diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 25117544d6a849..62fac085897ab6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1178,9 +1178,9 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16, FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK, - FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, - FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero, - FeatureVmemWriteVgprInOrder + FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, + FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS, + FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder ] >; @@ -1199,9 +1199,9 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16, - FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts, - FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, - FeatureMaxHardClauseLength63, + FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, + FeatureUnalignedDSAccess, FeatureImageInsts, FeatureGDS, FeatureGWS, + FeatureDefaultComponentZero, FeatureMaxHardClauseLength63, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts, FeatureVmemWriteVgprInOrder @@ -1223,9 +1223,9 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureA16, FeatureFastDenormalF32, FeatureG16, - FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS, - FeatureGWS, FeatureDefaultComponentZero, - FeatureMaxHardClauseLength32, + FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, + FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS, + FeatureDefaultComponentZero, FeatureMaxHardClauseLength32, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureVmemWriteVgprInOrder ] @@ -1246,9 +1246,9 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12", FeatureVOP3Literal, FeatureDPP8, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureA16, FeatureFastDenormalF32, FeatureG16, - FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, - FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast, - FeatureMaxHardClauseLength32, + FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, + FeatureUnalignedDSAccess, FeatureTrue16BitInsts, + FeatureDefaultComponentBroadcast, FeatureMaxHardClauseLength32, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureAgentScopeFineGrainedRemoteMemoryAtomics ] diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 3f4f42377d56ee..d701bf037fdfa6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -387,8 +387,8 @@ bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, // them later if they may access private memory. We don't have enough context // here, and legalization can handle it. if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { - return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) && - ChainSizeInBytes <= ST->getMaxPrivateElementSize(); + return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) && + ChainSizeInBytes <= ST->getMaxPrivateElementSize(); } return true; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 19458126093167..1ea3beb2855d69 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -591,6 +591,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return UnalignedScratchAccess; } + bool hasUnalignedScratchAccessEnabled() const { + return UnalignedScratchAccess && UnalignedAccessMode; + } + bool hasUnalignedAccessMode() const { return UnalignedAccessMode; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 3d8e03521e2b90..8c197f23149612 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1824,26 +1824,16 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( Subtarget->hasUnalignedDSAccessEnabled(); } - if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { - bool AlignedBy4 = Alignment >= Align(4); - if (IsFast) - *IsFast = AlignedBy4; - - return AlignedBy4 || - Subtarget->enableFlatScratch() || - Subtarget->hasUnalignedScratchAccess(); - } - // FIXME: We have to be conservative here and assume that flat operations // will access scratch. If we had access to the IR function, then we // could determine if any private memory was used in the function. - if (AddrSpace == AMDGPUAS::FLAT_ADDRESS && - !Subtarget->hasUnalignedScratchAccess()) { + if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || + AddrSpace == AMDGPUAS::FLAT_ADDRESS) { bool AlignedBy4 = Alignment >= Align(4); if (IsFast) *IsFast = AlignedBy4; - return AlignedBy4; + return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled(); } // So long as they are correct, wide global memory operations perform better diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index ce528467cd35b4..6e2e88f22600a8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -2428,11 +2428,54 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX9-LABEL: store_load_i64_unaligned: ; UNALIGNED_GFX9: ; %bb.0: ; %bb ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v2, 0 -; UNALIGNED_GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v4, 15 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v1, 4, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v3, 1, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v0, v4, off ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v4, 0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v6, 6, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v3, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v2, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v5, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v7, 5, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v1, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v7, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v8, 7, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v6, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v8, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v0, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr6 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr3 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v3, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v2, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v5, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v1, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v7, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v6, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v8, off glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2441,30 +2484,143 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, 0 -; UNALIGNED_GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v3, 4, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v0, v1, off ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v6, 5, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v7, 6, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v8, 7, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v4, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v5, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v1, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v3, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v6, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v7, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v8, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v0, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v4, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v5, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v1, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v3, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v6, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v7, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v8, off glc dlc ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX940-LABEL: store_load_i64_unaligned: ; UNALIGNED_GFX940: ; %bb.0: ; %bb ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: v_mov_b64_e32 v[2:3], 15 -; UNALIGNED_GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v4, 15 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v1, 4, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v3, 1, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v0, v4, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v4, 0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v6, 6, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v3, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v2, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v5, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v7, 5, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v1, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v7, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v8, 7, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v6, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v8, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr6 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr3 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v2, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v5, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v1, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v7, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v6, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v8, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX940-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX11-LABEL: store_load_i64_unaligned: ; UNALIGNED_GFX11: ; %bb.0: ; %bb ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX11-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX11-NEXT: v_mov_b32_e32 v2, 0 -; UNALIGNED_GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v3, 4, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v0, v1, off dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v6, 5, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v7, 6, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v8, 7, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v4, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v5, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v1, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v3, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v6, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v7, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v8, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v0, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v4, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v5, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v1, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v3, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v6, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v7, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v8, off glc dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2475,12 +2631,39 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX12-NEXT: s_wait_samplecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_bvhcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 -; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 0 +; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v1, off scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v0, v0, off offset:7 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_setpc_b64 s[30:31] bb: @@ -2572,59 +2755,293 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX9-LABEL: store_load_v3i32_unaligned: ; UNALIGNED_GFX9: ; %bb.0: ; %bb ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX9-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX9-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX9-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, s2 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v2, s1 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, s0 -; UNALIGNED_GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, 1 +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, 2 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v4, 1, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v0, v3, off ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v6, 4, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v7, 6, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v9, 8, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v10, 10, v0 +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v12, 3 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v4, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v2, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v5, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v8, 5, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v6, v1, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v8, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v1, 7, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v7, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v1, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v11, 9, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v9, v12, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v11, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v12, 11, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v10, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v12, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v0, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr9 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr12 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr4 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr11 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr6 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr10 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v4, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v2, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v5, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v6, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v8, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v7, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v1, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v9, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v11, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v10, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v12, off glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX10-LABEL: store_load_v3i32_unaligned: ; UNALIGNED_GFX10: ; %bb.0: ; %bb ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX10-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX10-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX10-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v3, s2 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, s1 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, s0 -; UNALIGNED_GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 1 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v0 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, 2 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v4, 2, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v0, v1, off ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v6, 4, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v7, 5, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v5, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v4, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v8, 6, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v1, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v6, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v7, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v2, 7, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v9, 8, v0 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v10, 3 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v11, 9, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v12, 10, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v13, 11, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v8, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v2, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v9, v10, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v11, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v12, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v13, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v0, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v5, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v4, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v1, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v6, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v7, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v8, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v2, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v9, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v11, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v12, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v13, off glc dlc ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX940-LABEL: store_load_v3i32_unaligned: ; UNALIGNED_GFX940: ; %bb.0: ; %bb ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX940-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX940-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v4, s2 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, s1 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v2, s0 -; UNALIGNED_GFX940-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, 1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v1, 2 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v4, 1, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v0, v3, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v6, 4, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v7, 6, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v9, 8, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v10, 10, v0 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v12, 3 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v4, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v2, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v5, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v8, 5, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v6, v1, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v8, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v1, 7, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v7, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v1, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v11, 9, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v9, v12, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v11, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v12, 11, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v10, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v12, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr9 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr12 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr4 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr11 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr6 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr10 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v2, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v5, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v6, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v8, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v7, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v1, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v9, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v11, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v10, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v12, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX940-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX11-LABEL: store_load_v3i32_unaligned: ; UNALIGNED_GFX11: ; %bb.0: ; %bb ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX11-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX11-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX11-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1 -; UNALIGNED_GFX11-NEXT: v_mov_b32_e32 v1, s0 -; UNALIGNED_GFX11-NEXT: scratch_store_b96 v0, v[1:3], off dlc +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_add_nc_u32 v4, 2, v0 +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v10, 3 :: v_dual_add_nc_u32 v5, 1, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v0, v1, off dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX11-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v6, 4, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v7, 5, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v5, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v4, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v8, 6, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v1, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v6, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v7, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v2, 7, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v9, 8, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v11, 9, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v12, 10, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v13, 11, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v8, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v2, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v9, v10, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v11, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v12, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v13, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v0, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v5, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v4, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v1, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v6, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v7, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v8, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v2, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v9, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v11, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v12, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v13, off glc dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2635,16 +3052,57 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX12-NEXT: s_wait_samplecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_bvhcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX12-NEXT: s_wait_alu 0xfffe -; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1 -; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v1, s0 +; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v3, 2 +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v1, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v1, 3 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v1, off offset:8 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:9 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:10 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:11 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:7 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:8 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:9 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:10 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v0, v0, off offset:11 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_setpc_b64 s[30:31] bb: @@ -2742,64 +3200,382 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX9-LABEL: store_load_v4i32_unaligned: ; UNALIGNED_GFX9: ; %bb.0: ; %bb ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX9-NEXT: s_mov_b32 s3, 4 -; UNALIGNED_GFX9-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX9-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX9-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v4, s3 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, s2 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v2, s1 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, s0 -; UNALIGNED_GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off -; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, 1 +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, 2 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v4, 1, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v0, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v6, 4 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v7, 4, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v8, 6, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v10, 8, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v11, 10, v0 +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v13, 3 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v14, 12, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v15, 14, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v4, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v2, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v5, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v9, 5, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v7, v1, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v9, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v1, 7, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v8, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v1, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v12, 9, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v10, v13, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v12, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v13, 11, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v11, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v13, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v16, 13, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v14, v6, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v16, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v6, 15, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v15, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v6, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v0, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v4, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v2, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v5, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v7, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v9, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v8, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v1, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v10, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v12, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v11, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v13, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v14, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v16, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v15, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr11 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr4 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr15 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr10 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr13 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr14 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr12 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr9 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr16 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v0, v6, off glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX10-LABEL: store_load_v4i32_unaligned: ; UNALIGNED_GFX10: ; %bb.0: ; %bb ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX10-NEXT: s_mov_b32 s3, 4 -; UNALIGNED_GFX10-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX10-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX10-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v4, s3 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v3, s2 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, s1 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, s0 -; UNALIGNED_GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 1 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, 2 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v6, 4, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v0, v1, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v7, 5, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v4, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v5, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v1, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v9, 6, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v6, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v7, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v2, 7, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v10, 8, v0 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v11, 3 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v12, 9, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v9, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v13, 10, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v2, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v10, v11, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v12, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v11, 11, v0 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v8, 4 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v14, 12, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v15, 13, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v16, 14, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v17, 15, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v13, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v11, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v14, v8, off ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc +; UNALIGNED_GFX10-NEXT: scratch_store_byte v15, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v16, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v17, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v0, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v4, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v5, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v1, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v6, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v7, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v9, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v2, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v10, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v12, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v13, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v11, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v14, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v15, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v16, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v17, off glc dlc ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX940-LABEL: store_load_v4i32_unaligned: ; UNALIGNED_GFX940: ; %bb.0: ; %bb ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: s_mov_b32 s3, 4 -; UNALIGNED_GFX940-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX940-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX940-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX940-NEXT: v_mov_b64_e32 v[4:5], s[2:3] -; UNALIGNED_GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; UNALIGNED_GFX940-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, 1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v1, 2 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v4, 1, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v0, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v6, 4 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v7, 4, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v8, 6, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v10, 8, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v11, 10, v0 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v13, 3 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v14, 12, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v15, 14, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v4, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v2, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v5, v3, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v9, 5, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v7, v1, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v9, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v1, 7, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v8, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v1, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v12, 9, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v10, v13, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v12, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v13, 11, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v11, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v13, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v16, 13, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v14, v6, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v16, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v6, 15, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v15, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v6, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v2, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v5, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v7, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v9, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v8, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v1, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v10, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v12, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v11, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v13, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v14, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v16, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v15, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr11 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr4 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr15 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr10 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr13 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr14 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr12 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr9 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr16 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v0, v6, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX940-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX11-LABEL: store_load_v4i32_unaligned: ; UNALIGNED_GFX11: ; %bb.0: ; %bb ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX11-NEXT: s_mov_b32 s3, 4 -; UNALIGNED_GFX11-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX11-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX11-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 -; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; UNALIGNED_GFX11-NEXT: scratch_store_b128 v0, v[1:4], off dlc +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_add_nc_u32 v4, 1, v0 +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v11, 3 :: v_dual_add_nc_u32 v6, 4, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v0, v1, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v8, 4 :: v_dual_add_nc_u32 v5, 2, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v7, 5, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v4, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v5, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v1, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v9, 6, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v6, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v7, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v2, 7, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v10, 8, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v12, 9, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v9, v3, off dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX11-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v13, 10, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v2, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v10, v11, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v12, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v11, 11, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v14, 12, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v15, 13, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v16, 14, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v17, 15, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v13, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v11, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v14, v8, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v15, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v16, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v17, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v0, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v4, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v5, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v1, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v6, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v7, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v9, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v2, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v10, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v12, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v13, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v11, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v14, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v15, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v16, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v17, off glc dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2810,17 +3586,74 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX12-NEXT: s_wait_samplecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_bvhcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s3, 4 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX12-NEXT: s_wait_alu 0xfffe -; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 -; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v3, 2 +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v1, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v1, 3 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v3, 4 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v1, off offset:8 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:9 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:10 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:11 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v3, off offset:12 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:13 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:14 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:15 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:7 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:8 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:9 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:10 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:11 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:12 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:13 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:14 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v0, v0, off offset:15 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir index b1d7d36f9912e7..032ca7c0d4fee9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir @@ -483,40 +483,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_s16_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_s16_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s16_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -664,40 +646,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_s32_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_s32_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s32_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -798,70 +762,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_s32_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_s32_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s32_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -1247,76 +1163,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; GFX11PLUS-LABEL: name: test_load_flat_s64_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; GFX12-LABEL: name: test_load_flat_s64_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s64_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -1485,130 +1347,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; GFX11PLUS-LABEL: name: test_load_flat_s64_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; GFX12-LABEL: name: test_load_flat_s64_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s64_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -2075,87 +1829,24 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) - ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2) + ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; GFX11PLUS-LABEL: name: test_load_flat_s96_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) - ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2) + ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; GFX12-LABEL: name: test_load_flat_s96_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) - ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2) + ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s96_align2 @@ -2369,165 +2060,24 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) - ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1) + ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; GFX11PLUS-LABEL: name: test_load_flat_s96_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) - ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1) + ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; GFX12-LABEL: name: test_load_flat_s96_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) - ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1) + ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s96_align1 @@ -3334,210 +2884,24 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; GFX11PLUS-LABEL: name: test_load_flat_s128_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; GFX12-LABEL: name: test_load_flat_s128_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s128_align1 @@ -4132,133 +3496,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) ; ; GFX11PLUS-LABEL: name: test_load_flat_p1_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) ; ; GFX12-LABEL: name: test_load_flat_p1_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p1_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -4662,79 +3915,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; GFX11PLUS-LABEL: name: test_load_flat_p4_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; GFX12-LABEL: name: test_load_flat_p4_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p4_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -4906,133 +4102,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; GFX11PLUS-LABEL: name: test_load_flat_p4_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; GFX12-LABEL: name: test_load_flat_p4_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p4_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -5274,43 +4359,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX11PLUS-LABEL: name: test_load_flat_p5_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX12-LABEL: name: test_load_flat_p5_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) - ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p5_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -5416,73 +4480,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX11PLUS-LABEL: name: test_load_flat_p5_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX12-LABEL: name: test_load_flat_p5_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) - ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p5_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -5732,40 +4745,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2s8_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_v2s8_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s8_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -6158,121 +5153,106 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2) + ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) + ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) + ; GFX9PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] + ; GFX9PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) + ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX9PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX9PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; GFX9PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR4]](s32) + ; GFX9PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] + ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) + ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] + ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR3]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_v3s8_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2) + ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) + ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) + ; GFX11PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX11PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] + ; GFX11PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) + ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX11PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX11PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; GFX11PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR4]](s32) + ; GFX11PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] + ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) + ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] + ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR3]](s32) ; ; GFX12-LABEL: name: test_load_flat_v3s8_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2) + ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) - ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) + ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) + ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] + ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) + ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR4]](s32) + ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] + ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) + ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] + ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] + ; GFX12-NEXT: $vgpr0 = COPY [[OR3]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s8_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -6503,40 +5483,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_v4s8_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_v4s8_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s8_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -6638,70 +5600,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_v4s8_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_v4s8_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s8_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -7185,40 +6099,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2s16_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX12-LABEL: name: test_load_flat_v2s16_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s16_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -7327,70 +6223,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2s16_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX12-LABEL: name: test_load_flat_v2s16_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s16_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -8291,36 +7139,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1) + ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1) + ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) + ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) ; GFX9PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; GFX9PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) @@ -8334,36 +7168,22 @@ body: | ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1) + ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1) + ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) + ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) ; GFX11PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX11PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; GFX11PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) @@ -8377,36 +7197,22 @@ body: | ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1) + ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1) + ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) + ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) ; GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) @@ -8765,70 +7571,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v4s16_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; GFX12-LABEL: name: test_load_flat_v4s16_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s16_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -9005,124 +7763,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v4s16_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; GFX12-LABEL: name: test_load_flat_v4s16_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s16_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -10686,133 +9342,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32) - ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32) - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2s64_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32) - ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32) - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; GFX12-LABEL: name: test_load_flat_v2s64_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32) - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32) - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32) - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64) - ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s64_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -11100,235 +9645,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2s64_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; GFX12-LABEL: name: test_load_flat_v2s64_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s64_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -12078,342 +10410,42 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX9PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX9PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX9PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX9PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX9PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX9PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX9PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX9PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX9PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX9PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX9PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX9PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX9PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX9PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX9PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX9PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX9PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX9PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] + ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1) + ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>) ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64) + ; GFX9PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) + ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v3s64_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX11PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX11PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX11PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX11PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX11PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX11PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX11PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX11PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX11PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX11PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX11PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX11PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX11PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX11PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX11PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX11PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX11PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX11PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] + ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1) + ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>) ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64) + ; GFX11PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) + ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) ; ; GFX12-LABEL: name: test_load_flat_v3s64_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] + ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1) + ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>) ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64) + ; GFX12-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s64_align1 @@ -13306,441 +11338,33 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX9PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX9PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX9PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX9PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX9PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX9PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX9PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX9PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX9PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX9PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX9PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX9PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX9PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX9PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX9PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX9PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX9PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX9PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] - ; GFX9PLUS-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24) - ; GFX9PLUS-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25) - ; GFX9PLUS-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]] - ; GFX9PLUS-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26) - ; GFX9PLUS-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27) - ; GFX9PLUS-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]] - ; GFX9PLUS-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]] - ; GFX9PLUS-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28) - ; GFX9PLUS-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29) - ; GFX9PLUS-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]] - ; GFX9PLUS-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30) - ; GFX9PLUS-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31) - ; GFX9PLUS-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]] - ; GFX9PLUS-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]] - ; GFX9PLUS-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32) - ; GFX9PLUS-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32) - ; GFX9PLUS-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64) - ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>) + ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1) + ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v4s64_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX11PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX11PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX11PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX11PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX11PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX11PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX11PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX11PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX11PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX11PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX11PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX11PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX11PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX11PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX11PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX11PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX11PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX11PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] - ; GFX11PLUS-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24) - ; GFX11PLUS-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25) - ; GFX11PLUS-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]] - ; GFX11PLUS-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26) - ; GFX11PLUS-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27) - ; GFX11PLUS-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]] - ; GFX11PLUS-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]] - ; GFX11PLUS-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28) - ; GFX11PLUS-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29) - ; GFX11PLUS-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]] - ; GFX11PLUS-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30) - ; GFX11PLUS-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31) - ; GFX11PLUS-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]] - ; GFX11PLUS-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]] - ; GFX11PLUS-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32) - ; GFX11PLUS-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32) - ; GFX11PLUS-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64) - ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>) + ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1) + ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>) ; ; GFX12-LABEL: name: test_load_flat_v4s64_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] - ; GFX12-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24) - ; GFX12-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25) - ; GFX12-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32) - ; GFX12-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]] - ; GFX12-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26) - ; GFX12-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64) - ; GFX12-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27) - ; GFX12-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32) - ; GFX12-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]] - ; GFX12-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32) - ; GFX12-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]] - ; GFX12-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32) - ; GFX12-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28) - ; GFX12-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29) - ; GFX12-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32) - ; GFX12-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]] - ; GFX12-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30) - ; GFX12-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64) - ; GFX12-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31) - ; GFX12-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]] - ; GFX12-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32) - ; GFX12-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]] - ; GFX12-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32) - ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32) - ; GFX12-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]] - ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64) - ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>) + ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1) + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s64_align1 @@ -14762,210 +12386,24 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2p1_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; ; GFX12-LABEL: name: test_load_flat_v2p1_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p1_align1 @@ -15422,124 +12860,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2p3_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX12-LABEL: name: test_load_flat_v2p3_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p3_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir index 741f878c86f8b6..6d93112aae1a06 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir @@ -636,27 +636,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX10-LABEL: name: test_load_private_s16_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11-LABEL: name: test_load_private_s16_align1 ; GFX11: liveins: $vgpr0 @@ -702,15 +690,27 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR]](s32) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s16_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) %0:_(p5) = COPY $vgpr0 %1:_(s16) = G_LOAD %0 :: (load (s16), align 1, addrspace 5) %2:_(s32) = G_ANYEXT %1 @@ -853,27 +853,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX10-LABEL: name: test_load_private_s32_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11-LABEL: name: test_load_private_s32_align2 ; GFX11: liveins: $vgpr0 @@ -919,15 +907,27 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR]](s32) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s32_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) %0:_(p5) = COPY $vgpr0 %1:_(s32) = G_LOAD %0 :: (load (s32), align 2, addrspace 5) $vgpr0 = COPY %1 @@ -1012,47 +1012,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX10-LABEL: name: test_load_private_s32_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11-LABEL: name: test_load_private_s32_align1 ; GFX11: liveins: $vgpr0 @@ -1118,15 +1086,47 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR2]](s32) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s32_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32) %0:_(p5) = COPY $vgpr0 %1:_(s32) = G_LOAD %0 :: (load (s32), align 1, addrspace 5) $vgpr0 = COPY %1 @@ -1529,39 +1529,27 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX9-NEXT: $vgpr0 = COPY [[OR1]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32) ; ; GFX10-LABEL: name: test_load_private_s24_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX10-NEXT: $vgpr0 = COPY [[OR1]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32) ; ; GFX11-LABEL: name: test_load_private_s24_align1 ; GFX11: liveins: $vgpr0 @@ -1631,27 +1619,39 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR1]](s32) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s24_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR1]](s32) %0:_(p5) = COPY $vgpr0 %1:_(s24) = G_LOAD %0 :: (load (s24), align 1, addrspace 5) %2:_(s32) = G_ANYEXT %1 @@ -2147,42 +2147,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) ; ; GFX10-LABEL: name: test_load_private_s64_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) ; ; GFX11-LABEL: name: test_load_private_s64_align2 @@ -2245,15 +2225,51 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s64_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64) %0:_(p5) = COPY $vgpr0 %1:_(s64) = G_LOAD %0 :: (load (s64), align 2, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -2386,78 +2402,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) ; ; GFX10-LABEL: name: test_load_private_s64_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) ; ; GFX11-LABEL: name: test_load_private_s64_align1 @@ -2556,15 +2516,87 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s64_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64) %0:_(p5) = COPY $vgpr0 %1:_(s64) = G_LOAD %0 :: (load (s64), align 1, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -2742,53 +2774,14 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -2796,53 +2789,14 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -2974,16 +2928,108 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) %0:_(p5) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load (s96), align 1, addrspace 5) @@ -3381,28 +3427,14 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -3410,28 +3442,14 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -3513,16 +3531,58 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) %0:_(p5) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load (s96), align 2, addrspace 5) @@ -3701,53 +3761,14 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -3755,53 +3776,14 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -3933,16 +3915,108 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) %0:_(p5) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load (s96), align 1, addrspace 5) @@ -4166,68 +4240,17 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -4235,68 +4258,17 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -4458,16 +4430,138 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) %0:_(p5) = COPY $vgpr0 %1:_(s128) = G_LOAD %0 :: (load (s128), align 1, addrspace 5) @@ -4928,35 +5022,17 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -4964,35 +5040,17 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -5088,16 +5146,72 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) %0:_(p5) = COPY $vgpr0 %1:_(s128) = G_LOAD %0 :: (load (s128), align 2, addrspace 5) @@ -5321,68 +5435,17 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -5390,68 +5453,17 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -5613,16 +5625,138 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) %0:_(p5) = COPY $vgpr0 %1:_(s128) = G_LOAD %0 :: (load (s128), align 1, addrspace 5) @@ -5932,42 +6066,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1) ; ; GFX10-LABEL: name: test_load_private_p1_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1) ; ; GFX11-LABEL: name: test_load_private_p1_align2 @@ -6030,15 +6144,53 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR2]](s64) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p1_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR2]](s64) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) %0:_(p5) = COPY $vgpr0 %1:_(p1) = G_LOAD %0 :: (load (p1), align 2, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -6171,78 +6323,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1) ; ; GFX10-LABEL: name: test_load_private_p1_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1) ; ; GFX11-LABEL: name: test_load_private_p1_align1 @@ -6341,15 +6437,89 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p1_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) %0:_(p5) = COPY $vgpr0 %1:_(p1) = G_LOAD %0 :: (load (p1), align 1, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -6494,29 +6664,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3) ; ; GFX10-LABEL: name: test_load_private_p3_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p3) ; ; GFX11-LABEL: name: test_load_private_p3_align2 ; GFX11: liveins: $vgpr0 @@ -6564,15 +6720,29 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p3_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) %0:_(p5) = COPY $vgpr0 %1:_(p3) = G_LOAD %0 :: (load (p3), align 2, addrspace 5) $vgpr0 = COPY %1 @@ -6660,49 +6830,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3) ; ; GFX10-LABEL: name: test_load_private_p3_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p3) ; ; GFX11-LABEL: name: test_load_private_p3_align1 ; GFX11: liveins: $vgpr0 @@ -6770,15 +6906,49 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p3_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) %0:_(p5) = COPY $vgpr0 %1:_(p3) = G_LOAD %0 :: (load (p3), align 1, addrspace 5) $vgpr0 = COPY %1 @@ -6923,29 +7093,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX10-LABEL: name: test_load_private_p5_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX11-LABEL: name: test_load_private_p5_align2 ; GFX11: liveins: $vgpr0 @@ -6993,15 +7149,29 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p5) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p5_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) %0:_(p5) = COPY $vgpr0 %1:_(p5) = G_LOAD %0 :: (load (p5), align 2, addrspace 5) $vgpr0 = COPY %1 @@ -7089,49 +7259,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX10-LABEL: name: test_load_private_p5_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX11-LABEL: name: test_load_private_p5_align1 ; GFX11: liveins: $vgpr0 @@ -7199,15 +7335,49 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p5) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p5_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) %0:_(p5) = COPY $vgpr0 %1:_(p5) = G_LOAD %0 :: (load (p5), align 1, addrspace 5) $vgpr0 = COPY %1 @@ -7357,30 +7527,20 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX10-LABEL: name: test_load_private_v2s8_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX11-LABEL: name: test_load_private_v2s8_align1 @@ -7437,20 +7597,30 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s8_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s8>) = G_LOAD %0 :: (load (<2 x s8>), align 1, addrspace 5) @@ -7938,81 +8108,71 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) + ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] - ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX9-NEXT: $vgpr0 = COPY [[OR4]](s32) + ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] + ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) + ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] + ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] + ; GFX9-NEXT: $vgpr0 = COPY [[OR3]](s32) ; ; GFX10-LABEL: name: test_load_private_v3s8_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) - ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) + ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] + ; GFX10-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) + ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX10-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX10-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; GFX10-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] - ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX10-NEXT: $vgpr0 = COPY [[OR4]](s32) + ; GFX10-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] + ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) + ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] + ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] + ; GFX10-NEXT: $vgpr0 = COPY [[OR3]](s32) ; ; GFX11-LABEL: name: test_load_private_v3s8_align1 ; GFX11: liveins: $vgpr0 @@ -8168,71 +8328,81 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) - ; UNALIGNED_GFX11-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) ; UNALIGNED_GFX11-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; UNALIGNED_GFX11-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; UNALIGNED_GFX11-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; UNALIGNED_GFX11-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] - ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) - ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] + ; UNALIGNED_GFX11-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; UNALIGNED_GFX11-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] + ; UNALIGNED_GFX11-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; UNALIGNED_GFX11-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] - ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) - ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] - ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; UNALIGNED_GFX11-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) - ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR3]](s32) + ; UNALIGNED_GFX11-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; UNALIGNED_GFX11-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR4]](s32) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v3s8_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) - ; UNALIGNED_GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; UNALIGNED_GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] - ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) - ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] + ; UNALIGNED_GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; UNALIGNED_GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] + ; UNALIGNED_GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; UNALIGNED_GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] - ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) - ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] - ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) - ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR3]](s32) + ; UNALIGNED_GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR4]](s32) %0:_(p5) = COPY $vgpr0 %1:_(<3 x s8>) = G_LOAD %0 :: (load (<3 x s8>), align 1, addrspace 5) %2:_(s24) = G_BITCAST %1 @@ -8658,136 +8828,34 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX10-LABEL: name: test_load_private_v16s8_align16 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX11-LABEL: name: test_load_private_v16s8_align16 @@ -8944,15 +9012,137 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v16s8_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<16 x s8>) = G_LOAD %0 :: (load (<16 x s8>), align 1, addrspace 5) %2:_(<4 x s32>) = G_BITCAST %1 @@ -9107,27 +9297,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX10-LABEL: name: test_load_private_v2s16_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX11-LABEL: name: test_load_private_v2s16_align2 ; GFX11: liveins: $vgpr0 @@ -9173,15 +9351,27 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s16_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 2, addrspace 5) $vgpr0 = COPY %1 @@ -9278,47 +9468,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX10-LABEL: name: test_load_private_v2s16_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX11-LABEL: name: test_load_private_v2s16_align1 ; GFX11: liveins: $vgpr0 @@ -9384,15 +9542,47 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s16_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 1, addrspace 5) $vgpr0 = COPY %1 @@ -9824,27 +10014,26 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) - ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16) ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) @@ -9853,27 +10042,26 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) - ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16) ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) @@ -10215,41 +10403,26 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) - ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16) ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) @@ -10258,41 +10431,26 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) - ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16) ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) @@ -10445,22 +10603,36 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; UNALIGNED_GFX11-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; UNALIGNED_GFX11-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) ; UNALIGNED_GFX11-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; UNALIGNED_GFX11-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) @@ -10474,22 +10646,36 @@ body: | ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) ; UNALIGNED_GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; UNALIGNED_GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) @@ -10827,44 +11013,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; GFX10-LABEL: name: test_load_private_v4s16_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; GFX11-LABEL: name: test_load_private_v4s16_align2 @@ -10929,15 +11093,47 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; UNALIGNED_GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s16_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 2, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -11091,80 +11287,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; GFX10-LABEL: name: test_load_private_v4s16_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; GFX11-LABEL: name: test_load_private_v4s16_align1 @@ -11265,15 +11403,83 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; UNALIGNED_GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s16_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 1, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -11582,42 +11788,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX10-LABEL: name: test_load_private_v2s32_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX11-LABEL: name: test_load_private_v2s32_align2 @@ -11680,15 +11866,43 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s32_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 2, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -11821,78 +12035,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX10-LABEL: name: test_load_private_v2s32_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX11-LABEL: name: test_load_private_v2s32_align1 @@ -11991,15 +12149,79 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s32_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 1, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -12174,106 +12396,28 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; ; GFX10-LABEL: name: test_load_private_v3s32_align16 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; ; GFX11-LABEL: name: test_load_private_v3s32_align16 @@ -12400,15 +12544,107 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v3s32_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>), align 1, addrspace 5) $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -12764,136 +13000,34 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX10-LABEL: name: test_load_private_v4s32_align16 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX11-LABEL: name: test_load_private_v4s32_align16 @@ -13050,15 +13184,137 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 1, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -13493,70 +13749,34 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX10-LABEL: name: test_load_private_v4s32_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX11-LABEL: name: test_load_private_v4s32_align2 @@ -13647,15 +13867,71 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 2, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -13875,136 +14151,34 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX10-LABEL: name: test_load_private_v4s32_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX11-LABEL: name: test_load_private_v4s32_align1 @@ -14161,15 +14335,137 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 1, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -15262,68 +15558,17 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; @@ -15331,68 +15576,17 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; @@ -15552,15 +15746,155 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) + ; UNALIGNED_GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s64_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) + ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 1, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -18178,98 +18512,23 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5) - ; GFX9-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX9-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]] - ; GFX9-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32) - ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5) - ; GFX9-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]] - ; GFX9-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32) - ; GFX9-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]] - ; GFX9-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5) - ; GFX9-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX9-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]] - ; GFX9-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32) - ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5) - ; GFX9-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX9-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]] - ; GFX9-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32) - ; GFX9-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]] - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32) + ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 1, addrspace 5) + ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) @@ -18280,98 +18539,23 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5) - ; GFX10-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX10-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]] - ; GFX10-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32) - ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5) - ; GFX10-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]] - ; GFX10-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32) - ; GFX10-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]] - ; GFX10-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5) - ; GFX10-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX10-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]] - ; GFX10-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32) - ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5) - ; GFX10-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX10-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]] - ; GFX10-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32) - ; GFX10-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]] - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32) + ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 1, addrspace 5) + ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) @@ -18616,12 +18800,99 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]] + ; UNALIGNED_GFX11-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]] + ; UNALIGNED_GFX11-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; UNALIGNED_GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; UNALIGNED_GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) @@ -18631,12 +18902,99 @@ body: | ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]] + ; UNALIGNED_GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]] + ; UNALIGNED_GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; UNALIGNED_GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) @@ -18818,49 +19176,23 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5) - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32) + ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 2, addrspace 5) + ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) @@ -18871,49 +19203,23 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5) - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32) + ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 2, addrspace 5) + ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) @@ -19060,12 +19366,50 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; UNALIGNED_GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; UNALIGNED_GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) @@ -19075,12 +19419,50 @@ body: | ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; UNALIGNED_GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index ea10547da6ab7f..3fc5d0d4b279eb 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -475,8 +475,14 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_short off, v0, s0 offset:4 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_load_dword v0, off, s0 -; FLATSCR-NEXT: scratch_load_dword v1, off, s0 offset:2 +; FLATSCR-NEXT: scratch_load_ushort v0, off, s0 offset:2 +; FLATSCR-NEXT: scratch_load_ushort v3, off, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(1) +; FLATSCR-NEXT: v_mov_b32_e32 v1, v0 +; FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, s0 offset:4 +; FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 +; FLATSCR-NEXT: s_waitcnt vmcnt(1) +; FLATSCR-NEXT: v_perm_b32 v0, v0, v3, s0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; FLATSCR-NEXT: s_endpgm @@ -537,8 +543,13 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s0 offset:4 ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; FLATSCR_GFX10-NEXT: s_clause 0x1 -; FLATSCR_GFX10-NEXT: scratch_load_dword v0, off, s0 -; FLATSCR_GFX10-NEXT: scratch_load_dword v1, off, s0 offset:2 +; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, off, s0 offset:2 +; FLATSCR_GFX10-NEXT: scratch_load_ushort v3, off, s0 +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(1) +; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v1, v0 +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) +; FLATSCR_GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 +; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, off, s0 offset:4 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; FLATSCR_GFX10-NEXT: s_endpgm @@ -561,8 +572,13 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; GFX11-NEXT: scratch_store_b16 off, v0, off offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v0, off, off -; GFX11-NEXT: scratch_load_b32 v1, off, off offset:2 +; GFX11-NEXT: scratch_load_u16 v0, off, off offset:2 +; GFX11-NEXT: scratch_load_u16 v3, off, off +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 +; GFX11-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll index 0ad53083d0ff3f..12593e3760fd3e 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll @@ -123,10 +123,8 @@ define amdgpu_kernel void @zextload_flat_i16(ptr addrspace(1) noalias %out, ptr } ; GCN-LABEL: flat_scratch_unaligned_load: -; GCN: flat_load_{{ubyte|u8}} -; GCN: flat_load_{{ubyte|u8}} -; GCN: flat_load_{{ubyte|u8}} -; GCN: flat_load_{{ubyte|u8}} +; GFX9: flat_load_dword +; GFX10PLUS: flat_load_{{dword|b32}} define amdgpu_kernel void @flat_scratch_unaligned_load() { %scratch = alloca i32, addrspace(5) %fptr = addrspacecast ptr addrspace(5) %scratch to ptr @@ -136,10 +134,8 @@ define amdgpu_kernel void @flat_scratch_unaligned_load() { } ; GCN-LABEL: flat_scratch_unaligned_store: -; GCN: flat_store_{{byte|b8}} -; GCN: flat_store_{{byte|b8}} -; GCN: flat_store_{{byte|b8}} -; GCN: flat_store_{{byte|b8}} +; GFX9: flat_store_dword +; GFX10PLUS: flat_store_{{dword|b32}} define amdgpu_kernel void @flat_scratch_unaligned_store() { %scratch = alloca i32, addrspace(5) %fptr = addrspacecast ptr addrspace(5) %scratch to ptr diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll index 1dd18b4228fe5e..9d43efbdf07b1f 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll @@ -16,47 +16,18 @@ define void @issue63986(i64 %0, i64 %idxprom) { ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v9, s7 ; CHECK-NEXT: v_mov_b32_e32 v8, s6 -; CHECK-NEXT: flat_load_ubyte v10, v[8:9] offset:5 -; CHECK-NEXT: flat_load_ubyte v11, v[8:9] offset:6 -; CHECK-NEXT: flat_load_ubyte v12, v[8:9] offset:7 -; CHECK-NEXT: flat_load_ubyte v13, v[8:9] offset:3 -; CHECK-NEXT: flat_load_ubyte v14, v[8:9] offset:2 -; CHECK-NEXT: flat_load_ubyte v15, v[8:9] offset:1 -; CHECK-NEXT: flat_load_ubyte v16, v[8:9] -; CHECK-NEXT: flat_load_ubyte v17, v[8:9] offset:4 -; CHECK-NEXT: flat_load_ubyte v18, v[8:9] offset:13 -; CHECK-NEXT: flat_load_ubyte v19, v[8:9] offset:14 -; CHECK-NEXT: flat_load_ubyte v20, v[8:9] offset:15 -; CHECK-NEXT: flat_load_ubyte v21, v[8:9] offset:11 -; CHECK-NEXT: flat_load_ubyte v22, v[8:9] offset:10 -; CHECK-NEXT: flat_load_ubyte v23, v[8:9] offset:9 -; CHECK-NEXT: flat_load_ubyte v24, v[8:9] offset:8 -; CHECK-NEXT: flat_load_ubyte v25, v[8:9] offset:12 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; CHECK-NEXT: s_add_u32 s4, s4, 1 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 -; CHECK-NEXT: v_add_co_u32_e32 v8, vcc, s6, v6 +; CHECK-NEXT: v_mov_b32_e32 v13, s7 +; CHECK-NEXT: v_add_co_u32_e32 v12, vcc, s6, v6 ; CHECK-NEXT: v_cmp_ge_u64_e64 s[8:9], s[4:5], 2 -; CHECK-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v7, vcc +; CHECK-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v7, vcc ; CHECK-NEXT: s_add_u32 s6, s6, 16 ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[8:9], v13 offset:3 -; CHECK-NEXT: flat_store_byte v[8:9], v14 offset:2 -; CHECK-NEXT: flat_store_byte v[8:9], v15 offset:1 -; CHECK-NEXT: flat_store_byte v[8:9], v16 -; CHECK-NEXT: flat_store_byte v[8:9], v12 offset:7 -; CHECK-NEXT: flat_store_byte v[8:9], v11 offset:6 -; CHECK-NEXT: flat_store_byte v[8:9], v10 offset:5 -; CHECK-NEXT: flat_store_byte v[8:9], v17 offset:4 -; CHECK-NEXT: flat_store_byte v[8:9], v21 offset:11 -; CHECK-NEXT: flat_store_byte v[8:9], v22 offset:10 -; CHECK-NEXT: flat_store_byte v[8:9], v23 offset:9 -; CHECK-NEXT: flat_store_byte v[8:9], v24 offset:8 -; CHECK-NEXT: flat_store_byte v[8:9], v20 offset:15 -; CHECK-NEXT: flat_store_byte v[8:9], v19 offset:14 -; CHECK-NEXT: flat_store_byte v[8:9], v18 offset:13 -; CHECK-NEXT: flat_store_byte v[8:9], v25 offset:12 +; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; CHECK-NEXT: s_cbranch_vccz .LBB0_2 ; CHECK-NEXT: ; %bb.3: ; %loop-memcpy-residual-header ; CHECK-NEXT: s_mov_b32 s4, 0 @@ -128,47 +99,18 @@ define void @issue63986(i64 %0, i64 %idxprom) { ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v10, s10 ; CHECK-NEXT: v_mov_b32_e32 v11, s11 -; CHECK-NEXT: flat_load_ubyte v12, v[10:11] offset:5 -; CHECK-NEXT: flat_load_ubyte v13, v[10:11] offset:6 -; CHECK-NEXT: flat_load_ubyte v14, v[10:11] offset:7 -; CHECK-NEXT: flat_load_ubyte v15, v[10:11] offset:3 -; CHECK-NEXT: flat_load_ubyte v16, v[10:11] offset:2 -; CHECK-NEXT: flat_load_ubyte v17, v[10:11] offset:1 -; CHECK-NEXT: flat_load_ubyte v18, v[10:11] -; CHECK-NEXT: flat_load_ubyte v19, v[10:11] offset:4 -; CHECK-NEXT: flat_load_ubyte v20, v[10:11] offset:13 -; CHECK-NEXT: flat_load_ubyte v21, v[10:11] offset:14 -; CHECK-NEXT: flat_load_ubyte v22, v[10:11] offset:15 -; CHECK-NEXT: flat_load_ubyte v23, v[10:11] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[10:11] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[10:11] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[10:11] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[10:11] offset:12 +; CHECK-NEXT: flat_load_dwordx4 v[10:13], v[10:11] +; CHECK-NEXT: v_mov_b32_e32 v15, s11 ; CHECK-NEXT: s_add_u32 s14, s14, 1 -; CHECK-NEXT: v_add_co_u32_e32 v10, vcc, s10, v2 -; CHECK-NEXT: v_addc_co_u32_e32 v11, vcc, v11, v3, vcc +; CHECK-NEXT: v_add_co_u32_e32 v14, vcc, s10, v2 +; CHECK-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v3, vcc ; CHECK-NEXT: s_addc_u32 s15, s15, 0 ; CHECK-NEXT: s_add_u32 s10, s10, 16 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc, s[14:15], v[4:5] ; CHECK-NEXT: s_addc_u32 s11, s11, 0 ; CHECK-NEXT: s_or_b64 s[12:13], vcc, s[12:13] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[10:11], v15 offset:3 -; CHECK-NEXT: flat_store_byte v[10:11], v16 offset:2 -; CHECK-NEXT: flat_store_byte v[10:11], v17 offset:1 -; CHECK-NEXT: flat_store_byte v[10:11], v18 -; CHECK-NEXT: flat_store_byte v[10:11], v14 offset:7 -; CHECK-NEXT: flat_store_byte v[10:11], v13 offset:6 -; CHECK-NEXT: flat_store_byte v[10:11], v12 offset:5 -; CHECK-NEXT: flat_store_byte v[10:11], v19 offset:4 -; CHECK-NEXT: flat_store_byte v[10:11], v23 offset:11 -; CHECK-NEXT: flat_store_byte v[10:11], v24 offset:10 -; CHECK-NEXT: flat_store_byte v[10:11], v25 offset:9 -; CHECK-NEXT: flat_store_byte v[10:11], v26 offset:8 -; CHECK-NEXT: flat_store_byte v[10:11], v22 offset:15 -; CHECK-NEXT: flat_store_byte v[10:11], v21 offset:14 -; CHECK-NEXT: flat_store_byte v[10:11], v20 offset:13 -; CHECK-NEXT: flat_store_byte v[10:11], v27 offset:12 +; CHECK-NEXT: flat_store_dwordx4 v[14:15], v[10:13] ; CHECK-NEXT: s_andn2_b64 exec, exec, s[12:13] ; CHECK-NEXT: s_cbranch_execnz .LBB0_14 ; CHECK-NEXT: .LBB0_15: ; %Flow20 @@ -251,23 +193,11 @@ define void @issue63986_reduced_expanded(i64 %idxprom) { ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: .LBB1_8: ; %post-loop-memcpy-expansion ; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, v2 +; CHECK-NEXT: v_mov_b32_e32 v4, v2 +; CHECK-NEXT: v_mov_b32_e32 v5, v2 ; CHECK-NEXT: s_and_b64 vcc, exec, 0 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:3 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:2 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:1 -; CHECK-NEXT: flat_store_byte v[0:1], v2 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:6 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:5 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:4 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:9 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:8 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:12 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: .LBB1_9: ; %loop-memcpy-expansion2 ; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccz .LBB1_9 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll index 0a76e169e9c385..8c28fac0d839c2 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll @@ -10,108 +10,21 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:1 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:2 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:3 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:4 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:5 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:6 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:8 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:9 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:10 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:11 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:12 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:13 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:14 -; CHECK-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: v_mov_b32_e32 v12, s3 +; CHECK-NEXT: v_mov_b32_e32 v11, s2 +; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46 +; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44 +; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12] +; CHECK-NEXT: v_mov_b32_e32 v12, s1 +; CHECK-NEXT: v_mov_b32_e32 v11, s0 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:1 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:2 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:3 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:4 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:5 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:6 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:7 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:8 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:9 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:10 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:11 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:12 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:13 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:14 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:17 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[0:1] offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:30 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:29 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:28 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:27 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:26 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:25 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:24 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:23 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:22 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:21 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:20 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:19 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:18 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:17 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:16 -; CHECK-NEXT: flat_store_byte v[2:3], v19 offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:46 -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:45 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:44 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:43 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:42 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:41 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:40 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:39 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:38 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:37 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:36 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:35 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:34 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:33 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:46 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:45 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:44 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:43 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:42 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:41 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:40 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:39 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:38 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:37 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:36 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:35 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:34 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:33 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:32 -; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:31 +; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46 +; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44 +; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7] ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false) @@ -185,375 +98,59 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v24, 0 ; CHECK-NEXT: s_add_u32 s16, s16, s13 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:15 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:14 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:13 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:12 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:11 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:10 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:9 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:8 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:7 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:6 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:5 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:4 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:3 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:2 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:1 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:31 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:30 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 ; CHECK-NEXT: s_addc_u32 s17, s17, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, s2 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:10 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:23 +; CHECK-NEXT: v_mov_b32_e32 v25, s2 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:116 +; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:112 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(9) +; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:108 +; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen offset:96 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] +; CHECK-NEXT: s_waitcnt vmcnt(13) +; CHECK-NEXT: buffer_store_dword v11, v25, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_store_dword v10, v25, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_store_dword v9, v25, s[16:19], 0 offen offset:84 +; CHECK-NEXT: buffer_store_dword v8, v25, s[16:19], 0 offen offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(16) +; CHECK-NEXT: buffer_store_dword v15, v25, s[16:19], 0 offen offset:76 +; CHECK-NEXT: buffer_store_dword v14, v25, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_store_dword v13, v25, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_store_dword v12, v25, s[16:19], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:9 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:8 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:7 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:20 +; CHECK-NEXT: buffer_store_dword v19, v25, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v18, v25, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_store_dword v17, v25, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_store_dword v16, v25, s[16:19], 0 offen offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:6 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:5 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:2 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:47 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:4 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:17 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:3 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:16 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:27 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:26 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:25 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:24 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:45 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:44 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:43 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:23 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:36 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:22 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:35 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:21 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:34 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:20 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:33 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:19 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:28 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:29 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:42 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:18 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:63 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:16 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:61 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:27 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:40 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:26 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:39 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:25 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:38 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:24 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:37 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:44 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:57 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:43 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:56 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:45 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:58 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:36 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:49 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:35 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:48 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:46 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:47 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:60 -; CHECK-NEXT: s_waitcnt vmcnt(33) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:34 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:79 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:28 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:41 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:42 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:55 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:33 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:32 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:77 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:61 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:74 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:40 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:53 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:39 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:52 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:38 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:51 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:37 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:50 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:57 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:70 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:56 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:69 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:58 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:71 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:49 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:48 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:93 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:46 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:59 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:60 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:73 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:41 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:54 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:55 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:68 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:74 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:87 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:53 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:66 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:52 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:65 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:51 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:64 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:62 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:63 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:76 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:50 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:95 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:77 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:90 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:71 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:83 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:70 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:69 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:59 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:72 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:73 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:85 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:54 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:67 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:68 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:81 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:66 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:111 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:65 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:64 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:109 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:62 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:75 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:76 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:89 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:90 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:103 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:72 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:86 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:84 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:82 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:87 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:100 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:67 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:80 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:78 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:94 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:79 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:92 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:95 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:108 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:93 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:75 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:88 +; CHECK-NEXT: buffer_store_dword v23, v25, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_store_dword v22, v25, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_store_dword v21, v25, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_store_dword v20, v25, s[16:19], 0 offen offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(21) +; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:89 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:102 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:78 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:91 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:94 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:107 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:92 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:105 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:88 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:101 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:91 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:104 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:86 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:85 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:84 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:83 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:82 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:96 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:97 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:98 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:99 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:120 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:81 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:80 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:111 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:110 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:109 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:108 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:100 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:121 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:122 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:123 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:124 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:125 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:126 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:107 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:127 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:105 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:103 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:102 -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:101 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:116 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:117 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:119 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:114 -; CHECK-NEXT: s_waitcnt vmcnt(34) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:104 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:118 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:115 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:113 -; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:112 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:99 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:96 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:125 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:124 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:121 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:120 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:119 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:116 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:115 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:114 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:113 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v21, v1, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) @@ -569,363 +166,57 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: s_add_u32 s16, s16, s13 ; CHECK-NEXT: s_addc_u32 s17, s17, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:2 +; CHECK-NEXT: v_mov_b32_e32 v26, s0 +; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:22 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:21 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:20 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:19 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:18 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:31 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:37 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:22 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:36 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:21 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:35 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:34 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:33 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:18 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:17 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:25 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:39 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:24 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:45 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:37 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:36 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:50 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:35 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:49 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:34 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:47 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:29 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:44 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:33 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:32 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:40 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:39 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:53 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:38 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:59 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:51 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:50 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:63 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:46 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:61 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:58 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:49 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:48 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:56 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:53 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:67 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:52 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:55 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:73 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:65 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:64 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:62 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:77 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:60 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:75 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:72 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:70 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:68 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:67 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:81 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:66 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:79 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:69 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:87 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:76 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:91 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:74 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:89 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:71 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:86 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:78 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:93 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:90 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:88 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:85 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:95 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:92 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:94 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:84 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:81 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:96 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:120 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:80 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:111 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:110 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:109 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:99 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v25, s1 +; CHECK-NEXT: v_mov_b32_e32 v24, s0 +; CHECK-NEXT: s_waitcnt vmcnt(20) +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112 +; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:107 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:105 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:104 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:103 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:106 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:102 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:101 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:100 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115 +; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:108 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:98 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:97 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:96 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:127 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:126 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96 +; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:125 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:124 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:123 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:122 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:121 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:120 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:119 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:118 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:117 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:116 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:115 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:114 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false) @@ -972,279 +263,27 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 { ; CHECK-LABEL: memcpy_p0_p3_minsize: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:112 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:113 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:114 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:115 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:116 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:117 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:118 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:119 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:112 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:113 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:114 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:115 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:116 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:117 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:118 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:119 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:120 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:121 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:122 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:123 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:124 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:125 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:126 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:127 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:120 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:121 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:122 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:123 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:124 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:125 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:126 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:127 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:96 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:97 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:98 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:99 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:100 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:101 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:102 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:103 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:96 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:97 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:98 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:100 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:101 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:102 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:103 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:104 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:105 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:106 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:107 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:108 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:109 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:110 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:111 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:104 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:107 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:108 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:109 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:110 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:111 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:80 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:81 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:82 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:83 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:84 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:85 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:86 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:87 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:80 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:81 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:84 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:85 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:86 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:87 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:88 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:89 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:90 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:91 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:92 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:93 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:94 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:95 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:88 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:89 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:90 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:91 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:92 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:93 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:94 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:95 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:64 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:65 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:66 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:67 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:68 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:69 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:70 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:71 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:64 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:67 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:68 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:69 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:70 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:71 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:72 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:73 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:74 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:75 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:76 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:77 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:78 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:79 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:48 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:49 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:50 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:51 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:52 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:53 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:54 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:55 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:48 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:49 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:50 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:51 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:52 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:53 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:54 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:55 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:56 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:57 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:58 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:59 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:60 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:61 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:62 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:63 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:56 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:57 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:58 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:59 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:60 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:61 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:62 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:63 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:32 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:33 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:34 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:35 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:36 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:37 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:38 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:39 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:32 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:33 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:34 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:35 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:36 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:37 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:38 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:39 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:40 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:41 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:42 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:43 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:44 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:45 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:46 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:47 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:40 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:41 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:42 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:43 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:44 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:45 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:46 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:47 -; CHECK-NEXT: ds_read_u8 v3, v2 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v18, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:17 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:18 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v26, v2 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:23 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:29 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:31 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:27 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:28 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:31 -; CHECK-NEXT: flat_store_byte v[0:1], v3 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 +; CHECK-NEXT: v_mov_b32_e32 v16, 0 +; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5 +; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v21, s1 +; CHECK-NEXT: v_mov_b32_e32 v20, s0 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32 +; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9 +; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11 +; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13 +; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[16:19] offset:112 ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false) @@ -1256,108 +295,21 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:1 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:2 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:3 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:4 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:5 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:6 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:8 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:9 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:10 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:11 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:12 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:13 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:14 -; CHECK-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: v_mov_b32_e32 v12, s3 +; CHECK-NEXT: v_mov_b32_e32 v11, s2 +; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46 +; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44 +; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12] +; CHECK-NEXT: v_mov_b32_e32 v12, s1 +; CHECK-NEXT: v_mov_b32_e32 v11, s0 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:1 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:2 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:3 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:4 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:5 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:6 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:7 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:8 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:9 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:10 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:11 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:12 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:13 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:14 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:17 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[0:1] offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:30 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:29 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:28 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:27 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:26 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:25 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:24 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:23 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:22 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:21 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:20 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:19 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:18 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:17 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:16 -; CHECK-NEXT: flat_store_byte v[2:3], v19 offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:46 -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:45 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:44 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:43 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:42 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:41 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:40 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:39 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:38 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:37 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:36 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:35 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:34 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:33 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:46 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:45 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:44 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:43 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:42 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:41 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:40 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:39 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:38 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:37 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:36 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:35 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:34 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:33 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:32 -; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:31 +; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46 +; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44 +; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7] ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false) @@ -1431,375 +383,59 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v24, 0 ; CHECK-NEXT: s_add_u32 s16, s16, s13 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:15 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:14 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:13 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:12 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:11 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:10 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:9 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:8 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:7 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:6 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:5 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:4 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:3 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:2 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:1 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:31 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:30 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 ; CHECK-NEXT: s_addc_u32 s17, s17, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, s2 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:10 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:23 +; CHECK-NEXT: v_mov_b32_e32 v25, s2 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:116 +; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:112 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(9) +; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:108 +; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen offset:96 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] +; CHECK-NEXT: s_waitcnt vmcnt(13) +; CHECK-NEXT: buffer_store_dword v11, v25, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_store_dword v10, v25, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_store_dword v9, v25, s[16:19], 0 offen offset:84 +; CHECK-NEXT: buffer_store_dword v8, v25, s[16:19], 0 offen offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(16) +; CHECK-NEXT: buffer_store_dword v15, v25, s[16:19], 0 offen offset:76 +; CHECK-NEXT: buffer_store_dword v14, v25, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_store_dword v13, v25, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_store_dword v12, v25, s[16:19], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:9 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:8 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:7 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:20 +; CHECK-NEXT: buffer_store_dword v19, v25, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v18, v25, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_store_dword v17, v25, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_store_dword v16, v25, s[16:19], 0 offen offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:6 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:5 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:2 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:47 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:4 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:17 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:3 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:16 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:27 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:26 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:25 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:24 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:45 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:44 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:43 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:23 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:36 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:22 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:35 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:21 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:34 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:20 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:33 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:19 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:28 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:29 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:42 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:18 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:63 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:16 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:61 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:27 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:40 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:26 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:39 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:25 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:38 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:24 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:37 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:44 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:57 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:43 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:56 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:45 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:58 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:36 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:49 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:35 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:48 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:46 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:47 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:60 -; CHECK-NEXT: s_waitcnt vmcnt(33) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:34 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:79 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:28 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:41 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:42 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:55 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:33 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:32 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:77 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:61 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:74 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:40 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:53 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:39 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:52 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:38 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:51 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:37 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:50 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:57 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:70 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:56 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:69 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:58 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:71 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:49 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:48 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:93 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:46 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:59 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:60 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:73 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:41 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:54 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:55 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:68 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:74 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:87 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:53 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:66 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:52 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:65 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:51 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:64 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:62 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:63 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:76 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:50 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:95 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:77 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:90 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:71 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:83 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:70 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:69 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:59 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:72 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:73 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:85 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:54 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:67 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:68 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:81 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:66 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:111 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:65 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:64 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:109 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:62 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:75 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:76 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:89 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:90 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:103 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:72 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:86 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:84 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:82 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:87 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:100 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:67 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:80 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:78 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:94 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:79 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:92 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:95 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:108 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:93 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:75 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:88 +; CHECK-NEXT: buffer_store_dword v23, v25, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_store_dword v22, v25, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_store_dword v21, v25, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_store_dword v20, v25, s[16:19], 0 offen offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(21) +; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:89 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:102 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:78 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:91 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:94 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:107 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:92 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:105 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:88 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:101 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:91 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:104 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:86 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:85 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:84 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:83 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:82 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:96 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:97 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:98 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:99 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:120 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:81 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:80 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:111 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:110 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:109 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:108 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:100 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:121 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:122 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:123 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:124 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:125 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:126 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:107 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:127 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:105 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:103 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:102 -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:101 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:116 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:117 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:119 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:114 -; CHECK-NEXT: s_waitcnt vmcnt(34) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:104 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:118 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:115 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:113 -; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:112 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:99 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:96 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:125 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:124 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:121 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:120 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:119 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:116 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:115 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:114 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:113 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v21, v1, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) @@ -1815,363 +451,57 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: s_add_u32 s16, s16, s13 ; CHECK-NEXT: s_addc_u32 s17, s17, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:2 +; CHECK-NEXT: v_mov_b32_e32 v26, s0 +; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:22 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:21 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:20 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:19 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:18 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:31 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:37 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:22 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:36 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:21 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:35 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:34 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:33 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:18 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:17 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:25 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:39 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:24 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:45 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:37 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:36 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:50 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:35 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:49 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:34 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:47 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:29 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:44 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:33 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:32 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:40 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:39 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:53 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:38 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:59 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:51 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:50 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:63 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:46 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:61 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:58 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:49 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:48 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:56 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:53 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:67 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:52 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:55 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:73 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:65 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:64 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:62 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:77 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:60 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:75 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:72 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:70 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:68 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:67 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:81 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:66 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:79 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:69 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:87 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:76 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:91 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:74 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:89 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:71 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:86 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:78 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:93 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:90 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:88 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:85 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:95 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:92 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:94 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:84 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:81 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:96 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:120 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:80 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:111 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:110 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:109 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:99 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v25, s1 +; CHECK-NEXT: v_mov_b32_e32 v24, s0 +; CHECK-NEXT: s_waitcnt vmcnt(20) +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112 +; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:107 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:105 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:104 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:103 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:106 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:102 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:101 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:100 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115 +; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:108 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:98 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:97 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:96 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:127 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:126 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96 +; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:125 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:124 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:123 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:122 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:121 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:120 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:119 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:118 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:117 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:116 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:115 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:114 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false) @@ -2218,279 +548,27 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 { ; CHECK-LABEL: memcpy_p0_p3_optsize: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:112 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:113 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:114 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:115 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:116 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:117 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:118 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:119 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:112 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:113 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:114 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:115 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:116 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:117 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:118 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:119 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:120 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:121 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:122 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:123 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:124 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:125 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:126 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:127 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:120 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:121 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:122 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:123 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:124 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:125 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:126 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:127 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:96 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:97 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:98 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:99 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:100 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:101 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:102 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:103 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:96 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:97 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:98 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:100 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:101 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:102 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:103 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:104 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:105 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:106 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:107 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:108 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:109 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:110 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:111 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:104 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:107 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:108 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:109 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:110 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:111 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:80 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:81 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:82 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:83 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:84 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:85 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:86 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:87 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:80 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:81 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:84 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:85 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:86 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:87 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:88 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:89 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:90 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:91 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:92 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:93 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:94 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:95 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:88 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:89 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:90 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:91 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:92 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:93 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:94 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:95 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:64 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:65 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:66 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:67 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:68 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:69 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:70 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:71 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:64 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:67 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:68 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:69 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:70 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:71 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:72 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:73 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:74 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:75 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:76 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:77 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:78 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:79 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:48 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:49 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:50 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:51 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:52 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:53 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:54 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:55 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:48 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:49 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:50 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:51 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:52 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:53 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:54 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:55 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:56 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:57 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:58 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:59 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:60 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:61 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:62 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:63 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:56 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:57 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:58 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:59 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:60 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:61 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:62 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:63 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:32 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:33 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:34 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:35 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:36 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:37 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:38 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:39 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:32 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:33 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:34 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:35 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:36 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:37 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:38 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:39 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:40 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:41 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:42 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:43 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:44 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:45 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:46 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:47 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:40 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:41 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:42 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:43 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:44 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:45 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:46 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:47 -; CHECK-NEXT: ds_read_u8 v3, v2 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v18, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:17 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:18 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v26, v2 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:23 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:29 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:31 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:27 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:28 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:31 -; CHECK-NEXT: flat_store_byte v[0:1], v3 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 +; CHECK-NEXT: v_mov_b32_e32 v16, 0 +; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5 +; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v21, s1 +; CHECK-NEXT: v_mov_b32_e32 v20, s0 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32 +; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9 +; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11 +; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13 +; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[16:19] offset:112 ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll index 7575782c1b2acd..cadc3dadb0a1e9 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll @@ -13,55 +13,9 @@ define void @memcpy_p0_p0_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -73,101 +27,19 @@ define void @memcpy_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xe -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -179,104 +51,13 @@ define void @memcpy_p0_p0_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v19 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:31 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -288,31 +69,9 @@ define void @memcpy_p0_p0_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -324,55 +83,19 @@ define void @memcpy_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -384,55 +107,13 @@ define void @memcpy_p0_p0_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -458,58 +139,13 @@ define void @memcpy_p0_p0_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:28 -; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:15 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -553,58 +189,13 @@ define void @memcpy_p0_p0_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p0_p0_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:28 -; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:15 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -634,55 +225,9 @@ define void @memcpy_p0_p1_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -694,101 +239,19 @@ define void @memcpy_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xe -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -800,104 +263,13 @@ define void @memcpy_p0_p1_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:31 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -909,31 +281,9 @@ define void @memcpy_p0_p1_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -945,55 +295,19 @@ define void @memcpy_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1005,55 +319,13 @@ define void @memcpy_p0_p1_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1080,35 +352,12 @@ define void @memcpy_p0_p1_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off -; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:15 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:15 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v11 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v11 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 8, v11 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1153,35 +402,12 @@ define void @memcpy_p0_p1_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off -; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:15 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:15 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v11 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v11 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 8, v11 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1211,54 +437,9 @@ define void @memcpy_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1270,96 +451,19 @@ define void @memcpy_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v17, v2 -; CHECK-NEXT: ds_read_u8 v18, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:17 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v17 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:29 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:23 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:29 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:25 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:21 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:19 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 +; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 +; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 +; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 +; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1371,100 +475,12 @@ define void @memcpy_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v18, v2 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:17 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:31 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:29 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:23 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:31 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 +; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1476,30 +492,9 @@ define void @memcpy_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v3, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1511,54 +506,19 @@ define void @memcpy_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:30 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:28 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:24 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:22 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:20 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:18 -; CHECK-NEXT: ds_read_u16 v10, v2 offset:16 -; CHECK-NEXT: ds_read_u16 v11, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v12, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v14, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v15, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v16, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v17, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 +; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 +; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 +; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1570,54 +530,12 @@ define void @memcpy_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v3, v2 offset:30 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:28 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:24 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:22 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:20 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:18 -; CHECK-NEXT: ds_read_u16 v10, v2 offset:16 -; CHECK-NEXT: ds_read_u16 v11, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v12, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v14, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v15, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v16, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v17, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1643,35 +561,12 @@ define void @memcpy_p0_p3_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset1:1 -; CHECK-NEXT: ds_read_b128 v[7:10], v2 offset:15 +; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:15 +; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v7 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 8, v10 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1714,35 +609,12 @@ define void @memcpy_p0_p3_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p0_p3_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_b128 v[3:6], v2 -; CHECK-NEXT: ds_read_b128 v[7:10], v2 offset:15 +; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:15 +; CHECK-NEXT: ds_read_b128 v[7:10], v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v7 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 8, v10 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1771,55 +643,12 @@ define void @memcpy_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1831,100 +660,24 @@ define void @memcpy_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:2 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:3 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:4 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:5 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:6 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:7 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:8 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:9 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:10 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:11 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:12 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:13 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16 +; CHECK-NEXT: global_load_dword v4, v[2:3], off offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dword v[0:1], v4 offset:24 +; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 +; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1936,104 +689,18 @@ define void @memcpy_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:31 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16 +; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:24 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2045,30 +712,12 @@ define void @memcpy_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_ushort v4, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:2 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:4 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:6 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:8 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:10 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:10 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:12 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 offset:14 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2080,55 +729,24 @@ define void @memcpy_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16 +; CHECK-NEXT: global_load_dword v4, v[2:3], off offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dword v[0:1], v4 offset:24 +; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 +; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2140,55 +758,18 @@ define void @memcpy_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16 +; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:24 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2219,30 +800,7 @@ define void @memcpy_p0_p4_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v3 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v4 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v5 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v3 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v4 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 24, v5 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v5 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2290,30 +848,7 @@ define void @memcpy_p0_p4_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v3 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v4 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v5 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v3 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v4 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 24, v5 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v5 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2342,55 +877,13 @@ define void @memcpy_p0_p5_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2402,99 +895,23 @@ define void @memcpy_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x11 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v17 -; CHECK-NEXT: s_clause 0xc -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:19 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2506,103 +923,19 @@ define void @memcpy_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x11 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: s_clause 0xd -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2614,31 +947,13 @@ define void @memcpy_p0_p5_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:2 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2650,55 +965,23 @@ define void @memcpy_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2710,55 +993,19 @@ define void @memcpy_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2788,53 +1035,19 @@ define void @memcpy_p0_p5_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_clause 0x7 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:15 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:23 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:27 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2888,53 +1101,19 @@ define void @memcpy_p0_p5_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p0_p5_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_clause 0x7 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:15 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:23 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:27 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2970,41 +1149,8 @@ define void @memcpy_p1_p0_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v10 -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v6, v11, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v8, v12, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 8, v17 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v10, v18, 8, v2 -; CHECK-NEXT: v_lshl_or_b32 v2, v6, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v7 -; CHECK-NEXT: v_lshl_or_b32 v4, v10, 16, v9 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -3016,79 +1162,15 @@ define void @memcpy_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v9 -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v6, v10, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v13, v13, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v5, v7, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v9, v15, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v7, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v15, v19, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v21, 8, v22 -; CHECK-NEXT: v_lshl_or_b32 v9, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v14, v23, 8, v24 -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v25, 8, v26 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v17, v27, 8, v28 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v30 -; CHECK-NEXT: v_lshl_or_b32 v4, v14, 16, v12 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: flat_load_dwordx2 v[6:7], v[2:3] offset:23 +; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v31, 8, v32 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v2, v33, 8, v2 -; CHECK-NEXT: v_lshl_or_b32 v3, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v2, v2, 16, v18 ; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) ; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -3100,79 +1182,13 @@ define void @memcpy_p1_p0_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:31 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] -; CHECK-NEXT: flat_load_ubyte v34, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v10 -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5 -; CHECK-NEXT: v_lshl_or_b32 v6, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v14, v14, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v18, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v11, v20, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v22, 8, v23 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v15, v24, 8, v25 -; CHECK-NEXT: v_lshl_or_b32 v9, v12, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v26, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v17, v28, 8, v29 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v30, 8, v31 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v32, 8, v33 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v19, v34, 8, v2 -; CHECK-NEXT: v_lshl_or_b32 v2, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v6, v19, 16, v18 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false) @@ -3183,23 +1199,8 @@ define void @memcpy_p1_p0_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v2, v9, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v6 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v7 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -3211,52 +1212,16 @@ define void @memcpy_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ushort v19, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v20, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v21, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v22, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v23, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v8, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v7, v4, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v8, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v12, v17, 8, v18 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v4, v19, 16, v20 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: flat_load_dwordx2 v[6:7], v[2:3] offset:23 +; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v3, v21, 16, v22 -; CHECK-NEXT: v_lshl_or_b32 v9, v13, 16, v12 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v2, v23, 16, v2 -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false) @@ -3267,39 +1232,13 @@ define void @memcpy_p1_p0_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v19, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 16, v9 -; CHECK-NEXT: v_lshl_or_b32 v9, v7, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v3, v12, 16, v13 -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v2, v14, 16, v15 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v7, v16, 16, v17 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v6, v18, 16, v19 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false) @@ -3323,47 +1262,13 @@ define void @memcpy_p1_p0_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:17 -; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v7, v7, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v9, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v7, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v14, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v12, v17, 8, v18 -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v15, v19, 8, v20 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false) @@ -3404,47 +1309,13 @@ define void @memcpy_p1_p0_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p1_p0_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:17 -; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v7, v7, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v9, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v7, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v14, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v12, v17, 8, v18 -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v15, v19, 8, v20 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false) @@ -4042,44 +1913,13 @@ define void @memcpy_p1_p5_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v9, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v10, v16, 8, v15 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v11, v2, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false) @@ -4090,81 +1930,21 @@ define void @memcpy_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: v_lshl_or_b32 v10, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v11, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v15, v23, 8, v22 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v25 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v14, v26, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v16, v28, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v17, v30, 8, v29 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v31, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v7, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v19, v2, 8, v32 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v8, v19, 16, v18 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[9:10], off offset:23 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false) @@ -4175,81 +1955,19 @@ define void @memcpy_p1_p5_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v10, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v11, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v15, v23, 8, v22 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v14, v26, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v16, v28, 8, v27 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v17, v30, 8, v29 -; CHECK-NEXT: v_lshl_or_b32 v7, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v32, 8, v31 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v19, v2, 8, v33 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v8, v19, 16, v18 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false) @@ -4260,24 +1978,13 @@ define void @memcpy_p1_p5_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v7 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v4, v10, 16, v9 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false) @@ -4288,52 +1995,21 @@ define void @memcpy_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v19, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v20, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v21, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v22, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v10, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v9, v14, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v11, v16, 8, v15 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v12, v18, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v9, v9, 16, v10 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v7, v20, 16, v19 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v6, v22, 16, v21 -; CHECK-NEXT: v_lshl_or_b32 v8, v12, 16, v11 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[9:10], off offset:23 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false) @@ -4344,41 +2020,19 @@ define void @memcpy_p1_p5_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 16, v15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v8, v18, 16, v17 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false) @@ -4406,49 +2060,18 @@ define void @memcpy_p1_p5_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x7 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v2, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v11, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v12, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v10, v2, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v8, v16, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v13, v20, 8, v19 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v9, v9, 16, v8 -; CHECK-NEXT: v_lshl_or_b32 v8, v12, 16, v11 -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4500,49 +2123,18 @@ define void @memcpy_p1_p5_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p1_p5_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x7 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v2, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v11, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v12, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v10, v2, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v8, v16, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v13, v20, 8, v19 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v9, v9, 16, v8 -; CHECK-NEXT: v_lshl_or_b32 v8, v12, 16, v11 -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4577,41 +2169,8 @@ define void @memcpy_p3_p0_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v9 -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 8, v16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 8, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 ; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -4624,80 +2183,16 @@ define void @memcpy_p3_p0_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v8 -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v8, v20, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v13, v22, 8, v23 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v11, v24, 8, v25 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v15, v26, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v28, 8, v29 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: flat_load_dwordx2 v[5:6], v[1:2] offset:23 +; CHECK-NEXT: flat_load_dwordx2 v[7:8], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v16, v30, 8, v31 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v17, v32, 8, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v6, v10, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v11 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:23 -; CHECK-NEXT: ds_write_b64 v0, v[3:4] offset:16 -; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1 +; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) +; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2) +; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4709,79 +2204,13 @@ define void @memcpy_p3_p0_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:31 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] -; CHECK-NEXT: flat_load_ubyte v33, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v9 -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v7, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v6, v17, 8, v18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v9, v19, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v8, v21, 8, v22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v13, v23, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v25, 8, v26 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v28 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v29, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v16, v31, 8, v32 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v17, v33, 8, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v6, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3 -; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4793,23 +2222,8 @@ define void @memcpy_p3_p0_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v1, v8, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v5 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v6 ; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -4822,51 +2236,16 @@ define void @memcpy_p3_p0_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v18, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v19, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v20, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v21, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v22, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v7, v7, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v1, v10, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v11, v5, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v9, v15, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v3, v17, 16, v18 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: flat_load_dwordx2 v[5:6], v[1:2] offset:23 +; CHECK-NEXT: flat_load_dwordx2 v[7:8], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v6, v19, 16, v20 -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v5, v21, 16, v22 -; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:16 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:23 +; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) +; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2) +; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4878,40 +2257,13 @@ define void @memcpy_p3_p0_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v18, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v3, v13, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v16 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v18 -; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3 -; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4937,47 +2289,13 @@ define void @memcpy_p3_p0_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:17 -; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v9, v8, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v7, v6, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v13, v14, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v11, v16, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v14, v18, 8, v19 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v5, v5, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v11 -; CHECK-NEXT: v_lshl_or_b32 v5, v5, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b128 v0, v[5:8] offset:15 +; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write_b128 v0, v[7:10] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5021,47 +2339,13 @@ define void @memcpy_p3_p0_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p3_p0_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:17 -; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v9, v8, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v7, v6, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v13, v14, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v11, v16, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v14, v18, 8, v19 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v5, v5, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v11 -; CHECK-NEXT: v_lshl_or_b32 v5, v5, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_write_b128 v0, v[1:4] -; CHECK-NEXT: ds_write_b128 v0, v[5:8] offset:15 +; CHECK-NEXT: ds_write_b128 v0, v[3:6] offset:15 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write_b128 v0, v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5701,44 +2985,13 @@ define void @memcpy_p3_p5_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v8, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v15, 8, v14 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v10, v1, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5750,81 +3003,21 @@ define void @memcpy_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: v_lshl_or_b32 v5, v15, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v12, v20, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v11, v18, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v13, v25, 8, v23 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v26 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v28 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:27 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v17, v30, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v18, v1, 8, v31 -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v16, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v7, v18, 16, v17 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 -; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:23 +; CHECK-NEXT: ds_write_b64 v0, v[6:7] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write_b64 v0, v[8:9] offset:23 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5836,81 +3029,19 @@ define void @memcpy_p3_p5_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v5, v15, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v12, v20, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v11, v18, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v25, 8, v23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v26 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v28 -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v17, v31, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v18, v1, 8, v32 -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v16, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v7, v18, 16, v17 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5922,24 +3053,13 @@ define void @memcpy_p3_p5_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5951,52 +3071,21 @@ define void @memcpy_p3_p5_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v18, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v19, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v20, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v21, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v3, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v8, v13, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v10, v15, 8, v14 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v11, v17, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v6, v19, 16, v18 +; CHECK-NEXT: ds_write_b64 v0, v[6:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v5, v21, 16, v20 -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 16, v10 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 -; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:23 +; CHECK-NEXT: ds_write_b64 v0, v[8:9] offset:23 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -6008,41 +3097,19 @@ define void @memcpy_p3_p5_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v17, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v3, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 16, v10 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3 +; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -6072,50 +3139,19 @@ define void @memcpy_p3_p5_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v1, v9, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v10, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v11, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v4, v1, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v3, v15, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v2, v11, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v18 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v1, v13, 16, v12 +; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset1:1 -; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:15 +; CHECK-NEXT: ds_write_b128 v0, v[2:5] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -6169,49 +3205,18 @@ define void @memcpy_p3_p5_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p3_p5_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x7 ; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v1, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v10, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v11, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v9, v1, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v7, v15, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v18 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v8, v8, 16, v7 -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ds_write_b128 v0, v[2:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ds_write_b128 v0, v[6:9] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -6248,55 +3253,12 @@ define void @memcpy_p5_p0_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false) @@ -6307,101 +3269,24 @@ define void @memcpy_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xe -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false) @@ -6412,104 +3297,19 @@ define void @memcpy_p5_p0_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:31 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false) @@ -6520,31 +3320,12 @@ define void @memcpy_p5_p0_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false) @@ -6555,55 +3336,24 @@ define void @memcpy_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false) @@ -6614,55 +3364,19 @@ define void @memcpy_p5_p0_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false) @@ -6689,61 +3403,19 @@ define void @memcpy_p5_p0_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:28 -; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false) @@ -6793,61 +3465,19 @@ define void @memcpy_p5_p0_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p5_p0_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:28 -; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false) @@ -6881,55 +3511,12 @@ define void @memcpy_p5_p1_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p1_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false) @@ -6940,207 +3527,47 @@ define void @memcpy_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p1_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false) ret void -} - -define void @memcpy_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) { -; CHECK-LABEL: memcpy_p5_p1_sz32_align_1_1: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:31 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v33, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:18 +} + +define void @memcpy_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) { +; CHECK-LABEL: memcpy_p5_p1_sz32_align_1_1: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false) @@ -7151,31 +3578,12 @@ define void @memcpy_p5_p1_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p1_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v3, v[1:2], off -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false) @@ -7186,55 +3594,24 @@ define void @memcpy_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p1_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false) @@ -7245,55 +3622,19 @@ define void @memcpy_p5_p1_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p1_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false) @@ -7329,30 +3670,10 @@ define void @memcpy_p5_p1_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr ; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v10 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false) @@ -7411,30 +3732,10 @@ define void @memcpy_p5_p1_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a ; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v10 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false) @@ -7468,54 +3769,12 @@ define void @memcpy_p5_p3_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:15 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v10, v1 offset:7 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false) @@ -7526,85 +3785,25 @@ define void @memcpy_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:7 -; CHECK-NEXT: ds_read_u8 v10, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v17, v1 offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: ds_read_u8 v2, v1 offset:24 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:25 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u8 v18, v1 offset:27 -; CHECK-NEXT: ds_read_u8 v19, v1 offset:28 -; CHECK-NEXT: ds_read_u8 v20, v1 offset:29 -; CHECK-NEXT: ds_read_u8 v21, v1 offset:30 -; CHECK-NEXT: ds_read_u8 v22, v1 offset:16 -; CHECK-NEXT: ds_read_u8 v23, v1 offset:17 -; CHECK-NEXT: ds_read_u8 v24, v1 offset:18 -; CHECK-NEXT: ds_read_u8 v25, v1 offset:19 -; CHECK-NEXT: ds_read_u8 v26, v1 offset:20 -; CHECK-NEXT: ds_read_u8 v27, v1 offset:21 -; CHECK-NEXT: ds_read_u8 v28, v1 offset:22 -; CHECK-NEXT: ds_read_u8 v1, v1 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(27) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(26) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(25) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(23) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(22) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(21) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(20) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(19) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(18) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:15 +; CHECK-NEXT: ds_read_b32 v8, v1 offset:24 +; CHECK-NEXT: ds_read_u16 v9, v1 offset:28 +; CHECK-NEXT: ds_read_u8 v10, v1 offset:30 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false) @@ -7615,79 +3814,18 @@ define void @memcpy_p5_p3_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:15 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v10, v1 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v17, v1 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: ds_read_u8 v2, v1 offset:24 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:25 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:27 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:28 -; CHECK-NEXT: ds_read_u8 v18, v1 offset:29 -; CHECK-NEXT: ds_read_u8 v19, v1 offset:30 -; CHECK-NEXT: ds_read_u8 v20, v1 offset:31 -; CHECK-NEXT: ds_read_u8 v21, v1 offset:16 -; CHECK-NEXT: ds_read_u8 v22, v1 offset:17 -; CHECK-NEXT: ds_read_u8 v23, v1 offset:18 -; CHECK-NEXT: ds_read_u8 v24, v1 offset:19 -; CHECK-NEXT: ds_read_u8 v25, v1 offset:20 -; CHECK-NEXT: ds_read_u8 v26, v1 offset:21 -; CHECK-NEXT: ds_read_u8 v27, v1 offset:22 -; CHECK-NEXT: ds_read_u8 v1, v1 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false) @@ -7698,30 +3836,12 @@ define void @memcpy_p5_p3_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v2, v1 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v1, v1 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false) @@ -7732,54 +3852,25 @@ define void @memcpy_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:30 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:28 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:24 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:22 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:20 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:18 -; CHECK-NEXT: ds_read_u16 v9, v1 offset:16 -; CHECK-NEXT: ds_read_u16 v10, v1 offset:14 -; CHECK-NEXT: ds_read_u16 v11, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v12, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v13, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v14, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v15, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v16, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: ds_read_b32 v8, v1 offset:24 +; CHECK-NEXT: ds_read_u16 v9, v1 offset:28 +; CHECK-NEXT: ds_read_u8 v10, v1 offset:30 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false) @@ -7790,54 +3881,18 @@ define void @memcpy_p5_p3_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v2, v1 offset:30 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:28 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:24 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:22 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:20 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:18 -; CHECK-NEXT: ds_read_u16 v9, v1 offset:16 -; CHECK-NEXT: ds_read_u16 v10, v1 offset:14 -; CHECK-NEXT: ds_read_u16 v11, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v12, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v13, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v14, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v15, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v16, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false) @@ -7872,30 +3927,10 @@ define void @memcpy_p5_p3_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v6, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v6 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v6 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v9 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false) @@ -7952,30 +3987,10 @@ define void @memcpy_p5_p3_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v6, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v6 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v6 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v9 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false) @@ -8008,55 +4023,12 @@ define void @memcpy_p5_p4_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) @@ -8067,100 +4039,24 @@ define void @memcpy_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false) @@ -8171,103 +4067,19 @@ define void @memcpy_p5_p4_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:31 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v33, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false) @@ -8278,31 +4090,12 @@ define void @memcpy_p5_p4_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v3, v[1:2], off -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false) @@ -8313,55 +4106,24 @@ define void @memcpy_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false) @@ -8372,55 +4134,19 @@ define void @memcpy_p5_p4_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false) @@ -8456,30 +4182,10 @@ define void @memcpy_p5_p4_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr ; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v10 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false) @@ -8538,30 +4244,10 @@ define void @memcpy_p5_p4_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a ; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v10 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false) @@ -8595,55 +4281,19 @@ define void @memcpy_p5_p5_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false) @@ -8654,99 +4304,34 @@ define void @memcpy_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x11 -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen -; CHECK-NEXT: s_clause 0xc -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:26 +; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:25 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:22 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:21 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false) @@ -8757,103 +4342,31 @@ define void @memcpy_p5_p5_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x11 -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen -; CHECK-NEXT: s_clause 0xd -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false) @@ -8864,31 +4377,19 @@ define void @memcpy_p5_p5_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false) @@ -8899,55 +4400,34 @@ define void @memcpy_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false) @@ -8958,55 +4438,31 @@ define void @memcpy_p5_p5_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false) @@ -9040,67 +4496,31 @@ define void @memcpy_p5_p5_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_dword v16, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v17, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v18, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v19, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:29 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:19 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 31, i1 false) @@ -9169,67 +4589,31 @@ define void @memcpy_p5_p5_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p5_p5_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_dword v16, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v17, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v18, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v19, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:29 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:19 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 31, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll index cc5256620bfe08..4e5688adcd6bbd 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll @@ -13,55 +13,9 @@ define void @memmove_p0_p0_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -73,100 +27,19 @@ define void @memmove_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(30) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -178,103 +51,13 @@ define void @memmove_p0_p0_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:31 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v34, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(31) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v34 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -286,31 +69,9 @@ define void @memmove_p0_p0_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -322,55 +83,19 @@ define void @memmove_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -382,55 +107,13 @@ define void @memmove_p0_p0_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -559,55 +242,9 @@ define void @memmove_p0_p1_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -619,100 +256,19 @@ define void @memmove_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v20, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v21, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v22, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v23, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v24, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v25, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v26, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v27, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v28, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v29, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v30, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v31, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v32, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v33, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:3 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:2 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:1 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -724,103 +280,13 @@ define void @memmove_p0_p1_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:31 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v20, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v21, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v22, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v23, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v24, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v25, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v26, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v27, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v28, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v29, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v30, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v31, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v32, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v33, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v34, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v34 offset:1 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -832,31 +298,9 @@ define void @memmove_p0_p1_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -868,55 +312,19 @@ define void @memmove_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -928,55 +336,13 @@ define void @memmove_p0_p1_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1105,54 +471,9 @@ define void @memmove_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1164,72 +485,19 @@ define void @memmove_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:29 ; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:17 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:18 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:23 -; CHECK-NEXT: ds_read_u8 v18, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v26, v2 -; CHECK-NEXT: ds_read_u8 v27, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v28, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v29, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v30, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v31, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v32, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) +; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 +; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 +; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:23 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:15 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:9 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7 -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:6 -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:5 -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4 -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:3 -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:2 -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:1 -; CHECK-NEXT: flat_store_byte v[0:1], v26 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1241,74 +509,12 @@ define void @memmove_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:29 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:31 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:17 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:18 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v18, v2 offset:23 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v26, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v27, v2 -; CHECK-NEXT: ds_read_u8 v28, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v29, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v30, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v31, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v32, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v33, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:31 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:23 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:15 -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:9 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7 -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:6 -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:5 -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:4 -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:3 -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:2 -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:1 -; CHECK-NEXT: flat_store_byte v[0:1], v27 +; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1320,30 +526,9 @@ define void @memmove_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v3, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1355,54 +540,19 @@ define void @memmove_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:30 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:28 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:24 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:22 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:20 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:18 -; CHECK-NEXT: ds_read_u16 v10, v2 offset:16 -; CHECK-NEXT: ds_read_u16 v11, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v12, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v14, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v15, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v16, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v17, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 +; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 +; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 +; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1414,54 +564,12 @@ define void @memmove_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v3, v2 offset:30 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:28 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:24 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:22 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:20 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:18 -; CHECK-NEXT: ds_read_u16 v10, v2 offset:16 -; CHECK-NEXT: ds_read_u16 v11, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v12, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v14, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v15, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v16, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v17, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1583,55 +691,9 @@ define void @memmove_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1643,100 +705,19 @@ define void @memmove_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v20, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v21, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v22, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v23, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v24, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v25, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v26, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v27, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v28, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v29, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v30, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v31, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v32, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v33, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:3 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:2 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:1 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1748,103 +729,13 @@ define void @memmove_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:31 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v20, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v21, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v22, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v23, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v24, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v25, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v26, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v27, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v28, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v29, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v30, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v31, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v32, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v33, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v34, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v34 offset:1 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1856,31 +747,9 @@ define void @memmove_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1892,55 +761,19 @@ define void @memmove_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1952,55 +785,13 @@ define void @memmove_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2129,55 +920,13 @@ define void @memmove_p0_p5_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2189,100 +938,23 @@ define void @memmove_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:6 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:5 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:1 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2294,103 +966,19 @@ define void @memmove_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:5 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:1 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2402,31 +990,13 @@ define void @memmove_p0_p5_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:2 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2438,55 +1008,23 @@ define void @memmove_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2498,55 +1036,19 @@ define void @memmove_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2698,41 +1200,8 @@ define void @memmove_p1_p0_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v10 -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v6, v11, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v8, v12, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 8, v17 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v10, v18, 8, v2 -; CHECK-NEXT: v_lshl_or_b32 v2, v6, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v7 -; CHECK-NEXT: v_lshl_or_b32 v4, v10, 16, v9 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2744,79 +1213,18 @@ define void @memmove_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v9, v8, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(23) -; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshlrev_b16 v12, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v14, v14, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(17) -; CHECK-NEXT: v_lshl_or_b32 v3, v16, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v16, v6, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: v_lshl_or_b32 v15, v18, 8, v19 -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v11, v20, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v8, v14, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v18, v22, 8, v23 -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v16 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v17, v24, 8, v25 -; CHECK-NEXT: v_lshl_or_b32 v6, v15, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v20, v26, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v19, v28, 8, v29 -; CHECK-NEXT: v_lshl_or_b32 v4, v18, 16, v17 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v21, v30, 8, v31 +; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v22, v32, 8, v33 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_or_b32_e32 v12, v12, v2 -; CHECK-NEXT: v_lshl_or_b32 v3, v20, 16, v19 -; CHECK-NEXT: v_lshl_or_b32 v2, v22, 16, v21 -; CHECK-NEXT: global_store_byte v[0:1], v13, off offset:30 -; CHECK-NEXT: global_store_short v[0:1], v12, off offset:28 ; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2828,79 +1236,13 @@ define void @memmove_p1_p0_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:31 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] -; CHECK-NEXT: flat_load_ubyte v34, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v10 -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5 -; CHECK-NEXT: v_lshl_or_b32 v6, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v14, v14, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v18, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v11, v20, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v22, 8, v23 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v15, v24, 8, v25 -; CHECK-NEXT: v_lshl_or_b32 v9, v12, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v26, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v17, v28, 8, v29 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v30, 8, v31 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v32, 8, v33 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v19, v34, 8, v2 -; CHECK-NEXT: v_lshl_or_b32 v2, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v6, v19, 16, v18 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false) @@ -2911,23 +1253,8 @@ define void @memmove_p1_p0_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v2, v9, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v6 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v7 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2939,41 +1266,18 @@ define void @memmove_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v19, v[2:3] -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v8, v6, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v7, v12, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v3, v16, 16, v17 +; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v2, v18, 16, v19 -; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_store_byte v[0:1], v20, off offset:30 ; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2985,39 +1289,13 @@ define void @memmove_p1_p0_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v19, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 16, v9 -; CHECK-NEXT: v_lshl_or_b32 v9, v7, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v3, v12, 16, v13 -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v2, v14, 16, v15 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v7, v16, 16, v17 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v6, v18, 16, v19 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false) @@ -3783,44 +2061,13 @@ define void @memmove_p1_p5_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v9, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v10, v16, 8, v15 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v11, v2, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false) @@ -3831,82 +2078,24 @@ define void @memmove_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v2, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshlrev_b16 v4, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10 -; CHECK-NEXT: v_lshl_or_b32 v8, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v6, v15, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v2, v9, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v12, v17, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v13, v19, 8, v18 -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v21, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v11, v23, 8, v22 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v15, v26, 8, v25 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v14, v24, 8, v28 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v27 -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v17, v31, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: v_or_b32_e32 v18, v4, v32 -; CHECK-NEXT: v_lshl_or_b32 v4, v13, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v7, v15, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 16, v17 +; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_byte v[0:1], v33, off offset:30 -; CHECK-NEXT: global_store_short v[0:1], v18, off offset:28 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false) @@ -3917,81 +2106,19 @@ define void @memmove_p1_p5_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v10, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v11, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v15, v23, 8, v22 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v14, v26, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v16, v28, 8, v27 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v17, v30, 8, v29 -; CHECK-NEXT: v_lshl_or_b32 v7, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v32, 8, v31 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v19, v2, 8, v33 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v8, v19, 16, v18 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false) @@ -4002,24 +2129,13 @@ define void @memmove_p1_p5_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v7 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v4, v10, 16, v9 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false) @@ -4030,43 +2146,24 @@ define void @memmove_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v18, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v8, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v2, v6, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v7 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v7, v15, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v6, v17, 16, v16 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: global_store_short v[0:1], v18, off offset:28 +; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(7) +; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_byte v[0:1], v19, off offset:30 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false) @@ -4077,41 +2174,19 @@ define void @memmove_p1_p5_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 16, v15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v8, v18, 16, v17 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false) @@ -4258,41 +2333,8 @@ define void @memmove_p3_p0_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v9 -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 8, v16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 8, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 ; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -4305,82 +2347,20 @@ define void @memmove_p3_p0_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(27) -; CHECK-NEXT: v_lshl_or_b32 v5, v5, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v7, v7, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(23) -; CHECK-NEXT: v_lshl_or_b32 v9, v9, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshlrev_b16 v11, 8, v11 -; CHECK-NEXT: v_lshl_or_b32 v4, v3, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v13, v13, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(17) -; CHECK-NEXT: v_lshl_or_b32 v2, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: v_lshl_or_b32 v10, v17, 8, v18 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v8, v19, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v13, v13, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v14, v21, 8, v22 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v6, v23, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v16, v25, 8, v26 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v28 -; CHECK-NEXT: v_lshl_or_b32 v3, v14, 16, v6 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v17, v29, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v18, v31, 8, v32 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_or_b32_e32 v11, v11, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v10, 16, v8 -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v5, v18, 16, v17 -; CHECK-NEXT: ds_write_b8 v0, v12 offset:30 -; CHECK-NEXT: ds_write_b32 v0, v13 offset:24 -; CHECK-NEXT: ds_write_b16 v0, v11 offset:28 -; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:16 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[3:4] offset1:1 +; CHECK-NEXT: ds_write_b8 v0, v8 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: ds_write_b16 v0, v9 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: ds_write_b32 v0, v7 offset:24 +; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(4) +; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4392,79 +2372,13 @@ define void @memmove_p3_p0_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:31 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] -; CHECK-NEXT: flat_load_ubyte v33, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v9 -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v7, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v6, v17, 8, v18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v9, v19, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v8, v21, 8, v22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v13, v23, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v25, 8, v26 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v28 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v29, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v16, v31, 8, v32 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v17, v33, 8, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v6, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3 -; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4476,23 +2390,8 @@ define void @memmove_p3_p0_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v1, v8, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v5 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v6 ; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -4505,43 +2404,20 @@ define void @memmove_p3_p0_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v4, v3, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v2, v10, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v1, v12, 16, v13 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v5, v16, 16, v17 -; CHECK-NEXT: ds_write_b16 v0, v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) -; CHECK-NEXT: ds_write_b8 v0, v18 offset:30 -; CHECK-NEXT: ds_write_b32 v0, v8 offset:24 -; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:16 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[3:4] offset1:1 +; CHECK-NEXT: ds_write_b8 v0, v8 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: ds_write_b16 v0, v9 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: ds_write_b32 v0, v7 offset:24 +; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(4) +; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4553,40 +2429,13 @@ define void @memmove_p3_p0_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v18, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v3, v13, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v16 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v18 -; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3 -; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5398,44 +3247,13 @@ define void @memmove_p3_p5_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v8, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v15, 8, v14 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v10, v1, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5447,83 +3265,26 @@ define void @memmove_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v1, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshlrev_b16 v3, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v6, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v7, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v4, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v5, v14, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v10, v16, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v16, v2, 16, v1 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v11, v18, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v8, v20, 8, v19 -; CHECK-NEXT: v_lshl_or_b32 v1, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v9, v22, 8, v21 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: ds_write_b32 v0, v8 offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v13, v25, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v12, v23, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v28, 8, v26 -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v8 +; CHECK-NEXT: ds_write_b16 v0, v9 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: ds_write_b8 v0, v10 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v15, v30, 8, v29 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: v_or_b32_e32 v17, v3, v31 -; CHECK-NEXT: v_lshl_or_b32 v3, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v5, v14, 16, v15 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_write_b8 v0, v32 offset:30 -; CHECK-NEXT: ds_write_b32 v0, v16 offset:24 -; CHECK-NEXT: ds_write_b16 v0, v17 offset:28 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write_b64 v0, v[6:7] offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5535,81 +3296,19 @@ define void @memmove_p3_p5_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v5, v15, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v12, v20, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v11, v18, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v25, 8, v23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v26 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v28 -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v17, v31, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v18, v1, 8, v32 -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v16, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v7, v18, 16, v17 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5621,24 +3320,13 @@ define void @memmove_p3_p5_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5650,44 +3338,26 @@ define void @memmove_p3_p5_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v18, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 +; CHECK-NEXT: ds_write_b32 v0, v8 offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(7) +; CHECK-NEXT: ds_write_b16 v0, v9 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12 +; CHECK-NEXT: ds_write_b8 v0, v10 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v5, v15, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: ds_write_b16 v0, v16 offset:28 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_write_b8 v0, v17 offset:30 -; CHECK-NEXT: ds_write_b32 v0, v18 offset:24 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 +; CHECK-NEXT: ds_write_b64 v0, v[6:7] offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5699,41 +3369,19 @@ define void @memmove_p3_p5_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v17, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v3, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 16, v10 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3 +; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5890,55 +3538,12 @@ define void @memmove_p5_p0_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false) @@ -5949,100 +3554,24 @@ define void @memmove_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(30) lgkmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) lgkmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) lgkmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) lgkmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false) @@ -6053,103 +3582,19 @@ define void @memmove_p5_p0_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:31 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v33, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(31) lgkmcnt(31) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) lgkmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(29) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) lgkmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) lgkmcnt(26) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(23) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) lgkmcnt(21) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(17) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false) @@ -6160,31 +3605,12 @@ define void @memmove_p5_p0_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false) @@ -6195,55 +3621,24 @@ define void @memmove_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false) @@ -6254,55 +3649,19 @@ define void @memmove_p5_p0_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false) @@ -6452,55 +3811,12 @@ define void @memmove_p5_p1_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false) @@ -6511,100 +3827,24 @@ define void @memmove_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false) @@ -6615,103 +3855,19 @@ define void @memmove_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:31 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v33, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false) @@ -6722,31 +3878,12 @@ define void @memmove_p5_p1_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v3, v[1:2], off -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false) @@ -6757,55 +3894,24 @@ define void @memmove_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false) @@ -6816,55 +3922,19 @@ define void @memmove_p5_p1_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false) @@ -7009,54 +4079,12 @@ define void @memmove_p5_p3_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:15 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v10, v1 offset:7 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false) @@ -7067,72 +4095,25 @@ define void @memmove_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:24 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:25 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:27 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:28 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:29 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:30 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:16 -; CHECK-NEXT: ds_read_u8 v10, v1 offset:17 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:18 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:19 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:20 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:21 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:22 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:23 -; CHECK-NEXT: ds_read_u8 v17, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v18, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v19, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v20, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v21, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v22, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v23, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v24, v1 offset:15 -; CHECK-NEXT: ds_read_u8 v25, v1 -; CHECK-NEXT: ds_read_u8 v26, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v27, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v28, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v29, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v30, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v31, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v1, v1 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen +; CHECK-NEXT: ds_read_b32 v8, v1 offset:24 +; CHECK-NEXT: ds_read_u16 v9, v1 offset:28 +; CHECK-NEXT: ds_read_u8 v10, v1 offset:30 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false) @@ -7143,74 +4124,18 @@ define void @memmove_p5_p3_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:24 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:25 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:27 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:28 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:29 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:30 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:31 -; CHECK-NEXT: ds_read_u8 v10, v1 offset:16 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:17 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:18 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:19 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:20 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:21 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:22 -; CHECK-NEXT: ds_read_u8 v17, v1 offset:23 -; CHECK-NEXT: ds_read_u8 v18, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v19, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v20, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v21, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v22, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v23, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v24, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v25, v1 offset:15 -; CHECK-NEXT: ds_read_u8 v26, v1 -; CHECK-NEXT: ds_read_u8 v27, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v28, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v29, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v30, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v31, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v32, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v1, v1 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false) @@ -7221,30 +4146,12 @@ define void @memmove_p5_p3_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v2, v1 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v1, v1 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false) @@ -7255,54 +4162,25 @@ define void @memmove_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:30 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:28 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:24 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:22 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:20 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:18 -; CHECK-NEXT: ds_read_u16 v9, v1 offset:16 -; CHECK-NEXT: ds_read_u16 v10, v1 offset:14 -; CHECK-NEXT: ds_read_u16 v11, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v12, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v13, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v14, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v15, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v16, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: ds_read_b32 v8, v1 offset:24 +; CHECK-NEXT: ds_read_u16 v9, v1 offset:28 +; CHECK-NEXT: ds_read_u8 v10, v1 offset:30 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false) @@ -7313,54 +4191,18 @@ define void @memmove_p5_p3_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v2, v1 offset:30 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:28 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:24 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:22 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:20 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:18 -; CHECK-NEXT: ds_read_u16 v9, v1 offset:16 -; CHECK-NEXT: ds_read_u16 v10, v1 offset:14 -; CHECK-NEXT: ds_read_u16 v11, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v12, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v13, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v14, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v15, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v16, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false) @@ -7505,55 +4347,12 @@ define void @memmove_p5_p4_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) @@ -7564,100 +4363,24 @@ define void @memmove_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false) @@ -7668,103 +4391,19 @@ define void @memmove_p5_p4_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:31 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v33, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false) @@ -7775,31 +4414,12 @@ define void @memmove_p5_p4_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v3, v[1:2], off -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false) @@ -7810,55 +4430,24 @@ define void @memmove_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false) @@ -7869,55 +4458,19 @@ define void @memmove_p5_p4_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false) @@ -8062,55 +4615,19 @@ define void @memmove_p5_p5_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false) @@ -8121,100 +4638,34 @@ define void @memmove_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:9 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:7 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:5 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false) @@ -8225,103 +4676,31 @@ define void @memmove_p5_p5_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:7 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:5 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false) @@ -8332,31 +4711,19 @@ define void @memmove_p5_p5_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false) @@ -8367,55 +4734,34 @@ define void @memmove_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false) @@ -8426,55 +4772,31 @@ define void @memmove_p5_p5_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll b/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll index 94bc6d46b2395b..8ad6a4e534d232 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll @@ -19,8 +19,8 @@ define void @extracted_values(ptr %ret_struct, ptr addrspace(3) %arg0, ptr addrs ; CHECK-NEXT: v_sub_f16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; CHECK-NEXT: v_add_f16_e32 v4, v6, v7 ; CHECK-NEXT: v_add_f16_e32 v2, v3, v2 -; CHECK-NEXT: flat_store_short v[0:1], v4 -; CHECK-NEXT: flat_store_short v[0:1], v2 offset:2 +; CHECK-NEXT: v_pack_b32_f16 v2, v4, v2 +; CHECK-NEXT: flat_store_dword v[0:1], v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll index 4e734d6e0884bc..fc33a274d7b11a 100644 --- a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll +++ b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll @@ -607,7 +607,14 @@ define amdgpu_kernel void @local_store_align1_v16i8(ptr addrspace(3) %out) #0 { ; MUBUF: buffer_load_ubyte ; MUBUF: buffer_load_ubyte ; MUBUF: buffer_load_ubyte -; FLATSCR: scratch_load_dwordx2 +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte define double @private_load_align1_f64(ptr addrspace(5) %in) { %x = load double, ptr addrspace(5) %in, align 1 ret double %x @@ -622,7 +629,14 @@ define double @private_load_align1_f64(ptr addrspace(5) %in) { ; MUBUF: buffer_store_byte ; MUBUF: buffer_store_byte ; MUBUF: buffer_store_byte -; FLATSCR: scratch_store_dwordx2 +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte define void @private_store_align1_f64(ptr addrspace(5) %out, double %x) #0 { store double %x, ptr addrspace(5) %out, align 1 ret void @@ -651,7 +665,10 @@ define void @private_store_align4_f64(ptr addrspace(5) %out, double %x) #0 { ; MUBUF: buffer_load_ushort ; MUBUF: buffer_load_ushort ; MUBUF: buffer_load_ushort -; FLATSCR: scratch_load_dwordx2 +; FLATSCR: scratch_load_ushort +; FLATSCR: scratch_load_ushort +; FLATSCR: scratch_load_ushort +; FLATSCR: scratch_load_ushort define double @private_load_align2_f64(ptr addrspace(5) %in) { %x = load double, ptr addrspace(5) %in, align 2 ret double %x @@ -662,7 +679,10 @@ define double @private_load_align2_f64(ptr addrspace(5) %in) { ; MUBUF: buffer_store_short ; MUBUF: buffer_store_short ; MUBUF: buffer_store_short -; FLATSCR: scratch_store_dwordx2 +; FLATSCR: scratch_store_short +; FLATSCR: scratch_store_short +; FLATSCR: scratch_store_short +; FLATSCR: scratch_store_short define void @private_store_align2_f64(ptr addrspace(5) %out, double %x) #0 { store double %x, ptr addrspace(5) %out, align 2 ret void