From 32073b835674a9e7bc3e1ee9708efb7c58e7394f Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Tue, 23 Jan 2024 10:05:32 -0800 Subject: [PATCH] AMDGPU: Do not generate non-temporal hint when Load_Tr intrinsic did not specify it (#79104) int_amdgcn_global_load_tr did not specify non-temporal load transpose, thus we should not genetrate the non-temporal hint for the load. We need to implement getTgtMemIntrinsic to create the corresponding MemSDNode. And we don't set the non-temporal flag because the intrinsic did not specify it. NOTE: We need to implement getTgtMemIntrinsic for any memory intrinsics. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 9 +++++++ .../AMDGPU/llvm.amdgcn.global.load.tr-w32.ll | 24 +++++++------------ .../AMDGPU/llvm.amdgcn.global.load.tr-w64.ll | 24 +++++++------------ 3 files changed, 25 insertions(+), 32 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 073c8cc7211737..cf947dccafac55 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1348,6 +1348,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineMemOperand::MOVolatile; return true; } + case Intrinsic::amdgcn_global_load_tr: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = CI.getOperand(0); + Info.align.reset(); + Info.flags |= MachineMemOperand::MOLoad; + return true; + } case Intrinsic::amdgcn_ds_gws_init: case Intrinsic::amdgcn_ds_gws_barrier: case Intrinsic::amdgcn_ds_gws_sema_v: @@ -1407,6 +1415,7 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, SmallVectorImpl &Ops, Type *&AccessTy) const { switch (II->getIntrinsicID()) { + case Intrinsic::amdgcn_global_load_tr: case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_ds_append: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll index 5382b56b92fb1d..8f1e6f3ac1a0c3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll @@ -13,9 +13,8 @@ define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrsp ; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-SDAG-W32-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 ; GFX12-SDAG-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_inv scope:SCOPE_SYS ; GFX12-SDAG-W32-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-W32-NEXT: s_nop 0 ; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -26,9 +25,8 @@ define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrsp ; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-GISEL-W32-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 ; GFX12-GISEL-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_inv scope:SCOPE_SYS ; GFX12-GISEL-W32-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-W32-NEXT: s_nop 0 ; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -46,9 +44,8 @@ define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr a ; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-SDAG-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 ; GFX12-SDAG-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_inv scope:SCOPE_SYS ; GFX12-SDAG-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] ; GFX12-SDAG-W32-NEXT: s_nop 0 ; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -59,9 +56,8 @@ define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr a ; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-GISEL-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 ; GFX12-GISEL-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_inv scope:SCOPE_SYS ; GFX12-GISEL-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] ; GFX12-GISEL-W32-NEXT: s_nop 0 ; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -79,9 +75,8 @@ define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr ; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-SDAG-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 ; GFX12-SDAG-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_inv scope:SCOPE_SYS ; GFX12-SDAG-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] ; GFX12-SDAG-W32-NEXT: s_nop 0 ; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -92,9 +87,8 @@ define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr ; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-GISEL-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 ; GFX12-GISEL-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_inv scope:SCOPE_SYS ; GFX12-GISEL-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] ; GFX12-GISEL-W32-NEXT: s_nop 0 ; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -112,9 +106,8 @@ define amdgpu_kernel void @global_load_tr_b128_bfloat(ptr addrspace(1) %addr, pt ; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-SDAG-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 ; GFX12-SDAG-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_inv scope:SCOPE_SYS ; GFX12-SDAG-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] ; GFX12-SDAG-W32-NEXT: s_nop 0 ; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -125,9 +118,8 @@ define amdgpu_kernel void @global_load_tr_b128_bfloat(ptr addrspace(1) %addr, pt ; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-GISEL-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 ; GFX12-GISEL-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_inv scope:SCOPE_SYS ; GFX12-GISEL-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] ; GFX12-GISEL-W32-NEXT: s_nop 0 ; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll index 0936d175636440..d5a45fb838fc7f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll @@ -13,9 +13,8 @@ define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrsp ; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-SDAG-W64-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32 ; GFX12-SDAG-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_inv scope:SCOPE_SYS ; GFX12-SDAG-W64-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX12-SDAG-W64-NEXT: s_nop 0 ; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -26,9 +25,8 @@ define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrsp ; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-GISEL-W64-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32 ; GFX12-GISEL-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_inv scope:SCOPE_SYS ; GFX12-GISEL-W64-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX12-GISEL-W64-NEXT: s_nop 0 ; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -46,9 +44,8 @@ define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr a ; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 ; GFX12-SDAG-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_inv scope:SCOPE_SYS ; GFX12-SDAG-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-W64-NEXT: s_nop 0 ; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -59,9 +56,8 @@ define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr a ; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 ; GFX12-GISEL-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_inv scope:SCOPE_SYS ; GFX12-GISEL-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-W64-NEXT: s_nop 0 ; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -79,9 +75,8 @@ define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr ; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 ; GFX12-SDAG-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_inv scope:SCOPE_SYS ; GFX12-SDAG-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-W64-NEXT: s_nop 0 ; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -92,9 +87,8 @@ define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr ; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 ; GFX12-GISEL-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_inv scope:SCOPE_SYS ; GFX12-GISEL-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-W64-NEXT: s_nop 0 ; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -112,9 +106,8 @@ define amdgpu_kernel void @global_load_tr_b128_bfloat(ptr addrspace(1) %addr, pt ; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 ; GFX12-SDAG-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_inv scope:SCOPE_SYS ; GFX12-SDAG-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-W64-NEXT: s_nop 0 ; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -125,9 +118,8 @@ define amdgpu_kernel void @global_load_tr_b128_bfloat(ptr addrspace(1) %addr, pt ; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 ; GFX12-GISEL-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_inv scope:SCOPE_SYS ; GFX12-GISEL-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-W64-NEXT: s_nop 0 ; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)