Skip to content

Commit

Permalink
fixup! [AMDGPU] Enable unaligned scratch accesses
Browse files Browse the repository at this point in the history
make flat scratch not imply that unaligned scratch accesses are valid
  • Loading branch information
ritter-x2a committed Oct 1, 2024
1 parent 49300b3 commit 0d0c4e0
Show file tree
Hide file tree
Showing 7 changed files with 7,570 additions and 766 deletions.
3 changes: 1 addition & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -387,8 +387,7 @@ bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
// them later if they may access private memory. We don't have enough context
// here, and legalization can handle it.
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled() ||
ST->enableFlatScratch()) &&
return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
ChainSizeInBytes <= ST->getMaxPrivateElementSize();
}
return true;
Expand Down
3 changes: 1 addition & 2 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1841,8 +1841,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
if (IsFast)
*IsFast = AlignedBy4;

return AlignedBy4 || Subtarget->enableFlatScratch() ||
Subtarget->hasUnalignedScratchAccessEnabled();
return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
}

// So long as they are correct, wide global memory operations perform better
Expand Down
1,204 changes: 1,081 additions & 123 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll

Large diffs are not rendered by default.

3,222 changes: 2,943 additions & 279 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir

Large diffs are not rendered by default.

3,825 changes: 3,486 additions & 339 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir

Large diffs are not rendered by default.

51 changes: 34 additions & 17 deletions llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -475,8 +475,14 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_store_short off, v0, s0 offset:4
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_load_dword v0, off, s0
; FLATSCR-NEXT: scratch_load_dword v1, off, s0 offset:2
; FLATSCR-NEXT: scratch_load_ushort v0, off, s0 offset:2
; FLATSCR-NEXT: scratch_load_ushort v3, off, s0
; FLATSCR-NEXT: s_waitcnt vmcnt(1)
; FLATSCR-NEXT: v_mov_b32_e32 v1, v0
; FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, s0 offset:4
; FLATSCR-NEXT: s_mov_b32 s0, 0x5040100
; FLATSCR-NEXT: s_waitcnt vmcnt(1)
; FLATSCR-NEXT: v_perm_b32 v0, v0, v3, s0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; FLATSCR-NEXT: s_endpgm
Expand Down Expand Up @@ -537,8 +543,13 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s0 offset:4
; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; FLATSCR_GFX10-NEXT: s_clause 0x1
; FLATSCR_GFX10-NEXT: scratch_load_dword v0, off, s0
; FLATSCR_GFX10-NEXT: scratch_load_dword v1, off, s0 offset:2
; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, off, s0 offset:2
; FLATSCR_GFX10-NEXT: scratch_load_ushort v3, off, s0
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(1)
; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v1, v0
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
; FLATSCR_GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, off, s0 offset:4
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; FLATSCR_GFX10-NEXT: s_endpgm
Expand All @@ -561,8 +572,13 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
; GFX11-NEXT: scratch_store_b16 off, v0, off offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_load_b32 v0, off, off
; GFX11-NEXT: scratch_load_b32 v1, off, off offset:2
; GFX11-NEXT: scratch_load_u16 v0, off, off offset:2
; GFX11-NEXT: scratch_load_u16 v3, off, off
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_mov_b32_e32 v1, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
; GFX11-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
Expand Down Expand Up @@ -703,12 +719,12 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
; FLATSCR-LABEL: chain_hi_to_lo_private_other_dep:
; FLATSCR: ; %bb.0: ; %bb
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FLATSCR-NEXT: scratch_load_dword v0, v0, off
; FLATSCR-NEXT: s_mov_b32 s0, 0x7060302
; FLATSCR-NEXT: scratch_load_short_d16_hi v1, v0, off
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; FLATSCR-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
; FLATSCR-NEXT: v_perm_b32 v0, v1, v0, s0
; FLATSCR-NEXT: scratch_load_short_d16 v1, v0, off offset:2
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_mov_b32_e32 v0, v1
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_other_dep:
Expand All @@ -725,22 +741,23 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private_other_dep:
; FLATSCR_GFX10: ; %bb.0: ; %bb
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FLATSCR_GFX10-NEXT: scratch_load_dword v0, v0, off
; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, v0, off
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
; FLATSCR_GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; FLATSCR_GFX10-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
; FLATSCR_GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; FLATSCR_GFX10-NEXT: scratch_load_short_d16 v1, v0, off offset:2
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1
; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_private_other_dep:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: scratch_load_b32 v0, v0, off
; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11-NEXT: scratch_load_d16_b16 v1, v0, off offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds i16, ptr addrspace(5) %ptr, i64 1
Expand Down
28 changes: 24 additions & 4 deletions llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
Original file line number Diff line number Diff line change
Expand Up @@ -607,7 +607,14 @@ define amdgpu_kernel void @local_store_align1_v16i8(ptr addrspace(3) %out) #0 {
; MUBUF: buffer_load_ubyte
; MUBUF: buffer_load_ubyte
; MUBUF: buffer_load_ubyte
; FLATSCR: scratch_load_dwordx2
; FLATSCR: scratch_load_ubyte
; FLATSCR: scratch_load_ubyte
; FLATSCR: scratch_load_ubyte
; FLATSCR: scratch_load_ubyte
; FLATSCR: scratch_load_ubyte
; FLATSCR: scratch_load_ubyte
; FLATSCR: scratch_load_ubyte
; FLATSCR: scratch_load_ubyte
define double @private_load_align1_f64(ptr addrspace(5) %in) {
%x = load double, ptr addrspace(5) %in, align 1
ret double %x
Expand All @@ -622,7 +629,14 @@ define double @private_load_align1_f64(ptr addrspace(5) %in) {
; MUBUF: buffer_store_byte
; MUBUF: buffer_store_byte
; MUBUF: buffer_store_byte
; FLATSCR: scratch_store_dwordx2
; FLATSCR: scratch_store_byte
; FLATSCR: scratch_store_byte
; FLATSCR: scratch_store_byte
; FLATSCR: scratch_store_byte
; FLATSCR: scratch_store_byte
; FLATSCR: scratch_store_byte
; FLATSCR: scratch_store_byte
; FLATSCR: scratch_store_byte
define void @private_store_align1_f64(ptr addrspace(5) %out, double %x) #0 {
store double %x, ptr addrspace(5) %out, align 1
ret void
Expand Down Expand Up @@ -651,7 +665,10 @@ define void @private_store_align4_f64(ptr addrspace(5) %out, double %x) #0 {
; MUBUF: buffer_load_ushort
; MUBUF: buffer_load_ushort
; MUBUF: buffer_load_ushort
; FLATSCR: scratch_load_dwordx2
; FLATSCR: scratch_load_ushort
; FLATSCR: scratch_load_ushort
; FLATSCR: scratch_load_ushort
; FLATSCR: scratch_load_ushort
define double @private_load_align2_f64(ptr addrspace(5) %in) {
%x = load double, ptr addrspace(5) %in, align 2
ret double %x
Expand All @@ -662,7 +679,10 @@ define double @private_load_align2_f64(ptr addrspace(5) %in) {
; MUBUF: buffer_store_short
; MUBUF: buffer_store_short
; MUBUF: buffer_store_short
; FLATSCR: scratch_store_dwordx2
; FLATSCR: scratch_store_short
; FLATSCR: scratch_store_short
; FLATSCR: scratch_store_short
; FLATSCR: scratch_store_short
define void @private_store_align2_f64(ptr addrspace(5) %out, double %x) #0 {
store double %x, ptr addrspace(5) %out, align 2
ret void
Expand Down

0 comments on commit 0d0c4e0

Please sign in to comment.