From c4c5fdd933fa2d1f7624d863d05a4fb982b4c074 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 22 Aug 2024 16:11:12 +0100 Subject: [PATCH] [AMDGPU] Generate checks for vector indexing. NFC. (#105668) This allows combining some test files that were only split because adding new RUN lines introduced too much churn in the checks. --- .../AMDGPU/indirect-addressing-si-gfx9.ll | 67 - .../AMDGPU/indirect-addressing-si-noopt.ll | 63 - .../AMDGPU/indirect-addressing-si-pregfx9.ll | 53 - .../CodeGen/AMDGPU/indirect-addressing-si.ll | 8379 ++++++++++++++++- 4 files changed, 8066 insertions(+), 496 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll delete mode 100644 llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll index 31fa32b3475cb7..872a457a3b5c34 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll @@ -2,70 +2,6 @@ ; indexing of vectors. -; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll -; to avoid gfx9 scheduling induced issues. - - -; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block: -; GCN-DAG: s_load_dwordx16 s[[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]] -; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]] -; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62 - -; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]] -; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]] - -; GCN: v_cmp_eq_u32_e32 -; GCN-COUNT-32: v_cndmask_b32 - -; GCN-COUNT-4: buffer_store_dwordx4 -define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in, <16 x i32> %vec0) #0 { -entry: - %id = call i32 @llvm.amdgcn.workitem.id.x() #1 - %id.ext = zext i32 %id to i64 - %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %id.ext - %idx0 = load volatile i32, ptr addrspace(1) %gep - %idx1 = add i32 %idx0, 1 - %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"() - %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0 - %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1 - store volatile <16 x i32> %vec2, ptr addrspace(1) %out0 - %cmp = icmp eq i32 %id, 0 - br i1 %cmp, label %bb1, label %bb2 - -bb1: - store volatile i32 %live.out.val, ptr addrspace(1) undef - br label %bb2 - -bb2: - ret void -} - -; Avoid inserting extra v_mov from copies within the vgpr indexing sequence. The -; gpr_idx mode switching sequence is expanded late for this reason. - -; GCN-LABEL: {{^}}insert_w_offset_multiple_in_block - -; GCN: s_set_gpr_idx_on -; GCN-NEXT: v_mov_b32_e32 -; GCN-NEXT: s_set_gpr_idx_off - -; GCN: s_set_gpr_idx_on -; GCN-NEXT: v_mov_b32_e32 -; GCN-NOT: v_mov_b32_e32 -; GCN-NEXT: s_set_gpr_idx_off -define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %out1, i32 %in) #0 { -entry: - %add1 = add i32 %in, 1 - %ins1 = insertelement <16 x float> , float 17.0, i32 %add1 - %add2 = add i32 %in, 2 - %ins2 = insertelement <16 x float> %ins1, float 17.0, i32 %add2 - store <16 x float> %ins1, ptr addrspace(1) %out1 - %out2 = getelementptr <16 x float>, ptr addrspace(1) %out1, i32 1 - store <16 x float> %ins2, ptr addrspace(1) %out2 - - ret void -} - declare hidden void @foo() ; For functions with calls, we were not accounting for m0_lo16/m0_hi16 @@ -83,7 +19,4 @@ define amdgpu_kernel void @insertelement_with_call(ptr addrspace(1) %ptr, i32 %i ret void } -declare i32 @llvm.amdgcn.workitem.id.x() #1 -declare void @llvm.amdgcn.s.barrier() #2 - attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll deleted file mode 100644 index 1a72140963d696..00000000000000 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll +++ /dev/null @@ -1,63 +0,0 @@ -; RUN: llc -O0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s - -; FIXME: Merge into indirect-addressing-si.ll - -; Make sure that TwoAddressInstructions keeps src0 as subregister sub0 -; of the tied implicit use and def of the super register. - -; CHECK-LABEL: {{^}}insert_wo_offset: -; CHECK: s_load_dword [[IN:s[0-9]+]] -; CHECK: s_mov_b32 m0, [[IN]] -; CHECK: v_movreld_b32_e32 v[[ELT0:[0-9]+]] -; CHECK: buffer_store_dwordx4 -; CHECK: buffer_store_dwordx4 -; CHECK: buffer_store_dwordx4 -; CHECK: buffer_store_dwordx4 -define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) { -entry: - %ins = insertelement <16 x float> , float 17.0, i32 %in - store <16 x float> %ins, ptr addrspace(1) %out - ret void -} - -; Make sure we don't hit use of undefined register errors when expanding an -; extract with undef index. - -; CHECK-LABEL: {{^}}extract_adjacent_blocks: -; CHECK: s_load_dword [[ARG:s[0-9]+]] -; CHECK: s_cmp_lg_u32 -; CHECK: s_cbranch_scc1 [[BB4:.LBB[0-9]+_[0-9]+]] - -; CHECK: buffer_load_dwordx4 - -; CHECK: s_branch [[ENDBB:.LBB[0-9]+_[0-9]+]] - -; CHECK: [[BB4]]: -; CHECK: buffer_load_dwordx4 - -; CHECK: [[ENDBB]]: -; CHECK: buffer_store_dword -; CHECK: s_endpgm - -define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) #0 { -bb: - %tmp = icmp eq i32 %arg, 0 - br i1 %tmp, label %bb1, label %bb4 - -bb1: - %tmp2 = load volatile <4 x float>, ptr addrspace(1) undef - %tmp3 = extractelement <4 x float> %tmp2, i32 undef - call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp2) #0 ; Prevent block optimize out - br label %bb7 - -bb4: - %tmp5 = load volatile <4 x float>, ptr addrspace(1) undef - %tmp6 = extractelement <4 x float> %tmp5, i32 undef - call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp5) #0 ; Prevent block optimize out - br label %bb7 - -bb7: - %tmp8 = phi float [ %tmp3, %bb1 ], [ %tmp6, %bb4 ] - store volatile float %tmp8, ptr addrspace(1) undef - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll deleted file mode 100644 index cbb5d9e1692843..00000000000000 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll +++ /dev/null @@ -1,53 +0,0 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s - -; Tests for indirect addressing on SI, which is implemented using dynamic -; indexing of vectors. - -; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll -; to avoid gfx9 scheduling induced issues. - - -; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block: -; GCN-DAG: s_load_dwordx16 s[[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]] -; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]] -; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62 - -; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]] -; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]] - -; GCN-DAG: v_add_{{i32|u32}}_e32 [[IDX1:v[0-9]+]], vcc, 1, [[IDX0]] - -; GCN: v_cmp_eq_u32_e32 -; GCN-COUNT-32: v_cndmask_b32 - -; GCN-COUNT-4: buffer_store_dwordx4 -define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in, <16 x i32> %vec0) #0 { -entry: - %id = call i32 @llvm.amdgcn.workitem.id.x() #1 - %id.ext = zext i32 %id to i64 - %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %id.ext - %idx0 = load volatile i32, ptr addrspace(1) %gep - %idx1 = add i32 %idx0, 1 - %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"() - %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0 - %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1 - store volatile <16 x i32> %vec2, ptr addrspace(1) %out0 - %cmp = icmp eq i32 %id, 0 - br i1 %cmp, label %bb1, label %bb2 - -bb1: - store volatile i32 %live.out.val, ptr addrspace(1) undef - br label %bb2 - -bb2: - ret void -} - -declare i32 @llvm.amdgcn.workitem.id.x() #1 -declare void @llvm.amdgcn.s.barrier() #2 - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } -attributes #2 = { nounwind convergent } diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index f095aef7a0cc81..c130eb04d02370 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -1,26 +1,197 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=tahiti -O0 < %s | FileCheck -check-prefix=NOOPT %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI-MOVREL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI,VI-MOVREL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-vgpr-index-mode < %s | FileCheck -check-prefixes=VI,VI-IDXMODE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9-IDXMODE %s ; Tests for indirect addressing on SI, which is implemented using dynamic ; indexing of vectors. - -; GCN-LABEL: {{^}}extract_w_offset: -; GCN-DAG: s_load_dword [[IN0:s[0-9]+]] -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 -; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0 -; GCN-DAG: s_add_i32 [[IN:s[0-9]+]], [[IN0]], 1 - -; MOVREL-DAG: s_mov_b32 m0, [[IN]] -; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]] - -; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(SRC0){{$}} -; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]] -; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) { +; NOOPT-LABEL: extract_w_offset: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s5, 1 +; NOOPT-NEXT: s_add_i32 s4, s4, s5 +; NOOPT-NEXT: s_mov_b32 s5, 0x41800000 +; NOOPT-NEXT: s_mov_b32 s6, 0x41700000 +; NOOPT-NEXT: s_mov_b32 s7, 0x41600000 +; NOOPT-NEXT: s_mov_b32 s8, 0x41500000 +; NOOPT-NEXT: s_mov_b32 s9, 0x41400000 +; NOOPT-NEXT: s_mov_b32 s10, 0x41300000 +; NOOPT-NEXT: s_mov_b32 s11, 0x41200000 +; NOOPT-NEXT: s_mov_b32 s12, 0x41100000 +; NOOPT-NEXT: s_mov_b32 s13, 0x41000000 +; NOOPT-NEXT: s_mov_b32 s14, 0x40e00000 +; NOOPT-NEXT: s_mov_b32 s15, 0x40c00000 +; NOOPT-NEXT: s_mov_b32 s16, 0x40a00000 +; NOOPT-NEXT: s_mov_b32 s17, 4.0 +; NOOPT-NEXT: s_mov_b32 s18, 0x40400000 +; NOOPT-NEXT: s_mov_b32 s19, 2.0 +; NOOPT-NEXT: s_mov_b32 s20, 1.0 +; NOOPT-NEXT: v_mov_b32_e32 v0, s20 +; NOOPT-NEXT: v_mov_b32_e32 v30, s19 +; NOOPT-NEXT: v_mov_b32_e32 v29, s18 +; NOOPT-NEXT: v_mov_b32_e32 v28, s17 +; NOOPT-NEXT: v_mov_b32_e32 v27, s16 +; NOOPT-NEXT: v_mov_b32_e32 v26, s15 +; NOOPT-NEXT: v_mov_b32_e32 v25, s14 +; NOOPT-NEXT: v_mov_b32_e32 v24, s13 +; NOOPT-NEXT: v_mov_b32_e32 v23, s12 +; NOOPT-NEXT: v_mov_b32_e32 v22, s11 +; NOOPT-NEXT: v_mov_b32_e32 v21, s10 +; NOOPT-NEXT: v_mov_b32_e32 v20, s9 +; NOOPT-NEXT: v_mov_b32_e32 v19, s8 +; NOOPT-NEXT: v_mov_b32_e32 v18, s7 +; NOOPT-NEXT: v_mov_b32_e32 v17, s6 +; NOOPT-NEXT: v_mov_b32_e32 v16, s5 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v30 +; NOOPT-NEXT: v_mov_b32_e32 v2, v29 +; NOOPT-NEXT: v_mov_b32_e32 v3, v28 +; NOOPT-NEXT: v_mov_b32_e32 v4, v27 +; NOOPT-NEXT: v_mov_b32_e32 v5, v26 +; NOOPT-NEXT: v_mov_b32_e32 v6, v25 +; NOOPT-NEXT: v_mov_b32_e32 v7, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v22 +; NOOPT-NEXT: v_mov_b32_e32 v10, v21 +; NOOPT-NEXT: v_mov_b32_e32 v11, v20 +; NOOPT-NEXT: v_mov_b32_e32 v12, v19 +; NOOPT-NEXT: v_mov_b32_e32 v13, v18 +; NOOPT-NEXT: v_mov_b32_e32 v14, v17 +; NOOPT-NEXT: v_mov_b32_e32 v15, v16 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_movrels_b32_e32 v0, v0 +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extract_w_offset: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_add_i32 s4, s4, 1 +; SI-MOVREL-NEXT: s_mov_b32 m0, s4 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: extract_w_offset: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_add_i32 s4, s4, 1 +; VI-MOVREL-NEXT: s_mov_b32 m0, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-MOVREL-NEXT: v_movrels_b32_e32 v2, v0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; VI-MOVREL-NEXT: flat_store_dword v[0:1], v2 +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: extract_w_offset: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_add_i32 s4, s4, 1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, v0 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1 +; VI-IDXMODE-NEXT: flat_store_dword v[0:1], v2 +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: extract_w_offset: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_add_i32 s4, s4, 1 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %idx = add i32 %in, 1 %elt = extractelement <16 x float> , i32 %idx @@ -29,24 +200,291 @@ entry: } ; XXX: Could do v_or_b32 directly -; GCN-LABEL: {{^}}extract_w_offset_salu_use_vector: -; GCN-DAG: s_or_b32 -; GCN-DAG: s_or_b32 -; GCN-DAG: s_or_b32 -; GCN-DAG: s_or_b32 -; MOVREL: s_mov_b32 m0 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} - - -; MOVREL: v_movrels_b32_e32 - -; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, gpr_idx(SRC0){{$}} -; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %out, i32 %in, <16 x i32> %or.val) { +; NOOPT-LABEL: extract_w_offset_salu_use_vector: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x19 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s21, 1 +; NOOPT-NEXT: s_add_i32 s4, s4, s21 +; NOOPT-NEXT: s_mov_b32 s5, s51 +; NOOPT-NEXT: s_mov_b32 s6, 16 +; NOOPT-NEXT: s_or_b32 s5, s5, s6 +; NOOPT-NEXT: s_mov_b32 s6, s50 +; NOOPT-NEXT: s_mov_b32 s7, 15 +; NOOPT-NEXT: s_or_b32 s6, s6, s7 +; NOOPT-NEXT: s_mov_b32 s7, s49 +; NOOPT-NEXT: s_mov_b32 s8, 14 +; NOOPT-NEXT: s_or_b32 s7, s7, s8 +; NOOPT-NEXT: s_mov_b32 s8, s48 +; NOOPT-NEXT: s_mov_b32 s9, 13 +; NOOPT-NEXT: s_or_b32 s8, s8, s9 +; NOOPT-NEXT: s_mov_b32 s9, s47 +; NOOPT-NEXT: s_mov_b32 s10, 12 +; NOOPT-NEXT: s_or_b32 s9, s9, s10 +; NOOPT-NEXT: s_mov_b32 s10, s46 +; NOOPT-NEXT: s_mov_b32 s11, 11 +; NOOPT-NEXT: s_or_b32 s10, s10, s11 +; NOOPT-NEXT: s_mov_b32 s11, s45 +; NOOPT-NEXT: s_mov_b32 s12, 10 +; NOOPT-NEXT: s_or_b32 s11, s11, s12 +; NOOPT-NEXT: s_mov_b32 s12, s44 +; NOOPT-NEXT: s_mov_b32 s13, 9 +; NOOPT-NEXT: s_or_b32 s12, s12, s13 +; NOOPT-NEXT: s_mov_b32 s13, s43 +; NOOPT-NEXT: s_mov_b32 s14, 8 +; NOOPT-NEXT: s_or_b32 s13, s13, s14 +; NOOPT-NEXT: s_mov_b32 s14, s42 +; NOOPT-NEXT: s_mov_b32 s15, 7 +; NOOPT-NEXT: s_or_b32 s14, s14, s15 +; NOOPT-NEXT: s_mov_b32 s15, s41 +; NOOPT-NEXT: s_mov_b32 s16, 6 +; NOOPT-NEXT: s_or_b32 s15, s15, s16 +; NOOPT-NEXT: s_mov_b32 s16, s40 +; NOOPT-NEXT: s_mov_b32 s17, 5 +; NOOPT-NEXT: s_or_b32 s16, s16, s17 +; NOOPT-NEXT: s_mov_b32 s17, s39 +; NOOPT-NEXT: s_mov_b32 s18, 4 +; NOOPT-NEXT: s_or_b32 s17, s17, s18 +; NOOPT-NEXT: s_mov_b32 s18, s38 +; NOOPT-NEXT: s_mov_b32 s19, 3 +; NOOPT-NEXT: s_or_b32 s18, s18, s19 +; NOOPT-NEXT: s_mov_b32 s19, s37 +; NOOPT-NEXT: s_mov_b32 s20, 2 +; NOOPT-NEXT: s_or_b32 s19, s19, s20 +; NOOPT-NEXT: s_mov_b32 s20, s36 +; NOOPT-NEXT: s_or_b32 s20, s20, s21 +; NOOPT-NEXT: v_mov_b32_e32 v0, s20 +; NOOPT-NEXT: v_mov_b32_e32 v30, s19 +; NOOPT-NEXT: v_mov_b32_e32 v29, s18 +; NOOPT-NEXT: v_mov_b32_e32 v28, s17 +; NOOPT-NEXT: v_mov_b32_e32 v27, s16 +; NOOPT-NEXT: v_mov_b32_e32 v26, s15 +; NOOPT-NEXT: v_mov_b32_e32 v25, s14 +; NOOPT-NEXT: v_mov_b32_e32 v24, s13 +; NOOPT-NEXT: v_mov_b32_e32 v23, s12 +; NOOPT-NEXT: v_mov_b32_e32 v22, s11 +; NOOPT-NEXT: v_mov_b32_e32 v21, s10 +; NOOPT-NEXT: v_mov_b32_e32 v20, s9 +; NOOPT-NEXT: v_mov_b32_e32 v19, s8 +; NOOPT-NEXT: v_mov_b32_e32 v18, s7 +; NOOPT-NEXT: v_mov_b32_e32 v17, s6 +; NOOPT-NEXT: v_mov_b32_e32 v16, s5 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v30 +; NOOPT-NEXT: v_mov_b32_e32 v2, v29 +; NOOPT-NEXT: v_mov_b32_e32 v3, v28 +; NOOPT-NEXT: v_mov_b32_e32 v4, v27 +; NOOPT-NEXT: v_mov_b32_e32 v5, v26 +; NOOPT-NEXT: v_mov_b32_e32 v6, v25 +; NOOPT-NEXT: v_mov_b32_e32 v7, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v22 +; NOOPT-NEXT: v_mov_b32_e32 v10, v21 +; NOOPT-NEXT: v_mov_b32_e32 v11, v20 +; NOOPT-NEXT: v_mov_b32_e32 v12, v19 +; NOOPT-NEXT: v_mov_b32_e32 v13, v18 +; NOOPT-NEXT: v_mov_b32_e32 v14, v17 +; NOOPT-NEXT: v_mov_b32_e32 v15, v16 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_movrels_b32_e32 v0, v0 +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extract_w_offset_salu_use_vector: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dword s20, s[2:3], 0xb +; SI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_add_i32 s20, s20, 1 +; SI-MOVREL-NEXT: s_or_b32 s4, s4, 1 +; SI-MOVREL-NEXT: s_or_b32 s19, s19, 16 +; SI-MOVREL-NEXT: s_or_b32 s18, s18, 15 +; SI-MOVREL-NEXT: s_or_b32 s17, s17, 14 +; SI-MOVREL-NEXT: s_or_b32 s16, s16, 13 +; SI-MOVREL-NEXT: s_or_b32 s15, s15, 12 +; SI-MOVREL-NEXT: s_or_b32 s14, s14, 11 +; SI-MOVREL-NEXT: s_or_b32 s13, s13, 10 +; SI-MOVREL-NEXT: s_or_b32 s12, s12, 9 +; SI-MOVREL-NEXT: s_or_b32 s11, s11, 8 +; SI-MOVREL-NEXT: s_or_b32 s10, s10, 7 +; SI-MOVREL-NEXT: s_or_b32 s9, s9, 6 +; SI-MOVREL-NEXT: s_or_b32 s8, s8, 5 +; SI-MOVREL-NEXT: s_or_b32 s7, s7, 4 +; SI-MOVREL-NEXT: s_or_b32 s6, s6, 3 +; SI-MOVREL-NEXT: s_or_b32 s5, s5, 2 +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; SI-MOVREL-NEXT: s_mov_b32 m0, s20 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s18 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s19 +; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: extract_w_offset_salu_use_vector: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dword s20, s[2:3], 0x2c +; VI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_add_i32 s20, s20, 1 +; VI-MOVREL-NEXT: s_or_b32 s6, s6, 3 +; VI-MOVREL-NEXT: s_or_b32 s5, s5, 2 +; VI-MOVREL-NEXT: s_or_b32 s4, s4, 1 +; VI-MOVREL-NEXT: s_or_b32 s2, s19, 16 +; VI-MOVREL-NEXT: s_or_b32 s3, s18, 15 +; VI-MOVREL-NEXT: s_or_b32 s17, s17, 14 +; VI-MOVREL-NEXT: s_or_b32 s16, s16, 13 +; VI-MOVREL-NEXT: s_or_b32 s15, s15, 12 +; VI-MOVREL-NEXT: s_or_b32 s14, s14, 11 +; VI-MOVREL-NEXT: s_or_b32 s13, s13, 10 +; VI-MOVREL-NEXT: s_or_b32 s12, s12, 9 +; VI-MOVREL-NEXT: s_or_b32 s11, s11, 8 +; VI-MOVREL-NEXT: s_or_b32 s10, s10, 7 +; VI-MOVREL-NEXT: s_or_b32 s9, s9, 6 +; VI-MOVREL-NEXT: s_or_b32 s8, s8, 5 +; VI-MOVREL-NEXT: s_or_b32 s7, s7, 4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 +; VI-MOVREL-NEXT: s_mov_b32 m0, s20 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s2 +; VI-MOVREL-NEXT: v_movrels_b32_e32 v2, v0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; VI-MOVREL-NEXT: flat_store_dword v[0:1], v2 +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: extract_w_offset_salu_use_vector: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0x2c +; VI-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_add_i32 s20, s20, 1 +; VI-IDXMODE-NEXT: s_or_b32 s6, s6, 3 +; VI-IDXMODE-NEXT: s_or_b32 s5, s5, 2 +; VI-IDXMODE-NEXT: s_or_b32 s4, s4, 1 +; VI-IDXMODE-NEXT: s_or_b32 s2, s19, 16 +; VI-IDXMODE-NEXT: s_or_b32 s3, s18, 15 +; VI-IDXMODE-NEXT: s_or_b32 s17, s17, 14 +; VI-IDXMODE-NEXT: s_or_b32 s16, s16, 13 +; VI-IDXMODE-NEXT: s_or_b32 s15, s15, 12 +; VI-IDXMODE-NEXT: s_or_b32 s14, s14, 11 +; VI-IDXMODE-NEXT: s_or_b32 s13, s13, 10 +; VI-IDXMODE-NEXT: s_or_b32 s12, s12, 9 +; VI-IDXMODE-NEXT: s_or_b32 s11, s11, 8 +; VI-IDXMODE-NEXT: s_or_b32 s10, s10, 7 +; VI-IDXMODE-NEXT: s_or_b32 s9, s9, 6 +; VI-IDXMODE-NEXT: s_or_b32 s8, s8, 5 +; VI-IDXMODE-NEXT: s_or_b32 s7, s7, 4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, s2 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s20, gpr_idx(SRC0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, v0 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1 +; VI-IDXMODE-NEXT: flat_store_dword v[0:1], v2 +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: extract_w_offset_salu_use_vector: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_add_i32 s20, s20, 1 +; GFX9-IDXMODE-NEXT: s_or_b32 s4, s4, 1 +; GFX9-IDXMODE-NEXT: s_or_b32 s2, s19, 16 +; GFX9-IDXMODE-NEXT: s_or_b32 s3, s18, 15 +; GFX9-IDXMODE-NEXT: s_or_b32 s17, s17, 14 +; GFX9-IDXMODE-NEXT: s_or_b32 s16, s16, 13 +; GFX9-IDXMODE-NEXT: s_or_b32 s15, s15, 12 +; GFX9-IDXMODE-NEXT: s_or_b32 s14, s14, 11 +; GFX9-IDXMODE-NEXT: s_or_b32 s13, s13, 10 +; GFX9-IDXMODE-NEXT: s_or_b32 s12, s12, 9 +; GFX9-IDXMODE-NEXT: s_or_b32 s11, s11, 8 +; GFX9-IDXMODE-NEXT: s_or_b32 s10, s10, 7 +; GFX9-IDXMODE-NEXT: s_or_b32 s9, s9, 6 +; GFX9-IDXMODE-NEXT: s_or_b32 s8, s8, 5 +; GFX9-IDXMODE-NEXT: s_or_b32 s7, s7, 4 +; GFX9-IDXMODE-NEXT: s_or_b32 s6, s6, 3 +; GFX9-IDXMODE-NEXT: s_or_b32 s5, s5, 2 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s3 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s2 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s20, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %idx = add i32 %in, 1 %vec = or <16 x i32> %or.val, @@ -55,38 +493,371 @@ entry: ret void } -; GCN-LABEL: {{^}}extract_wo_offset: -; GCN-DAG: s_load_dword [[IN:s[0-9]+]] -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 -; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0 - -; MOVREL-DAG: s_mov_b32 m0, [[IN]] -; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]] - -; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(SRC0){{$}} -; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]] -; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @extract_wo_offset(ptr addrspace(1) %out, i32 %in) { +; NOOPT-LABEL: extract_wo_offset: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s5, 0x41800000 +; NOOPT-NEXT: s_mov_b32 s6, 0x41700000 +; NOOPT-NEXT: s_mov_b32 s7, 0x41600000 +; NOOPT-NEXT: s_mov_b32 s8, 0x41500000 +; NOOPT-NEXT: s_mov_b32 s9, 0x41400000 +; NOOPT-NEXT: s_mov_b32 s10, 0x41300000 +; NOOPT-NEXT: s_mov_b32 s11, 0x41200000 +; NOOPT-NEXT: s_mov_b32 s12, 0x41100000 +; NOOPT-NEXT: s_mov_b32 s13, 0x41000000 +; NOOPT-NEXT: s_mov_b32 s14, 0x40e00000 +; NOOPT-NEXT: s_mov_b32 s15, 0x40c00000 +; NOOPT-NEXT: s_mov_b32 s16, 0x40a00000 +; NOOPT-NEXT: s_mov_b32 s17, 4.0 +; NOOPT-NEXT: s_mov_b32 s18, 0x40400000 +; NOOPT-NEXT: s_mov_b32 s19, 2.0 +; NOOPT-NEXT: s_mov_b32 s20, 1.0 +; NOOPT-NEXT: v_mov_b32_e32 v0, s20 +; NOOPT-NEXT: v_mov_b32_e32 v30, s19 +; NOOPT-NEXT: v_mov_b32_e32 v29, s18 +; NOOPT-NEXT: v_mov_b32_e32 v28, s17 +; NOOPT-NEXT: v_mov_b32_e32 v27, s16 +; NOOPT-NEXT: v_mov_b32_e32 v26, s15 +; NOOPT-NEXT: v_mov_b32_e32 v25, s14 +; NOOPT-NEXT: v_mov_b32_e32 v24, s13 +; NOOPT-NEXT: v_mov_b32_e32 v23, s12 +; NOOPT-NEXT: v_mov_b32_e32 v22, s11 +; NOOPT-NEXT: v_mov_b32_e32 v21, s10 +; NOOPT-NEXT: v_mov_b32_e32 v20, s9 +; NOOPT-NEXT: v_mov_b32_e32 v19, s8 +; NOOPT-NEXT: v_mov_b32_e32 v18, s7 +; NOOPT-NEXT: v_mov_b32_e32 v17, s6 +; NOOPT-NEXT: v_mov_b32_e32 v16, s5 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v30 +; NOOPT-NEXT: v_mov_b32_e32 v2, v29 +; NOOPT-NEXT: v_mov_b32_e32 v3, v28 +; NOOPT-NEXT: v_mov_b32_e32 v4, v27 +; NOOPT-NEXT: v_mov_b32_e32 v5, v26 +; NOOPT-NEXT: v_mov_b32_e32 v6, v25 +; NOOPT-NEXT: v_mov_b32_e32 v7, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v22 +; NOOPT-NEXT: v_mov_b32_e32 v10, v21 +; NOOPT-NEXT: v_mov_b32_e32 v11, v20 +; NOOPT-NEXT: v_mov_b32_e32 v12, v19 +; NOOPT-NEXT: v_mov_b32_e32 v13, v18 +; NOOPT-NEXT: v_mov_b32_e32 v14, v17 +; NOOPT-NEXT: v_mov_b32_e32 v15, v16 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_movrels_b32_e32 v0, v0 +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extract_wo_offset: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_mov_b32 m0, s4 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: extract_wo_offset: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-MOVREL-NEXT: v_movrels_b32_e32 v2, v0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; VI-MOVREL-NEXT: flat_store_dword v[0:1], v2 +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: extract_wo_offset: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, v0 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1 +; VI-IDXMODE-NEXT: flat_store_dword v[0:1], v2 +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: extract_wo_offset: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %elt = extractelement <16 x float> , i32 %in store float %elt, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}extract_neg_offset_sgpr: -; The offset depends on the register that holds the first element of the vector. -; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} -; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0 - -; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} -; IDXMODE: v_mov_b32_e32 v14, 15 -; IDXMODE: v_mov_b32_e32 v15, 16 -; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0){{$}} -; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @extract_neg_offset_sgpr(ptr addrspace(1) %out, i32 %offset) { +; NOOPT-LABEL: extract_neg_offset_sgpr: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s5, 16 +; NOOPT-NEXT: s_mov_b32 s6, 15 +; NOOPT-NEXT: s_mov_b32 s7, 14 +; NOOPT-NEXT: s_mov_b32 s8, 13 +; NOOPT-NEXT: s_mov_b32 s9, 12 +; NOOPT-NEXT: s_mov_b32 s10, 11 +; NOOPT-NEXT: s_mov_b32 s11, 10 +; NOOPT-NEXT: s_mov_b32 s12, 9 +; NOOPT-NEXT: s_mov_b32 s13, 8 +; NOOPT-NEXT: s_mov_b32 s14, 7 +; NOOPT-NEXT: s_mov_b32 s15, 6 +; NOOPT-NEXT: s_mov_b32 s16, 5 +; NOOPT-NEXT: s_mov_b32 s17, 3 +; NOOPT-NEXT: s_mov_b32 s18, 2 +; NOOPT-NEXT: s_mov_b32 s19, 1 +; NOOPT-NEXT: s_mov_b32 s20, 0 +; NOOPT-NEXT: v_mov_b32_e32 v0, s20 +; NOOPT-NEXT: v_mov_b32_e32 v30, s19 +; NOOPT-NEXT: v_mov_b32_e32 v29, s18 +; NOOPT-NEXT: v_mov_b32_e32 v28, s17 +; NOOPT-NEXT: v_mov_b32_e32 v27, s16 +; NOOPT-NEXT: v_mov_b32_e32 v26, s15 +; NOOPT-NEXT: v_mov_b32_e32 v25, s14 +; NOOPT-NEXT: v_mov_b32_e32 v24, s13 +; NOOPT-NEXT: v_mov_b32_e32 v23, s12 +; NOOPT-NEXT: v_mov_b32_e32 v22, s11 +; NOOPT-NEXT: v_mov_b32_e32 v21, s10 +; NOOPT-NEXT: v_mov_b32_e32 v20, s9 +; NOOPT-NEXT: v_mov_b32_e32 v19, s8 +; NOOPT-NEXT: v_mov_b32_e32 v18, s7 +; NOOPT-NEXT: v_mov_b32_e32 v17, s6 +; NOOPT-NEXT: v_mov_b32_e32 v16, s5 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v30 +; NOOPT-NEXT: v_mov_b32_e32 v2, v29 +; NOOPT-NEXT: v_mov_b32_e32 v3, v28 +; NOOPT-NEXT: v_mov_b32_e32 v4, v27 +; NOOPT-NEXT: v_mov_b32_e32 v5, v26 +; NOOPT-NEXT: v_mov_b32_e32 v6, v25 +; NOOPT-NEXT: v_mov_b32_e32 v7, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v22 +; NOOPT-NEXT: v_mov_b32_e32 v10, v21 +; NOOPT-NEXT: v_mov_b32_e32 v11, v20 +; NOOPT-NEXT: v_mov_b32_e32 v12, v19 +; NOOPT-NEXT: v_mov_b32_e32 v13, v18 +; NOOPT-NEXT: v_mov_b32_e32 v14, v17 +; NOOPT-NEXT: v_mov_b32_e32 v15, v16 +; NOOPT-NEXT: s_add_i32 m0, s4, 0xfffffe00 +; NOOPT-NEXT: v_movrels_b32_e32 v0, v0 +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extract_neg_offset_sgpr: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 0 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 1 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_add_i32 m0, s4, 0xfffffe00 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 2 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 3 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 8 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 16 +; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: extract_neg_offset_sgpr: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 2 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 3 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_add_i32 m0, s2, 0xfffffe00 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 7 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 8 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 9 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 10 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 11 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 12 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 13 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 14 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 15 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 16 +; VI-MOVREL-NEXT: v_movrels_b32_e32 v2, v0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; VI-MOVREL-NEXT: flat_store_dword v[0:1], v2 +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: extract_neg_offset_sgpr: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 3 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_addk_i32 s2, 0xfe00 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 7 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 8 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 9 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 10 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 11 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 12 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 13 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 14 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 15 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 16 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, v0 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1 +; VI-IDXMODE-NEXT: flat_store_dword v[0:1], v2 +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: extract_neg_offset_sgpr: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 2 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 3 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_addk_i32 s4, 0xfe00 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 7 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 16 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %index = add i32 %offset, -512 %value = extractelement <16 x i32> , i32 %index @@ -94,32 +865,293 @@ entry: ret void } -; GCN-LABEL: {{^}}extract_neg_offset_sgpr_loaded: -; The offset depends on the register that holds the first element of the vector. -; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} -; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0 - -; IDXMODE-DAG: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} -; IDXMODE-DAG: v_mov_b32_e32 v0, -; IDXMODE: v_mov_b32_e32 v1, -; IDXMODE: v_mov_b32_e32 v2, -; IDXMODE: v_mov_b32_e32 v3, -; IDXMODE: v_mov_b32_e32 v4, -; IDXMODE: v_mov_b32_e32 v5, -; IDXMODE: v_mov_b32_e32 v6, -; IDXMODE: v_mov_b32_e32 v7, -; IDXMODE: v_mov_b32_e32 v8, -; IDXMODE: v_mov_b32_e32 v9, -; IDXMODE: v_mov_b32_e32 v10, -; IDXMODE: v_mov_b32_e32 v11, -; IDXMODE: v_mov_b32_e32 v12, -; IDXMODE: v_mov_b32_e32 v13, -; IDXMODE: v_mov_b32_e32 v14, -; IDXMODE: v_mov_b32_e32 v15, -; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0){{$}} -; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, <16 x i32> %vec0, <16 x i32> %vec1, i32 %offset) { +; NOOPT-LABEL: extract_neg_offset_sgpr_loaded: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x19 +; NOOPT-NEXT: s_load_dwordx16 s[52:67], s[2:3], 0x29 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0x39 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s6, s67 +; NOOPT-NEXT: s_mov_b32 s5, s51 +; NOOPT-NEXT: s_or_b32 s5, s5, s6 +; NOOPT-NEXT: s_mov_b32 s7, s66 +; NOOPT-NEXT: s_mov_b32 s6, s50 +; NOOPT-NEXT: s_or_b32 s6, s6, s7 +; NOOPT-NEXT: s_mov_b32 s8, s65 +; NOOPT-NEXT: s_mov_b32 s7, s49 +; NOOPT-NEXT: s_or_b32 s7, s7, s8 +; NOOPT-NEXT: s_mov_b32 s9, s64 +; NOOPT-NEXT: s_mov_b32 s8, s48 +; NOOPT-NEXT: s_or_b32 s8, s8, s9 +; NOOPT-NEXT: s_mov_b32 s10, s63 +; NOOPT-NEXT: s_mov_b32 s9, s47 +; NOOPT-NEXT: s_or_b32 s9, s9, s10 +; NOOPT-NEXT: s_mov_b32 s11, s62 +; NOOPT-NEXT: s_mov_b32 s10, s46 +; NOOPT-NEXT: s_or_b32 s10, s10, s11 +; NOOPT-NEXT: s_mov_b32 s12, s61 +; NOOPT-NEXT: s_mov_b32 s11, s45 +; NOOPT-NEXT: s_or_b32 s11, s11, s12 +; NOOPT-NEXT: s_mov_b32 s13, s60 +; NOOPT-NEXT: s_mov_b32 s12, s44 +; NOOPT-NEXT: s_or_b32 s12, s12, s13 +; NOOPT-NEXT: s_mov_b32 s14, s59 +; NOOPT-NEXT: s_mov_b32 s13, s43 +; NOOPT-NEXT: s_or_b32 s13, s13, s14 +; NOOPT-NEXT: s_mov_b32 s15, s58 +; NOOPT-NEXT: s_mov_b32 s14, s42 +; NOOPT-NEXT: s_or_b32 s14, s14, s15 +; NOOPT-NEXT: s_mov_b32 s16, s57 +; NOOPT-NEXT: s_mov_b32 s15, s41 +; NOOPT-NEXT: s_or_b32 s15, s15, s16 +; NOOPT-NEXT: s_mov_b32 s17, s56 +; NOOPT-NEXT: s_mov_b32 s16, s40 +; NOOPT-NEXT: s_or_b32 s16, s16, s17 +; NOOPT-NEXT: s_mov_b32 s18, s55 +; NOOPT-NEXT: s_mov_b32 s17, s39 +; NOOPT-NEXT: s_or_b32 s17, s17, s18 +; NOOPT-NEXT: s_mov_b32 s19, s54 +; NOOPT-NEXT: s_mov_b32 s18, s38 +; NOOPT-NEXT: s_or_b32 s18, s18, s19 +; NOOPT-NEXT: s_mov_b32 s20, s53 +; NOOPT-NEXT: s_mov_b32 s19, s37 +; NOOPT-NEXT: s_or_b32 s19, s19, s20 +; NOOPT-NEXT: s_mov_b32 s21, s52 +; NOOPT-NEXT: s_mov_b32 s20, s36 +; NOOPT-NEXT: s_or_b32 s20, s20, s21 +; NOOPT-NEXT: v_mov_b32_e32 v0, s20 +; NOOPT-NEXT: v_mov_b32_e32 v30, s19 +; NOOPT-NEXT: v_mov_b32_e32 v29, s18 +; NOOPT-NEXT: v_mov_b32_e32 v28, s17 +; NOOPT-NEXT: v_mov_b32_e32 v27, s16 +; NOOPT-NEXT: v_mov_b32_e32 v26, s15 +; NOOPT-NEXT: v_mov_b32_e32 v25, s14 +; NOOPT-NEXT: v_mov_b32_e32 v24, s13 +; NOOPT-NEXT: v_mov_b32_e32 v23, s12 +; NOOPT-NEXT: v_mov_b32_e32 v22, s11 +; NOOPT-NEXT: v_mov_b32_e32 v21, s10 +; NOOPT-NEXT: v_mov_b32_e32 v20, s9 +; NOOPT-NEXT: v_mov_b32_e32 v19, s8 +; NOOPT-NEXT: v_mov_b32_e32 v18, s7 +; NOOPT-NEXT: v_mov_b32_e32 v17, s6 +; NOOPT-NEXT: v_mov_b32_e32 v16, s5 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v30 +; NOOPT-NEXT: v_mov_b32_e32 v2, v29 +; NOOPT-NEXT: v_mov_b32_e32 v3, v28 +; NOOPT-NEXT: v_mov_b32_e32 v4, v27 +; NOOPT-NEXT: v_mov_b32_e32 v5, v26 +; NOOPT-NEXT: v_mov_b32_e32 v6, v25 +; NOOPT-NEXT: v_mov_b32_e32 v7, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v22 +; NOOPT-NEXT: v_mov_b32_e32 v10, v21 +; NOOPT-NEXT: v_mov_b32_e32 v11, v20 +; NOOPT-NEXT: v_mov_b32_e32 v12, v19 +; NOOPT-NEXT: v_mov_b32_e32 v13, v18 +; NOOPT-NEXT: v_mov_b32_e32 v14, v17 +; NOOPT-NEXT: v_mov_b32_e32 v15, v16 +; NOOPT-NEXT: s_add_i32 m0, s4, 0xfffffe00 +; NOOPT-NEXT: v_movrels_b32_e32 v0, v0 +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extract_neg_offset_sgpr_loaded: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; SI-MOVREL-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x29 +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s20, s[2:3], 0x39 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_or_b32 s4, s4, s36 +; SI-MOVREL-NEXT: s_or_b32 s19, s19, s51 +; SI-MOVREL-NEXT: s_or_b32 s18, s18, s50 +; SI-MOVREL-NEXT: s_or_b32 s17, s17, s49 +; SI-MOVREL-NEXT: s_or_b32 s16, s16, s48 +; SI-MOVREL-NEXT: s_or_b32 s15, s15, s47 +; SI-MOVREL-NEXT: s_or_b32 s14, s14, s46 +; SI-MOVREL-NEXT: s_or_b32 s13, s13, s45 +; SI-MOVREL-NEXT: s_or_b32 s12, s12, s44 +; SI-MOVREL-NEXT: s_or_b32 s11, s11, s43 +; SI-MOVREL-NEXT: s_or_b32 s10, s10, s42 +; SI-MOVREL-NEXT: s_or_b32 s9, s9, s41 +; SI-MOVREL-NEXT: s_or_b32 s8, s8, s40 +; SI-MOVREL-NEXT: s_or_b32 s7, s7, s39 +; SI-MOVREL-NEXT: s_or_b32 s6, s6, s38 +; SI-MOVREL-NEXT: s_or_b32 s5, s5, s37 +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; SI-MOVREL-NEXT: s_add_i32 m0, s20, 0xfffffe00 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s18 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s19 +; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: extract_neg_offset_sgpr_loaded: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-MOVREL-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0xe4 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_or_b32 s6, s6, s38 +; VI-MOVREL-NEXT: s_or_b32 s5, s5, s37 +; VI-MOVREL-NEXT: s_or_b32 s4, s4, s36 +; VI-MOVREL-NEXT: s_or_b32 s3, s19, s51 +; VI-MOVREL-NEXT: s_or_b32 s18, s18, s50 +; VI-MOVREL-NEXT: s_or_b32 s17, s17, s49 +; VI-MOVREL-NEXT: s_or_b32 s16, s16, s48 +; VI-MOVREL-NEXT: s_or_b32 s15, s15, s47 +; VI-MOVREL-NEXT: s_or_b32 s14, s14, s46 +; VI-MOVREL-NEXT: s_or_b32 s13, s13, s45 +; VI-MOVREL-NEXT: s_or_b32 s12, s12, s44 +; VI-MOVREL-NEXT: s_or_b32 s11, s11, s43 +; VI-MOVREL-NEXT: s_or_b32 s10, s10, s42 +; VI-MOVREL-NEXT: s_or_b32 s9, s9, s41 +; VI-MOVREL-NEXT: s_or_b32 s8, s8, s40 +; VI-MOVREL-NEXT: s_or_b32 s7, s7, s39 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 +; VI-MOVREL-NEXT: s_add_i32 m0, s2, 0xfffffe00 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s18 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s3 +; VI-MOVREL-NEXT: v_movrels_b32_e32 v2, v0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; VI-MOVREL-NEXT: flat_store_dword v[0:1], v2 +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: extract_neg_offset_sgpr_loaded: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-IDXMODE-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0xe4 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_or_b32 s6, s6, s38 +; VI-IDXMODE-NEXT: s_or_b32 s5, s5, s37 +; VI-IDXMODE-NEXT: s_or_b32 s4, s4, s36 +; VI-IDXMODE-NEXT: s_or_b32 s3, s19, s51 +; VI-IDXMODE-NEXT: s_or_b32 s18, s18, s50 +; VI-IDXMODE-NEXT: s_or_b32 s17, s17, s49 +; VI-IDXMODE-NEXT: s_or_b32 s16, s16, s48 +; VI-IDXMODE-NEXT: s_or_b32 s15, s15, s47 +; VI-IDXMODE-NEXT: s_or_b32 s14, s14, s46 +; VI-IDXMODE-NEXT: s_or_b32 s13, s13, s45 +; VI-IDXMODE-NEXT: s_or_b32 s12, s12, s44 +; VI-IDXMODE-NEXT: s_or_b32 s11, s11, s43 +; VI-IDXMODE-NEXT: s_or_b32 s10, s10, s42 +; VI-IDXMODE-NEXT: s_or_b32 s9, s9, s41 +; VI-IDXMODE-NEXT: s_or_b32 s8, s8, s40 +; VI-IDXMODE-NEXT: s_or_b32 s7, s7, s39 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 +; VI-IDXMODE-NEXT: s_addk_i32 s2, 0xfe00 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, s18 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, s3 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, v0 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1 +; VI-IDXMODE-NEXT: flat_store_dword v[0:1], v2 +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: extract_neg_offset_sgpr_loaded: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0xe4 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_or_b32 s4, s4, s36 +; GFX9-IDXMODE-NEXT: s_or_b32 s2, s19, s51 +; GFX9-IDXMODE-NEXT: s_or_b32 s3, s18, s50 +; GFX9-IDXMODE-NEXT: s_or_b32 s17, s17, s49 +; GFX9-IDXMODE-NEXT: s_or_b32 s16, s16, s48 +; GFX9-IDXMODE-NEXT: s_or_b32 s15, s15, s47 +; GFX9-IDXMODE-NEXT: s_or_b32 s14, s14, s46 +; GFX9-IDXMODE-NEXT: s_or_b32 s13, s13, s45 +; GFX9-IDXMODE-NEXT: s_or_b32 s12, s12, s44 +; GFX9-IDXMODE-NEXT: s_or_b32 s11, s11, s43 +; GFX9-IDXMODE-NEXT: s_or_b32 s10, s10, s42 +; GFX9-IDXMODE-NEXT: s_or_b32 s9, s9, s41 +; GFX9-IDXMODE-NEXT: s_or_b32 s8, s8, s40 +; GFX9-IDXMODE-NEXT: s_or_b32 s7, s7, s39 +; GFX9-IDXMODE-NEXT: s_or_b32 s6, s6, s38 +; GFX9-IDXMODE-NEXT: s_or_b32 s5, s5, s37 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-IDXMODE-NEXT: s_addk_i32 s20, 0xfe00 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s3 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s2 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s20, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %index = add i32 %offset, -512 %or = or <16 x i32> %vec0, %vec1 @@ -128,25 +1160,350 @@ entry: ret void } -; GCN-LABEL: {{^}}extract_neg_offset_vgpr: -; The offset depends on the register that holds the first element of the vector. - -; GCN: v_cmp_eq_u32_e32 -; GCN-COUNT-14: v_cndmask_b32 -; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 16 -; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) { +; NOOPT-LABEL: extract_neg_offset_vgpr: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; NOOPT-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; NOOPT-NEXT: s_mov_b32 s22, -1 +; NOOPT-NEXT: s_mov_b32 s23, 0xe8f000 +; NOOPT-NEXT: s_add_u32 s20, s20, s9 +; NOOPT-NEXT: s_addc_u32 s21, s21, 0 +; NOOPT-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane +; NOOPT-NEXT: v_mov_b32_e32 v1, v0 +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s6, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s4, 0xf000 +; NOOPT-NEXT: s_mov_b32 s5, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s6 +; NOOPT-NEXT: s_mov_b32 s2, s5 +; NOOPT-NEXT: s_mov_b32 s3, s4 +; NOOPT-NEXT: s_waitcnt vmcnt(1) +; NOOPT-NEXT: v_writelane_b32 v0, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 1 +; NOOPT-NEXT: v_writelane_b32 v0, s2, 2 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 3 +; NOOPT-NEXT: s_mov_b32 s0, 16 +; NOOPT-NEXT: s_mov_b32 s1, 15 +; NOOPT-NEXT: s_mov_b32 s2, 14 +; NOOPT-NEXT: s_mov_b32 s3, 13 +; NOOPT-NEXT: s_mov_b32 s4, 12 +; NOOPT-NEXT: s_mov_b32 s5, 11 +; NOOPT-NEXT: s_mov_b32 s6, 10 +; NOOPT-NEXT: s_mov_b32 s7, 9 +; NOOPT-NEXT: s_mov_b32 s8, 8 +; NOOPT-NEXT: s_mov_b32 s9, 7 +; NOOPT-NEXT: s_mov_b32 s10, 6 +; NOOPT-NEXT: s_mov_b32 s11, 5 +; NOOPT-NEXT: s_mov_b32 s12, 3 +; NOOPT-NEXT: s_mov_b32 s13, 2 +; NOOPT-NEXT: s_mov_b32 s14, 1 +; NOOPT-NEXT: s_mov_b32 s15, 0 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v1, s15 +; NOOPT-NEXT: v_mov_b32_e32 v31, s14 +; NOOPT-NEXT: v_mov_b32_e32 v30, s13 +; NOOPT-NEXT: v_mov_b32_e32 v29, s12 +; NOOPT-NEXT: v_mov_b32_e32 v28, s11 +; NOOPT-NEXT: v_mov_b32_e32 v27, s10 +; NOOPT-NEXT: v_mov_b32_e32 v26, s9 +; NOOPT-NEXT: v_mov_b32_e32 v25, s8 +; NOOPT-NEXT: v_mov_b32_e32 v24, s7 +; NOOPT-NEXT: v_mov_b32_e32 v23, s6 +; NOOPT-NEXT: v_mov_b32_e32 v22, s5 +; NOOPT-NEXT: v_mov_b32_e32 v21, s4 +; NOOPT-NEXT: v_mov_b32_e32 v20, s3 +; NOOPT-NEXT: v_mov_b32_e32 v19, s2 +; NOOPT-NEXT: v_mov_b32_e32 v18, s1 +; NOOPT-NEXT: v_mov_b32_e32 v17, s0 +; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v2, v31 +; NOOPT-NEXT: v_mov_b32_e32 v3, v30 +; NOOPT-NEXT: v_mov_b32_e32 v4, v29 +; NOOPT-NEXT: v_mov_b32_e32 v5, v28 +; NOOPT-NEXT: v_mov_b32_e32 v6, v27 +; NOOPT-NEXT: v_mov_b32_e32 v7, v26 +; NOOPT-NEXT: v_mov_b32_e32 v8, v25 +; NOOPT-NEXT: v_mov_b32_e32 v9, v24 +; NOOPT-NEXT: v_mov_b32_e32 v10, v23 +; NOOPT-NEXT: v_mov_b32_e32 v11, v22 +; NOOPT-NEXT: v_mov_b32_e32 v12, v21 +; NOOPT-NEXT: v_mov_b32_e32 v13, v20 +; NOOPT-NEXT: v_mov_b32_e32 v14, v19 +; NOOPT-NEXT: v_mov_b32_e32 v15, v18 +; NOOPT-NEXT: v_mov_b32_e32 v16, v17 +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: v_writelane_b32 v0, s0, 4 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 5 +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: ; implicit-def: $vgpr0 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 +; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:72 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 +; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; NOOPT-NEXT: s_add_i32 m0, s2, 0xfffffe00 +; NOOPT-NEXT: v_movrels_b32_e32 v1, v1 +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:72 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] +; NOOPT-NEXT: v_writelane_b32 v0, s2, 6 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 7 +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] +; NOOPT-NEXT: s_cbranch_execnz .LBB5_1 +; NOOPT-NEXT: ; %bb.2: +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 4 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 5 +; NOOPT-NEXT: s_mov_b64 exec, s[0:1] +; NOOPT-NEXT: ; %bb.3: +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 0 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 1 +; NOOPT-NEXT: v_readlane_b32 s2, v0, 2 +; NOOPT-NEXT: v_readlane_b32 s3, v0, 3 +; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; NOOPT-NEXT: ; kill: killed $vgpr0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extract_neg_offset_vgpr: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffe00, v0 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 2, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 4, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 6, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 6, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 7, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 7, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 8, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 8, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 9, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 10, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 10, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 11, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 11, v0 +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 12, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 12, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 13, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 13, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 14, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 14, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 15, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 15, v0 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v0, 16, v1, vcc +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-LABEL: extract_neg_offset_vgpr: +; VI: ; %bb.0: ; %entry +; VI-NEXT: v_add_u32_e32 v0, vcc, 0xfffffe00, v0 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 2, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 6, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 6, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 7, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 7, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 8, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 8, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 9, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 10, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 10, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_cndmask_b32_e32 v1, 11, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 11, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 12, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 12, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 13, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 13, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 14, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 14, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 15, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 15, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, 16, v1, vcc +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: extract_neg_offset_vgpr: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: v_add_u32_e32 v0, 0xfffffe00, v0 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 2, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 3, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 4, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 6, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 6, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 7, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 7, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 8, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 8, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 9, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 10, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 10, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 11, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 11, v0 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 12, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 12, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 13, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 13, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 14, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 14, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 15, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 15, v0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v0, 16, v2, vcc +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: - %id = call i32 @llvm.amdgcn.workitem.id.x() #1 + %id = call i32 @llvm.amdgcn.workitem.id.x() %index = add i32 %id, -512 %value = extractelement <16 x i32> , i32 %index store i32 %value, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}extract_undef_offset_sgpr: ; undefined behavior, but shouldn't crash compiler define amdgpu_kernel void @extract_undef_offset_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; NOOPT-LABEL: extract_undef_offset_sgpr: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s6, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s4, 0xf000 +; NOOPT-NEXT: s_mov_b32 s5, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s6 +; NOOPT-NEXT: s_mov_b32 s2, s5 +; NOOPT-NEXT: s_mov_b32 s3, s4 +; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extract_undef_offset_sgpr: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_mov_b32 s7, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s6, -1 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_mov_b32 s4, s2 +; SI-MOVREL-NEXT: s_mov_b32 s5, s3 +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: s_endpgm +; +; VI-LABEL: extract_undef_offset_sgpr: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: extract_undef_offset_sgpr: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: s_endpgm entry: %ld = load volatile <4 x i32>, ptr addrspace(1) %in %value = extractelement <4 x i32> %ld, i32 undef @@ -154,9 +1511,23 @@ entry: ret void } -; GCN-LABEL: {{^}}insert_undef_offset_sgpr_vector_src: ; undefined behavior, but shouldn't crash compiler define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; NOOPT-LABEL: insert_undef_offset_sgpr_vector_src: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_undef_offset_sgpr_vector_src: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_endpgm +; +; VI-LABEL: insert_undef_offset_sgpr_vector_src: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_undef_offset_sgpr_vector_src: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_endpgm entry: %ld = load <4 x i32>, ptr addrspace(1) %in %value = insertelement <4 x i32> %ld, i32 5, i32 undef @@ -164,20 +1535,276 @@ entry: ret void } -; GCN-LABEL: {{^}}insert_w_offset: -; GCN-DAG: s_load_dword [[IN0:s[0-9]+]] -; MOVREL-DAG: s_add_i32 [[IN:s[0-9]+]], [[IN0]], 1 -; MOVREL-DAG: s_mov_b32 m0, [[IN]] -; GCN-DAG: v_mov_b32_e32 v[[ELT0:[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 v[[ELT1:[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 v[[ELT2:[0-9]+]], 0x40400000 -; GCN-DAG: v_mov_b32_e32 v[[ELT3:[0-9]+]], 4.0 -; GCN-DAG: v_mov_b32_e32 v[[ELT15:[0-9]+]], 0x41800000 -; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x41880000 - -; MOVREL: v_movreld_b32_e32 v[[ELT0]], v[[INS]] -; MOVREL: buffer_store_dwordx4 v[[[ELT0]]:[[ELT3]]] define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { +; NOOPT-LABEL: insert_w_offset: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s5, 1 +; NOOPT-NEXT: s_add_i32 s4, s4, s5 +; NOOPT-NEXT: s_mov_b32 s5, 0x41800000 +; NOOPT-NEXT: s_mov_b32 s6, 0x41700000 +; NOOPT-NEXT: s_mov_b32 s7, 0x41600000 +; NOOPT-NEXT: s_mov_b32 s8, 0x41500000 +; NOOPT-NEXT: s_mov_b32 s9, 0x41400000 +; NOOPT-NEXT: s_mov_b32 s10, 0x41300000 +; NOOPT-NEXT: s_mov_b32 s11, 0x41200000 +; NOOPT-NEXT: s_mov_b32 s12, 0x41100000 +; NOOPT-NEXT: s_mov_b32 s13, 0x41000000 +; NOOPT-NEXT: s_mov_b32 s14, 0x40e00000 +; NOOPT-NEXT: s_mov_b32 s15, 0x40c00000 +; NOOPT-NEXT: s_mov_b32 s16, 0x40a00000 +; NOOPT-NEXT: s_mov_b32 s17, 4.0 +; NOOPT-NEXT: s_mov_b32 s18, 0x40400000 +; NOOPT-NEXT: s_mov_b32 s19, 2.0 +; NOOPT-NEXT: s_mov_b32 s20, 1.0 +; NOOPT-NEXT: v_mov_b32_e32 v7, s20 +; NOOPT-NEXT: v_mov_b32_e32 v30, s19 +; NOOPT-NEXT: v_mov_b32_e32 v29, s18 +; NOOPT-NEXT: v_mov_b32_e32 v28, s17 +; NOOPT-NEXT: v_mov_b32_e32 v27, s16 +; NOOPT-NEXT: v_mov_b32_e32 v26, s15 +; NOOPT-NEXT: v_mov_b32_e32 v25, s14 +; NOOPT-NEXT: v_mov_b32_e32 v24, s13 +; NOOPT-NEXT: v_mov_b32_e32 v23, s12 +; NOOPT-NEXT: v_mov_b32_e32 v6, s11 +; NOOPT-NEXT: v_mov_b32_e32 v5, s10 +; NOOPT-NEXT: v_mov_b32_e32 v4, s9 +; NOOPT-NEXT: v_mov_b32_e32 v3, s8 +; NOOPT-NEXT: v_mov_b32_e32 v2, s7 +; NOOPT-NEXT: v_mov_b32_e32 v1, s6 +; NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v8, v30 +; NOOPT-NEXT: v_mov_b32_e32 v9, v29 +; NOOPT-NEXT: v_mov_b32_e32 v10, v28 +; NOOPT-NEXT: v_mov_b32_e32 v11, v27 +; NOOPT-NEXT: v_mov_b32_e32 v12, v26 +; NOOPT-NEXT: v_mov_b32_e32 v13, v25 +; NOOPT-NEXT: v_mov_b32_e32 v14, v24 +; NOOPT-NEXT: v_mov_b32_e32 v15, v23 +; NOOPT-NEXT: v_mov_b32_e32 v16, v6 +; NOOPT-NEXT: v_mov_b32_e32 v17, v5 +; NOOPT-NEXT: v_mov_b32_e32 v18, v4 +; NOOPT-NEXT: v_mov_b32_e32 v19, v3 +; NOOPT-NEXT: v_mov_b32_e32 v20, v2 +; NOOPT-NEXT: v_mov_b32_e32 v21, v1 +; NOOPT-NEXT: v_mov_b32_e32 v22, v0 +; NOOPT-NEXT: v_mov_b32_e32 v0, 0x41880000 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_movreld_b32_e32 v7, v0 +; NOOPT-NEXT: v_mov_b32_e32 v4, v22 +; NOOPT-NEXT: v_mov_b32_e32 v5, v21 +; NOOPT-NEXT: v_mov_b32_e32 v6, v20 +; NOOPT-NEXT: v_mov_b32_e32 v0, v19 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: v_mov_b32_e32 v4, v18 +; NOOPT-NEXT: v_mov_b32_e32 v5, v17 +; NOOPT-NEXT: v_mov_b32_e32 v6, v16 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v15 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; NOOPT-NEXT: v_mov_b32_e32 v4, v14 +; NOOPT-NEXT: v_mov_b32_e32 v5, v13 +; NOOPT-NEXT: v_mov_b32_e32 v6, v12 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v11 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; NOOPT-NEXT: v_mov_b32_e32 v4, v10 +; NOOPT-NEXT: v_mov_b32_e32 v5, v9 +; NOOPT-NEXT: v_mov_b32_e32 v6, v8 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v7 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_w_offset: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_add_i32 s4, s4, 1 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 +; SI-MOVREL-NEXT: s_mov_b32 m0, s4 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: insert_w_offset: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_add_i32 s4, s4, 1 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 +; VI-MOVREL-NEXT: s_mov_b32 m0, s4 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: insert_w_offset: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_add_i32 s4, s4, 1 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, 0x41880000 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v16 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_w_offset: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_add_i32 s4, s4, 1 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, 0x41880000 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v17 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %add = add i32 %in, 1 %ins = insertelement <16 x float> , float 17.0, i32 %add @@ -185,19 +1812,276 @@ entry: ret void } -; GCN-LABEL: {{^}}insert_unsigned_base_plus_offset: -; GCN-DAG: s_load_dword [[IN:s[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 2.0 -; GCN-DAG: s_and_b32 [[BASE:s[0-9]+]], [[IN]], 0xffff - -; MOVREL: s_mov_b32 m0, [[BASE]] -; MOVREL: v_movreld_b32_e32 [[ELT1]], v{{[0-9]+}} - -; IDXMODE: s_set_gpr_idx_on [[BASE]], gpr_idx(DST) -; IDXMODE-NEXT: v_mov_b32_e32 [[ELT1]], v{{[0-9]+}} -; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %out, i16 %in) { +; NOOPT-LABEL: insert_unsigned_base_plus_offset: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s5, 0xffff +; NOOPT-NEXT: s_and_b32 s4, s4, s5 +; NOOPT-NEXT: s_mov_b32 s5, 0x41800000 +; NOOPT-NEXT: s_mov_b32 s6, 0x41700000 +; NOOPT-NEXT: s_mov_b32 s7, 0x41600000 +; NOOPT-NEXT: s_mov_b32 s8, 0x41500000 +; NOOPT-NEXT: s_mov_b32 s9, 0x41400000 +; NOOPT-NEXT: s_mov_b32 s10, 0x41300000 +; NOOPT-NEXT: s_mov_b32 s11, 0x41200000 +; NOOPT-NEXT: s_mov_b32 s12, 0x41100000 +; NOOPT-NEXT: s_mov_b32 s13, 0x41000000 +; NOOPT-NEXT: s_mov_b32 s14, 0x40e00000 +; NOOPT-NEXT: s_mov_b32 s15, 0x40c00000 +; NOOPT-NEXT: s_mov_b32 s16, 0x40a00000 +; NOOPT-NEXT: s_mov_b32 s17, 4.0 +; NOOPT-NEXT: s_mov_b32 s18, 0x40400000 +; NOOPT-NEXT: s_mov_b32 s19, 2.0 +; NOOPT-NEXT: s_mov_b32 s20, 1.0 +; NOOPT-NEXT: v_mov_b32_e32 v7, s20 +; NOOPT-NEXT: v_mov_b32_e32 v30, s19 +; NOOPT-NEXT: v_mov_b32_e32 v29, s18 +; NOOPT-NEXT: v_mov_b32_e32 v28, s17 +; NOOPT-NEXT: v_mov_b32_e32 v27, s16 +; NOOPT-NEXT: v_mov_b32_e32 v26, s15 +; NOOPT-NEXT: v_mov_b32_e32 v25, s14 +; NOOPT-NEXT: v_mov_b32_e32 v24, s13 +; NOOPT-NEXT: v_mov_b32_e32 v23, s12 +; NOOPT-NEXT: v_mov_b32_e32 v6, s11 +; NOOPT-NEXT: v_mov_b32_e32 v5, s10 +; NOOPT-NEXT: v_mov_b32_e32 v4, s9 +; NOOPT-NEXT: v_mov_b32_e32 v3, s8 +; NOOPT-NEXT: v_mov_b32_e32 v2, s7 +; NOOPT-NEXT: v_mov_b32_e32 v1, s6 +; NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v8, v30 +; NOOPT-NEXT: v_mov_b32_e32 v9, v29 +; NOOPT-NEXT: v_mov_b32_e32 v10, v28 +; NOOPT-NEXT: v_mov_b32_e32 v11, v27 +; NOOPT-NEXT: v_mov_b32_e32 v12, v26 +; NOOPT-NEXT: v_mov_b32_e32 v13, v25 +; NOOPT-NEXT: v_mov_b32_e32 v14, v24 +; NOOPT-NEXT: v_mov_b32_e32 v15, v23 +; NOOPT-NEXT: v_mov_b32_e32 v16, v6 +; NOOPT-NEXT: v_mov_b32_e32 v17, v5 +; NOOPT-NEXT: v_mov_b32_e32 v18, v4 +; NOOPT-NEXT: v_mov_b32_e32 v19, v3 +; NOOPT-NEXT: v_mov_b32_e32 v20, v2 +; NOOPT-NEXT: v_mov_b32_e32 v21, v1 +; NOOPT-NEXT: v_mov_b32_e32 v22, v0 +; NOOPT-NEXT: v_mov_b32_e32 v0, 0x41880000 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_movreld_b32_e32 v8, v0 +; NOOPT-NEXT: v_mov_b32_e32 v4, v22 +; NOOPT-NEXT: v_mov_b32_e32 v5, v21 +; NOOPT-NEXT: v_mov_b32_e32 v6, v20 +; NOOPT-NEXT: v_mov_b32_e32 v0, v19 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: v_mov_b32_e32 v4, v18 +; NOOPT-NEXT: v_mov_b32_e32 v5, v17 +; NOOPT-NEXT: v_mov_b32_e32 v6, v16 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v15 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; NOOPT-NEXT: v_mov_b32_e32 v4, v14 +; NOOPT-NEXT: v_mov_b32_e32 v5, v13 +; NOOPT-NEXT: v_mov_b32_e32 v6, v12 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v11 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; NOOPT-NEXT: v_mov_b32_e32 v4, v10 +; NOOPT-NEXT: v_mov_b32_e32 v5, v9 +; NOOPT-NEXT: v_mov_b32_e32 v6, v8 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v7 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_unsigned_base_plus_offset: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_and_b32 s4, s4, 0xffff +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 +; SI-MOVREL-NEXT: s_mov_b32 m0, s4 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: insert_unsigned_base_plus_offset: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_and_b32 s2, s4, 0xffff +; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: insert_unsigned_base_plus_offset: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_and_b32 s2, s4, 0xffff +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, 0x41880000 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, v16 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_unsigned_base_plus_offset: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, 0x41880000 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, v17 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %base = zext i16 %in to i32 %add = add i32 %base, 1 @@ -206,21 +2090,281 @@ entry: ret void } -; GCN-LABEL: {{^}}insert_signed_base_plus_offset: -; GCN-DAG: s_load_dword [[IN:s[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 2.0 - -; GCN-DAG: s_sext_i32_i16 [[BASE:s[0-9]+]], [[IN]] -; GCN-DAG: s_add_i32 [[BASE_PLUS_OFFSET:s[0-9]+]], [[BASE]], 1 - -; MOVREL: s_mov_b32 m0, [[BASE_PLUS_OFFSET]] -; MOVREL: v_movreld_b32_e32 [[ELT0]], v{{[0-9]+}} - -; IDXMODE: s_set_gpr_idx_on [[BASE_PLUS_OFFSET]], gpr_idx(DST) -; IDXMODE-NEXT: v_mov_b32_e32 [[ELT0]], v{{[0-9]+}} -; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, i16 %in) { +; NOOPT-LABEL: insert_signed_base_plus_offset: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_sext_i32_i16 s4, s4 +; NOOPT-NEXT: s_mov_b32 s5, 1 +; NOOPT-NEXT: s_add_i32 s4, s4, s5 +; NOOPT-NEXT: s_mov_b32 s5, 0x41800000 +; NOOPT-NEXT: s_mov_b32 s6, 0x41700000 +; NOOPT-NEXT: s_mov_b32 s7, 0x41600000 +; NOOPT-NEXT: s_mov_b32 s8, 0x41500000 +; NOOPT-NEXT: s_mov_b32 s9, 0x41400000 +; NOOPT-NEXT: s_mov_b32 s10, 0x41300000 +; NOOPT-NEXT: s_mov_b32 s11, 0x41200000 +; NOOPT-NEXT: s_mov_b32 s12, 0x41100000 +; NOOPT-NEXT: s_mov_b32 s13, 0x41000000 +; NOOPT-NEXT: s_mov_b32 s14, 0x40e00000 +; NOOPT-NEXT: s_mov_b32 s15, 0x40c00000 +; NOOPT-NEXT: s_mov_b32 s16, 0x40a00000 +; NOOPT-NEXT: s_mov_b32 s17, 4.0 +; NOOPT-NEXT: s_mov_b32 s18, 0x40400000 +; NOOPT-NEXT: s_mov_b32 s19, 2.0 +; NOOPT-NEXT: s_mov_b32 s20, 1.0 +; NOOPT-NEXT: v_mov_b32_e32 v7, s20 +; NOOPT-NEXT: v_mov_b32_e32 v30, s19 +; NOOPT-NEXT: v_mov_b32_e32 v29, s18 +; NOOPT-NEXT: v_mov_b32_e32 v28, s17 +; NOOPT-NEXT: v_mov_b32_e32 v27, s16 +; NOOPT-NEXT: v_mov_b32_e32 v26, s15 +; NOOPT-NEXT: v_mov_b32_e32 v25, s14 +; NOOPT-NEXT: v_mov_b32_e32 v24, s13 +; NOOPT-NEXT: v_mov_b32_e32 v23, s12 +; NOOPT-NEXT: v_mov_b32_e32 v6, s11 +; NOOPT-NEXT: v_mov_b32_e32 v5, s10 +; NOOPT-NEXT: v_mov_b32_e32 v4, s9 +; NOOPT-NEXT: v_mov_b32_e32 v3, s8 +; NOOPT-NEXT: v_mov_b32_e32 v2, s7 +; NOOPT-NEXT: v_mov_b32_e32 v1, s6 +; NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v8, v30 +; NOOPT-NEXT: v_mov_b32_e32 v9, v29 +; NOOPT-NEXT: v_mov_b32_e32 v10, v28 +; NOOPT-NEXT: v_mov_b32_e32 v11, v27 +; NOOPT-NEXT: v_mov_b32_e32 v12, v26 +; NOOPT-NEXT: v_mov_b32_e32 v13, v25 +; NOOPT-NEXT: v_mov_b32_e32 v14, v24 +; NOOPT-NEXT: v_mov_b32_e32 v15, v23 +; NOOPT-NEXT: v_mov_b32_e32 v16, v6 +; NOOPT-NEXT: v_mov_b32_e32 v17, v5 +; NOOPT-NEXT: v_mov_b32_e32 v18, v4 +; NOOPT-NEXT: v_mov_b32_e32 v19, v3 +; NOOPT-NEXT: v_mov_b32_e32 v20, v2 +; NOOPT-NEXT: v_mov_b32_e32 v21, v1 +; NOOPT-NEXT: v_mov_b32_e32 v22, v0 +; NOOPT-NEXT: v_mov_b32_e32 v0, 0x41880000 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_movreld_b32_e32 v7, v0 +; NOOPT-NEXT: v_mov_b32_e32 v4, v22 +; NOOPT-NEXT: v_mov_b32_e32 v5, v21 +; NOOPT-NEXT: v_mov_b32_e32 v6, v20 +; NOOPT-NEXT: v_mov_b32_e32 v0, v19 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: v_mov_b32_e32 v4, v18 +; NOOPT-NEXT: v_mov_b32_e32 v5, v17 +; NOOPT-NEXT: v_mov_b32_e32 v6, v16 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v15 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; NOOPT-NEXT: v_mov_b32_e32 v4, v14 +; NOOPT-NEXT: v_mov_b32_e32 v5, v13 +; NOOPT-NEXT: v_mov_b32_e32 v6, v12 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v11 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; NOOPT-NEXT: v_mov_b32_e32 v4, v10 +; NOOPT-NEXT: v_mov_b32_e32 v5, v9 +; NOOPT-NEXT: v_mov_b32_e32 v6, v8 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v7 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_signed_base_plus_offset: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_sext_i32_i16 s4, s4 +; SI-MOVREL-NEXT: s_add_i32 s4, s4, 1 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 +; SI-MOVREL-NEXT: s_mov_b32 m0, s4 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: insert_signed_base_plus_offset: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_sext_i32_i16 s2, s4 +; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1 +; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: insert_signed_base_plus_offset: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_sext_i32_i16 s2, s4 +; VI-IDXMODE-NEXT: s_add_i32 s2, s2, 1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, 0x41880000 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v16 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_signed_base_plus_offset: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_sext_i32_i16 s2, s4 +; GFX9-IDXMODE-NEXT: s_add_i32 s2, s2, 1 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, 0x41880000 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v17 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %base = sext i16 %in to i32 %add = add i32 %base, 1 @@ -229,35 +2373,553 @@ entry: ret void } - -; GCN-LABEL: {{^}}insert_wo_offset: -; GCN: s_load_dword [[IN:s[0-9]+]] - -; MOVREL: s_mov_b32 m0, [[IN]] -; MOVREL: v_movreld_b32_e32 v[[ELT0:[0-9]+]] - -; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(DST) -; IDXMODE-NEXT: v_mov_b32_e32 v[[ELT0:[0-9]+]], v{{[0-9]+}} -; IDXMODE-NEXT: s_set_gpr_idx_off - -; GCN: buffer_store_dwordx4 v[[[ELT0]]: +; Make sure that TwoAddressInstructions keeps src0 as subregister sub0 +; of the tied implicit use and def of the super register. define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) { +; NOOPT-LABEL: insert_wo_offset: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s5, 0x41800000 +; NOOPT-NEXT: s_mov_b32 s6, 0x41700000 +; NOOPT-NEXT: s_mov_b32 s7, 0x41600000 +; NOOPT-NEXT: s_mov_b32 s8, 0x41500000 +; NOOPT-NEXT: s_mov_b32 s9, 0x41400000 +; NOOPT-NEXT: s_mov_b32 s10, 0x41300000 +; NOOPT-NEXT: s_mov_b32 s11, 0x41200000 +; NOOPT-NEXT: s_mov_b32 s12, 0x41100000 +; NOOPT-NEXT: s_mov_b32 s13, 0x41000000 +; NOOPT-NEXT: s_mov_b32 s14, 0x40e00000 +; NOOPT-NEXT: s_mov_b32 s15, 0x40c00000 +; NOOPT-NEXT: s_mov_b32 s16, 0x40a00000 +; NOOPT-NEXT: s_mov_b32 s17, 4.0 +; NOOPT-NEXT: s_mov_b32 s18, 0x40400000 +; NOOPT-NEXT: s_mov_b32 s19, 2.0 +; NOOPT-NEXT: s_mov_b32 s20, 1.0 +; NOOPT-NEXT: v_mov_b32_e32 v7, s20 +; NOOPT-NEXT: v_mov_b32_e32 v30, s19 +; NOOPT-NEXT: v_mov_b32_e32 v29, s18 +; NOOPT-NEXT: v_mov_b32_e32 v28, s17 +; NOOPT-NEXT: v_mov_b32_e32 v27, s16 +; NOOPT-NEXT: v_mov_b32_e32 v26, s15 +; NOOPT-NEXT: v_mov_b32_e32 v25, s14 +; NOOPT-NEXT: v_mov_b32_e32 v24, s13 +; NOOPT-NEXT: v_mov_b32_e32 v23, s12 +; NOOPT-NEXT: v_mov_b32_e32 v6, s11 +; NOOPT-NEXT: v_mov_b32_e32 v5, s10 +; NOOPT-NEXT: v_mov_b32_e32 v4, s9 +; NOOPT-NEXT: v_mov_b32_e32 v3, s8 +; NOOPT-NEXT: v_mov_b32_e32 v2, s7 +; NOOPT-NEXT: v_mov_b32_e32 v1, s6 +; NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v8, v30 +; NOOPT-NEXT: v_mov_b32_e32 v9, v29 +; NOOPT-NEXT: v_mov_b32_e32 v10, v28 +; NOOPT-NEXT: v_mov_b32_e32 v11, v27 +; NOOPT-NEXT: v_mov_b32_e32 v12, v26 +; NOOPT-NEXT: v_mov_b32_e32 v13, v25 +; NOOPT-NEXT: v_mov_b32_e32 v14, v24 +; NOOPT-NEXT: v_mov_b32_e32 v15, v23 +; NOOPT-NEXT: v_mov_b32_e32 v16, v6 +; NOOPT-NEXT: v_mov_b32_e32 v17, v5 +; NOOPT-NEXT: v_mov_b32_e32 v18, v4 +; NOOPT-NEXT: v_mov_b32_e32 v19, v3 +; NOOPT-NEXT: v_mov_b32_e32 v20, v2 +; NOOPT-NEXT: v_mov_b32_e32 v21, v1 +; NOOPT-NEXT: v_mov_b32_e32 v22, v0 +; NOOPT-NEXT: v_mov_b32_e32 v0, 0x41880000 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_movreld_b32_e32 v7, v0 +; NOOPT-NEXT: v_mov_b32_e32 v4, v22 +; NOOPT-NEXT: v_mov_b32_e32 v5, v21 +; NOOPT-NEXT: v_mov_b32_e32 v6, v20 +; NOOPT-NEXT: v_mov_b32_e32 v0, v19 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: v_mov_b32_e32 v4, v18 +; NOOPT-NEXT: v_mov_b32_e32 v5, v17 +; NOOPT-NEXT: v_mov_b32_e32 v6, v16 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v15 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; NOOPT-NEXT: v_mov_b32_e32 v4, v14 +; NOOPT-NEXT: v_mov_b32_e32 v5, v13 +; NOOPT-NEXT: v_mov_b32_e32 v6, v12 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v11 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; NOOPT-NEXT: v_mov_b32_e32 v4, v10 +; NOOPT-NEXT: v_mov_b32_e32 v5, v9 +; NOOPT-NEXT: v_mov_b32_e32 v6, v8 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v7 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_wo_offset: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_mov_b32 m0, s4 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: insert_wo_offset: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: insert_wo_offset: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, 0x41880000 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v16 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_wo_offset: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, 0x41880000 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v17 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %ins = insertelement <16 x float> , float 17.0, i32 %in store <16 x float> %ins, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}insert_neg_offset_sgpr: -; The offset depends on the register that holds the first element of the vector. -; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} -; MOVREL: v_movreld_b32_e32 v0, 16 - -; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} -; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST) -; IDXMODE-NEXT: v_mov_b32_e32 v0, 16 -; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addrspace(1) %out, i32 %offset) { +; NOOPT-LABEL: insert_neg_offset_sgpr: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xd +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s5, 15 +; NOOPT-NEXT: s_mov_b32 s6, 14 +; NOOPT-NEXT: s_mov_b32 s7, 13 +; NOOPT-NEXT: s_mov_b32 s8, 12 +; NOOPT-NEXT: s_mov_b32 s9, 11 +; NOOPT-NEXT: s_mov_b32 s10, 10 +; NOOPT-NEXT: s_mov_b32 s11, 9 +; NOOPT-NEXT: s_mov_b32 s12, 8 +; NOOPT-NEXT: s_mov_b32 s13, 7 +; NOOPT-NEXT: s_mov_b32 s14, 6 +; NOOPT-NEXT: s_mov_b32 s15, 5 +; NOOPT-NEXT: s_mov_b32 s16, 4 +; NOOPT-NEXT: s_mov_b32 s17, 3 +; NOOPT-NEXT: s_mov_b32 s18, 2 +; NOOPT-NEXT: s_mov_b32 s19, 1 +; NOOPT-NEXT: s_mov_b32 s20, 0 +; NOOPT-NEXT: v_mov_b32_e32 v15, s20 +; NOOPT-NEXT: v_mov_b32_e32 v14, s19 +; NOOPT-NEXT: v_mov_b32_e32 v13, s18 +; NOOPT-NEXT: v_mov_b32_e32 v12, s17 +; NOOPT-NEXT: v_mov_b32_e32 v11, s16 +; NOOPT-NEXT: v_mov_b32_e32 v10, s15 +; NOOPT-NEXT: v_mov_b32_e32 v9, s14 +; NOOPT-NEXT: v_mov_b32_e32 v8, s13 +; NOOPT-NEXT: v_mov_b32_e32 v7, s12 +; NOOPT-NEXT: v_mov_b32_e32 v6, s11 +; NOOPT-NEXT: v_mov_b32_e32 v5, s10 +; NOOPT-NEXT: v_mov_b32_e32 v4, s9 +; NOOPT-NEXT: v_mov_b32_e32 v3, s8 +; NOOPT-NEXT: v_mov_b32_e32 v2, s7 +; NOOPT-NEXT: v_mov_b32_e32 v1, s6 +; NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; NOOPT-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v16, v14 +; NOOPT-NEXT: v_mov_b32_e32 v17, v13 +; NOOPT-NEXT: v_mov_b32_e32 v18, v12 +; NOOPT-NEXT: v_mov_b32_e32 v19, v11 +; NOOPT-NEXT: v_mov_b32_e32 v20, v10 +; NOOPT-NEXT: v_mov_b32_e32 v21, v9 +; NOOPT-NEXT: v_mov_b32_e32 v22, v8 +; NOOPT-NEXT: v_mov_b32_e32 v23, v7 +; NOOPT-NEXT: v_mov_b32_e32 v24, v6 +; NOOPT-NEXT: v_mov_b32_e32 v25, v5 +; NOOPT-NEXT: v_mov_b32_e32 v26, v4 +; NOOPT-NEXT: v_mov_b32_e32 v27, v3 +; NOOPT-NEXT: v_mov_b32_e32 v28, v2 +; NOOPT-NEXT: v_mov_b32_e32 v29, v1 +; NOOPT-NEXT: v_mov_b32_e32 v30, v0 +; NOOPT-NEXT: v_mov_b32_e32 v0, 16 +; NOOPT-NEXT: s_add_i32 m0, s4, 0xfffffe00 +; NOOPT-NEXT: v_movreld_b32_e32 v15, v0 +; NOOPT-NEXT: v_mov_b32_e32 v4, v18 +; NOOPT-NEXT: v_mov_b32_e32 v5, v17 +; NOOPT-NEXT: v_mov_b32_e32 v6, v16 +; NOOPT-NEXT: v_mov_b32_e32 v0, v15 +; NOOPT-NEXT: v_mov_b32_e32 v1, v22 +; NOOPT-NEXT: v_mov_b32_e32 v2, v21 +; NOOPT-NEXT: v_mov_b32_e32 v3, v20 +; NOOPT-NEXT: v_mov_b32_e32 v7, v19 +; NOOPT-NEXT: v_mov_b32_e32 v12, v26 +; NOOPT-NEXT: v_mov_b32_e32 v13, v25 +; NOOPT-NEXT: v_mov_b32_e32 v14, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v30 +; NOOPT-NEXT: v_mov_b32_e32 v10, v29 +; NOOPT-NEXT: v_mov_b32_e32 v11, v28 +; NOOPT-NEXT: v_mov_b32_e32 v15, v27 +; NOOPT-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v16, v11 +; NOOPT-NEXT: v_mov_b32_e32 v17, v10 +; NOOPT-NEXT: v_mov_b32_e32 v18, v9 +; NOOPT-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v9, v14 +; NOOPT-NEXT: v_mov_b32_e32 v10, v13 +; NOOPT-NEXT: v_mov_b32_e32 v11, v12 +; NOOPT-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10 killed $exec +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v8, v3 +; NOOPT-NEXT: v_mov_b32_e32 v9, v2 +; NOOPT-NEXT: v_mov_b32_e32 v10, v1 +; NOOPT-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_neg_offset_sgpr: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 1 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 2 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 3 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 4 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 8 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 15 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_add_i32 m0, s4, 0xfffffe00 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, 16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: insert_neg_offset_sgpr: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 2 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 3 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_add_i32 m0, s2, 0xfffffe00 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 7 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 8 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 9 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 10 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 11 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 12 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 13 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 14 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 15 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, 16 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: insert_neg_offset_sgpr: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 7 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 8 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 9 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 10 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 11 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 12 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 13 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 14 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 15 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_addk_i32 s2, 0xfe00 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 16 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_neg_offset_sgpr: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x34 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 2 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 3 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 4 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 7 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v15 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_addk_i32 s4, 0xfe00 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, v14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, v13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, v12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, v11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, v10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, v9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 16 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %index = add i32 %offset, -512 %value = insertelement <16 x i32> , i32 16, i32 %index @@ -267,17 +2929,239 @@ entry: ; The vector indexed into is originally loaded into an SGPR rather ; than built with a reg_sequence - -; GCN-LABEL: {{^}}insert_neg_offset_sgpr_loadreg: -; The offset depends on the register that holds the first element of the vector. -; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} -; MOVREL: v_movreld_b32_e32 v0, 5 - -; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} -; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST) -; IDXMODE-NEXT: v_mov_b32_e32 v0, 5 -; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ptr addrspace(1) %out, <16 x i32> %vec, i32 %offset) { +; NOOPT-LABEL: insert_neg_offset_sgpr_loadreg: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; NOOPT-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x19 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0x29 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: v_mov_b32_e32 v0, 5 +; NOOPT-NEXT: v_mov_b32_e32 v30, s23 +; NOOPT-NEXT: v_mov_b32_e32 v29, s22 +; NOOPT-NEXT: v_mov_b32_e32 v28, s21 +; NOOPT-NEXT: v_mov_b32_e32 v27, s20 +; NOOPT-NEXT: v_mov_b32_e32 v26, s19 +; NOOPT-NEXT: v_mov_b32_e32 v25, s18 +; NOOPT-NEXT: v_mov_b32_e32 v24, s17 +; NOOPT-NEXT: v_mov_b32_e32 v23, s16 +; NOOPT-NEXT: v_mov_b32_e32 v22, s15 +; NOOPT-NEXT: v_mov_b32_e32 v21, s14 +; NOOPT-NEXT: v_mov_b32_e32 v20, s13 +; NOOPT-NEXT: v_mov_b32_e32 v19, s12 +; NOOPT-NEXT: v_mov_b32_e32 v18, s11 +; NOOPT-NEXT: v_mov_b32_e32 v17, s10 +; NOOPT-NEXT: v_mov_b32_e32 v16, s9 +; NOOPT-NEXT: v_mov_b32_e32 v15, s8 +; NOOPT-NEXT: s_add_i32 m0, s4, 0xfffffe00 +; NOOPT-NEXT: v_movreld_b32_e32 v15, v0 +; NOOPT-NEXT: v_mov_b32_e32 v4, v18 +; NOOPT-NEXT: v_mov_b32_e32 v5, v17 +; NOOPT-NEXT: v_mov_b32_e32 v6, v16 +; NOOPT-NEXT: v_mov_b32_e32 v0, v15 +; NOOPT-NEXT: v_mov_b32_e32 v1, v22 +; NOOPT-NEXT: v_mov_b32_e32 v2, v21 +; NOOPT-NEXT: v_mov_b32_e32 v3, v20 +; NOOPT-NEXT: v_mov_b32_e32 v7, v19 +; NOOPT-NEXT: v_mov_b32_e32 v12, v26 +; NOOPT-NEXT: v_mov_b32_e32 v13, v25 +; NOOPT-NEXT: v_mov_b32_e32 v14, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v30 +; NOOPT-NEXT: v_mov_b32_e32 v10, v29 +; NOOPT-NEXT: v_mov_b32_e32 v11, v28 +; NOOPT-NEXT: v_mov_b32_e32 v15, v27 +; NOOPT-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v16, v11 +; NOOPT-NEXT: v_mov_b32_e32 v17, v10 +; NOOPT-NEXT: v_mov_b32_e32 v18, v9 +; NOOPT-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v9, v14 +; NOOPT-NEXT: v_mov_b32_e32 v10, v13 +; NOOPT-NEXT: v_mov_b32_e32 v11, v12 +; NOOPT-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10 killed $exec +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v8, v3 +; NOOPT-NEXT: v_mov_b32_e32 v9, v2 +; NOOPT-NEXT: v_mov_b32_e32 v10, v1 +; NOOPT-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_neg_offset_sgpr_loadreg: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; SI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x29 +; SI-MOVREL-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0xb +; SI-MOVREL-NEXT: s_mov_b32 s23, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s22, -1 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s18 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s19 +; SI-MOVREL-NEXT: s_add_i32 m0, s0, 0xfffffe00 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, 5 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[20:23], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[20:23], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: insert_neg_offset_sgpr_loadreg: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-MOVREL-NEXT: s_load_dword s20, s[2:3], 0xa4 +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; VI-MOVREL-NEXT: s_add_i32 m0, s20, 0xfffffe00 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s18 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s19 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, 5 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: insert_neg_offset_sgpr_loadreg: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0xa4 +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 +; VI-IDXMODE-NEXT: s_addk_i32 s20, 0xfe00 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, s18 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, s19 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s20, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 5 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_neg_offset_sgpr_loadreg: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; GFX9-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0xa4 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-IDXMODE-NEXT: s_addk_i32 s20, 0xfe00 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s20, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 5 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %index = add i32 %offset, -512 %value = insertelement <16 x i32> %vec, i32 5, i32 %index @@ -285,29 +3169,885 @@ entry: ret void } -; GCN-LABEL: {{^}}insert_neg_offset_vgpr: -; The offset depends on the register that holds the first element of the vector. - -; GCN: v_cmp_eq_u32_e32 -; GCN-COUNT-16: v_cndmask_b32 -; GCN-COUNT-4: buffer_store_dwordx4 define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; NOOPT-LABEL: insert_neg_offset_vgpr: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; NOOPT-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; NOOPT-NEXT: s_mov_b32 s22, -1 +; NOOPT-NEXT: s_mov_b32 s23, 0xe8f000 +; NOOPT-NEXT: s_add_u32 s20, s20, s9 +; NOOPT-NEXT: s_addc_u32 s21, s21, 0 +; NOOPT-NEXT: ; implicit-def: $vgpr16 : SGPR spill to VGPR lane +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:136 ; 4-byte Folded Spill +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s6, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s4, 0xf000 +; NOOPT-NEXT: s_mov_b32 s5, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s6 +; NOOPT-NEXT: s_mov_b32 s2, s5 +; NOOPT-NEXT: s_mov_b32 s3, s4 +; NOOPT-NEXT: v_writelane_b32 v16, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v16, s1, 1 +; NOOPT-NEXT: v_writelane_b32 v16, s2, 2 +; NOOPT-NEXT: v_writelane_b32 v16, s3, 3 +; NOOPT-NEXT: s_mov_b32 s0, 16 +; NOOPT-NEXT: s_mov_b32 s1, 15 +; NOOPT-NEXT: s_mov_b32 s2, 14 +; NOOPT-NEXT: s_mov_b32 s3, 13 +; NOOPT-NEXT: s_mov_b32 s4, 12 +; NOOPT-NEXT: s_mov_b32 s5, 11 +; NOOPT-NEXT: s_mov_b32 s6, 10 +; NOOPT-NEXT: s_mov_b32 s7, 9 +; NOOPT-NEXT: s_mov_b32 s8, 8 +; NOOPT-NEXT: s_mov_b32 s9, 7 +; NOOPT-NEXT: s_mov_b32 s10, 6 +; NOOPT-NEXT: s_mov_b32 s11, 5 +; NOOPT-NEXT: s_mov_b32 s12, 4 +; NOOPT-NEXT: s_mov_b32 s13, 3 +; NOOPT-NEXT: s_mov_b32 s14, 2 +; NOOPT-NEXT: s_mov_b32 s15, 1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, s15 +; NOOPT-NEXT: v_mov_b32_e32 v31, s14 +; NOOPT-NEXT: v_mov_b32_e32 v30, s13 +; NOOPT-NEXT: v_mov_b32_e32 v29, s12 +; NOOPT-NEXT: v_mov_b32_e32 v28, s11 +; NOOPT-NEXT: v_mov_b32_e32 v27, s10 +; NOOPT-NEXT: v_mov_b32_e32 v26, s9 +; NOOPT-NEXT: v_mov_b32_e32 v25, s8 +; NOOPT-NEXT: v_mov_b32_e32 v24, s7 +; NOOPT-NEXT: v_mov_b32_e32 v23, s6 +; NOOPT-NEXT: v_mov_b32_e32 v22, s5 +; NOOPT-NEXT: v_mov_b32_e32 v21, s4 +; NOOPT-NEXT: v_mov_b32_e32 v20, s3 +; NOOPT-NEXT: v_mov_b32_e32 v19, s2 +; NOOPT-NEXT: v_mov_b32_e32 v18, s1 +; NOOPT-NEXT: v_mov_b32_e32 v17, s0 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v31 +; NOOPT-NEXT: v_mov_b32_e32 v2, v30 +; NOOPT-NEXT: v_mov_b32_e32 v3, v29 +; NOOPT-NEXT: v_mov_b32_e32 v4, v28 +; NOOPT-NEXT: v_mov_b32_e32 v5, v27 +; NOOPT-NEXT: v_mov_b32_e32 v6, v26 +; NOOPT-NEXT: v_mov_b32_e32 v7, v25 +; NOOPT-NEXT: v_mov_b32_e32 v8, v24 +; NOOPT-NEXT: v_mov_b32_e32 v9, v23 +; NOOPT-NEXT: v_mov_b32_e32 v10, v22 +; NOOPT-NEXT: v_mov_b32_e32 v11, v21 +; NOOPT-NEXT: v_mov_b32_e32 v12, v20 +; NOOPT-NEXT: v_mov_b32_e32 v13, v19 +; NOOPT-NEXT: v_mov_b32_e32 v14, v18 +; NOOPT-NEXT: v_mov_b32_e32 v15, v17 +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:72 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:80 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:84 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:88 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:92 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:96 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:100 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:104 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:108 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:112 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:116 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:120 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:124 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:132 ; 4-byte Folded Spill +; NOOPT-NEXT: v_mov_b32_e32 v17, 33 +; NOOPT-NEXT: buffer_store_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: v_writelane_b32 v16, s0, 4 +; NOOPT-NEXT: v_writelane_b32 v16, s1, 5 +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_waitcnt expcnt(2) +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 +; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; NOOPT-NEXT: s_add_i32 m0, s2, 0xfffffe00 +; NOOPT-NEXT: v_movreld_b32_e32 v1, v17 +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:140 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:144 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:148 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:152 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:156 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:160 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:164 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:168 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:172 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:176 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:180 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:184 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:188 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:192 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:196 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:200 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] +; NOOPT-NEXT: v_writelane_b32 v0, s2, 6 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 7 +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] +; NOOPT-NEXT: s_cbranch_execnz .LBB14_1 +; NOOPT-NEXT: ; %bb.2: +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 4 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 5 +; NOOPT-NEXT: s_mov_b64 exec, s[0:1] +; NOOPT-NEXT: ; %bb.3: +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 0 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 1 +; NOOPT-NEXT: v_readlane_b32 s2, v0, 2 +; NOOPT-NEXT: v_readlane_b32 s3, v0, 3 +; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v19, off, s[20:23], 0 offset:152 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v20, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v21, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v22, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v23, off, s[20:23], 0 offset:168 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v24, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v25, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v26, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v27, off, s[20:23], 0 offset:184 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v28, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v29, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v30, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v5, v19 +; NOOPT-NEXT: v_mov_b32_e32 v6, v18 +; NOOPT-NEXT: v_mov_b32_e32 v7, v17 +; NOOPT-NEXT: v_mov_b32_e32 v1, v16 +; NOOPT-NEXT: v_mov_b32_e32 v2, v23 +; NOOPT-NEXT: v_mov_b32_e32 v3, v22 +; NOOPT-NEXT: v_mov_b32_e32 v4, v21 +; NOOPT-NEXT: v_mov_b32_e32 v8, v20 +; NOOPT-NEXT: v_mov_b32_e32 v13, v27 +; NOOPT-NEXT: v_mov_b32_e32 v14, v26 +; NOOPT-NEXT: v_mov_b32_e32 v15, v25 +; NOOPT-NEXT: v_mov_b32_e32 v9, v24 +; NOOPT-NEXT: v_mov_b32_e32 v10, v31 +; NOOPT-NEXT: v_mov_b32_e32 v11, v30 +; NOOPT-NEXT: v_mov_b32_e32 v12, v29 +; NOOPT-NEXT: v_mov_b32_e32 v16, v28 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17_vgpr18_vgpr19 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v17, v12 +; NOOPT-NEXT: v_mov_b32_e32 v18, v11 +; NOOPT-NEXT: v_mov_b32_e32 v19, v10 +; NOOPT-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v10, v15 +; NOOPT-NEXT: v_mov_b32_e32 v11, v14 +; NOOPT-NEXT: v_mov_b32_e32 v12, v13 +; NOOPT-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:32 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v9, v4 +; NOOPT-NEXT: v_mov_b32_e32 v10, v3 +; NOOPT-NEXT: v_mov_b32_e32 v11, v2 +; NOOPT-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v2, v7 +; NOOPT-NEXT: v_mov_b32_e32 v3, v6 +; NOOPT-NEXT: v_mov_b32_e32 v4, v5 +; NOOPT-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 +; NOOPT-NEXT: ; kill: killed $vgpr0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_neg_offset_vgpr: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: v_add_i32_e32 v12, vcc, 0xfffffe00, v0 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 4, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 3, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v1, 2, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v0, 1, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v7, 8, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v6, 7, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v5, 6, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v4, 5, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v11, 12, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v10, 11, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v9, 10, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12 +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v8, 9, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v15, 16, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v14, 15, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v13, 14, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v12, 13, 33, vcc +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-LABEL: insert_neg_offset_vgpr: +; VI: ; %bb.0: ; %entry +; VI-NEXT: v_add_u32_e32 v12, vcc, 0xfffffe00, v0 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 +; VI-NEXT: v_cndmask_b32_e64 v3, 4, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12 +; VI-NEXT: v_cndmask_b32_e64 v2, 3, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; VI-NEXT: v_cndmask_b32_e64 v1, 2, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; VI-NEXT: v_cndmask_b32_e64 v0, 1, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12 +; VI-NEXT: v_cndmask_b32_e64 v7, 8, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12 +; VI-NEXT: v_cndmask_b32_e64 v6, 7, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: v_cndmask_b32_e64 v5, 6, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12 +; VI-NEXT: v_cndmask_b32_e64 v4, 5, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12 +; VI-NEXT: v_cndmask_b32_e64 v11, 12, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12 +; VI-NEXT: v_cndmask_b32_e64 v10, 11, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12 +; VI-NEXT: v_cndmask_b32_e64 v9, 10, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12 +; VI-NEXT: v_cndmask_b32_e64 v8, 9, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s2, s0, 48 +; VI-NEXT: v_cndmask_b32_e64 v15, 16, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_cndmask_b32_e64 v14, 15, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12 +; VI-NEXT: v_mov_b32_e32 v17, s3 +; VI-NEXT: v_cndmask_b32_e64 v13, 14, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12 +; VI-NEXT: v_mov_b32_e32 v16, s2 +; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: v_cndmask_b32_e64 v12, 13, 33, vcc +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v13, s3 +; VI-NEXT: v_mov_b32_e32 v12, s2 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v9, s3 +; VI-NEXT: v_mov_b32_e32 v8, s2 +; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_neg_offset_vgpr: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: v_add_u32_e32 v12, 0xfffffe00, v0 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 4, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v2, 3, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v1, 2, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v0, 1, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v7, 8, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v6, 7, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v5, 6, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 5, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v11, 12, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v10, 11, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v9, 10, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v8, 9, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v15, 16, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v14, 15, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v13, 14, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v12, 13, 33, vcc +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: - %id = call i32 @llvm.amdgcn.workitem.id.x() #1 + %id = call i32 @llvm.amdgcn.workitem.id.x() %index = add i32 %id, -512 %value = insertelement <16 x i32> , i32 33, i32 %index store <16 x i32> %value, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}insert_neg_inline_offset_vgpr: - -; GCN: v_cmp_eq_u32_e32 -; GCN-COUNT-16: v_cndmask_b32 -; GCN-COUNT-4: buffer_store_dwordx4 define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; NOOPT-LABEL: insert_neg_inline_offset_vgpr: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; NOOPT-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; NOOPT-NEXT: s_mov_b32 s22, -1 +; NOOPT-NEXT: s_mov_b32 s23, 0xe8f000 +; NOOPT-NEXT: s_add_u32 s20, s20, s9 +; NOOPT-NEXT: s_addc_u32 s21, s21, 0 +; NOOPT-NEXT: ; implicit-def: $vgpr16 : SGPR spill to VGPR lane +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:136 ; 4-byte Folded Spill +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s6, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s4, 0xf000 +; NOOPT-NEXT: s_mov_b32 s5, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s6 +; NOOPT-NEXT: s_mov_b32 s2, s5 +; NOOPT-NEXT: s_mov_b32 s3, s4 +; NOOPT-NEXT: v_writelane_b32 v16, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v16, s1, 1 +; NOOPT-NEXT: v_writelane_b32 v16, s2, 2 +; NOOPT-NEXT: v_writelane_b32 v16, s3, 3 +; NOOPT-NEXT: s_mov_b32 s0, 16 +; NOOPT-NEXT: s_mov_b32 s1, 15 +; NOOPT-NEXT: s_mov_b32 s2, 14 +; NOOPT-NEXT: s_mov_b32 s3, 13 +; NOOPT-NEXT: s_mov_b32 s4, 12 +; NOOPT-NEXT: s_mov_b32 s5, 11 +; NOOPT-NEXT: s_mov_b32 s6, 10 +; NOOPT-NEXT: s_mov_b32 s7, 9 +; NOOPT-NEXT: s_mov_b32 s8, 8 +; NOOPT-NEXT: s_mov_b32 s9, 7 +; NOOPT-NEXT: s_mov_b32 s10, 6 +; NOOPT-NEXT: s_mov_b32 s11, 5 +; NOOPT-NEXT: s_mov_b32 s12, 4 +; NOOPT-NEXT: s_mov_b32 s13, 3 +; NOOPT-NEXT: s_mov_b32 s14, 2 +; NOOPT-NEXT: s_mov_b32 s15, 1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, s15 +; NOOPT-NEXT: v_mov_b32_e32 v31, s14 +; NOOPT-NEXT: v_mov_b32_e32 v30, s13 +; NOOPT-NEXT: v_mov_b32_e32 v29, s12 +; NOOPT-NEXT: v_mov_b32_e32 v28, s11 +; NOOPT-NEXT: v_mov_b32_e32 v27, s10 +; NOOPT-NEXT: v_mov_b32_e32 v26, s9 +; NOOPT-NEXT: v_mov_b32_e32 v25, s8 +; NOOPT-NEXT: v_mov_b32_e32 v24, s7 +; NOOPT-NEXT: v_mov_b32_e32 v23, s6 +; NOOPT-NEXT: v_mov_b32_e32 v22, s5 +; NOOPT-NEXT: v_mov_b32_e32 v21, s4 +; NOOPT-NEXT: v_mov_b32_e32 v20, s3 +; NOOPT-NEXT: v_mov_b32_e32 v19, s2 +; NOOPT-NEXT: v_mov_b32_e32 v18, s1 +; NOOPT-NEXT: v_mov_b32_e32 v17, s0 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v31 +; NOOPT-NEXT: v_mov_b32_e32 v2, v30 +; NOOPT-NEXT: v_mov_b32_e32 v3, v29 +; NOOPT-NEXT: v_mov_b32_e32 v4, v28 +; NOOPT-NEXT: v_mov_b32_e32 v5, v27 +; NOOPT-NEXT: v_mov_b32_e32 v6, v26 +; NOOPT-NEXT: v_mov_b32_e32 v7, v25 +; NOOPT-NEXT: v_mov_b32_e32 v8, v24 +; NOOPT-NEXT: v_mov_b32_e32 v9, v23 +; NOOPT-NEXT: v_mov_b32_e32 v10, v22 +; NOOPT-NEXT: v_mov_b32_e32 v11, v21 +; NOOPT-NEXT: v_mov_b32_e32 v12, v20 +; NOOPT-NEXT: v_mov_b32_e32 v13, v19 +; NOOPT-NEXT: v_mov_b32_e32 v14, v18 +; NOOPT-NEXT: v_mov_b32_e32 v15, v17 +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:72 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:80 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:84 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:88 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:92 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:96 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:100 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:104 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:108 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:112 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:116 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:120 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:124 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:132 ; 4-byte Folded Spill +; NOOPT-NEXT: v_mov_b32_e32 v17, 0x1f4 +; NOOPT-NEXT: buffer_store_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: v_writelane_b32 v16, s0, 4 +; NOOPT-NEXT: v_writelane_b32 v16, s1, 5 +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_waitcnt expcnt(2) +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 +; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; NOOPT-NEXT: s_add_i32 m0, s2, -16 +; NOOPT-NEXT: v_movreld_b32_e32 v1, v17 +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:140 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:144 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:148 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:152 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:156 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:160 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:164 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:168 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:172 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:176 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:180 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:184 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:188 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:192 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:196 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:200 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] +; NOOPT-NEXT: v_writelane_b32 v0, s2, 6 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 7 +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] +; NOOPT-NEXT: s_cbranch_execnz .LBB15_1 +; NOOPT-NEXT: ; %bb.2: +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 4 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 5 +; NOOPT-NEXT: s_mov_b64 exec, s[0:1] +; NOOPT-NEXT: ; %bb.3: +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 0 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 1 +; NOOPT-NEXT: v_readlane_b32 s2, v0, 2 +; NOOPT-NEXT: v_readlane_b32 s3, v0, 3 +; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v19, off, s[20:23], 0 offset:152 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v20, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v21, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v22, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v23, off, s[20:23], 0 offset:168 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v24, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v25, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v26, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v27, off, s[20:23], 0 offset:184 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v28, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v29, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v30, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v5, v19 +; NOOPT-NEXT: v_mov_b32_e32 v6, v18 +; NOOPT-NEXT: v_mov_b32_e32 v7, v17 +; NOOPT-NEXT: v_mov_b32_e32 v1, v16 +; NOOPT-NEXT: v_mov_b32_e32 v2, v23 +; NOOPT-NEXT: v_mov_b32_e32 v3, v22 +; NOOPT-NEXT: v_mov_b32_e32 v4, v21 +; NOOPT-NEXT: v_mov_b32_e32 v8, v20 +; NOOPT-NEXT: v_mov_b32_e32 v13, v27 +; NOOPT-NEXT: v_mov_b32_e32 v14, v26 +; NOOPT-NEXT: v_mov_b32_e32 v15, v25 +; NOOPT-NEXT: v_mov_b32_e32 v9, v24 +; NOOPT-NEXT: v_mov_b32_e32 v10, v31 +; NOOPT-NEXT: v_mov_b32_e32 v11, v30 +; NOOPT-NEXT: v_mov_b32_e32 v12, v29 +; NOOPT-NEXT: v_mov_b32_e32 v16, v28 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17_vgpr18_vgpr19 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v17, v12 +; NOOPT-NEXT: v_mov_b32_e32 v18, v11 +; NOOPT-NEXT: v_mov_b32_e32 v19, v10 +; NOOPT-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v10, v15 +; NOOPT-NEXT: v_mov_b32_e32 v11, v14 +; NOOPT-NEXT: v_mov_b32_e32 v12, v13 +; NOOPT-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:32 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v9, v4 +; NOOPT-NEXT: v_mov_b32_e32 v10, v3 +; NOOPT-NEXT: v_mov_b32_e32 v11, v2 +; NOOPT-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v2, v7 +; NOOPT-NEXT: v_mov_b32_e32 v3, v6 +; NOOPT-NEXT: v_mov_b32_e32 v4, v5 +; NOOPT-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 +; NOOPT-NEXT: ; kill: killed $vgpr0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_neg_inline_offset_vgpr: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: v_add_i32_e32 v12, vcc, -16, v0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x1f4 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, 4, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, 3, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 2, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v0, 1, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, 8, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, 7, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v5, 6, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, 5, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, 12, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, 11, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, 10, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12 +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, 9, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v15, 16, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, 15, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v13, 14, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v12, 13, v16, vcc +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-LABEL: insert_neg_inline_offset_vgpr: +; VI: ; %bb.0: ; %entry +; VI-NEXT: v_add_u32_e32 v12, vcc, -16, v0 +; VI-NEXT: v_mov_b32_e32 v16, 0x1f4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 +; VI-NEXT: v_cndmask_b32_e32 v3, 4, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12 +; VI-NEXT: v_cndmask_b32_e32 v2, 3, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; VI-NEXT: v_cndmask_b32_e32 v1, 2, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; VI-NEXT: v_cndmask_b32_e32 v0, 1, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12 +; VI-NEXT: v_cndmask_b32_e32 v7, 8, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12 +; VI-NEXT: v_cndmask_b32_e32 v6, 7, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12 +; VI-NEXT: v_cndmask_b32_e32 v5, 6, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12 +; VI-NEXT: v_cndmask_b32_e32 v4, 5, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: v_cndmask_b32_e32 v11, 12, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12 +; VI-NEXT: v_cndmask_b32_e32 v10, 11, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12 +; VI-NEXT: v_cndmask_b32_e32 v9, 10, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12 +; VI-NEXT: v_cndmask_b32_e32 v8, 9, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12 +; VI-NEXT: v_cndmask_b32_e32 v15, 16, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12 +; VI-NEXT: v_cndmask_b32_e32 v14, 15, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s2, s0, 48 +; VI-NEXT: v_cndmask_b32_e32 v13, 14, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_cndmask_b32_e32 v12, 13, v16, vcc +; VI-NEXT: v_mov_b32_e32 v17, s3 +; VI-NEXT: v_mov_b32_e32 v16, s2 +; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v13, s3 +; VI-NEXT: v_mov_b32_e32 v12, s2 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v9, s3 +; VI-NEXT: v_mov_b32_e32 v8, s2 +; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_neg_inline_offset_vgpr: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: v_add_u32_e32 v12, -16, v0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, 0x1f4 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, 4, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 3, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v1, 2, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v0, 1, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, 8, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v6, 7, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, 6, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, 5, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, 12, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, 11, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v9, 10, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, 9, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v15, 16, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v14, 15, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v13, 14, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v12, 13, v17, vcc +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: - %id = call i32 @llvm.amdgcn.workitem.id.x() #1 + %id = call i32 @llvm.amdgcn.workitem.id.x() %index = add i32 %id, -16 %value = insertelement <16 x i32> , i32 500, i32 %index store <16 x i32> %value, ptr addrspace(1) %out @@ -316,19 +4056,646 @@ entry: ; When the block is split to insert the loop, make sure any other ; places that need to be expanded in the same block are also handled. - -; GCN-LABEL: {{^}}extract_vgpr_offset_multiple_in_block: - -; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]] -; GCN: v_cmp_eq_u32 -; GCN: v_cndmask_b32_e64 [[RESULT0:v[0-9]+]], 16, -; GCN: v_cndmask_b32_e64 [[RESULT1:v[0-9]+]], 16, - -; GCN: buffer_store_dword [[RESULT0]] -; GCN: buffer_store_dword [[RESULT1]] -define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) #0 { +define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) { +; NOOPT-LABEL: extract_vgpr_offset_multiple_in_block: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; NOOPT-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; NOOPT-NEXT: s_mov_b32 s38, -1 +; NOOPT-NEXT: s_mov_b32 s39, 0xe8f000 +; NOOPT-NEXT: s_add_u32 s36, s36, s9 +; NOOPT-NEXT: s_addc_u32 s37, s37, 0 +; NOOPT-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane +; NOOPT-NEXT: s_mov_b64 s[0:1], s[2:3] +; NOOPT-NEXT: v_mov_b32_e32 v1, v0 +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:76 ; 4-byte Folded Spill +; NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s8, s3 +; NOOPT-NEXT: s_mov_b32 s4, s2 +; NOOPT-NEXT: s_mov_b32 s2, 0xf000 +; NOOPT-NEXT: s_mov_b32 s3, -1 +; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b32 s5, s8 +; NOOPT-NEXT: s_mov_b32 s6, s3 +; NOOPT-NEXT: s_mov_b32 s7, s2 +; NOOPT-NEXT: s_waitcnt vmcnt(1) +; NOOPT-NEXT: v_writelane_b32 v0, s4, 0 +; NOOPT-NEXT: v_writelane_b32 v0, s5, 1 +; NOOPT-NEXT: v_writelane_b32 v0, s6, 2 +; NOOPT-NEXT: v_writelane_b32 v0, s7, 3 +; NOOPT-NEXT: s_mov_b32 s4, 0 +; NOOPT-NEXT: v_writelane_b32 v0, s4, 4 +; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; NOOPT-NEXT: s_mov_b32 s5, s2 +; NOOPT-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; NOOPT-NEXT: s_mov_b32 s4, 2 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_lshlrev_b32_e64 v1, s4, v1 +; NOOPT-NEXT: s_mov_b32 s4, 0 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: v_mov_b32_e32 v3, 0 +; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v2, v3 +; NOOPT-NEXT: buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:72 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b32 s0, 1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_add_i32_e64 v1, s[0:1], v1, s0 +; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b32 s16, 16 +; NOOPT-NEXT: s_mov_b32 s17, 15 +; NOOPT-NEXT: s_mov_b32 s18, 14 +; NOOPT-NEXT: s_mov_b32 s20, 12 +; NOOPT-NEXT: s_mov_b32 s22, 10 +; NOOPT-NEXT: s_mov_b32 s24, 8 +; NOOPT-NEXT: s_mov_b32 s26, 6 +; NOOPT-NEXT: s_mov_b32 s27, 5 +; NOOPT-NEXT: s_mov_b32 s19, 13 +; NOOPT-NEXT: s_mov_b32 s21, 11 +; NOOPT-NEXT: s_mov_b32 s23, 9 +; NOOPT-NEXT: s_mov_b32 s25, 7 +; NOOPT-NEXT: s_mov_b32 s0, s25 +; NOOPT-NEXT: s_mov_b32 s1, s23 +; NOOPT-NEXT: s_mov_b32 s2, s21 +; NOOPT-NEXT: s_mov_b32 s3, s19 +; NOOPT-NEXT: s_mov_b32 s4, s27 +; NOOPT-NEXT: s_mov_b32 s5, s26 +; NOOPT-NEXT: s_mov_b32 s6, s25 +; NOOPT-NEXT: s_mov_b32 s7, s24 +; NOOPT-NEXT: s_mov_b32 s8, s23 +; NOOPT-NEXT: s_mov_b32 s9, s22 +; NOOPT-NEXT: s_mov_b32 s10, s21 +; NOOPT-NEXT: s_mov_b32 s11, s20 +; NOOPT-NEXT: s_mov_b32 s12, s19 +; NOOPT-NEXT: s_mov_b32 s13, s18 +; NOOPT-NEXT: s_mov_b32 s14, s17 +; NOOPT-NEXT: s_mov_b32 s15, s16 +; NOOPT-NEXT: v_writelane_b32 v0, s0, 5 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 6 +; NOOPT-NEXT: v_writelane_b32 v0, s2, 7 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 8 +; NOOPT-NEXT: v_writelane_b32 v0, s4, 9 +; NOOPT-NEXT: v_writelane_b32 v0, s5, 10 +; NOOPT-NEXT: v_writelane_b32 v0, s6, 11 +; NOOPT-NEXT: v_writelane_b32 v0, s7, 12 +; NOOPT-NEXT: v_writelane_b32 v0, s8, 13 +; NOOPT-NEXT: v_writelane_b32 v0, s9, 14 +; NOOPT-NEXT: v_writelane_b32 v0, s10, 15 +; NOOPT-NEXT: v_writelane_b32 v0, s11, 16 +; NOOPT-NEXT: v_writelane_b32 v0, s12, 17 +; NOOPT-NEXT: v_writelane_b32 v0, s13, 18 +; NOOPT-NEXT: v_writelane_b32 v0, s14, 19 +; NOOPT-NEXT: v_writelane_b32 v0, s15, 20 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v16, s15 +; NOOPT-NEXT: v_mov_b32_e32 v15, s14 +; NOOPT-NEXT: v_mov_b32_e32 v14, s13 +; NOOPT-NEXT: v_mov_b32_e32 v13, s12 +; NOOPT-NEXT: v_mov_b32_e32 v12, s11 +; NOOPT-NEXT: v_mov_b32_e32 v11, s10 +; NOOPT-NEXT: v_mov_b32_e32 v10, s9 +; NOOPT-NEXT: v_mov_b32_e32 v9, s8 +; NOOPT-NEXT: v_mov_b32_e32 v8, s7 +; NOOPT-NEXT: v_mov_b32_e32 v7, s6 +; NOOPT-NEXT: v_mov_b32_e32 v6, s5 +; NOOPT-NEXT: v_mov_b32_e32 v5, s4 +; NOOPT-NEXT: v_mov_b32_e32 v4, s3 +; NOOPT-NEXT: v_mov_b32_e32 v3, s2 +; NOOPT-NEXT: v_mov_b32_e32 v2, s1 +; NOOPT-NEXT: v_mov_b32_e32 v1, s0 +; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[36:39], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[36:39], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[36:39], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[36:39], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[36:39], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[36:39], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[36:39], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[36:39], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[36:39], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[36:39], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[36:39], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[36:39], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[36:39], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[36:39], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[36:39], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: v_writelane_b32 v0, s0, 21 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 22 +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: ; implicit-def: $vgpr0 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: .LBB16_1: ; =>This Inner Loop Header: Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 23 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 24 +; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 offset:80 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:8 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[36:39], 0 offset:12 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[36:39], 0 offset:16 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[36:39], 0 offset:20 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[36:39], 0 offset:24 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[36:39], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[36:39], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[36:39], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[36:39], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[36:39], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[36:39], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[36:39], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[36:39], 0 offset:56 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[36:39], 0 offset:60 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[36:39], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[36:39], 0 offset:72 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 +; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; NOOPT-NEXT: s_mov_b32 m0, s2 +; NOOPT-NEXT: v_movrels_b32_e32 v1, v1 +; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:84 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:80 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] +; NOOPT-NEXT: v_writelane_b32 v0, s2, 23 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 24 +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] +; NOOPT-NEXT: s_cbranch_execnz .LBB16_1 +; NOOPT-NEXT: ; %bb.2: +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 21 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 22 +; NOOPT-NEXT: s_mov_b64 exec, s[0:1] +; NOOPT-NEXT: ; %bb.3: +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: ;;#ASMSTART +; NOOPT-NEXT: s_mov_b32 s4, 17 +; NOOPT-NEXT: ;;#ASMEND +; NOOPT-NEXT: s_mov_b32 s16, s4 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 5 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 6 +; NOOPT-NEXT: v_readlane_b32 s2, v0, 7 +; NOOPT-NEXT: v_readlane_b32 s3, v0, 8 +; NOOPT-NEXT: v_readlane_b32 s4, v0, 9 +; NOOPT-NEXT: v_readlane_b32 s5, v0, 10 +; NOOPT-NEXT: v_readlane_b32 s6, v0, 11 +; NOOPT-NEXT: v_readlane_b32 s7, v0, 12 +; NOOPT-NEXT: v_readlane_b32 s8, v0, 13 +; NOOPT-NEXT: v_readlane_b32 s9, v0, 14 +; NOOPT-NEXT: v_readlane_b32 s10, v0, 15 +; NOOPT-NEXT: v_readlane_b32 s11, v0, 16 +; NOOPT-NEXT: v_readlane_b32 s12, v0, 17 +; NOOPT-NEXT: v_readlane_b32 s13, v0, 18 +; NOOPT-NEXT: v_readlane_b32 s14, v0, 19 +; NOOPT-NEXT: v_readlane_b32 s15, v0, 20 +; NOOPT-NEXT: v_writelane_b32 v0, s16, 25 +; NOOPT-NEXT: v_mov_b32_e32 v16, s15 +; NOOPT-NEXT: v_mov_b32_e32 v15, s14 +; NOOPT-NEXT: v_mov_b32_e32 v14, s13 +; NOOPT-NEXT: v_mov_b32_e32 v13, s12 +; NOOPT-NEXT: v_mov_b32_e32 v12, s11 +; NOOPT-NEXT: v_mov_b32_e32 v11, s10 +; NOOPT-NEXT: v_mov_b32_e32 v10, s9 +; NOOPT-NEXT: v_mov_b32_e32 v9, s8 +; NOOPT-NEXT: v_mov_b32_e32 v8, s7 +; NOOPT-NEXT: v_mov_b32_e32 v7, s6 +; NOOPT-NEXT: v_mov_b32_e32 v6, s5 +; NOOPT-NEXT: v_mov_b32_e32 v5, s4 +; NOOPT-NEXT: v_mov_b32_e32 v4, s3 +; NOOPT-NEXT: v_mov_b32_e32 v3, s2 +; NOOPT-NEXT: v_mov_b32_e32 v2, s1 +; NOOPT-NEXT: v_mov_b32_e32 v1, s0 +; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:88 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[36:39], 0 offset:92 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[36:39], 0 offset:96 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[36:39], 0 offset:100 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[36:39], 0 offset:104 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[36:39], 0 offset:108 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[36:39], 0 offset:112 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[36:39], 0 offset:116 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[36:39], 0 offset:120 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[36:39], 0 offset:124 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[36:39], 0 offset:128 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[36:39], 0 offset:132 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[36:39], 0 offset:136 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[36:39], 0 offset:140 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[36:39], 0 offset:144 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[36:39], 0 offset:148 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: v_writelane_b32 v0, s0, 26 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 27 +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: ; implicit-def: $vgpr0 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: .LBB16_4: ; =>This Inner Loop Header: Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 28 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 29 +; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 offset:152 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:88 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:92 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[36:39], 0 offset:96 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[36:39], 0 offset:100 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[36:39], 0 offset:104 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[36:39], 0 offset:108 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[36:39], 0 offset:112 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[36:39], 0 offset:116 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[36:39], 0 offset:120 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[36:39], 0 offset:124 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[36:39], 0 offset:128 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[36:39], 0 offset:132 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[36:39], 0 offset:136 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[36:39], 0 offset:140 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[36:39], 0 offset:144 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[36:39], 0 offset:148 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[36:39], 0 offset:68 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 +; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; NOOPT-NEXT: s_mov_b32 m0, s2 +; NOOPT-NEXT: v_movrels_b32_e32 v1, v1 +; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:156 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:152 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] +; NOOPT-NEXT: v_writelane_b32 v0, s2, 28 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 29 +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] +; NOOPT-NEXT: s_cbranch_execnz .LBB16_4 +; NOOPT-NEXT: ; %bb.5: +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 26 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 27 +; NOOPT-NEXT: s_mov_b64 exec, s[0:1] +; NOOPT-NEXT: ; %bb.6: +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 4 +; NOOPT-NEXT: v_readlane_b32 s4, v0, 0 +; NOOPT-NEXT: v_readlane_b32 s5, v0, 1 +; NOOPT-NEXT: v_readlane_b32 s6, v0, 2 +; NOOPT-NEXT: v_readlane_b32 s7, v0, 3 +; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:76 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:156 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[36:39], 0 offset:84 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v3, off, s[4:7], 0 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0 +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: v_writelane_b32 v0, s0, 30 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 31 +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; NOOPT-NEXT: s_mov_b64 exec, s[0:1] +; NOOPT-NEXT: s_cbranch_execz .LBB16_8 +; NOOPT-NEXT: ; %bb.7: ; %bb1 +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s4, v0, 25 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: v_mov_b32_e32 v0, s4 +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: .LBB16_8: ; %bb2 +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 30 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 31 +; NOOPT-NEXT: s_or_b64 exec, exec, s[0:1] +; NOOPT-NEXT: ; kill: killed $vgpr0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extract_vgpr_offset_multiple_in_block: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-MOVREL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_mov_b32 s11, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s6, 0 +; SI-MOVREL-NEXT: s_mov_b32 s7, s11 +; SI-MOVREL-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-MOVREL-NEXT: s_mov_b32 s10, -1 +; SI-MOVREL-NEXT: ;;#ASMSTART +; SI-MOVREL-NEXT: s_mov_b32 s4, 17 +; SI-MOVREL-NEXT: ;;#ASMEND +; SI-MOVREL-NEXT: v_add_i32_e64 v0, s[0:1], 1, v1 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 7, 9, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 11, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 7, 9, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 13, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 11, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 4, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 5, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 13, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 5, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 6, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 4, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 5, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 6, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 7, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 5, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 6, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 7, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 8, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 6, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 7, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 8, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 9, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 7, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 8, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 9, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 10, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 8, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 9, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 10, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 11, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 9, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 10, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 11, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 12, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 10, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 11, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 12, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 13, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 11, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 12, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 14, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 12, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 13, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 14, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 15, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 14, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v1, 16, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 14, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 15, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v0, 16, v2, s[0:1] +; SI-MOVREL-NEXT: buffer_store_dword v1, off, s[8:11], 0 +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; SI-MOVREL-NEXT: s_cbranch_execz .LBB16_2 +; SI-MOVREL-NEXT: ; %bb.1: ; %bb1 +; SI-MOVREL-NEXT: s_waitcnt expcnt(0) +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: .LBB16_2: ; %bb2 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-LABEL: extract_vgpr_offset_multiple_in_block: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: flat_load_dword v2, v[1:2] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: ;;#ASMSTART +; VI-NEXT: s_mov_b32 s4, 17 +; VI-NEXT: ;;#ASMEND +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e64 v3, s[0:1], 1, v2 +; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 7, 9, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 11, v4, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 7, 9, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 13, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 11, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 4, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 5, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 13, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 5, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 6, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 4, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 5, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 6, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 7, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 5, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 6, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 7, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 8, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 6, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 7, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 8, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 9, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 7, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 8, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 9, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 10, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 8, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 9, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 10, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 11, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 9, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 10, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 11, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 12, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 10, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 11, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 12, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 13, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 11, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 12, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 14, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 12, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 13, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 14, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 15, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 14, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v2 +; VI-NEXT: v_cndmask_b32_e64 v2, 16, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 14, v3 +; VI-NEXT: v_cndmask_b32_e64 v4, 15, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v3 +; VI-NEXT: v_cndmask_b32_e64 v3, 16, v4, s[0:1] +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: ; %bb.1: ; %bb1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: flat_store_dword v[0:1], v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: .LBB16_2: ; %bb2 +; VI-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: extract_vgpr_offset_multiple_in_block: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dword v2, v1, s[0:1] glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-IDXMODE-NEXT: ;;#ASMSTART +; GFX9-IDXMODE-NEXT: s_mov_b32 s4, 17 +; GFX9-IDXMODE-NEXT: ;;#ASMEND +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2 +; GFX9-IDXMODE-NEXT: v_add_u32_e32 v0, 1, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 7, 9, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 11, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 7, 9, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 13, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 11, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 4, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 5, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 13, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 5, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 6, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 4, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 5, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 6, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 7, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 5, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 6, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 7, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 8, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 6, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 7, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 8, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 9, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 7, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 8, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 9, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 10, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 8, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 9, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 10, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 11, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 9, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 10, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 11, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 12, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 10, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 11, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 12, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 13, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 11, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 12, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 14, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 12, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 13, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 14, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 15, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 14, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v2, 16, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 14, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 15, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v0, 16, v3, s[0:1] +; GFX9-IDXMODE-NEXT: global_store_dword v1, v2, s[6:7] +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_store_dword v1, v0, s[6:7] +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-IDXMODE-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-IDXMODE-NEXT: ; %bb.1: ; %bb1 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-IDXMODE-NEXT: global_store_dword v[0:1], v0, off +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: .LBB16_2: ; %bb2 +; GFX9-IDXMODE-NEXT: s_endpgm entry: - %id = call i32 @llvm.amdgcn.workitem.id.x() #1 + %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = zext i32 %id to i64 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %id.ext %idx0 = load volatile i32, ptr addrspace(1) %gep @@ -349,62 +4716,1870 @@ bb2: ret void } -; Moved subtest for insert_vgpr_offset_multiple_in_block to separate file to -; avoid very different schedule induced isses with gfx9. -; test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll +define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in, <16 x i32> %vec0) { +; NOOPT-LABEL: insert_vgpr_offset_multiple_in_block: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_mov_b32 s28, SCRATCH_RSRC_DWORD0 +; NOOPT-NEXT: s_mov_b32 s29, SCRATCH_RSRC_DWORD1 +; NOOPT-NEXT: s_mov_b32 s30, -1 +; NOOPT-NEXT: s_mov_b32 s31, 0xe8f000 +; NOOPT-NEXT: s_add_u32 s28, s28, s9 +; NOOPT-NEXT: s_addc_u32 s29, s29, 0 +; NOOPT-NEXT: ; implicit-def: $vgpr16 : SGPR spill to VGPR lane +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:84 ; 4-byte Folded Spill +; NOOPT-NEXT: s_load_dwordx2 s[18:19], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0xd +; NOOPT-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x19 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s24, s19 +; NOOPT-NEXT: s_mov_b32 s20, s18 +; NOOPT-NEXT: s_mov_b32 s18, 0xf000 +; NOOPT-NEXT: s_mov_b32 s19, -1 +; NOOPT-NEXT: ; kill: def $sgpr20 killed $sgpr20 def $sgpr20_sgpr21_sgpr22_sgpr23 +; NOOPT-NEXT: s_mov_b32 s21, s24 +; NOOPT-NEXT: s_mov_b32 s22, s19 +; NOOPT-NEXT: s_mov_b32 s23, s18 +; NOOPT-NEXT: v_writelane_b32 v16, s20, 0 +; NOOPT-NEXT: v_writelane_b32 v16, s21, 1 +; NOOPT-NEXT: v_writelane_b32 v16, s22, 2 +; NOOPT-NEXT: v_writelane_b32 v16, s23, 3 +; NOOPT-NEXT: s_mov_b32 s20, 0 +; NOOPT-NEXT: v_writelane_b32 v16, s20, 4 +; NOOPT-NEXT: ; kill: def $sgpr20 killed $sgpr20 def $sgpr20_sgpr21 +; NOOPT-NEXT: s_mov_b32 s21, s18 +; NOOPT-NEXT: ; kill: def $sgpr16_sgpr17 killed $sgpr16_sgpr17 def $sgpr16_sgpr17_sgpr18_sgpr19 +; NOOPT-NEXT: s_mov_b64 s[18:19], s[20:21] +; NOOPT-NEXT: s_mov_b32 s20, 2 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_lshlrev_b32_e64 v0, s20, v0 +; NOOPT-NEXT: s_mov_b32 s20, 0 +; NOOPT-NEXT: ; implicit-def: $sgpr20 +; NOOPT-NEXT: v_mov_b32_e32 v2, 0 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v2 +; NOOPT-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:80 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b32 s16, 1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_add_i32_e64 v0, s[16:17], v0, s16 +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:76 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: ;;#ASMSTART +; NOOPT-NEXT: v_mov_b32 v0, 62 +; NOOPT-NEXT: ;;#ASMEND +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:72 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, s0 +; NOOPT-NEXT: v_mov_b32_e32 v1, s1 +; NOOPT-NEXT: v_mov_b32_e32 v2, s2 +; NOOPT-NEXT: v_mov_b32_e32 v3, s3 +; NOOPT-NEXT: v_mov_b32_e32 v4, s4 +; NOOPT-NEXT: v_mov_b32_e32 v5, s5 +; NOOPT-NEXT: v_mov_b32_e32 v6, s6 +; NOOPT-NEXT: v_mov_b32_e32 v7, s7 +; NOOPT-NEXT: v_mov_b32_e32 v8, s8 +; NOOPT-NEXT: v_mov_b32_e32 v9, s9 +; NOOPT-NEXT: v_mov_b32_e32 v10, s10 +; NOOPT-NEXT: v_mov_b32_e32 v11, s11 +; NOOPT-NEXT: v_mov_b32_e32 v12, s12 +; NOOPT-NEXT: v_mov_b32_e32 v13, s13 +; NOOPT-NEXT: v_mov_b32_e32 v14, s14 +; NOOPT-NEXT: v_mov_b32_e32 v15, s15 +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: v_writelane_b32 v16, s0, 5 +; NOOPT-NEXT: v_writelane_b32 v16, s1, 6 +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 7 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 8 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[28:31], 0 offset:8 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[28:31], 0 offset:12 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[28:31], 0 offset:16 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[28:31], 0 offset:20 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[28:31], 0 offset:24 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[28:31], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[28:31], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[28:31], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[28:31], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[28:31], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[28:31], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[28:31], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:56 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:60 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:72 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[28:31], 0 offset:80 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 +; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; NOOPT-NEXT: s_mov_b32 m0, s2 +; NOOPT-NEXT: v_movreld_b32_e32 v1, v17 +; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:88 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:92 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:96 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:100 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:104 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:108 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:112 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:116 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:120 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:124 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:128 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:132 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:136 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:140 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:144 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:148 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] +; NOOPT-NEXT: v_writelane_b32 v0, s2, 7 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 8 +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] +; NOOPT-NEXT: s_cbranch_execnz .LBB17_1 +; NOOPT-NEXT: ; %bb.2: +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 5 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 6 +; NOOPT-NEXT: s_mov_b64 exec, s[0:1] +; NOOPT-NEXT: ; %bb.3: +; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:88 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:92 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:96 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[28:31], 0 offset:100 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[28:31], 0 offset:104 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[28:31], 0 offset:108 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[28:31], 0 offset:112 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[28:31], 0 offset:116 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[28:31], 0 offset:120 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[28:31], 0 offset:124 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[28:31], 0 offset:128 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[28:31], 0 offset:132 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[28:31], 0 offset:136 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[28:31], 0 offset:140 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[28:31], 0 offset:144 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:148 ; 4-byte Folded Reload +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: v_mov_b32_e32 v17, 63 +; NOOPT-NEXT: buffer_store_dword v17, off, s[28:31], 0 offset:216 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: s_waitcnt vmcnt(1) +; NOOPT-NEXT: v_writelane_b32 v16, s0, 9 +; NOOPT-NEXT: v_writelane_b32 v16, s1, 10 +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:152 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:156 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:160 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:164 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:168 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:172 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:176 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:180 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:184 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:188 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:192 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:196 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:200 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:204 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:208 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:212 ; 4-byte Folded Spill +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: .LBB17_4: ; =>This Inner Loop Header: Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 11 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 12 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:152 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:156 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[28:31], 0 offset:160 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[28:31], 0 offset:164 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[28:31], 0 offset:168 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[28:31], 0 offset:172 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[28:31], 0 offset:176 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[28:31], 0 offset:180 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[28:31], 0 offset:184 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[28:31], 0 offset:188 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[28:31], 0 offset:192 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[28:31], 0 offset:196 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[28:31], 0 offset:200 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[28:31], 0 offset:204 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:208 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:212 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:216 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[28:31], 0 offset:76 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 +; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; NOOPT-NEXT: s_mov_b32 m0, s2 +; NOOPT-NEXT: v_movreld_b32_e32 v1, v17 +; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:220 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:224 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:228 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:232 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:236 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:240 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:244 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:248 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:252 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:256 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:260 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:264 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:268 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:272 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:276 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:280 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:152 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:156 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:160 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:164 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:168 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:172 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:176 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:180 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:184 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:188 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:192 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:196 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:200 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:204 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:208 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:212 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] +; NOOPT-NEXT: v_writelane_b32 v0, s2, 11 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 12 +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] +; NOOPT-NEXT: s_cbranch_execnz .LBB17_4 +; NOOPT-NEXT: ; %bb.5: +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 9 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 10 +; NOOPT-NEXT: s_mov_b64 exec, s[0:1] +; NOOPT-NEXT: ; %bb.6: +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 4 +; NOOPT-NEXT: v_readlane_b32 s4, v0, 0 +; NOOPT-NEXT: v_readlane_b32 s5, v0, 1 +; NOOPT-NEXT: v_readlane_b32 s6, v0, 2 +; NOOPT-NEXT: v_readlane_b32 s7, v0, 3 +; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:84 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:220 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[28:31], 0 offset:224 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v19, off, s[28:31], 0 offset:228 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v20, off, s[28:31], 0 offset:232 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v21, off, s[28:31], 0 offset:236 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v22, off, s[28:31], 0 offset:240 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v23, off, s[28:31], 0 offset:244 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v24, off, s[28:31], 0 offset:248 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v25, off, s[28:31], 0 offset:252 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v26, off, s[28:31], 0 offset:256 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v27, off, s[28:31], 0 offset:260 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v28, off, s[28:31], 0 offset:264 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v29, off, s[28:31], 0 offset:268 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v30, off, s[28:31], 0 offset:272 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v31, off, s[28:31], 0 offset:276 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 offset:280 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v6, v20 +; NOOPT-NEXT: v_mov_b32_e32 v7, v19 +; NOOPT-NEXT: v_mov_b32_e32 v8, v18 +; NOOPT-NEXT: v_mov_b32_e32 v2, v17 +; NOOPT-NEXT: v_mov_b32_e32 v3, v24 +; NOOPT-NEXT: v_mov_b32_e32 v4, v23 +; NOOPT-NEXT: v_mov_b32_e32 v5, v22 +; NOOPT-NEXT: v_mov_b32_e32 v9, v21 +; NOOPT-NEXT: v_mov_b32_e32 v14, v28 +; NOOPT-NEXT: v_mov_b32_e32 v15, v27 +; NOOPT-NEXT: v_mov_b32_e32 v16, v26 +; NOOPT-NEXT: v_mov_b32_e32 v10, v25 +; NOOPT-NEXT: v_mov_b32_e32 v11, v32 +; NOOPT-NEXT: v_mov_b32_e32 v12, v31 +; NOOPT-NEXT: v_mov_b32_e32 v13, v30 +; NOOPT-NEXT: v_mov_b32_e32 v17, v29 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18_vgpr19_vgpr20 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v18, v13 +; NOOPT-NEXT: v_mov_b32_e32 v19, v12 +; NOOPT-NEXT: v_mov_b32_e32 v20, v11 +; NOOPT-NEXT: buffer_store_dwordx4 v[17:20], off, s[4:7], 0 offset:48 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11_vgpr12_vgpr13 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v11, v16 +; NOOPT-NEXT: v_mov_b32_e32 v12, v15 +; NOOPT-NEXT: v_mov_b32_e32 v13, v14 +; NOOPT-NEXT: buffer_store_dwordx4 v[10:13], off, s[4:7], 0 offset:32 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v10, v5 +; NOOPT-NEXT: v_mov_b32_e32 v11, v4 +; NOOPT-NEXT: v_mov_b32_e32 v12, v3 +; NOOPT-NEXT: buffer_store_dwordx4 v[9:12], off, s[4:7], 0 offset:16 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v3, v8 +; NOOPT-NEXT: v_mov_b32_e32 v4, v7 +; NOOPT-NEXT: v_mov_b32_e32 v5, v6 +; NOOPT-NEXT: buffer_store_dwordx4 v[2:5], off, s[4:7], 0 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0 +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: v_writelane_b32 v0, s0, 13 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 14 +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; NOOPT-NEXT: s_mov_b64 exec, s[0:1] +; NOOPT-NEXT: s_cbranch_execz .LBB17_8 +; NOOPT-NEXT: ; %bb.7: ; %bb1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:68 ; 4-byte Folded Reload +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s6, s1 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s4, 0xf000 +; NOOPT-NEXT: s_mov_b32 s5, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s6 +; NOOPT-NEXT: s_mov_b32 s2, s5 +; NOOPT-NEXT: s_mov_b32 s3, s4 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: .LBB17_8: ; %bb2 +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 13 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 14 +; NOOPT-NEXT: s_or_b64 exec, exec, s[0:1] +; NOOPT-NEXT: ; kill: killed $vgpr0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_vgpr_offset_multiple_in_block: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-MOVREL-NEXT: s_mov_b32 s23, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s6, 0 +; SI-MOVREL-NEXT: s_mov_b32 s7, s23 +; SI-MOVREL-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: buffer_load_dword v2, v[1:2], s[4:7], 0 addr64 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; SI-MOVREL-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 +; SI-MOVREL-NEXT: ;;#ASMSTART +; SI-MOVREL-NEXT: v_mov_b32 v1, 62 +; SI-MOVREL-NEXT: ;;#ASMEND +; SI-MOVREL-NEXT: s_mov_b32 s22, -1 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s16 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s17 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s18 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s19 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s8 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s4 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, s5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v17, s6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v18, s7 +; SI-MOVREL-NEXT: v_add_i32_e32 v19, vcc, 1, v2 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 13, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 14, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, v11, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, v15, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, v18, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 3, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v5, 63, v2, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 2, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, 63, v11, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 7, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 6, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 5, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 4, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 11, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v13, 63, v10, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 10, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 9, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 8, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, 63, v24, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 15, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 14, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 13, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[14:17], off, s[20:23], 0 offset:48 +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[10:13], off, s[20:23], 0 offset:32 +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[6:9], off, s[20:23], 0 offset:16 +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[2:5], off, s[20:23], 0 +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; SI-MOVREL-NEXT: s_cbranch_execz .LBB17_2 +; SI-MOVREL-NEXT: ; %bb.1: ; %bb1 +; SI-MOVREL-NEXT: buffer_store_dword v1, off, s[20:23], 0 +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: .LBB17_2: ; %bb2 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-LABEL: insert_vgpr_offset_multiple_in_block: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: flat_load_dword v2, v[1:2] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: ;;#ASMSTART +; VI-NEXT: v_mov_b32 v1, 62 +; VI-NEXT: ;;#ASMEND +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s18 +; VI-NEXT: v_mov_b32_e32 v6, s19 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s13 +; VI-NEXT: v_mov_b32_e32 v9, s14 +; VI-NEXT: v_mov_b32_e32 v10, s15 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s9 +; VI-NEXT: v_mov_b32_e32 v13, s10 +; VI-NEXT: v_mov_b32_e32 v14, s11 +; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: v_mov_b32_e32 v16, s5 +; VI-NEXT: v_mov_b32_e32 v17, s6 +; VI-NEXT: v_mov_b32_e32 v18, s7 +; VI-NEXT: s_add_u32 s2, s0, 48 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 1, v2 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2 +; VI-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 13, v2 +; VI-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 14, v2 +; VI-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2 +; VI-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2 +; VI-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2 +; VI-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2 +; VI-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2 +; VI-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; VI-NEXT: v_cndmask_b32_e32 v6, v11, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; VI-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; VI-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; VI-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; VI-NEXT: v_cndmask_b32_e32 v11, v15, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; VI-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v18, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 3, v19 +; VI-NEXT: v_cndmask_b32_e32 v5, 63, v2, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 2, v19 +; VI-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v19 +; VI-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; VI-NEXT: v_cndmask_b32_e32 v2, 63, v11, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 7, v19 +; VI-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 6, v19 +; VI-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v19 +; VI-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 4, v19 +; VI-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 11, v19 +; VI-NEXT: v_cndmask_b32_e32 v13, 63, v10, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 10, v19 +; VI-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 9, v19 +; VI-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 8, v19 +; VI-NEXT: v_cndmask_b32_e32 v10, 63, v24, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 15, v19 +; VI-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 14, v19 +; VI-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 13, v19 +; VI-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19 +; VI-NEXT: v_mov_b32_e32 v19, s3 +; VI-NEXT: v_mov_b32_e32 v18, s2 +; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: flat_store_dwordx4 v[18:19], v[14:17] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v15, s3 +; VI-NEXT: v_mov_b32_e32 v14, s2 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: flat_store_dwordx4 v[14:15], v[10:13] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v11, s3 +; VI-NEXT: v_mov_b32_e32 v10, s2 +; VI-NEXT: flat_store_dwordx4 v[10:11], v[6:9] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: flat_store_dwordx4 v[6:7], v[2:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: ; %bb.1: ; %bb1 +; VI-NEXT: flat_store_dword v[0:1], v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: .LBB17_2: ; %bb2 +; VI-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_vgpr_offset_multiple_in_block: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-IDXMODE-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dword v3, v1, s[0:1] glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: ;;#ASMSTART +; GFX9-IDXMODE-NEXT: v_mov_b32 v1, 62 +; GFX9-IDXMODE-NEXT: ;;#ASMEND +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s18 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s19 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, s4 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, s5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, s6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v19, s7 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 12, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 13, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 14, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 15, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 8, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 9, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 10, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v27, v10, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 11, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, v11, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 4, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 5, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 6, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 7, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v12, v16, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 2, v3 +; GFX9-IDXMODE-NEXT: v_add_u32_e32 v20, 1, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, v18, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, v19, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 3, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v6, 63, v3, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 2, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 1, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, 63, v12, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 7, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 6, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 5, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 4, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 11, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v14, 63, v11, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 10, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v13, 63, v27, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 9, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 8, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 15, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v18, 63, v24, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 14, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 13, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 12, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[15:18], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[11:14], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[7:10], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[3:6], s[0:1] +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-IDXMODE-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-IDXMODE-NEXT: ; %bb.1: ; %bb1 +; GFX9-IDXMODE-NEXT: global_store_dword v[0:1], v1, off +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: .LBB17_2: ; %bb2 +; GFX9-IDXMODE-NEXT: s_endpgm +entry: + %id = call i32 @llvm.amdgcn.workitem.id.x() + %id.ext = zext i32 %id to i64 + %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %id.ext + %idx0 = load volatile i32, ptr addrspace(1) %gep + %idx1 = add i32 %idx0, 1 + %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"() + %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0 + %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1 + store volatile <16 x i32> %vec2, ptr addrspace(1) %out0 + %cmp = icmp eq i32 %id, 0 + br i1 %cmp, label %bb1, label %bb2 +bb1: + store volatile i32 %live.out.val, ptr addrspace(1) undef + br label %bb2 -; GCN-LABEL: {{^}}insert_adjacent_blocks: -define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) #0 { +bb2: + ret void +} + +; Avoid inserting extra v_mov from copies within the vgpr indexing sequence. The +; gpr_idx mode switching sequence is expanded late for this reason. +define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %out1, i32 %in) { +; NOOPT-LABEL: insert_w_offset_multiple_in_block: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s5, 1 +; NOOPT-NEXT: s_add_i32 s5, s4, s5 +; NOOPT-NEXT: s_mov_b32 s6, 0x41800000 +; NOOPT-NEXT: s_mov_b32 s7, 0x41700000 +; NOOPT-NEXT: s_mov_b32 s8, 0x41600000 +; NOOPT-NEXT: s_mov_b32 s9, 0x41500000 +; NOOPT-NEXT: s_mov_b32 s10, 0x41400000 +; NOOPT-NEXT: s_mov_b32 s11, 0x41300000 +; NOOPT-NEXT: s_mov_b32 s12, 0x41200000 +; NOOPT-NEXT: s_mov_b32 s13, 0x41100000 +; NOOPT-NEXT: s_mov_b32 s14, 0x41000000 +; NOOPT-NEXT: s_mov_b32 s15, 0x40e00000 +; NOOPT-NEXT: s_mov_b32 s16, 0x40c00000 +; NOOPT-NEXT: s_mov_b32 s17, 0x40a00000 +; NOOPT-NEXT: s_mov_b32 s18, 4.0 +; NOOPT-NEXT: s_mov_b32 s19, 0x40400000 +; NOOPT-NEXT: s_mov_b32 s20, 2.0 +; NOOPT-NEXT: s_mov_b32 s21, 1.0 +; NOOPT-NEXT: v_mov_b32_e32 v23, s21 +; NOOPT-NEXT: v_mov_b32_e32 v14, s20 +; NOOPT-NEXT: v_mov_b32_e32 v13, s19 +; NOOPT-NEXT: v_mov_b32_e32 v12, s18 +; NOOPT-NEXT: v_mov_b32_e32 v11, s17 +; NOOPT-NEXT: v_mov_b32_e32 v10, s16 +; NOOPT-NEXT: v_mov_b32_e32 v9, s15 +; NOOPT-NEXT: v_mov_b32_e32 v8, s14 +; NOOPT-NEXT: v_mov_b32_e32 v7, s13 +; NOOPT-NEXT: v_mov_b32_e32 v6, s12 +; NOOPT-NEXT: v_mov_b32_e32 v5, s11 +; NOOPT-NEXT: v_mov_b32_e32 v4, s10 +; NOOPT-NEXT: v_mov_b32_e32 v3, s9 +; NOOPT-NEXT: v_mov_b32_e32 v2, s8 +; NOOPT-NEXT: v_mov_b32_e32 v1, s7 +; NOOPT-NEXT: v_mov_b32_e32 v0, s6 +; NOOPT-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v24, v14 +; NOOPT-NEXT: v_mov_b32_e32 v25, v13 +; NOOPT-NEXT: v_mov_b32_e32 v26, v12 +; NOOPT-NEXT: v_mov_b32_e32 v27, v11 +; NOOPT-NEXT: v_mov_b32_e32 v28, v10 +; NOOPT-NEXT: v_mov_b32_e32 v29, v9 +; NOOPT-NEXT: v_mov_b32_e32 v30, v8 +; NOOPT-NEXT: v_mov_b32_e32 v31, v7 +; NOOPT-NEXT: v_mov_b32_e32 v32, v6 +; NOOPT-NEXT: v_mov_b32_e32 v33, v5 +; NOOPT-NEXT: v_mov_b32_e32 v34, v4 +; NOOPT-NEXT: v_mov_b32_e32 v35, v3 +; NOOPT-NEXT: v_mov_b32_e32 v36, v2 +; NOOPT-NEXT: v_mov_b32_e32 v37, v1 +; NOOPT-NEXT: v_mov_b32_e32 v38, v0 +; NOOPT-NEXT: v_mov_b32_e32 v0, 0x41880000 +; NOOPT-NEXT: s_mov_b32 m0, s5 +; NOOPT-NEXT: v_movreld_b32_e32 v23, v0 +; NOOPT-NEXT: s_mov_b32 s5, 2 +; NOOPT-NEXT: s_add_i32 s4, s4, s5 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_mov_b32_e32 v7, v23 +; NOOPT-NEXT: v_mov_b32_e32 v8, v24 +; NOOPT-NEXT: v_mov_b32_e32 v9, v25 +; NOOPT-NEXT: v_mov_b32_e32 v10, v26 +; NOOPT-NEXT: v_mov_b32_e32 v11, v27 +; NOOPT-NEXT: v_mov_b32_e32 v12, v28 +; NOOPT-NEXT: v_mov_b32_e32 v13, v29 +; NOOPT-NEXT: v_mov_b32_e32 v14, v30 +; NOOPT-NEXT: v_mov_b32_e32 v15, v31 +; NOOPT-NEXT: v_mov_b32_e32 v16, v32 +; NOOPT-NEXT: v_mov_b32_e32 v17, v33 +; NOOPT-NEXT: v_mov_b32_e32 v18, v34 +; NOOPT-NEXT: v_mov_b32_e32 v19, v35 +; NOOPT-NEXT: v_mov_b32_e32 v20, v36 +; NOOPT-NEXT: v_mov_b32_e32 v21, v37 +; NOOPT-NEXT: v_mov_b32_e32 v22, v38 +; NOOPT-NEXT: v_movreld_b32_e32 v7, v0 +; NOOPT-NEXT: v_mov_b32_e32 v4, v38 +; NOOPT-NEXT: v_mov_b32_e32 v5, v37 +; NOOPT-NEXT: v_mov_b32_e32 v6, v36 +; NOOPT-NEXT: v_mov_b32_e32 v0, v35 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: v_mov_b32_e32 v4, v34 +; NOOPT-NEXT: v_mov_b32_e32 v5, v33 +; NOOPT-NEXT: v_mov_b32_e32 v6, v32 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v31 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; NOOPT-NEXT: v_mov_b32_e32 v4, v30 +; NOOPT-NEXT: v_mov_b32_e32 v5, v29 +; NOOPT-NEXT: v_mov_b32_e32 v6, v28 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v27 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; NOOPT-NEXT: v_mov_b32_e32 v4, v26 +; NOOPT-NEXT: v_mov_b32_e32 v5, v25 +; NOOPT-NEXT: v_mov_b32_e32 v6, v24 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v23 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; NOOPT-NEXT: v_mov_b32_e32 v4, v22 +; NOOPT-NEXT: v_mov_b32_e32 v5, v21 +; NOOPT-NEXT: v_mov_b32_e32 v6, v20 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v19 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; NOOPT-NEXT: v_mov_b32_e32 v4, v18 +; NOOPT-NEXT: v_mov_b32_e32 v5, v17 +; NOOPT-NEXT: v_mov_b32_e32 v6, v16 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v15 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; NOOPT-NEXT: v_mov_b32_e32 v4, v14 +; NOOPT-NEXT: v_mov_b32_e32 v5, v13 +; NOOPT-NEXT: v_mov_b32_e32 v6, v12 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v11 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; NOOPT-NEXT: v_mov_b32_e32 v4, v10 +; NOOPT-NEXT: v_mov_b32_e32 v5, v9 +; NOOPT-NEXT: v_mov_b32_e32 v6, v8 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v7 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_w_offset_multiple_in_block: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_add_i32 s2, s4, 1 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000 +; SI-MOVREL-NEXT: s_mov_b32 m0, s2 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32 +; SI-MOVREL-NEXT: s_add_i32 s4, s4, 2 +; SI-MOVREL-NEXT: v_mov_b32_e32 v31, v15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v30, v14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v29, v13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v28, v12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v27, v11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v26, v10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v25, v9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v24, v8 +; SI-MOVREL-NEXT: v_mov_b32_e32 v23, v7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v22, v6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v21, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v20, v4 +; SI-MOVREL-NEXT: v_mov_b32_e32 v19, v3 +; SI-MOVREL-NEXT: v_mov_b32_e32 v18, v2 +; SI-MOVREL-NEXT: v_mov_b32_e32 v17, v1 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, v0 +; SI-MOVREL-NEXT: s_mov_b32 m0, s4 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v16, v32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: insert_w_offset_multiple_in_block: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_add_i32 s2, s4, 1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000 +; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: s_add_i32 s4, s4, 2 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v31, v15 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 +; VI-MOVREL-NEXT: v_mov_b32_e32 v30, v14 +; VI-MOVREL-NEXT: v_mov_b32_e32 v29, v13 +; VI-MOVREL-NEXT: v_mov_b32_e32 v28, v12 +; VI-MOVREL-NEXT: v_mov_b32_e32 v27, v11 +; VI-MOVREL-NEXT: v_mov_b32_e32 v26, v10 +; VI-MOVREL-NEXT: v_mov_b32_e32 v25, v9 +; VI-MOVREL-NEXT: v_mov_b32_e32 v24, v8 +; VI-MOVREL-NEXT: v_mov_b32_e32 v23, v7 +; VI-MOVREL-NEXT: v_mov_b32_e32 v22, v6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v21, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v20, v4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v19, v3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v18, v2 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, v1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, v0 +; VI-MOVREL-NEXT: s_mov_b32 m0, s4 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v16, v32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v33, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v32, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[32:33], v[12:15] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 64 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: s_add_u32 s4, s0, 0x70 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-MOVREL-NEXT: s_addc_u32 s5, s1, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 +; VI-MOVREL-NEXT: s_add_u32 s4, s0, 0x60 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[28:31] +; VI-MOVREL-NEXT: s_addc_u32 s5, s1, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 +; VI-MOVREL-NEXT: s_add_u32 s0, s0, 0x50 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[24:27] +; VI-MOVREL-NEXT: s_addc_u32 s1, s1, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[20:23] +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s2 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s3 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[16:19] +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: insert_w_offset_multiple_in_block: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_add_i32 s2, s4, 1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-IDXMODE-NEXT: s_add_i32 s4, s4, 2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v32, 0x41880000 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v32 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: v_mov_b32_e32 v31, v15 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v30, v14 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v29, v13 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v28, v12 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v27, v11 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v26, v10 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v25, v9 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v24, v8 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v23, v7 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v22, v6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v21, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v20, v4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v19, v3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v18, v2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, v1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, v0 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, v32 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: v_mov_b32_e32 v33, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v32, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[32:33], v[12:15] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 64 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: s_add_u32 s4, s0, 0x70 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-IDXMODE-NEXT: s_addc_u32 s5, s1, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 +; VI-IDXMODE-NEXT: s_add_u32 s4, s0, 0x60 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[28:31] +; VI-IDXMODE-NEXT: s_addc_u32 s5, s1, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 +; VI-IDXMODE-NEXT: s_add_u32 s0, s0, 0x50 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[24:27] +; VI-IDXMODE-NEXT: s_addc_u32 s1, s1, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[20:23] +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[16:19] +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_w_offset_multiple_in_block: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_add_i32 s2, s4, 1 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v32, 0x41880000 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v32 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v31, v15 +; GFX9-IDXMODE-NEXT: s_add_i32 s4, s4, 2 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v30, v14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v29, v13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v28, v12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v27, v11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v26, v10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v25, v9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v24, v8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v23, v7 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v21, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v20, v4 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, v2 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, v1 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v0 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v32 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v32, 0 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX9-IDXMODE-NEXT: s_endpgm +entry: + %add1 = add i32 %in, 1 + %ins1 = insertelement <16 x float> , float 17.0, i32 %add1 + %add2 = add i32 %in, 2 + %ins2 = insertelement <16 x float> %ins1, float 17.0, i32 %add2 + store <16 x float> %ins1, ptr addrspace(1) %out1 + %out2 = getelementptr <16 x float>, ptr addrspace(1) %out1, i32 1 + store <16 x float> %ins2, ptr addrspace(1) %out2 + + ret void +} + +; Make sure we don't hit use of undefined register errors when expanding an +; extract with undef index. +define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) { +; NOOPT-LABEL: extract_adjacent_blocks: +; NOOPT: ; %bb.0: ; %bb +; NOOPT-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; NOOPT-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; NOOPT-NEXT: s_mov_b32 s14, -1 +; NOOPT-NEXT: s_mov_b32 s15, 0xe8f000 +; NOOPT-NEXT: s_add_u32 s12, s12, s9 +; NOOPT-NEXT: s_addc_u32 s13, s13, 0 +; NOOPT-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; NOOPT-NEXT: s_load_dword s2, s[2:3], 0x9 +; NOOPT-NEXT: s_mov_b64 s[0:1], -1 +; NOOPT-NEXT: ; implicit-def: $sgpr3 +; NOOPT-NEXT: s_mov_b32 s3, 0 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_cmp_lg_u32 s2, s3 +; NOOPT-NEXT: v_writelane_b32 v0, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 1 +; NOOPT-NEXT: s_mov_b64 s[8:9], exec +; NOOPT-NEXT: s_mov_b64 exec, -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[8:9] +; NOOPT-NEXT: s_cbranch_scc1 .LBB19_3 +; NOOPT-NEXT: .LBB19_1: ; %Flow +; NOOPT-NEXT: s_or_saveexec_b64 s[8:9], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[8:9] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 0 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 1 +; NOOPT-NEXT: ; implicit-def: $sgpr2 +; NOOPT-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; NOOPT-NEXT: s_mov_b32 s0, 1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: v_cmp_ne_u32_e64 s[0:1], v0, s0 +; NOOPT-NEXT: s_and_b64 vcc, exec, s[0:1] +; NOOPT-NEXT: s_cbranch_vccnz .LBB19_4 +; NOOPT-NEXT: ; %bb.2: ; %bb1 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s6, s1 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s4, 0xf000 +; NOOPT-NEXT: s_mov_b32 s5, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s6 +; NOOPT-NEXT: s_mov_b32 s2, s5 +; NOOPT-NEXT: s_mov_b32 s3, s4 +; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ; implicit-def: $sgpr0 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ;;#ASMSTART +; NOOPT-NEXT: ; reg use v[0:3] +; NOOPT-NEXT: ;;#ASMEND +; NOOPT-NEXT: s_branch .LBB19_4 +; NOOPT-NEXT: .LBB19_3: ; %bb4 +; NOOPT-NEXT: s_or_saveexec_b64 s[8:9], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[8:9] +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s6, s1 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s4, 0xf000 +; NOOPT-NEXT: s_mov_b32 s5, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s6 +; NOOPT-NEXT: s_mov_b32 s2, s5 +; NOOPT-NEXT: s_mov_b32 s3, s4 +; NOOPT-NEXT: buffer_load_dwordx4 v[1:4], off, s[0:3], 0 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ; implicit-def: $sgpr0 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ;;#ASMSTART +; NOOPT-NEXT: ; reg use v[1:4] +; NOOPT-NEXT: ;;#ASMEND +; NOOPT-NEXT: s_mov_b64 s[0:1], 0 +; NOOPT-NEXT: v_writelane_b32 v0, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 1 +; NOOPT-NEXT: s_or_saveexec_b64 s[8:9], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[8:9] +; NOOPT-NEXT: s_branch .LBB19_1 +; NOOPT-NEXT: .LBB19_4: ; %bb7 +; NOOPT-NEXT: s_or_saveexec_b64 s[8:9], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[8:9] +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; NOOPT-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ; kill: killed $vgpr0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extract_adjacent_blocks: +; SI-MOVREL: ; %bb.0: ; %bb +; SI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_cmp_lg_u32 s0, 0 +; SI-MOVREL-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-MOVREL-NEXT: ; %bb.1: ; %bb4 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: ;;#ASMSTART +; SI-MOVREL-NEXT: ; reg use v[0:3] +; SI-MOVREL-NEXT: ;;#ASMEND +; SI-MOVREL-NEXT: s_cbranch_execnz .LBB19_3 +; SI-MOVREL-NEXT: .LBB19_2: ; %bb1 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: ;;#ASMSTART +; SI-MOVREL-NEXT: ; reg use v[0:3] +; SI-MOVREL-NEXT: ;;#ASMEND +; SI-MOVREL-NEXT: .LBB19_3: ; %bb7 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: s_endpgm +; SI-MOVREL-NEXT: .LBB19_4: +; SI-MOVREL-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: extract_adjacent_blocks: +; VI: ; %bb.0: ; %bb +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %bb4 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: ;;#ASMSTART +; VI-NEXT: ; reg use v[0:3] +; VI-NEXT: ;;#ASMEND +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %bb1 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: ;;#ASMSTART +; VI-NEXT: ; reg use v[0:3] +; VI-NEXT: ;;#ASMEND +; VI-NEXT: .LBB19_3: ; %bb7 +; VI-NEXT: flat_store_dword v[0:1], v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; VI-NEXT: .LBB19_4: +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-IDXMODE-LABEL: extract_adjacent_blocks: +; GFX9-IDXMODE: ; %bb.0: ; %bb +; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-IDXMODE-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX9-IDXMODE-NEXT: ; %bb.1: ; %bb4 +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v[0:1], off glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: ;;#ASMSTART +; GFX9-IDXMODE-NEXT: ; reg use v[0:3] +; GFX9-IDXMODE-NEXT: ;;#ASMEND +; GFX9-IDXMODE-NEXT: s_cbranch_execnz .LBB19_3 +; GFX9-IDXMODE-NEXT: .LBB19_2: ; %bb1 +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v[0:1], off glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: ;;#ASMSTART +; GFX9-IDXMODE-NEXT: ; reg use v[0:3] +; GFX9-IDXMODE-NEXT: ;;#ASMEND +; GFX9-IDXMODE-NEXT: .LBB19_3: ; %bb7 +; GFX9-IDXMODE-NEXT: global_store_dword v[0:1], v0, off +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: s_endpgm +; GFX9-IDXMODE-NEXT: .LBB19_4: +; GFX9-IDXMODE-NEXT: s_branch .LBB19_2 bb: %tmp = icmp eq i32 %arg, 0 br i1 %tmp, label %bb1, label %bb4 -bb1: ; preds = %bb +bb1: + %tmp2 = load volatile <4 x float>, ptr addrspace(1) undef + %tmp3 = extractelement <4 x float> %tmp2, i32 undef + call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp2) ; Prevent block optimize out + br label %bb7 + +bb4: + %tmp5 = load volatile <4 x float>, ptr addrspace(1) undef + %tmp6 = extractelement <4 x float> %tmp5, i32 undef + call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp5) ; Prevent block optimize out + br label %bb7 + +bb7: + %tmp8 = phi float [ %tmp3, %bb1 ], [ %tmp6, %bb4 ] + store volatile float %tmp8, ptr addrspace(1) undef + ret void +} + +define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) { +; NOOPT-LABEL: insert_adjacent_blocks: +; NOOPT: ; %bb.0: ; %bb +; NOOPT-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; NOOPT-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; NOOPT-NEXT: s_mov_b32 s18, -1 +; NOOPT-NEXT: s_mov_b32 s19, 0xe8f000 +; NOOPT-NEXT: s_add_u32 s16, s16, s9 +; NOOPT-NEXT: s_addc_u32 s17, s17, 0 +; NOOPT-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; NOOPT-NEXT: s_mov_b64 s[0:1], s[2:3] +; NOOPT-NEXT: s_load_dword s2, s[0:1], 0x9 +; NOOPT-NEXT: s_load_dword s0, s[0:1], 0xa +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b64 s[0:1], -1 +; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b32 s3, 0 +; NOOPT-NEXT: s_cmp_lg_u32 s2, s3 +; NOOPT-NEXT: v_writelane_b32 v0, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 1 +; NOOPT-NEXT: s_mov_b64 s[12:13], exec +; NOOPT-NEXT: s_mov_b64 exec, -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[12:13] +; NOOPT-NEXT: s_cbranch_scc1 .LBB20_3 +; NOOPT-NEXT: .LBB20_1: ; %Flow +; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[12:13] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 0 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 1 +; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; NOOPT-NEXT: s_mov_b32 s0, 1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: v_cmp_ne_u32_e64 s[0:1], v0, s0 +; NOOPT-NEXT: s_and_b64 vcc, exec, s[0:1] +; NOOPT-NEXT: s_cbranch_vccnz .LBB20_4 +; NOOPT-NEXT: ; %bb.2: ; %bb1 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s6, s1 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s4, 0xf000 +; NOOPT-NEXT: s_mov_b32 s5, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s6 +; NOOPT-NEXT: s_mov_b32 s2, s5 +; NOOPT-NEXT: s_mov_b32 s3, s4 +; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ;;#ASMSTART +; NOOPT-NEXT: ; reg use v[0:3] +; NOOPT-NEXT: ;;#ASMEND +; NOOPT-NEXT: s_branch .LBB20_4 +; NOOPT-NEXT: .LBB20_3: ; %bb4 +; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[12:13] +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s6, s1 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s4, 0xf000 +; NOOPT-NEXT: s_mov_b32 s5, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s6 +; NOOPT-NEXT: s_mov_b32 s2, s5 +; NOOPT-NEXT: s_mov_b32 s3, s4 +; NOOPT-NEXT: buffer_load_dwordx4 v[1:4], off, s[0:3], 0 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ;;#ASMSTART +; NOOPT-NEXT: ; reg use v[1:4] +; NOOPT-NEXT: ;;#ASMEND +; NOOPT-NEXT: s_mov_b64 s[0:1], 0 +; NOOPT-NEXT: v_writelane_b32 v0, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 1 +; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[12:13] +; NOOPT-NEXT: s_branch .LBB20_1 +; NOOPT-NEXT: .LBB20_4: ; %bb7 +; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[12:13] +; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s10, s1 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s8, 0xf000 +; NOOPT-NEXT: s_mov_b32 s9, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s10 +; NOOPT-NEXT: s_mov_b32 s2, s9 +; NOOPT-NEXT: s_mov_b32 s3, s8 +; NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; NOOPT-NEXT: v_mov_b32_e32 v2, s5 +; NOOPT-NEXT: v_mov_b32_e32 v3, s6 +; NOOPT-NEXT: v_mov_b32_e32 v4, s7 +; NOOPT-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ; kill: killed $vgpr0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_adjacent_blocks: +; SI-MOVREL: ; %bb.0: ; %bb +; SI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_cmp_lg_u32 s0, 0 +; SI-MOVREL-NEXT: s_cbranch_scc0 .LBB20_4 +; SI-MOVREL-NEXT: ; %bb.1: ; %bb4 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: ;;#ASMSTART +; SI-MOVREL-NEXT: ; reg use v[0:3] +; SI-MOVREL-NEXT: ;;#ASMEND +; SI-MOVREL-NEXT: s_cbranch_execnz .LBB20_3 +; SI-MOVREL-NEXT: .LBB20_2: ; %bb1 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: ;;#ASMSTART +; SI-MOVREL-NEXT: ; reg use v[0:3] +; SI-MOVREL-NEXT: ;;#ASMEND +; SI-MOVREL-NEXT: .LBB20_3: ; %bb7 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: s_endpgm +; SI-MOVREL-NEXT: .LBB20_4: +; SI-MOVREL-NEXT: s_branch .LBB20_2 +; +; VI-LABEL: insert_adjacent_blocks: +; VI: ; %bb.0: ; %bb +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cbranch_scc0 .LBB20_4 +; VI-NEXT: ; %bb.1: ; %bb4 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: ;;#ASMSTART +; VI-NEXT: ; reg use v[0:3] +; VI-NEXT: ;;#ASMEND +; VI-NEXT: s_cbranch_execnz .LBB20_3 +; VI-NEXT: .LBB20_2: ; %bb1 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: ;;#ASMSTART +; VI-NEXT: ; reg use v[0:3] +; VI-NEXT: ;;#ASMEND +; VI-NEXT: .LBB20_3: ; %bb7 +; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; VI-NEXT: .LBB20_4: +; VI-NEXT: s_branch .LBB20_2 +; +; GFX9-IDXMODE-LABEL: insert_adjacent_blocks: +; GFX9-IDXMODE: ; %bb.0: ; %bb +; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-IDXMODE-NEXT: s_cbranch_scc0 .LBB20_4 +; GFX9-IDXMODE-NEXT: ; %bb.1: ; %bb4 +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v[0:1], off glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: ;;#ASMSTART +; GFX9-IDXMODE-NEXT: ; reg use v[0:3] +; GFX9-IDXMODE-NEXT: ;;#ASMEND +; GFX9-IDXMODE-NEXT: s_cbranch_execnz .LBB20_3 +; GFX9-IDXMODE-NEXT: .LBB20_2: ; %bb1 +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v[0:1], off glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: ;;#ASMSTART +; GFX9-IDXMODE-NEXT: ; reg use v[0:3] +; GFX9-IDXMODE-NEXT: ;;#ASMEND +; GFX9-IDXMODE-NEXT: .LBB20_3: ; %bb7 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: s_endpgm +; GFX9-IDXMODE-NEXT: .LBB20_4: +; GFX9-IDXMODE-NEXT: s_branch .LBB20_2 +bb: + %tmp = icmp eq i32 %arg, 0 + br i1 %tmp, label %bb1, label %bb4 + +bb1: %tmp2 = load volatile <4 x float>, ptr addrspace(1) undef %tmp3 = insertelement <4 x float> %tmp2, float %val0, i32 undef - call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp3) #0 ; Prevent block optimize out + call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp3) ; Prevent block optimize out br label %bb7 -bb4: ; preds = %bb +bb4: %tmp5 = load volatile <4 x float>, ptr addrspace(1) undef %tmp6 = insertelement <4 x float> %tmp5, float %val0, i32 undef - call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp6) #0 ; Prevent block optimize out + call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp6) ; Prevent block optimize out br label %bb7 -bb7: ; preds = %bb4, %bb1 +bb7: %tmp8 = phi <4 x float> [ %tmp3, %bb1 ], [ %tmp6, %bb4 ] store volatile <4 x float> %tmp8, ptr addrspace(1) undef ret void } ; FIXME: Should be able to fold zero input to movreld to inline imm? - -; GCN-LABEL: {{^}}multi_same_block: - -; GCN: s_load_dword [[ARG:s[0-9]+]] - -; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000 -; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd -; MOVREL: s_waitcnt -; MOVREL: s_add_i32 m0, [[ARG]], -16 -; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, 4.0 -; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, -4.0 -; MOVREL: s_mov_b32 m0, -1 - - -; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000 -; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd -; IDXMODE: s_waitcnt -; IDXMODE: s_add_i32 [[ARG]], [[ARG]], -16 -; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 4.0 -; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, -4.0 -; IDXMODE: s_set_gpr_idx_off - -; GCN: ds_write_b32 -; GCN: ds_write_b32 -; GCN: s_endpgm -define amdgpu_kernel void @multi_same_block(i32 %arg) #0 { +define amdgpu_kernel void @multi_same_block(i32 %arg) { +; NOOPT-LABEL: multi_same_block: +; NOOPT: ; %bb.0: ; %bb +; NOOPT-NEXT: s_load_dword s0, s[2:3], 0x9 +; NOOPT-NEXT: s_mov_b32 s8, 0x41900000 +; NOOPT-NEXT: ; implicit-def: $sgpr9 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr7 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr6 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr5 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr3 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr2 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr10 +; NOOPT-NEXT: v_mov_b32_e32 v12, s9 +; NOOPT-NEXT: v_mov_b32_e32 v7, s8 +; NOOPT-NEXT: v_mov_b32_e32 v6, s7 +; NOOPT-NEXT: v_mov_b32_e32 v5, s6 +; NOOPT-NEXT: v_mov_b32_e32 v4, s5 +; NOOPT-NEXT: v_mov_b32_e32 v3, s4 +; NOOPT-NEXT: v_mov_b32_e32 v2, s3 +; NOOPT-NEXT: v_mov_b32_e32 v1, s2 +; NOOPT-NEXT: v_mov_b32_e32 v0, s1 +; NOOPT-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v13, v7 +; NOOPT-NEXT: v_mov_b32_e32 v14, v6 +; NOOPT-NEXT: v_mov_b32_e32 v15, v5 +; NOOPT-NEXT: v_mov_b32_e32 v16, v4 +; NOOPT-NEXT: v_mov_b32_e32 v17, v3 +; NOOPT-NEXT: v_mov_b32_e32 v18, v2 +; NOOPT-NEXT: v_mov_b32_e32 v19, v1 +; NOOPT-NEXT: v_mov_b32_e32 v20, v0 +; NOOPT-NEXT: v_mov_b32_e32 v0, 4.0 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_add_i32 m0, s0, -16 +; NOOPT-NEXT: v_movreld_b32_e32 v12, v0 +; NOOPT-NEXT: s_mov_b32 s4, 0x41b0cccd +; NOOPT-NEXT: ; implicit-def: $sgpr9 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr8 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr7 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr6 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr5 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr3 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr2 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr10 +; NOOPT-NEXT: v_mov_b32_e32 v3, s9 +; NOOPT-NEXT: v_mov_b32_e32 v25, s8 +; NOOPT-NEXT: v_mov_b32_e32 v24, s7 +; NOOPT-NEXT: v_mov_b32_e32 v23, s6 +; NOOPT-NEXT: v_mov_b32_e32 v22, s5 +; NOOPT-NEXT: v_mov_b32_e32 v21, s4 +; NOOPT-NEXT: v_mov_b32_e32 v2, s3 +; NOOPT-NEXT: v_mov_b32_e32 v1, s2 +; NOOPT-NEXT: v_mov_b32_e32 v0, s1 +; NOOPT-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v4, v25 +; NOOPT-NEXT: v_mov_b32_e32 v5, v24 +; NOOPT-NEXT: v_mov_b32_e32 v6, v23 +; NOOPT-NEXT: v_mov_b32_e32 v7, v22 +; NOOPT-NEXT: v_mov_b32_e32 v8, v21 +; NOOPT-NEXT: v_mov_b32_e32 v9, v2 +; NOOPT-NEXT: v_mov_b32_e32 v10, v1 +; NOOPT-NEXT: v_mov_b32_e32 v11, v0 +; NOOPT-NEXT: v_mov_b32_e32 v0, -4.0 +; NOOPT-NEXT: s_add_i32 m0, s0, -16 +; NOOPT-NEXT: v_movreld_b32_e32 v3, v0 +; NOOPT-NEXT: v_mov_b32_e32 v2, v13 +; NOOPT-NEXT: v_mov_b32_e32 v1, v8 +; NOOPT-NEXT: s_mov_b32 m0, -1 +; NOOPT-NEXT: ; implicit-def: $sgpr0 +; NOOPT-NEXT: v_mov_b32_e32 v0, s0 +; NOOPT-NEXT: ds_write_b32 v0, v2 +; NOOPT-NEXT: s_mov_b32 m0, -1 +; NOOPT-NEXT: ; implicit-def: $sgpr0 +; NOOPT-NEXT: v_mov_b32_e32 v0, s0 +; NOOPT-NEXT: ds_write_b32 v0, v1 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: multi_same_block: +; SI-MOVREL: ; %bb.0: ; %bb +; SI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 0x41900000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41b0cccd +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_add_i32 m0, s0, -16 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, 4.0 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v4, -4.0 +; SI-MOVREL-NEXT: s_mov_b32 m0, -1 +; SI-MOVREL-NEXT: ds_write_b32 v0, v1 +; SI-MOVREL-NEXT: ds_write_b32 v0, v9 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: multi_same_block: +; VI-MOVREL: ; %bb.0: ; %bb +; VI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 0x41900000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41b0cccd +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_add_i32 m0, s0, -16 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, 4.0 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v4, -4.0 +; VI-MOVREL-NEXT: s_mov_b32 m0, -1 +; VI-MOVREL-NEXT: ds_write_b32 v0, v1 +; VI-MOVREL-NEXT: ds_write_b32 v0, v9 +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: multi_same_block: +; VI-IDXMODE: ; %bb.0: ; %bb +; VI-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 0x41900000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41b0cccd +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_add_i32 s0, s0, -16 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 4.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, -4.0 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: s_mov_b32 m0, -1 +; VI-IDXMODE-NEXT: ds_write_b32 v0, v1 +; VI-IDXMODE-NEXT: ds_write_b32 v0, v9 +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: multi_same_block: +; GFX9-IDXMODE: ; %bb.0: ; %bb +; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 0x41900000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41b0cccd +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_add_i32 s0, s0, -16 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, -4.0 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: ds_write_b32 v0, v1 +; GFX9-IDXMODE-NEXT: ds_write_b32 v0, v9 +; GFX9-IDXMODE-NEXT: s_endpgm bb: %tmp1 = add i32 %arg, -16 %tmp2 = insertelement <9 x float> , float 4.000000e+00, i32 %tmp1 @@ -420,20 +6595,192 @@ bb: } ; offset puts outside of superegister bounaries, so clamp to 1st element. -; GCN-LABEL: {{^}}extract_largest_inbounds_offset: -; GCN-DAG: buffer_load_dwordx4 v[[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]] -; GCN-DAG: s_load_dword [[IDX0:s[0-9]+]] -; GCN-DAG: s_add_i32 [[IDX:s[0-9]+]], [[IDX0]], 15 - -; MOVREL: s_mov_b32 m0, [[IDX]] -; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] - -; IDXMODE: s_set_gpr_idx_on [[IDX]], gpr_idx(SRC0) -; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] -; IDXMODE: s_set_gpr_idx_off - -; GCN: buffer_store_dword [[EXTRACT]] define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) { +; NOOPT-LABEL: extract_largest_inbounds_offset: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xb +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xd +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s7, s9 +; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 +; NOOPT-NEXT: s_mov_b32 s9, s7 +; NOOPT-NEXT: s_mov_b32 s10, s6 +; NOOPT-NEXT: s_mov_b32 s11, s5 +; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v16, v15 +; NOOPT-NEXT: v_mov_b32_e32 v17, v14 +; NOOPT-NEXT: v_mov_b32_e32 v18, v13 +; NOOPT-NEXT: v_mov_b32_e32 v19, v12 +; NOOPT-NEXT: v_mov_b32_e32 v20, v11 +; NOOPT-NEXT: v_mov_b32_e32 v21, v10 +; NOOPT-NEXT: v_mov_b32_e32 v22, v9 +; NOOPT-NEXT: v_mov_b32_e32 v23, v8 +; NOOPT-NEXT: v_mov_b32_e32 v24, v7 +; NOOPT-NEXT: v_mov_b32_e32 v25, v6 +; NOOPT-NEXT: v_mov_b32_e32 v26, v5 +; NOOPT-NEXT: v_mov_b32_e32 v27, v4 +; NOOPT-NEXT: v_mov_b32_e32 v28, v3 +; NOOPT-NEXT: v_mov_b32_e32 v29, v2 +; NOOPT-NEXT: v_mov_b32_e32 v30, v1 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v30 +; NOOPT-NEXT: v_mov_b32_e32 v2, v29 +; NOOPT-NEXT: v_mov_b32_e32 v3, v28 +; NOOPT-NEXT: v_mov_b32_e32 v4, v27 +; NOOPT-NEXT: v_mov_b32_e32 v5, v26 +; NOOPT-NEXT: v_mov_b32_e32 v6, v25 +; NOOPT-NEXT: v_mov_b32_e32 v7, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v22 +; NOOPT-NEXT: v_mov_b32_e32 v10, v21 +; NOOPT-NEXT: v_mov_b32_e32 v11, v20 +; NOOPT-NEXT: v_mov_b32_e32 v12, v19 +; NOOPT-NEXT: v_mov_b32_e32 v13, v18 +; NOOPT-NEXT: v_mov_b32_e32 v14, v17 +; NOOPT-NEXT: v_mov_b32_e32 v15, v16 +; NOOPT-NEXT: s_mov_b32 s5, 15 +; NOOPT-NEXT: s_add_i32 s4, s4, s5 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_movrels_b32_e32 v0, v0 +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extract_largest_inbounds_offset: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s12, s[2:3], 0xd +; SI-MOVREL-NEXT: s_mov_b32 s7, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s6, -1 +; SI-MOVREL-NEXT: s_mov_b32 s2, s6 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_mov_b32 s0, s10 +; SI-MOVREL-NEXT: s_mov_b32 s1, s11 +; SI-MOVREL-NEXT: s_mov_b32 s3, s7 +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: s_add_i32 s12, s12, 15 +; SI-MOVREL-NEXT: s_mov_b32 m0, s12 +; SI-MOVREL-NEXT: s_mov_b32 s4, s8 +; SI-MOVREL-NEXT: s_mov_b32 s5, s9 +; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: extract_largest_inbounds_offset: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s7 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: s_add_u32 s0, s6, 48 +; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: s_add_u32 s0, s6, 32 +; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: s_add_u32 s0, s6, 16 +; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: s_add_i32 s2, s2, 15 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s5 +; VI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 +; VI-MOVREL-NEXT: flat_store_dword v[16:17], v0 +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: extract_largest_inbounds_offset: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 48 +; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s7 +; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 16 +; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s5 +; VI-IDXMODE-NEXT: s_add_i32 s2, s2, 15 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: flat_store_dword v[16:17], v0 +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: extract_largest_inbounds_offset: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: s_add_i32 s0, s0, 15 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[4:5] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %ld = load volatile <16 x i32>, ptr addrspace(1) %in %offset = add i32 %idx, 15 @@ -442,20 +6789,192 @@ entry: ret void } -; GCN-LABEL: {{^}}extract_out_of_bounds_offset: -; GCN-DAG: buffer_load_dwordx4 v[[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]] -; GCN-DAG: s_load_dword [[IDX:s[0-9]+]] -; GCN: s_add_i32 [[ADD_IDX:s[0-9]+]], [[IDX]], 16 - -; MOVREL: s_mov_b32 m0, [[ADD_IDX]] -; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] - -; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0) -; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] -; IDXMODE: s_set_gpr_idx_off - -; GCN: buffer_store_dword [[EXTRACT]] define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) { +; NOOPT-LABEL: extract_out_of_bounds_offset: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xb +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xd +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s7, s9 +; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 +; NOOPT-NEXT: s_mov_b32 s9, s7 +; NOOPT-NEXT: s_mov_b32 s10, s6 +; NOOPT-NEXT: s_mov_b32 s11, s5 +; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v16, v15 +; NOOPT-NEXT: v_mov_b32_e32 v17, v14 +; NOOPT-NEXT: v_mov_b32_e32 v18, v13 +; NOOPT-NEXT: v_mov_b32_e32 v19, v12 +; NOOPT-NEXT: v_mov_b32_e32 v20, v11 +; NOOPT-NEXT: v_mov_b32_e32 v21, v10 +; NOOPT-NEXT: v_mov_b32_e32 v22, v9 +; NOOPT-NEXT: v_mov_b32_e32 v23, v8 +; NOOPT-NEXT: v_mov_b32_e32 v24, v7 +; NOOPT-NEXT: v_mov_b32_e32 v25, v6 +; NOOPT-NEXT: v_mov_b32_e32 v26, v5 +; NOOPT-NEXT: v_mov_b32_e32 v27, v4 +; NOOPT-NEXT: v_mov_b32_e32 v28, v3 +; NOOPT-NEXT: v_mov_b32_e32 v29, v2 +; NOOPT-NEXT: v_mov_b32_e32 v30, v1 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v30 +; NOOPT-NEXT: v_mov_b32_e32 v2, v29 +; NOOPT-NEXT: v_mov_b32_e32 v3, v28 +; NOOPT-NEXT: v_mov_b32_e32 v4, v27 +; NOOPT-NEXT: v_mov_b32_e32 v5, v26 +; NOOPT-NEXT: v_mov_b32_e32 v6, v25 +; NOOPT-NEXT: v_mov_b32_e32 v7, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v22 +; NOOPT-NEXT: v_mov_b32_e32 v10, v21 +; NOOPT-NEXT: v_mov_b32_e32 v11, v20 +; NOOPT-NEXT: v_mov_b32_e32 v12, v19 +; NOOPT-NEXT: v_mov_b32_e32 v13, v18 +; NOOPT-NEXT: v_mov_b32_e32 v14, v17 +; NOOPT-NEXT: v_mov_b32_e32 v15, v16 +; NOOPT-NEXT: s_mov_b32 s5, 16 +; NOOPT-NEXT: s_add_i32 s4, s4, s5 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_movrels_b32_e32 v0, v0 +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extract_out_of_bounds_offset: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s12, s[2:3], 0xd +; SI-MOVREL-NEXT: s_mov_b32 s7, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s6, -1 +; SI-MOVREL-NEXT: s_mov_b32 s2, s6 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_mov_b32 s0, s10 +; SI-MOVREL-NEXT: s_mov_b32 s1, s11 +; SI-MOVREL-NEXT: s_mov_b32 s3, s7 +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: s_add_i32 s12, s12, 16 +; SI-MOVREL-NEXT: s_mov_b32 m0, s12 +; SI-MOVREL-NEXT: s_mov_b32 s4, s8 +; SI-MOVREL-NEXT: s_mov_b32 s5, s9 +; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: extract_out_of_bounds_offset: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s7 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: s_add_u32 s0, s6, 48 +; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: s_add_u32 s0, s6, 32 +; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: s_add_u32 s0, s6, 16 +; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: s_add_i32 s2, s2, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s5 +; VI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 +; VI-MOVREL-NEXT: flat_store_dword v[16:17], v0 +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: extract_out_of_bounds_offset: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 48 +; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s7 +; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 16 +; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s5 +; VI-IDXMODE-NEXT: s_add_i32 s2, s2, 16 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: flat_store_dword v[16:17], v0 +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: extract_out_of_bounds_offset: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: s_add_i32 s0, s0, 16 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[4:5] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %ld = load volatile <16 x i32>, ptr addrspace(1) %in %offset = add i32 %idx, 16 @@ -464,17 +6983,192 @@ entry: ret void } -; GCN-LABEL: {{^}}extractelement_v16i32_or_index: -; GCN: s_load_dword [[IDX_IN:s[0-9]+]] -; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]] - -; MOVREL: s_mov_b32 m0, [[IDX_SHL]] -; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} - -; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], gpr_idx(SRC0) -; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; IDXMODE: s_set_gpr_idx_off define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx.in) { +; NOOPT-LABEL: extractelement_v16i32_or_index: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xb +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xd +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s7, s9 +; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 +; NOOPT-NEXT: s_mov_b32 s9, s7 +; NOOPT-NEXT: s_mov_b32 s10, s6 +; NOOPT-NEXT: s_mov_b32 s11, s5 +; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v16, v15 +; NOOPT-NEXT: v_mov_b32_e32 v17, v14 +; NOOPT-NEXT: v_mov_b32_e32 v18, v13 +; NOOPT-NEXT: v_mov_b32_e32 v19, v12 +; NOOPT-NEXT: v_mov_b32_e32 v20, v11 +; NOOPT-NEXT: v_mov_b32_e32 v21, v10 +; NOOPT-NEXT: v_mov_b32_e32 v22, v9 +; NOOPT-NEXT: v_mov_b32_e32 v23, v8 +; NOOPT-NEXT: v_mov_b32_e32 v24, v7 +; NOOPT-NEXT: v_mov_b32_e32 v25, v6 +; NOOPT-NEXT: v_mov_b32_e32 v26, v5 +; NOOPT-NEXT: v_mov_b32_e32 v27, v4 +; NOOPT-NEXT: v_mov_b32_e32 v28, v3 +; NOOPT-NEXT: v_mov_b32_e32 v29, v2 +; NOOPT-NEXT: v_mov_b32_e32 v30, v1 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v30 +; NOOPT-NEXT: v_mov_b32_e32 v2, v29 +; NOOPT-NEXT: v_mov_b32_e32 v3, v28 +; NOOPT-NEXT: v_mov_b32_e32 v4, v27 +; NOOPT-NEXT: v_mov_b32_e32 v5, v26 +; NOOPT-NEXT: v_mov_b32_e32 v6, v25 +; NOOPT-NEXT: v_mov_b32_e32 v7, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v22 +; NOOPT-NEXT: v_mov_b32_e32 v10, v21 +; NOOPT-NEXT: v_mov_b32_e32 v11, v20 +; NOOPT-NEXT: v_mov_b32_e32 v12, v19 +; NOOPT-NEXT: v_mov_b32_e32 v13, v18 +; NOOPT-NEXT: v_mov_b32_e32 v14, v17 +; NOOPT-NEXT: v_mov_b32_e32 v15, v16 +; NOOPT-NEXT: s_mov_b32 s5, 2 +; NOOPT-NEXT: s_lshl_b32 s4, s4, s5 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_movrels_b32_e32 v0, v1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extractelement_v16i32_or_index: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s12, s[2:3], 0xd +; SI-MOVREL-NEXT: s_mov_b32 s7, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s6, -1 +; SI-MOVREL-NEXT: s_mov_b32 s2, s6 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_mov_b32 s0, s10 +; SI-MOVREL-NEXT: s_mov_b32 s1, s11 +; SI-MOVREL-NEXT: s_mov_b32 s3, s7 +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: s_lshl_b32 s0, s12, 2 +; SI-MOVREL-NEXT: s_mov_b32 m0, s0 +; SI-MOVREL-NEXT: s_mov_b32 s4, s8 +; SI-MOVREL-NEXT: s_mov_b32 s5, s9 +; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v1 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: extractelement_v16i32_or_index: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s7 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: s_add_u32 s0, s6, 48 +; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: s_add_u32 s0, s6, 32 +; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: s_add_u32 s0, s6, 16 +; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: s_lshl_b32 s0, s2, 2 +; VI-MOVREL-NEXT: s_mov_b32 m0, s0 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s5 +; VI-MOVREL-NEXT: v_movrels_b32_e32 v0, v1 +; VI-MOVREL-NEXT: flat_store_dword v[16:17], v0 +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: extractelement_v16i32_or_index: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 48 +; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s7 +; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 16 +; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s5 +; VI-IDXMODE-NEXT: s_lshl_b32 s0, s2, 2 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v1 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: flat_store_dword v[16:17], v0 +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: extractelement_v16i32_or_index: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[4:5] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %ld = load volatile <16 x i32>, ptr addrspace(1) %in %idx.shl = shl i32 %idx.in, 2 @@ -484,17 +7178,249 @@ entry: ret void } -; GCN-LABEL: {{^}}insertelement_v16f32_or_index: -; GCN: s_load_dword [[IDX_IN:s[0-9]+]] -; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]] - -; MOVREL: s_mov_b32 m0, [[IDX_SHL]] -; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} - -; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], gpr_idx(DST) -; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; IDXMODE: s_set_gpr_idx_off define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, <16 x float> %a, i32 %idx.in) nounwind { +; NOOPT-LABEL: insertelement_v16f32_or_index: +; NOOPT: ; %bb.0: +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x19 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0x29 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s5, 2 +; NOOPT-NEXT: s_lshl_b32 s4, s4, s5 +; NOOPT-NEXT: v_mov_b32_e32 v0, 0x40a00000 +; NOOPT-NEXT: v_mov_b32_e32 v7, s8 +; NOOPT-NEXT: v_mov_b32_e32 v8, s9 +; NOOPT-NEXT: v_mov_b32_e32 v9, s10 +; NOOPT-NEXT: v_mov_b32_e32 v10, s11 +; NOOPT-NEXT: v_mov_b32_e32 v11, s12 +; NOOPT-NEXT: v_mov_b32_e32 v12, s13 +; NOOPT-NEXT: v_mov_b32_e32 v13, s14 +; NOOPT-NEXT: v_mov_b32_e32 v14, s15 +; NOOPT-NEXT: v_mov_b32_e32 v15, s16 +; NOOPT-NEXT: v_mov_b32_e32 v16, s17 +; NOOPT-NEXT: v_mov_b32_e32 v17, s18 +; NOOPT-NEXT: v_mov_b32_e32 v18, s19 +; NOOPT-NEXT: v_mov_b32_e32 v19, s20 +; NOOPT-NEXT: v_mov_b32_e32 v20, s21 +; NOOPT-NEXT: v_mov_b32_e32 v21, s22 +; NOOPT-NEXT: v_mov_b32_e32 v22, s23 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_movreld_b32_e32 v8, v0 +; NOOPT-NEXT: v_mov_b32_e32 v4, v22 +; NOOPT-NEXT: v_mov_b32_e32 v5, v21 +; NOOPT-NEXT: v_mov_b32_e32 v6, v20 +; NOOPT-NEXT: v_mov_b32_e32 v0, v19 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: v_mov_b32_e32 v4, v18 +; NOOPT-NEXT: v_mov_b32_e32 v5, v17 +; NOOPT-NEXT: v_mov_b32_e32 v6, v16 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v15 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; NOOPT-NEXT: v_mov_b32_e32 v4, v14 +; NOOPT-NEXT: v_mov_b32_e32 v5, v13 +; NOOPT-NEXT: v_mov_b32_e32 v6, v12 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v11 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; NOOPT-NEXT: v_mov_b32_e32 v4, v10 +; NOOPT-NEXT: v_mov_b32_e32 v5, v9 +; NOOPT-NEXT: v_mov_b32_e32 v6, v8 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v7 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insertelement_v16f32_or_index: +; SI-MOVREL: ; %bb.0: +; SI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x29 +; SI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; SI-MOVREL-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x40a00000 +; SI-MOVREL-NEXT: s_mov_b32 s23, 0xf000 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_lshl_b32 s0, s0, 2 +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s18 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s19 +; SI-MOVREL-NEXT: s_mov_b32 m0, s0 +; SI-MOVREL-NEXT: s_mov_b32 s22, -1 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[20:23], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[20:23], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: insertelement_v16f32_or_index: +; VI-MOVREL: ; %bb.0: +; VI-MOVREL-NEXT: s_load_dword s20, s[2:3], 0xa4 +; VI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x40a00000 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_lshl_b32 s2, s20, 2 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s18 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s19 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: insertelement_v16f32_or_index: +; VI-IDXMODE: ; %bb.0: +; VI-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0xa4 +; VI-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, 0x40a00000 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_lshl_b32 s3, s20, 2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, s18 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, s19 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, v16 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insertelement_v16f32_or_index: +; GFX9-IDXMODE: ; %bb.0: +; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; GFX9-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0xa4 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, 0x40a00000 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-IDXMODE-NEXT: s_lshl_b32 s2, s20, 2 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, v17 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm %idx.shl = shl i32 %idx.in, 2 %idx = or i32 %idx.shl, 1 %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %idx @@ -502,57 +7428,891 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, ret void } -; GCN-LABEL: {{^}}broken_phi_bb: -; GCN: v_mov_b32_e32 [[PHIREG:v[0-9]+]], 8 - -; GCN: {{.LBB[0-9]+_[0-9]+}}: -; GCN: [[BB2:.LBB[0-9]+_[0-9]+]]: -; GCN: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, [[PHIREG]] -; GCN: buffer_load_dword - -; GCN: [[REGLOOP:.LBB[0-9]+_[0-9]+]]: -; MOVREL: v_movreld_b32_e32 - -; IDXMODE: s_set_gpr_idx_on -; IDXMODE: v_mov_b32_e32 -; IDXMODE: s_set_gpr_idx_off - -; GCN: s_cbranch_execnz [[REGLOOP]] - -; GCN: {{^; %bb.[0-9]}}: -; GCN: s_mov_b64 exec, -; GCN: s_cbranch_execnz [[BB2]] - -define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 { +define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { +; NOOPT-LABEL: broken_phi_bb: +; NOOPT: ; %bb.0: ; %bb +; NOOPT-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 +; NOOPT-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 +; NOOPT-NEXT: s_mov_b32 s26, -1 +; NOOPT-NEXT: s_mov_b32 s27, 0xe8f000 +; NOOPT-NEXT: s_add_u32 s24, s24, s9 +; NOOPT-NEXT: s_addc_u32 s25, s25, 0 +; NOOPT-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; NOOPT-NEXT: s_load_dword s1, s[2:3], 0x9 +; NOOPT-NEXT: s_load_dword s0, s[2:3], 0xa +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: v_writelane_b32 v0, s1, 0 +; NOOPT-NEXT: s_mov_b32 s1, 8 +; NOOPT-NEXT: v_writelane_b32 v0, s0, 1 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, 8 +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: .LBB26_1: ; %bb2 +; NOOPT-NEXT: ; =>This Loop Header: Depth=1 +; NOOPT-NEXT: ; Child Loop BB26_3 Depth 2 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s2, v0, 0 +; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 s[0:1], -1 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_cmp_ge_i32_e64 s[2:3], v1, s2 +; NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; NOOPT-NEXT: s_and_b64 vcc, exec, s[2:3] +; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: v_writelane_b32 v0, s0, 2 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 3 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: s_cbranch_vccnz .LBB26_6 +; NOOPT-NEXT: ; %bb.2: ; %bb4 +; NOOPT-NEXT: ; in Loop: Header=BB26_1 Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: buffer_load_dword v16, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v16, 1 +; NOOPT-NEXT: ; implicit-def: $sgpr2_sgpr3 +; NOOPT-NEXT: ; kill: def $sgpr3 killed $sgpr3 killed $sgpr2_sgpr3 +; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5 +; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; NOOPT-NEXT: s_mov_b32 s1, 0xf000 +; NOOPT-NEXT: s_mov_b32 s2, -1 +; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b32 s5, s3 +; NOOPT-NEXT: s_mov_b32 s6, s2 +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:80 ; 4-byte Folded Spill +; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, s4 +; NOOPT-NEXT: v_mov_b32_e32 v1, s5 +; NOOPT-NEXT: v_mov_b32_e32 v2, s6 +; NOOPT-NEXT: v_mov_b32_e32 v3, s7 +; NOOPT-NEXT: v_mov_b32_e32 v4, s8 +; NOOPT-NEXT: v_mov_b32_e32 v5, s9 +; NOOPT-NEXT: v_mov_b32_e32 v6, s10 +; NOOPT-NEXT: v_mov_b32_e32 v7, s11 +; NOOPT-NEXT: v_mov_b32_e32 v8, s12 +; NOOPT-NEXT: v_mov_b32_e32 v9, s13 +; NOOPT-NEXT: v_mov_b32_e32 v10, s14 +; NOOPT-NEXT: v_mov_b32_e32 v11, s15 +; NOOPT-NEXT: v_mov_b32_e32 v12, s16 +; NOOPT-NEXT: v_mov_b32_e32 v13, s17 +; NOOPT-NEXT: v_mov_b32_e32 v14, s18 +; NOOPT-NEXT: v_mov_b32_e32 v15, s19 +; NOOPT-NEXT: v_mov_b32_e32 v17, s0 +; NOOPT-NEXT: buffer_store_dword v17, off, s[24:27], 0 offset:76 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: v_writelane_b32 v16, s0, 4 +; NOOPT-NEXT: v_writelane_b32 v16, s1, 5 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: buffer_store_dword v16, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: .LBB26_3: ; Parent Loop BB26_1 Depth=1 +; NOOPT-NEXT: ; => This Inner Loop Header: Depth=2 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:12 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[24:27], 0 offset:16 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[24:27], 0 offset:20 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[24:27], 0 offset:24 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[24:27], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[24:27], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[24:27], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[24:27], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[24:27], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[24:27], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[24:27], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[24:27], 0 offset:56 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[24:27], 0 offset:60 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[24:27], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[24:27], 0 offset:68 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[24:27], 0 offset:72 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[24:27], 0 offset:76 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[24:27], 0 offset:80 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 +; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; NOOPT-NEXT: s_mov_b32 m0, s2 +; NOOPT-NEXT: v_movreld_b32_e32 v1, v17 +; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:84 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[24:27], 0 offset:88 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:92 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[24:27], 0 offset:96 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[24:27], 0 offset:100 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[24:27], 0 offset:104 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:108 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[24:27], 0 offset:112 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[24:27], 0 offset:116 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[24:27], 0 offset:120 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[24:27], 0 offset:124 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:128 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:132 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:136 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:140 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[24:27], 0 offset:144 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] +; NOOPT-NEXT: v_writelane_b32 v0, s2, 6 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 7 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] +; NOOPT-NEXT: s_cbranch_execnz .LBB26_3 +; NOOPT-NEXT: ; %bb.4: ; in Loop: Header=BB26_1 Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 4 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 5 +; NOOPT-NEXT: s_mov_b64 exec, s[0:1] +; NOOPT-NEXT: ; %bb.5: ; in Loop: Header=BB26_1 Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:84 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[24:27], 0 offset:88 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[24:27], 0 offset:92 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[24:27], 0 offset:96 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[24:27], 0 offset:100 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[24:27], 0 offset:104 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[24:27], 0 offset:108 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[24:27], 0 offset:112 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[24:27], 0 offset:116 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[24:27], 0 offset:120 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[24:27], 0 offset:124 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[24:27], 0 offset:132 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[24:27], 0 offset:136 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[24:27], 0 offset:140 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[24:27], 0 offset:144 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 s[0:1], 0 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: v_writelane_b32 v0, s0, 2 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 3 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: .LBB26_6: ; %Flow +; NOOPT-NEXT: ; in Loop: Header=BB26_1 Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: s_waitcnt expcnt(1) +; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v1, 2 +; NOOPT-NEXT: v_readlane_b32 s1, v1, 3 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Reload +; NOOPT-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; NOOPT-NEXT: s_mov_b32 s0, 1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, s0 +; NOOPT-NEXT: s_and_b64 vcc, exec, s[0:1] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_cbranch_vccnz .LBB26_1 +; NOOPT-NEXT: ; %bb.7: ; %bb8 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: ; kill: killed $vgpr0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: broken_phi_bb: +; SI-MOVREL: ; %bb.0: ; %bb +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 8 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: s_branch .LBB26_2 +; SI-MOVREL-NEXT: .LBB26_1: +; SI-MOVREL-NEXT: ; implicit-def: $vgpr0 +; SI-MOVREL-NEXT: s_branch .LBB26_6 +; SI-MOVREL-NEXT: .LBB26_2: ; %bb2 +; SI-MOVREL-NEXT: ; =>This Loop Header: Depth=1 +; SI-MOVREL-NEXT: ; Child Loop BB26_4 Depth 2 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: v_cmp_le_i32_e32 vcc, s0, v0 +; SI-MOVREL-NEXT: s_cbranch_vccnz .LBB26_1 +; SI-MOVREL-NEXT: ; %bb.3: ; %bb4 +; SI-MOVREL-NEXT: ; in Loop: Header=BB26_2 Depth=1 +; SI-MOVREL-NEXT: buffer_load_dword v16, off, s[0:3], 0 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: v_mov_b32_e32 v17, s1 +; SI-MOVREL-NEXT: s_mov_b64 s[4:5], exec +; SI-MOVREL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-MOVREL-NEXT: .LBB26_4: ; Parent Loop BB26_2 Depth=1 +; SI-MOVREL-NEXT: ; => This Inner Loop Header: Depth=2 +; SI-MOVREL-NEXT: v_readfirstlane_b32 s6, v16 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v16 +; SI-MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc +; SI-MOVREL-NEXT: s_mov_b32 m0, s6 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v17 +; SI-MOVREL-NEXT: s_xor_b64 exec, exec, vcc +; SI-MOVREL-NEXT: s_cbranch_execnz .LBB26_4 +; SI-MOVREL-NEXT: ; %bb.5: ; in Loop: Header=BB26_2 Depth=1 +; SI-MOVREL-NEXT: s_mov_b64 exec, s[4:5] +; SI-MOVREL-NEXT: s_cbranch_execnz .LBB26_2 +; SI-MOVREL-NEXT: .LBB26_6: ; %bb8 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: broken_phi_bb: +; VI-MOVREL: ; %bb.0: ; %bb +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 8 +; VI-MOVREL-NEXT: s_branch .LBB26_2 +; VI-MOVREL-NEXT: .LBB26_1: +; VI-MOVREL-NEXT: ; implicit-def: $vgpr0 +; VI-MOVREL-NEXT: s_branch .LBB26_6 +; VI-MOVREL-NEXT: .LBB26_2: ; %bb2 +; VI-MOVREL-NEXT: ; =>This Loop Header: Depth=1 +; VI-MOVREL-NEXT: ; Child Loop BB26_4 Depth 2 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: v_cmp_le_i32_e32 vcc, s0, v0 +; VI-MOVREL-NEXT: s_cbranch_vccnz .LBB26_1 +; VI-MOVREL-NEXT: ; %bb.3: ; %bb4 +; VI-MOVREL-NEXT: ; in Loop: Header=BB26_2 Depth=1 +; VI-MOVREL-NEXT: flat_load_dword v16, v[0:1] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s1 +; VI-MOVREL-NEXT: s_mov_b64 s[2:3], exec +; VI-MOVREL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-MOVREL-NEXT: .LBB26_4: ; Parent Loop BB26_2 Depth=1 +; VI-MOVREL-NEXT: ; => This Inner Loop Header: Depth=2 +; VI-MOVREL-NEXT: v_readfirstlane_b32 s4, v16 +; VI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v16 +; VI-MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc +; VI-MOVREL-NEXT: s_mov_b32 m0, s4 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v17 +; VI-MOVREL-NEXT: s_xor_b64 exec, exec, vcc +; VI-MOVREL-NEXT: s_cbranch_execnz .LBB26_4 +; VI-MOVREL-NEXT: ; %bb.5: ; in Loop: Header=BB26_2 Depth=1 +; VI-MOVREL-NEXT: s_mov_b64 exec, s[2:3] +; VI-MOVREL-NEXT: s_cbranch_execnz .LBB26_2 +; VI-MOVREL-NEXT: .LBB26_6: ; %bb8 +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: broken_phi_bb: +; VI-IDXMODE: ; %bb.0: ; %bb +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 8 +; VI-IDXMODE-NEXT: s_branch .LBB26_2 +; VI-IDXMODE-NEXT: .LBB26_1: +; VI-IDXMODE-NEXT: ; implicit-def: $vgpr0 +; VI-IDXMODE-NEXT: s_branch .LBB26_6 +; VI-IDXMODE-NEXT: .LBB26_2: ; %bb2 +; VI-IDXMODE-NEXT: ; =>This Loop Header: Depth=1 +; VI-IDXMODE-NEXT: ; Child Loop BB26_4 Depth 2 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: v_cmp_le_i32_e32 vcc, s0, v0 +; VI-IDXMODE-NEXT: s_cbranch_vccnz .LBB26_1 +; VI-IDXMODE-NEXT: ; %bb.3: ; %bb4 +; VI-IDXMODE-NEXT: ; in Loop: Header=BB26_2 Depth=1 +; VI-IDXMODE-NEXT: flat_load_dword v16, v[0:1] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s1 +; VI-IDXMODE-NEXT: s_mov_b64 s[2:3], exec +; VI-IDXMODE-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-IDXMODE-NEXT: .LBB26_4: ; Parent Loop BB26_2 Depth=1 +; VI-IDXMODE-NEXT: ; => This Inner Loop Header: Depth=2 +; VI-IDXMODE-NEXT: v_readfirstlane_b32 s4, v16 +; VI-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v16 +; VI-IDXMODE-NEXT: s_and_saveexec_b64 vcc, vcc +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v17 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: s_xor_b64 exec, exec, vcc +; VI-IDXMODE-NEXT: s_cbranch_execnz .LBB26_4 +; VI-IDXMODE-NEXT: ; %bb.5: ; in Loop: Header=BB26_2 Depth=1 +; VI-IDXMODE-NEXT: s_mov_b64 exec, s[2:3] +; VI-IDXMODE-NEXT: s_cbranch_execnz .LBB26_2 +; VI-IDXMODE-NEXT: .LBB26_6: ; %bb8 +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: broken_phi_bb: +; GFX9-IDXMODE: ; %bb.0: ; %bb +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-IDXMODE-NEXT: s_branch .LBB26_2 +; GFX9-IDXMODE-NEXT: .LBB26_1: +; GFX9-IDXMODE-NEXT: ; implicit-def: $vgpr0 +; GFX9-IDXMODE-NEXT: s_branch .LBB26_6 +; GFX9-IDXMODE-NEXT: .LBB26_2: ; %bb2 +; GFX9-IDXMODE-NEXT: ; =>This Loop Header: Depth=1 +; GFX9-IDXMODE-NEXT: ; Child Loop BB26_4 Depth 2 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: v_cmp_le_i32_e32 vcc, s0, v0 +; GFX9-IDXMODE-NEXT: s_cbranch_vccnz .LBB26_1 +; GFX9-IDXMODE-NEXT: ; %bb.3: ; %bb4 +; GFX9-IDXMODE-NEXT: ; in Loop: Header=BB26_2 Depth=1 +; GFX9-IDXMODE-NEXT: global_load_dword v16, v[0:1], off glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, s1 +; GFX9-IDXMODE-NEXT: s_mov_b64 s[2:3], exec +; GFX9-IDXMODE-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-IDXMODE-NEXT: .LBB26_4: ; Parent Loop BB26_2 Depth=1 +; GFX9-IDXMODE-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX9-IDXMODE-NEXT: v_readfirstlane_b32 s4, v16 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v16 +; GFX9-IDXMODE-NEXT: s_and_saveexec_b64 vcc, vcc +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v17 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: s_xor_b64 exec, exec, vcc +; GFX9-IDXMODE-NEXT: s_cbranch_execnz .LBB26_4 +; GFX9-IDXMODE-NEXT: ; %bb.5: ; in Loop: Header=BB26_2 Depth=1 +; GFX9-IDXMODE-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-IDXMODE-NEXT: s_cbranch_execnz .LBB26_2 +; GFX9-IDXMODE-NEXT: .LBB26_6: ; %bb8 +; GFX9-IDXMODE-NEXT: s_endpgm bb: br label %bb2 -bb2: ; preds = %bb4, %bb +bb2: %tmp = phi i32 [ 8, %bb ], [ %tmp7, %bb4 ] %tmp3 = icmp slt i32 %tmp, %arg br i1 %tmp3, label %bb4, label %bb8 -bb4: ; preds = %bb2 +bb4: %vgpr = load volatile i32, ptr addrspace(1) undef %tmp5 = insertelement <16 x i32> undef, i32 undef, i32 %vgpr %tmp6 = insertelement <16 x i32> %tmp5, i32 %arg1, i32 %vgpr %tmp7 = extractelement <16 x i32> %tmp6, i32 0 br label %bb2 -bb8: ; preds = %bb2 +bb8: ret void } -; GCN-LABEL: {{^}}insert_or_disj_index: -; GCN: v_mov_b32_e32 v[[#VIDX:]], 0 - -; MOVREL: s_mov_b32 m0, s{{[0-9]+}} -; MOVREL: v_movreld_b32_e32 v[[#VIDX + 1]], v{{[0-9]+}} - -; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, gpr_idx(DST) -; IDXMODE: v_mov_b32_e32 v[[#VIDX + 1]], v{{[0-9]+}} -; IDXMODE: s_set_gpr_idx_off define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace(4) %in, i32 %val, <4 x i32> inreg %desc, i32 inreg %A) { +; NOOPT-LABEL: insert_or_disj_index: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; NOOPT-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; NOOPT-NEXT: s_mov_b32 s18, -1 +; NOOPT-NEXT: s_mov_b32 s19, 0xe8f000 +; NOOPT-NEXT: s_add_u32 s16, s16, s5 +; NOOPT-NEXT: s_addc_u32 s17, s17, 0 +; NOOPT-NEXT: ; implicit-def: $vgpr16 : SGPR spill to VGPR lane +; NOOPT-NEXT: v_writelane_b32 v16, s4, 0 +; NOOPT-NEXT: s_mov_b32 s4, s1 +; NOOPT-NEXT: v_readlane_b32 s1, v16, 0 +; NOOPT-NEXT: v_writelane_b32 v16, s4, 1 +; NOOPT-NEXT: s_mov_b32 s4, s0 +; NOOPT-NEXT: v_readlane_b32 s0, v16, 1 +; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:144 ; 4-byte Folded Spill +; NOOPT-NEXT: v_mov_b32_e32 v2, v1 +; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b32 s5, s0 +; NOOPT-NEXT: s_mov_b32 s6, s2 +; NOOPT-NEXT: s_mov_b32 s7, s3 +; NOOPT-NEXT: ; implicit-def: $sgpr0 +; NOOPT-NEXT: ; implicit-def: $sgpr0 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v2 +; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:136 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:140 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b32 s8, 0xf000 +; NOOPT-NEXT: s_mov_b32 s0, 0 +; NOOPT-NEXT: v_writelane_b32 v16, s0, 2 +; NOOPT-NEXT: s_mov_b32 s2, s0 +; NOOPT-NEXT: s_mov_b32 s3, s8 +; NOOPT-NEXT: s_mov_b32 s8, s0 +; NOOPT-NEXT: s_mov_b32 s9, s0 +; NOOPT-NEXT: ; kill: def $sgpr8_sgpr9 killed $sgpr8_sgpr9 def $sgpr8_sgpr9_sgpr10_sgpr11 +; NOOPT-NEXT: s_mov_b64 s[10:11], s[2:3] +; NOOPT-NEXT: v_writelane_b32 v16, s8, 3 +; NOOPT-NEXT: v_writelane_b32 v16, s9, 4 +; NOOPT-NEXT: v_writelane_b32 v16, s10, 5 +; NOOPT-NEXT: v_writelane_b32 v16, s11, 6 +; NOOPT-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10_sgpr11 killed $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: ; implicit-def: $sgpr2_sgpr3 +; NOOPT-NEXT: s_waitcnt expcnt(1) +; NOOPT-NEXT: v_mov_b32_e32 v0, s1 +; NOOPT-NEXT: buffer_load_dword v0, v0, s[4:7], s0 offen +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:132 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, s0 +; NOOPT-NEXT: v_mov_b32_e32 v31, s0 +; NOOPT-NEXT: v_mov_b32_e32 v30, s0 +; NOOPT-NEXT: v_mov_b32_e32 v29, s0 +; NOOPT-NEXT: v_mov_b32_e32 v28, s0 +; NOOPT-NEXT: v_mov_b32_e32 v27, s0 +; NOOPT-NEXT: v_mov_b32_e32 v26, s0 +; NOOPT-NEXT: v_mov_b32_e32 v25, s0 +; NOOPT-NEXT: v_mov_b32_e32 v24, s0 +; NOOPT-NEXT: v_mov_b32_e32 v23, s0 +; NOOPT-NEXT: v_mov_b32_e32 v22, s0 +; NOOPT-NEXT: v_mov_b32_e32 v21, s0 +; NOOPT-NEXT: v_mov_b32_e32 v20, s0 +; NOOPT-NEXT: v_mov_b32_e32 v19, s0 +; NOOPT-NEXT: v_mov_b32_e32 v18, s0 +; NOOPT-NEXT: v_mov_b32_e32 v17, s0 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v31 +; NOOPT-NEXT: v_mov_b32_e32 v2, v30 +; NOOPT-NEXT: v_mov_b32_e32 v3, v29 +; NOOPT-NEXT: v_mov_b32_e32 v4, v28 +; NOOPT-NEXT: v_mov_b32_e32 v5, v27 +; NOOPT-NEXT: v_mov_b32_e32 v6, v26 +; NOOPT-NEXT: v_mov_b32_e32 v7, v25 +; NOOPT-NEXT: v_mov_b32_e32 v8, v24 +; NOOPT-NEXT: v_mov_b32_e32 v9, v23 +; NOOPT-NEXT: v_mov_b32_e32 v10, v22 +; NOOPT-NEXT: v_mov_b32_e32 v11, v21 +; NOOPT-NEXT: v_mov_b32_e32 v12, v20 +; NOOPT-NEXT: v_mov_b32_e32 v13, v19 +; NOOPT-NEXT: v_mov_b32_e32 v14, v18 +; NOOPT-NEXT: v_mov_b32_e32 v15, v17 +; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:72 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:76 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:80 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:84 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:88 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[16:19], 0 offset:92 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[16:19], 0 offset:96 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[16:19], 0 offset:100 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[16:19], 0 offset:104 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[16:19], 0 offset:108 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[16:19], 0 offset:112 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[16:19], 0 offset:116 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[16:19], 0 offset:120 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:124 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:128 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: v_writelane_b32 v16, s0, 7 +; NOOPT-NEXT: v_writelane_b32 v16, s1, 8 +; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 +; NOOPT-NEXT: buffer_store_dword v16, off, s[16:19], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[12:13] +; NOOPT-NEXT: s_waitcnt expcnt(1) +; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[16:19], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[16:19], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[16:19], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[16:19], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[16:19], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[16:19], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[16:19], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[16:19], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[12:13] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 9 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 10 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:20 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[16:19], 0 offset:24 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[16:19], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[16:19], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[16:19], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[16:19], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[16:19], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[16:19], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[16:19], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[16:19], 0 offset:56 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[16:19], 0 offset:60 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[16:19], 0 offset:144 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[16:19], 0 offset:132 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 +; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; NOOPT-NEXT: s_mov_b32 m0, s2 +; NOOPT-NEXT: v_movreld_b32_e32 v2, v17 +; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:148 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:152 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:156 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:160 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:164 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[16:19], 0 offset:168 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[16:19], 0 offset:172 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[16:19], 0 offset:176 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[16:19], 0 offset:180 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[16:19], 0 offset:184 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[16:19], 0 offset:188 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[16:19], 0 offset:192 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[16:19], 0 offset:196 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:200 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:204 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[16:19], 0 offset:208 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[16:19], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[16:19], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[16:19], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[16:19], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[16:19], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[16:19], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[16:19], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[16:19], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] +; NOOPT-NEXT: v_writelane_b32 v0, s2, 9 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 10 +; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[12:13] +; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] +; NOOPT-NEXT: s_cbranch_execnz .LBB27_1 +; NOOPT-NEXT: ; %bb.2: +; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[12:13] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 7 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 8 +; NOOPT-NEXT: s_mov_b64 exec, s[0:1] +; NOOPT-NEXT: ; %bb.3: +; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[12:13] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 3 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 4 +; NOOPT-NEXT: v_readlane_b32 s2, v0, 5 +; NOOPT-NEXT: v_readlane_b32 s3, v0, 6 +; NOOPT-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:136 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:140 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[16:19], 0 offset:148 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v19, off, s[16:19], 0 offset:152 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v20, off, s[16:19], 0 offset:156 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v21, off, s[16:19], 0 offset:160 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v22, off, s[16:19], 0 offset:164 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v23, off, s[16:19], 0 offset:168 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v24, off, s[16:19], 0 offset:172 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v25, off, s[16:19], 0 offset:176 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v26, off, s[16:19], 0 offset:180 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v27, off, s[16:19], 0 offset:184 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v28, off, s[16:19], 0 offset:188 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v29, off, s[16:19], 0 offset:192 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v30, off, s[16:19], 0 offset:196 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v31, off, s[16:19], 0 offset:200 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v32, off, s[16:19], 0 offset:204 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v33, off, s[16:19], 0 offset:208 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v7, v21 +; NOOPT-NEXT: v_mov_b32_e32 v8, v20 +; NOOPT-NEXT: v_mov_b32_e32 v9, v19 +; NOOPT-NEXT: v_mov_b32_e32 v1, v18 +; NOOPT-NEXT: v_mov_b32_e32 v2, v25 +; NOOPT-NEXT: v_mov_b32_e32 v3, v24 +; NOOPT-NEXT: v_mov_b32_e32 v4, v23 +; NOOPT-NEXT: v_mov_b32_e32 v10, v22 +; NOOPT-NEXT: v_mov_b32_e32 v15, v29 +; NOOPT-NEXT: v_mov_b32_e32 v16, v28 +; NOOPT-NEXT: v_mov_b32_e32 v17, v27 +; NOOPT-NEXT: v_mov_b32_e32 v11, v26 +; NOOPT-NEXT: v_mov_b32_e32 v12, v33 +; NOOPT-NEXT: v_mov_b32_e32 v13, v32 +; NOOPT-NEXT: v_mov_b32_e32 v14, v31 +; NOOPT-NEXT: v_mov_b32_e32 v18, v30 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19_vgpr20_vgpr21 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v19, v14 +; NOOPT-NEXT: v_mov_b32_e32 v20, v13 +; NOOPT-NEXT: v_mov_b32_e32 v21, v12 +; NOOPT-NEXT: v_mov_b32_e32 v13, v6 +; NOOPT-NEXT: v_mov_b32_e32 v12, v5 +; NOOPT-NEXT: buffer_store_dwordx4 v[18:21], v[12:13], s[0:3], 0 addr64 offset:48 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12_vgpr13_vgpr14 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v12, v17 +; NOOPT-NEXT: v_mov_b32_e32 v13, v16 +; NOOPT-NEXT: v_mov_b32_e32 v14, v15 +; NOOPT-NEXT: v_mov_b32_e32 v16, v6 +; NOOPT-NEXT: v_mov_b32_e32 v15, v5 +; NOOPT-NEXT: buffer_store_dwordx4 v[11:14], v[15:16], s[0:3], 0 addr64 offset:32 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11_vgpr12_vgpr13 killed $exec +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v11, v4 +; NOOPT-NEXT: v_mov_b32_e32 v12, v3 +; NOOPT-NEXT: v_mov_b32_e32 v13, v2 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v6 +; NOOPT-NEXT: buffer_store_dwordx4 v[10:13], v[2:3], s[0:3], 0 addr64 offset:16 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v2, v9 +; NOOPT-NEXT: v_mov_b32_e32 v3, v8 +; NOOPT-NEXT: v_mov_b32_e32 v4, v7 +; NOOPT-NEXT: buffer_store_dwordx4 v[1:4], v[5:6], s[0:3], 0 addr64 +; NOOPT-NEXT: ; kill: killed $vgpr0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_or_disj_index: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s4 +; SI-MOVREL-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; SI-MOVREL-NEXT: s_mov_b32 s2, 0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s0, s2 +; SI-MOVREL-NEXT: s_mov_b32 s1, s2 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v17, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v18, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v19, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v20, v5 +; SI-MOVREL-NEXT: s_mov_b64 s[4:5], exec +; SI-MOVREL-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1 +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: v_readfirstlane_b32 s6, v2 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v2 +; SI-MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc +; SI-MOVREL-NEXT: s_mov_b32 m0, s6 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v6, v4 +; SI-MOVREL-NEXT: s_xor_b64 exec, exec, vcc +; SI-MOVREL-NEXT: s_cbranch_execnz .LBB27_1 +; SI-MOVREL-NEXT: ; %bb.2: +; SI-MOVREL-NEXT: s_mov_b64 exec, s[4:5] +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[17:20], v[0:1], s[0:3], 0 addr64 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[13:16], v[0:1], s[0:3], 0 addr64 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[9:12], v[0:1], s[0:3], 0 addr64 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[5:8], v[0:1], s[0:3], 0 addr64 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: insert_or_disj_index: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s4 +; VI-MOVREL-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v18, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v19, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v20, v5 +; VI-MOVREL-NEXT: s_mov_b64 s[0:1], exec +; VI-MOVREL-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1 +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_readfirstlane_b32 s2, v2 +; VI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; VI-MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc +; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v6, v4 +; VI-MOVREL-NEXT: s_xor_b64 exec, exec, vcc +; VI-MOVREL-NEXT: s_cbranch_execnz .LBB27_1 +; VI-MOVREL-NEXT: ; %bb.2: +; VI-MOVREL-NEXT: s_mov_b64 exec, s[0:1] +; VI-MOVREL-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-MOVREL-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-MOVREL-NEXT: flat_store_dwordx4 v[2:3], v[17:20] +; VI-MOVREL-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-MOVREL-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-MOVREL-NEXT: flat_store_dwordx4 v[2:3], v[13:16] +; VI-MOVREL-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-MOVREL-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-MOVREL-NEXT: flat_store_dwordx4 v[2:3], v[9:12] +; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[5:8] +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: insert_or_disj_index: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s4 +; VI-IDXMODE-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v18, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v19, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v20, v5 +; VI-IDXMODE-NEXT: s_mov_b64 s[0:1], exec +; VI-IDXMODE-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1 +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: v_readfirstlane_b32 s2, v2 +; VI-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; VI-IDXMODE-NEXT: s_and_saveexec_b64 vcc, vcc +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, v4 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: s_xor_b64 exec, exec, vcc +; VI-IDXMODE-NEXT: s_cbranch_execnz .LBB27_1 +; VI-IDXMODE-NEXT: ; %bb.2: +; VI-IDXMODE-NEXT: s_mov_b64 exec, s[0:1] +; VI-IDXMODE-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-IDXMODE-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[2:3], v[17:20] +; VI-IDXMODE-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-IDXMODE-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[2:3], v[13:16] +; VI-IDXMODE-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-IDXMODE-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[2:3], v[9:12] +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[5:8] +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_or_disj_index: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-IDXMODE-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v20, v5 +; GFX9-IDXMODE-NEXT: s_mov_b64 s[0:1], exec +; GFX9-IDXMODE-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX9-IDXMODE-NEXT: s_and_saveexec_b64 vcc, vcc +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: s_xor_b64 exec, exec, vcc +; GFX9-IDXMODE-NEXT: s_cbranch_execnz .LBB27_1 +; GFX9-IDXMODE-NEXT: ; %bb.2: +; GFX9-IDXMODE-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v[0:1], v[17:20], off offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v[0:1], v[13:16], off offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v[0:1], v[9:12], off offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v[0:1], v[5:8], off +; GFX9-IDXMODE-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %A, i32 0, i32 0) %off = or disjoint i32 %idx, 1 @@ -560,10 +8320,3 @@ entry: store <16 x i32> %v, ptr addrspace(1) %out ret void } - -declare i32 @llvm.amdgcn.workitem.id.x() #1 -declare void @llvm.amdgcn.s.barrier() #2 - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } -attributes #2 = { nounwind convergent }