[AMDGPU] Fix an issue that wrong index is used in calculation of byte provider when the op is extract_vector_elt #91697

shiltian · 2024-05-10T05:12:45Z

Fixes: SWDEV-460097

llvmbot · 2024-05-10T05:13:15Z

@llvm/pr-subscribers-backend-amdgpu

Author: Shilei Tian (shiltian)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/91697.diff

4 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+2-6)
(modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll (+20-12)
(added) llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll (+126)
(modified) llvm/test/CodeGen/AMDGPU/permute_i8.ll (+14-11)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 33bdd6195a040..4d3662f672687 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -12071,12 +12071,8 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
       return std::nullopt;
     auto VecIdx = IdxOp->getZExtValue();
     auto ScalarSize = Op.getScalarValueSizeInBits();
-    if (ScalarSize != 32) {
-      Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
-    }
-
-    return calculateSrcByte(ScalarSize == 32 ? Op : Op.getOperand(0),
-                            StartingIndex, Index);
+    return calculateSrcByte(Op.getOperand(0), StartingIndex,
+                            ScalarSize / 8 * VecIdx + Index);
   }
 
   case AMDGPUISD::PERM: {
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 1ba2491d2210e..5fd9b1d855095 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -2706,25 +2706,33 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
-; VI-NEXT:    v_mov_b32_e32 v12, 0x3020504
+; VI-NEXT:    v_mov_b32_e32 v10, 0x3020504
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v8
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s2, v8
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v0, vcc
+; VI-NEXT:    flat_load_dwordx4 v[0:3], v[4:5]
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v4
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; VI-NEXT:    s_mov_b32 s2, 0x3020504
 ; VI-NEXT:    v_mov_b32_e32 v9, s1
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, s0, v8
 ; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; VI-NEXT:    v_add_u32_e32 v10, vcc, 16, v8
-; VI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v9, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_perm_b32 v3, s4, v3, v12
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
+; VI-NEXT:    v_perm_b32 v3, s4, v3, v10
+; VI-NEXT:    v_perm_b32 v2, v2, v2, s2
+; VI-NEXT:    v_perm_b32 v1, v1, v1, s2
+; VI-NEXT:    v_perm_b32 v0, v0, v0, s2
 ; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_perm_b32 v7, v7, v7, s2
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v8
+; VI-NEXT:    v_perm_b32 v6, v6, v6, s2
+; VI-NEXT:    v_perm_b32 v5, v5, v5, s2
+; VI-NEXT:    v_perm_b32 v4, v4, v4, s2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v9, vcc
+; VI-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: v_insertelement_v16i16_6:
diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
new file mode 100644
index 0000000000000..486ddba799dfe
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
@@ -0,0 +1,126 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+
+define protected amdgpu_kernel void @test(ptr addrspace(1) %srcA, ptr addrspace(1) %srcB, ptr addrspace(1) %dst) {
+; GFX9-LABEL: test:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dword s7, s[4:5], 0x24
+; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x40
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x10
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_and_b32 s4, s7, 0xffff
+; GFX9-NEXT:    s_mul_i32 s6, s6, s4
+; GFX9-NEXT:    s_add_i32 s8, s8, s6
+; GFX9-NEXT:    v_add_u32_e32 v0, s8, v0
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 4, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s0, 0x3020504
+; GFX9-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v2, v2, v2, s0
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s0
+; GFX9-NEXT:    v_not_b32_e32 v3, v3
+; GFX9-NEXT:    v_not_b32_e32 v1, v1
+; GFX9-NEXT:    v_not_b32_e32 v2, v2
+; GFX9-NEXT:    v_not_b32_e32 v0, v0
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x24
+; GFX10-NEXT:    s_load_dword s2, s[4:5], 0x40
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_and_b32 s3, s0, 0xffff
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT:    s_mul_i32 s6, s6, s3
+; GFX10-NEXT:    v_add3_u32 v0, s2, s6, v0
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x10
+; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 4, v[0:1]
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v4
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v5, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, s2, v4
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s3, v5, vcc_lo
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v2, v2, v2, 0x3020504
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x3020504
+; GFX10-NEXT:    v_not_b32_e32 v3, v3
+; GFX10-NEXT:    v_not_b32_e32 v1, v1
+; GFX10-NEXT:    v_not_b32_e32 v2, v2
+; GFX10-NEXT:    v_not_b32_e32 v0, v0
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x40
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_and_b32 s5, s2, 0xffff
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-NEXT:    s_mul_i32 s15, s15, s5
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x10
+; GFX11-NEXT:    v_add3_u32 v0, s4, s15, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], 4, v[0:1]
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, s2, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s3, v5, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, s0, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s1, v5, vcc_lo
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v2, 0x3020504
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x3020504
+; GFX11-NEXT:    v_not_b32_e32 v3, v3
+; GFX11-NEXT:    v_not_b32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_not_b32_e32 v2, v2
+; GFX11-NEXT:    v_not_b32_e32 v0, v0
+; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %test.kernarg.segment = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+  %srcA.kernarg.offset4 = bitcast ptr addrspace(4) %test.kernarg.segment to ptr addrspace(4)
+  %srcA.load = load ptr addrspace(1), ptr addrspace(4) %srcA.kernarg.offset4, align 16
+  %dst.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %test.kernarg.segment, i64 16
+  %dst.load = load ptr addrspace(1), ptr addrspace(4) %dst.kernarg.offset, align 16
+  %0 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %1 = getelementptr inbounds i8, ptr addrspace(4) %0, i64 40
+  %2 = load i64, ptr addrspace(4) %1, align 8
+  %3 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %4 = getelementptr inbounds i8, ptr addrspace(4) %0, i64 12
+  %5 = load i16, ptr addrspace(4) %4, align 4
+  %6 = zext i16 %5 to i32
+  %7 = mul i32 %3, %6
+  %8 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %9 = add i32 %7, %8
+  %10 = zext i32 %9 to i64
+  %11 = add i64 %2, %10
+  %sext = shl i64 %11, 32
+  %idxprom = ashr exact i64 %sext, 32
+  %arrayidx = getelementptr inbounds <16 x i8>, ptr addrspace(1) %srcA.load, i64 %idxprom
+  %12 = load <16 x i8>, ptr addrspace(1) %arrayidx, align 16
+  %not = xor <16 x i8> %12, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %arrayidx2 = getelementptr inbounds <16 x i8>, ptr addrspace(1) %dst.load, i64 %idxprom
+  store <16 x i8> %not, ptr addrspace(1) %arrayidx2, align 16
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index 8ac332197215f..7ca9ae359a499 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -3816,13 +3816,15 @@ define hidden void @extract_v13i64(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-LABEL: extract_v13i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:16
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:48
+; GFX10-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off offset:64
+; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_perm_b32 v0, v9, v8, 0x3020504
+; GFX10-NEXT:    v_perm_b32 v0, v12, v13, 0x1000504
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v1, v11, v12, 0x1000706
+; GFX10-NEXT:    v_perm_b32 v1, v10, v14, 0x1000504
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX10-NEXT:    global_store_dword v[6:7], v1, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -3830,14 +3832,15 @@ define hidden void @extract_v13i64(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-LABEL: extract_v13i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:16
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020504
-; GFX9-NEXT:    s_mov_b32 s5, 0x1000706
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:48
+; GFX9-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off offset:64
+; GFX9-NEXT:    s_mov_b32 s4, 0x1000504
+; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_perm_b32 v0, v9, v8, s4
+; GFX9-NEXT:    v_perm_b32 v0, v12, v13, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v1, v11, v12, s5
+; GFX9-NEXT:    v_perm_b32 v1, v10, v14, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX9-NEXT:    global_store_dword v[6:7], v1, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)

shiltian · 2024-05-10T05:17:51Z

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

-    return calculateSrcByte(ScalarSize == 32 ? Op : Op.getOperand(0),
-                            StartingIndex, Index);
+    return calculateSrcByte(Op.getOperand(0), StartingIndex,
+                            ScalarSize / 8 * VecIdx + Index);


I feel I might miss some corner case here but I can't figure that out. However, this approach does look pretty straightforward.

jrbyrnes · 2024-05-10T05:43:22Z

I see a regression of class:

define hidden void @new(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
%vec0 = load <4 x i32>, ptr addrspace(1) %in0, align 4
%vec1 = load <4 x i32>, ptr addrspace(1) %in1, align 4
%v0e1 = extractelement <4 x i32> %vec0, i64 1
%v1e3 = extractelement <4 x i32> %vec1, i64 3
%bc0 = bitcast i32 %v0e1 to <4 x i8>
%bc1 = bitcast i32 %v1e3 to <4 x i8>
%shuffle0_0 = shufflevector <4 x i8> %bc0, <4 x i8> %bc1, <4 x i32> <i32 3, i32 7, i32 4, i32 4>
store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
ret void
}

llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll

shiltian · 2024-05-10T18:35:10Z

@jrbyrnes I tried your case but didn't find anything wrong. I set a breakpoint at SIISelLowering.cpp:12068, which is case ISD::EXTRACT_VECTOR_ELT. The break point was hit twice.

Here is when it was hit the first time. It was trying to match t64: i32 = or # D:1 t61, t63 and the index at this point is 3.

SelectionDAG has 45 nodes:
  t0: ch,glue = EntryToken
      t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %0
      t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %1
    t13: i64 = build_pair # D:1 t2, t4
  t18: v4i32,ch = load<(load (s128) from %ir.in0, align 4, addrspace 1)> # D:1 t0, t13, undef:i64
      t6: i32,ch = CopyFromReg # D:1 t0, Register:i32 %2
      t8: i32,ch = CopyFromReg # D:1 t0, Register:i32 %3
    t14: i64 = build_pair # D:1 t6, t8
  t19: v4i32,ch = load<(load (s128) from %ir.in1, align 4, addrspace 1)> # D:1 t0, t14, undef:i64
  t25: i32 = extract_vector_elt # D:1 t19, Constant:i32<3>
  t51: i16 = truncate # D:1 t25
      t29: ch = TokenFactor t18:1, t19:1
                  t22: i32 = extract_vector_elt # D:1 t18, Constant:i32<1>
                t36: i32 = srl # D:1 t22, Constant:i32<16>
              t37: i16 = truncate # D:1 t36
            t42: i16 = srl # D:1 t37, Constant:i16<8>
                t52: i32 = srl # D:1 t25, Constant:i32<16>
              t53: i16 = truncate # D:1 t52
            t89: i16 = and # D:1 t53, Constant:i16<-256>
          t74: i16 = or # D:1 t42, t89
        t61: i32 = zero_extend # D:1 t74
              t85: i16 = and # D:1 t51, Constant:i16<255>
              t82: i16 = shl # D:1 t51, Constant:i16<8>
            t83: i16 = or # D:1 t85, t82
          t62: i32 = any_extend # D:1 t83
        t63: i32 = shl # D:1 t62, Constant:i32<16>
      t64: i32 = or # D:1 t61, t63
        t10: i32,ch = CopyFromReg # D:1 t0, Register:i32 %4
        t12: i32,ch = CopyFromReg # D:1 t0, Register:i32 %5
      t15: i64 = build_pair # D:1 t10, t12
    t33: ch = store<(store (s32) into %ir.out0, addrspace 1)> # D:1 t29, t64, t15, undef:i64
  t31: ch = RET_GLUE t33

Here is the trace of the byte:

t64[3] -> t63[3] -> t62[1] -> t83[1] -> t82[1] -> t51[0] -> t25[0] -> t19[12]

This looks correct to me.

Here is when the break point was hit the second time. Still, it was trying to match t64: i32 = or # D:1 t61, t63, and the index is 3.

SelectionDAG has 45 nodes:
  t0: ch,glue = EntryToken
      t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %0
      t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %1
    t13: i64 = build_pair # D:1 t2, t4
  t18: v4i32,ch = load<(load (s128) from %ir.in0, align 4, addrspace 1)> # D:1 t0, t13, undef:i64
      t6: i32,ch = CopyFromReg # D:1 t0, Register:i32 %2
      t8: i32,ch = CopyFromReg # D:1 t0, Register:i32 %3
    t14: i64 = build_pair # D:1 t6, t8
  t19: v4i32,ch = load<(load (s128) from %ir.in1, align 4, addrspace 1)> # D:1 t0, t14, undef:i64
  t25: i32 = extract_vector_elt # D:1 t19, Constant:i32<3>
  t51: i16 = truncate # D:1 t25
      t29: ch = TokenFactor t18:1, t19:1
                t22: i32 = extract_vector_elt # D:1 t18, Constant:i32<1>
              t91: i32 = srl # D:1 t22, Constant:i32<24>
            t92: i16 = truncate # D:1 t91
                t52: i32 = srl # D:1 t25, Constant:i32<16>
              t53: i16 = truncate # D:1 t52
            t89: i16 = and # D:1 t53, Constant:i16<-256>
          t74: i16 = or # D:1 t92, t89
        t61: i32 = zero_extend # D:1 t74
              t85: i16 = and # D:1 t51, Constant:i16<255>
              t82: i16 = shl # D:1 t51, Constant:i16<8>
            t83: i16 = or # D:1 t85, t82
          t62: i32 = any_extend # D:1 t83
        t63: i32 = shl # D:1 t62, Constant:i32<16>
      t64: i32 = or # D:1 t61, t63
        t10: i32,ch = CopyFromReg # D:1 t0, Register:i32 %4
        t12: i32,ch = CopyFromReg # D:1 t0, Register:i32 %5
      t15: i64 = build_pair # D:1 t10, t12
    t33: ch = store<(store (s32) into %ir.out0, addrspace 1)> # D:1 t29, t64, t15, undef:i64
  t31: ch = RET_GLUE t33

The trace is still the same:

t64[3] -> t63[3] -> t62[1] -> t83[1] -> t82[1] -> t51[0] -> t25[0] -> t19[12]

arsenm · 2024-05-13T10:16:35Z

Should include the Fixes: SWDEV-... comment in description

arsenm · 2024-05-13T10:17:06Z

I see a regression of class:

define hidden void @new(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { %vec0 = load <4 x i32>, ptr addrspace(1) %in0, align 4 %vec1 = load <4 x i32>, ptr addrspace(1) %in1, align 4 %v0e1 = extractelement <4 x i32> %vec0, i64 1 %v1e3 = extractelement <4 x i32> %vec1, i64 3 %bc0 = bitcast i32 %v0e1 to <4 x i8> %bc1 = bitcast i32 %v1e3 to <4 x i8> %shuffle0_0 = shufflevector <4 x i8> %bc0, <4 x i8> %bc1, <4 x i32> <i32 3, i32 7, i32 4, i32 4> store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 ret void }

Is this one of the tests in the file? What do you mean by regression?

shiltian · 2024-05-13T13:37:12Z

Should include the Fixes: SWDEV-... comment in description

Do we want to include that information in upstream PR?

arsenm · 2024-05-13T14:15:03Z

Should include the Fixes: SWDEV-... comment in description

Do we want to include that information in upstream PR?

Yes

shiltian · 2024-05-14T13:12:55Z

ping

shiltian · 2024-05-15T12:49:02Z

@jrbyrnes

shiltian · 2024-05-20T19:17:27Z

ping +1

arsenm · 2024-05-20T19:24:05Z

Should include the Fixes: SWDEV-... comment in description

Do we want to include that information in upstream PR?

Yes

Specifically, the format "Fixes: SWDEV-xxxxxx"

arsenm · 2024-05-20T19:24:50Z

Is this one of the tests in the file? What do you mean by regression?

What was this regression? Is this one of the existing tests?

jrbyrnes · 2024-05-20T20:18:40Z

Is this one of the tests in the file? What do you mean by regression?

What was this regression? Is this one of the existing tests?

Thanks for the ping -- this got buried in emails while I was away.

By regression, I mean quality of code, not correctness. Using the example provided

llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs test.ll

git fetch origin pull/91697/head:fix-cbp
git checkout fix-cbp

(with PR)

    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
    global_load_dword v6, v[2:3], off offset:12
    global_load_dword v7, v[0:1], off offset:4
    s_movk_i32 s4, 0xff00
    s_waitcnt vmcnt(1)
    v_lshlrev_b16_e32 v0, 8, v6
    v_and_b32_sdwa v1, v6, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
    v_or_b32_sdwa v0, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
    s_waitcnt vmcnt(0)
    v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
    global_store_dword v[4:5], v0, off
    s_waitcnt vmcnt(0)
    s_setpc_b64 s[30:31]

git revert bb779e6

(without PR)

    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
    global_load_dword v6, v[0:1], off offset:4
    global_load_dword v7, v[2:3], off offset:12
    s_movk_i32 s4, 0x307
    s_waitcnt vmcnt(0)
    v_perm_b32 v0, v6, v7, s4
    global_store_dword v[4:5], v0, off
    s_waitcnt vmcnt(0)
    s_setpc_b64 s[30:31]

I have seen a similar pattern in existing real code (CK). I don't think it is covered in existing lit tests.

SelectionDAG has 45 nodes:
  t0: ch,glue = EntryToken
      t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %0
      t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %1
    t13: i64 = build_pair # D:1 t2, t4
  t18: v4i32,ch = load<(load (s128) from %ir.in0, align 4, addrspace 1)> # D:1 t0, t13, undef:i64
      t6: i32,ch = CopyFromReg # D:1 t0, Register:i32 %2
      t8: i32,ch = CopyFromReg # D:1 t0, Register:i32 %3
    t14: i64 = build_pair # D:1 t6, t8
  t19: v4i32,ch = load<(load (s128) from %ir.in1, align 4, addrspace 1)> # D:1 t0, t14, undef:i64
  t25: i32 = extract_vector_elt # D:1 t19, Constant:i32<3>
  t51: i16 = truncate # D:1 t25
      t29: ch = TokenFactor t18:1, t19:1
                  t22: i32 = extract_vector_elt # D:1 t18, Constant:i32<1>
                t36: i32 = srl # D:1 t22, Constant:i32<16>
              t37: i16 = truncate # D:1 t36
            t42: i16 = srl # D:1 t37, Constant:i16<8>
                t52: i32 = srl # D:1 t25, Constant:i32<16>
              t53: i16 = truncate # D:1 t52
            t89: i16 = and # D:1 t53, Constant:i16<-256>
          t74: i16 = or # D:1 t42, t89
        t61: i32 = zero_extend # D:1 t74
              t85: i16 = and # D:1 t51, Constant:i16<255>
              t82: i16 = shl # D:1 t51, Constant:i16<8>
            t83: i16 = or # D:1 t85, t82
          t62: i32 = any_extend # D:1 t83
        t63: i32 = shl # D:1 t62, Constant:i32<16>
      t64: i32 = or # D:1 t61, t63
        t10: i32,ch = CopyFromReg # D:1 t0, Register:i32 %4
        t12: i32,ch = CopyFromReg # D:1 t0, Register:i32 %5
      t15: i64 = build_pair # D:1 t10, t12
    t33: ch = store<(store (s32) into %ir.out0, addrspace 1)> # D:1 t29, t64, t15, undef:i64
  t31: ch = RET_GLUE t33

With the PR, we have source bytes: t22, t25, t25, and t19. Without the PR, I see source bytes: t22, t25, t25, t25. The issue here seems to be too many source nodes, and trying to combine them is not profitable.

It seems the answer is that if we are going to look through extract_vector_elt in calculateByteProvider, we should also look them in calculateSrcByte. But, doing such may cause your crash to reappear (I don't understand how this is causing a crash?)

shiltian · 2024-05-20T21:00:02Z

It didn't cause a crash. It causes wrong results in just one OpenCL conformance test. That is because wrong register is used in v_perm_b32 instruction.

Correct asm:

...
	v_perm_b32 v2, v2, v2, 0x3020504
	v_perm_b32 v0, v0, v0, 0x3020504
...

Wrong asm:

...
	v_perm_b32 v2, v0, v2, 0x3020706
	v_perm_b32 v0, v0, v0, 0x3020504
...

I narrowed it down to this place and found out that the index was wrong when calculating the source byte.

With the PR, we have source bytes: t22, t25, t25, and t19. Without the PR, I see source bytes: t22, t25, t25, t25. The issue here seems to be too many source nodes, and trying to combine them is not profitable.

That might be beyond the scope of this PR. We can figure out when is better to look into extract_vector_elt and when is not in the future, but at least we need to make sure correct code is generated.

jrbyrnes · 2024-05-20T21:22:15Z

It didn't cause a crash. It causes wrong results in just one OpenCL conformance test. That is because wrong register is used in v_perm_b32 instruction.

Ah, I see the issue. Thanks.

It is because the current index calculation condition assumes less than 32 bit.

Can you try

    if (ScalarSize < 32) {
      Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
    }

    return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
                            StartingIndex, Index);

shiltian · 2024-05-20T21:50:43Z

It didn't cause a crash. It causes wrong results in just one OpenCL conformance test. That is because wrong register is used in v_perm_b32 instruction.

Ah, I see the issue. Thanks.

It is because the current index calculation condition assumes less than 32 bit.

Can you try
    if (ScalarSize < 32) {
      Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
    }

    return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
                            StartingIndex, Index);

It works. Why do we want to differentiate them here?

jrbyrnes · 2024-05-20T22:11:10Z

We don't gain anything from looking into the vectors for scalarsize >= 32. calculateSrcByte will just return the Op since vector handling hasn't been built yet, and then we will need to re-extract the dword when building the perm https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/AMDGPU/SIISelLowering.cpp#L12193 -- by disallowing them, we avoid some unecessary computation. Additionally, it does not introduce the regression mentioned above.

On the other hand, we want to continue looking through to the vector for 8 and 16 bit scalars, as the vector that we are extracting from is usually the common source of permutation (and not the extracted value itself).

… provider when the op is extract_vector_elt

shiltian · 2024-05-20T23:07:46Z

We don't gain anything from looking into the vectors for scalarsize >= 32. calculateSrcByte will just return the Op since vector handling hasn't been built yet, and then we will need to re-extract the dword when building the perm https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/AMDGPU/SIISelLowering.cpp#L12193 -- by disallowing them, we avoid some unecessary computation. Additionally, it does not introduce the regression mentioned above.

On the other hand, we want to continue looking through to the vector for 8 and 16 bit scalars, as the vector that we are extracting from is usually the common source of permutation (and not the extracted value itself).

Good to know. Thanks for your explanation. I have updated the patch.

jrbyrnes

Thanks, LGTM

… provider when the op is extract_vector_elt (llvm#91697) Fixes: SWDEV-460097 Change-Id: I9c86711bf403f617bc01b352f685ede98e91f4f9

… provider when the op is extract_vector_elt (llvm#91697) Fixes: SWDEV-460097 Change-Id: I441c8b595da4b71a1f84a3be43481a6847832b21

llvmbot added the backend:AMDGPU label May 10, 2024

shiltian requested review from jrbyrnes, piotrAMD and arsenm May 10, 2024 05:12

shiltian commented May 10, 2024

View reviewed changes

arsenm reviewed May 10, 2024

View reviewed changes

llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll Outdated Show resolved Hide resolved

llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll Outdated Show resolved Hide resolved

shiltian force-pushed the SWDEV-460097 branch from 07bf2d8 to 3ebc479 Compare May 10, 2024 18:26

shiltian force-pushed the SWDEV-460097 branch from 3ebc479 to bb779e6 Compare May 12, 2024 03:43

[AMDGPU] Fix an issue that wrong index is used in calculation of byte…

a4f8ac2

… provider when the op is extract_vector_elt

shiltian force-pushed the SWDEV-460097 branch from bb779e6 to a4f8ac2 Compare May 20, 2024 23:06

jrbyrnes approved these changes May 20, 2024

View reviewed changes

shiltian merged commit 45968da into llvm:main May 21, 2024
3 of 4 checks passed

shiltian deleted the SWDEV-460097 branch May 21, 2024 01:32

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AMDGPU] Fix an issue that wrong index is used in calculation of byte provider when the op is extract_vector_elt #91697

[AMDGPU] Fix an issue that wrong index is used in calculation of byte provider when the op is extract_vector_elt #91697

shiltian commented May 10, 2024 •

edited by arsenm

Loading

llvmbot commented May 10, 2024

shiltian May 10, 2024

jrbyrnes commented May 10, 2024

shiltian commented May 10, 2024

arsenm commented May 13, 2024

arsenm commented May 13, 2024

shiltian commented May 13, 2024

arsenm commented May 13, 2024

shiltian commented May 14, 2024

shiltian commented May 15, 2024

shiltian commented May 20, 2024

arsenm commented May 20, 2024

arsenm commented May 20, 2024

jrbyrnes commented May 20, 2024 •

edited

Loading

shiltian commented May 20, 2024 •

edited

Loading

jrbyrnes commented May 20, 2024

shiltian commented May 20, 2024

jrbyrnes commented May 20, 2024 •

edited

Loading

shiltian commented May 20, 2024

jrbyrnes left a comment

[AMDGPU] Fix an issue that wrong index is used in calculation of byte provider when the op is extract_vector_elt #91697

[AMDGPU] Fix an issue that wrong index is used in calculation of byte provider when the op is extract_vector_elt #91697

Conversation

shiltian commented May 10, 2024 • edited by arsenm Loading

llvmbot commented May 10, 2024

shiltian May 10, 2024

Choose a reason for hiding this comment

jrbyrnes commented May 10, 2024

shiltian commented May 10, 2024

arsenm commented May 13, 2024

arsenm commented May 13, 2024

shiltian commented May 13, 2024

arsenm commented May 13, 2024

shiltian commented May 14, 2024

shiltian commented May 15, 2024

shiltian commented May 20, 2024

arsenm commented May 20, 2024

arsenm commented May 20, 2024

jrbyrnes commented May 20, 2024 • edited Loading

shiltian commented May 20, 2024 • edited Loading

jrbyrnes commented May 20, 2024

shiltian commented May 20, 2024

jrbyrnes commented May 20, 2024 • edited Loading

shiltian commented May 20, 2024

jrbyrnes left a comment

Choose a reason for hiding this comment

shiltian commented May 10, 2024 •

edited by arsenm

Loading

jrbyrnes commented May 20, 2024 •

edited

Loading

shiltian commented May 20, 2024 •

edited

Loading

jrbyrnes commented May 20, 2024 •

edited

Loading