[DAGCombine] Fold `ctlz_zero_undef(X << C) -> ctlz_zero_undef(X) - C` #100932

dtcxzyw · 2024-07-28T13:25:16Z

Alive2: https://alive2.llvm.org/ce/z/Gd37Tv

Fixes a codegen regression introduced by 69192e0.
Closes dtcxzyw/llvm-codegen-benchmark#75.

llvmbot · 2024-07-28T13:25:47Z

@llvm/pr-subscribers-llvm-selectiondag

@llvm/pr-subscribers-backend-amdgpu

Author: Yingwei Zheng (dtcxzyw)

Changes

Alive2: https://alive2.llvm.org/ce/z/Dv26cE

Fixes a codegen regression introduced by 69192e0.
Closes dtcxzyw/llvm-codegen-benchmark#75.

Full diff: https://github.com/llvm/llvm-project/pull/100932.diff

4 Files Affected:

(modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+12)
(modified) llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll (+10-12)
(modified) llvm/test/CodeGen/RISCV/rv32zbb.ll (+47)
(modified) llvm/test/CodeGen/RISCV/rv64zbb.ll (+47)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 060e66175d965..a446643f52052 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11133,6 +11133,18 @@ SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) {
   if (SDValue C =
           DAG.FoldConstantArithmetic(ISD::CTLZ_ZERO_UNDEF, DL, VT, {N0}))
     return C;
+
+  // Fold ctlz_zero_undef(X << C) --> ctlz_zero_undef(X) - C
+  SDValue A;
+  APInt C;
+  if (sd_match(N0, m_Shl(m_Value(A), m_ConstInt(C))) &&
+      DAG.computeKnownBits(A).countMinLeadingZeros() >= C.getLimitedValue()) {
+    SDValue NegC =
+        DAG.getConstant(-C.zextOrTrunc(VT.getScalarSizeInBits()), DL, VT);
+    return DAG.getNode(ISD::ADD, DL, VT,
+                       DAG.getNode(ISD::CTLZ_ZERO_UNDEF, DL, VT, A), NegC);
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 2168e7fe1dd28..7b2e5c8f6cca4 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -620,7 +620,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
@@ -629,11 +629,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     FFBH_UINT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
+; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    -24(nan), 3(4.203895e-45)
 ; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
@@ -724,7 +723,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
@@ -733,11 +732,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     FFBH_UINT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
+; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    -16(nan), 3(4.203895e-45)
 ; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index cb9fc6c16333e..4478111c35f08 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -1417,3 +1417,50 @@ define i64 @orc_b_i64(i64 %a) {
   %2 = mul nuw i64 %1, 255
   ret i64 %2
 }
+
+define i16 @count_activebits(i16 %x) nounwind {
+; RV32I-LABEL: count_activebits:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    andi a1, a0, 255
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srli a0, a0, 25
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 5
+; RV32I-NEXT:    addi a2, a2, 1365
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    lui a1, 3
+; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 15
+; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a0, a0, 28
+; RV32I-NEXT:    li a2, 16
+; RV32I-NEXT:    sub a2, a2, a1
+; RV32I-NEXT:    sub a0, a2, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: count_activebits:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    andi a0, a0, 255
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    li a1, 32
+; RV32ZBB-NEXT:    sub a0, a1, a0
+; RV32ZBB-NEXT:    ret
+entry:
+  %ext = and i16 %x, 255
+  %ctlz = call i16 @llvm.ctlz.i16(i16 %ext, i1 true)
+  %sub = sub nuw nsw i16 16, %ctlz
+  ret i16 %sub
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index 6c354cc1b446b..3da9ee958221e 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -1560,3 +1560,50 @@ define i64 @orc_b_i64(i64 %a) {
   %2 = mul nuw i64 %1, 255
   ret i64 %2
 }
+
+define i16 @count_activebits(i16 %x) nounwind {
+; RV64I-LABEL: count_activebits:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    andi a1, a0, 255
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srli a0, a0, 57
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    srli a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srli a1, a0, 1
+; RV64I-NEXT:    lui a2, 5
+; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    lui a1, 3
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    andi a1, a0, 15
+; RV64I-NEXT:    slli a0, a0, 52
+; RV64I-NEXT:    srli a0, a0, 60
+; RV64I-NEXT:    li a2, 16
+; RV64I-NEXT:    sub a2, a2, a1
+; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: count_activebits:
+; RV64ZBB:       # %bb.0: # %entry
+; RV64ZBB-NEXT:    andi a0, a0, 255
+; RV64ZBB-NEXT:    clz a0, a0
+; RV64ZBB-NEXT:    li a1, 64
+; RV64ZBB-NEXT:    sub a0, a1, a0
+; RV64ZBB-NEXT:    ret
+entry:
+  %ext = and i16 %x, 255
+  %ctlz = call i16 @llvm.ctlz.i16(i16 %ext, i1 true)
+  %sub = sub nuw nsw i16 16, %ctlz
+  ret i16 %sub
+}

RKSimon · 2024-07-28T14:26:05Z

CodeGen/VE/Scalar/ctlz.ll failure?

arsenm · 2024-07-28T15:06:52Z

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

+  // Fold ctlz_zero_undef(X << C) --> ctlz_zero_undef(X) - C
+  SDValue A;
+  APInt C;
+  if (sd_match(N0, m_Shl(m_Value(A), m_ConstInt(C))) &&


alive link has nuw flag but this doesn't check that

It is checked by DAG.computeKnownBits(A).countMinLeadingZeros() >= C.getLimitedValue().

That check on the operand of the shift cannot see the flag on the shift itself

The proof has been updated: https://alive2.llvm.org/ce/z/Gd37Tv.

But can't you use the NUW in place of this check?

IIRC DAGCombine doesn't infer poison-generating flags for new SDNodes.

Optimized lowered selection DAG: %bb.0 'count_activebits:entry' SelectionDAG has 13 nodes: t0: ch,glue = EntryToken t2: i64,ch = CopyFromReg t0, Register:i64 %0 t3: i16 = truncate t2 t5: i16 = and t3, Constant:i16<255> t6: i16 = ctlz_zero_undef t5 t8: i16 = sub nuw nsw Constant:i16<16>, t6 t9: i64 = any_extend t8 t11: ch,glue = CopyToReg t0, Register:i64 $x10, t9 t12: ch = RISCVISD::RET_GLUE t11, Register:i64 $x10, t11:1

Type-legalized selection DAG: %bb.0 'count_activebits:entry' SelectionDAG has 13 nodes: t0: ch,glue = EntryToken t2: i64,ch = CopyFromReg t0, Register:i64 %0 t15: i64 = and t2, Constant:i64<255> t17: i64 = shl t15, Constant:i64<48> t18: i64 = ctlz_zero_undef t17 t19: i64 = sub Constant:i64<16>, t18 t11: ch,glue = CopyToReg t0, Register:i64 $x10, t19 t12: ch = RISCVISD::RET_GLUE t11, Register:i64 $x10, t11:1

Optimized type-legalized selection DAG: %bb.0 'count_activebits:entry' SelectionDAG has 13 nodes: t0: ch,glue = EntryToken t2: i64,ch = CopyFromReg t0, Register:i64 %0 t15: i64 = and t2, Constant:i64<255> t17: i64 = shl t15, Constant:i64<48> t18: i64 = ctlz_zero_undef t17 t19: i64 = sub Constant:i64<16>, t18 t11: ch,glue = CopyToReg t0, Register:i64 $x10, t19 t12: ch = RISCVISD::RET_GLUE t11, Register:i64 $x10, t11:1

arsenm · 2024-07-28T15:07:06Z

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

@@ -11133,6 +11133,18 @@ SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) {
  if (SDValue C =
          DAG.FoldConstantArithmetic(ISD::CTLZ_ZERO_UNDEF, DL, VT, {N0}))
    return C;
+
+  // Fold ctlz_zero_undef(X << C) --> ctlz_zero_undef(X) - C


It's not obvious to me that it's better. it's just trading a shift for an add or sub?

Is it better to fold C1 - ctlz_zero_undef(X << C2) --> C1 + C2 - ctlz_zero_undef(X)?

You can do both. The win is probably that the parameter of the ctlz_zero_undef got simpler.

dtcxzyw added 2 commits July 28, 2024 20:54

[RISCV] Add pre-commit tests. NFC.

0cae58b

[DAGCombine] Fold ctlz(X << C) -> ctlz(X) - C

fc9ee38

dtcxzyw requested review from arsenm, RKSimon and topperc July 28, 2024 13:25

llvmbot added backend:AMDGPU llvm:SelectionDAG SelectionDAGISel as well labels Jul 28, 2024

arsenm reviewed Jul 28, 2024

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[DAGCombine] Fold `ctlz_zero_undef(X << C) -> ctlz_zero_undef(X) - C` #100932

[DAGCombine] Fold `ctlz_zero_undef(X << C) -> ctlz_zero_undef(X) - C` #100932

dtcxzyw commented Jul 28, 2024 •

edited

Loading

llvmbot commented Jul 28, 2024 •

edited

Loading

RKSimon commented Jul 28, 2024

arsenm Jul 28, 2024

dtcxzyw Jul 28, 2024

arsenm Jul 29, 2024

dtcxzyw Jul 29, 2024

arsenm Jul 29, 2024

dtcxzyw Jul 29, 2024

arsenm Jul 28, 2024

dtcxzyw Jul 28, 2024

tschuett Jul 28, 2024

[DAGCombine] Fold ctlz_zero_undef(X << C) -> ctlz_zero_undef(X) - C #100932

Are you sure you want to change the base?

[DAGCombine] Fold ctlz_zero_undef(X << C) -> ctlz_zero_undef(X) - C #100932

Conversation

dtcxzyw commented Jul 28, 2024 • edited Loading

llvmbot commented Jul 28, 2024 • edited Loading

RKSimon commented Jul 28, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

[DAGCombine] Fold `ctlz_zero_undef(X << C) -> ctlz_zero_undef(X) - C` #100932

[DAGCombine] Fold `ctlz_zero_undef(X << C) -> ctlz_zero_undef(X) - C` #100932

dtcxzyw commented Jul 28, 2024 •

edited

Loading

llvmbot commented Jul 28, 2024 •

edited

Loading