Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DAGCombine] Fold ctlz_zero_undef(X << C) -> ctlz_zero_undef(X) - C #100932

Open
wants to merge 2 commits into
base: main
Choose a base branch
from

Conversation

dtcxzyw
Copy link
Member

@dtcxzyw dtcxzyw commented Jul 28, 2024

Alive2: https://alive2.llvm.org/ce/z/Gd37Tv

Fixes a codegen regression introduced by 69192e0.
Closes dtcxzyw/llvm-codegen-benchmark#75.

@llvmbot llvmbot added backend:AMDGPU llvm:SelectionDAG SelectionDAGISel as well labels Jul 28, 2024
@llvmbot
Copy link
Collaborator

llvmbot commented Jul 28, 2024

@llvm/pr-subscribers-llvm-selectiondag

@llvm/pr-subscribers-backend-amdgpu

Author: Yingwei Zheng (dtcxzyw)

Changes

Alive2: https://alive2.llvm.org/ce/z/Dv26cE

Fixes a codegen regression introduced by 69192e0.
Closes dtcxzyw/llvm-codegen-benchmark#75.


Full diff: https://github.com/llvm/llvm-project/pull/100932.diff

4 Files Affected:

  • (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+12)
  • (modified) llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll (+10-12)
  • (modified) llvm/test/CodeGen/RISCV/rv32zbb.ll (+47)
  • (modified) llvm/test/CodeGen/RISCV/rv64zbb.ll (+47)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 060e66175d965..a446643f52052 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11133,6 +11133,18 @@ SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) {
   if (SDValue C =
           DAG.FoldConstantArithmetic(ISD::CTLZ_ZERO_UNDEF, DL, VT, {N0}))
     return C;
+
+  // Fold ctlz_zero_undef(X << C) --> ctlz_zero_undef(X) - C
+  SDValue A;
+  APInt C;
+  if (sd_match(N0, m_Shl(m_Value(A), m_ConstInt(C))) &&
+      DAG.computeKnownBits(A).countMinLeadingZeros() >= C.getLimitedValue()) {
+    SDValue NegC =
+        DAG.getConstant(-C.zextOrTrunc(VT.getScalarSizeInBits()), DL, VT);
+    return DAG.getNode(ISD::ADD, DL, VT,
+                       DAG.getNode(ISD::CTLZ_ZERO_UNDEF, DL, VT, A), NegC);
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 2168e7fe1dd28..7b2e5c8f6cca4 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -620,7 +620,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
@@ -629,11 +629,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     FFBH_UINT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
+; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    -24(nan), 3(4.203895e-45)
 ; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
@@ -724,7 +723,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
@@ -733,11 +732,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     FFBH_UINT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
+; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    -16(nan), 3(4.203895e-45)
 ; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index cb9fc6c16333e..4478111c35f08 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -1417,3 +1417,50 @@ define i64 @orc_b_i64(i64 %a) {
   %2 = mul nuw i64 %1, 255
   ret i64 %2
 }
+
+define i16 @count_activebits(i16 %x) nounwind {
+; RV32I-LABEL: count_activebits:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    andi a1, a0, 255
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srli a0, a0, 25
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 5
+; RV32I-NEXT:    addi a2, a2, 1365
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    lui a1, 3
+; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 15
+; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a0, a0, 28
+; RV32I-NEXT:    li a2, 16
+; RV32I-NEXT:    sub a2, a2, a1
+; RV32I-NEXT:    sub a0, a2, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: count_activebits:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    andi a0, a0, 255
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    li a1, 32
+; RV32ZBB-NEXT:    sub a0, a1, a0
+; RV32ZBB-NEXT:    ret
+entry:
+  %ext = and i16 %x, 255
+  %ctlz = call i16 @llvm.ctlz.i16(i16 %ext, i1 true)
+  %sub = sub nuw nsw i16 16, %ctlz
+  ret i16 %sub
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index 6c354cc1b446b..3da9ee958221e 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -1560,3 +1560,50 @@ define i64 @orc_b_i64(i64 %a) {
   %2 = mul nuw i64 %1, 255
   ret i64 %2
 }
+
+define i16 @count_activebits(i16 %x) nounwind {
+; RV64I-LABEL: count_activebits:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    andi a1, a0, 255
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srli a0, a0, 57
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    srli a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srli a1, a0, 1
+; RV64I-NEXT:    lui a2, 5
+; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    lui a1, 3
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    andi a1, a0, 15
+; RV64I-NEXT:    slli a0, a0, 52
+; RV64I-NEXT:    srli a0, a0, 60
+; RV64I-NEXT:    li a2, 16
+; RV64I-NEXT:    sub a2, a2, a1
+; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: count_activebits:
+; RV64ZBB:       # %bb.0: # %entry
+; RV64ZBB-NEXT:    andi a0, a0, 255
+; RV64ZBB-NEXT:    clz a0, a0
+; RV64ZBB-NEXT:    li a1, 64
+; RV64ZBB-NEXT:    sub a0, a1, a0
+; RV64ZBB-NEXT:    ret
+entry:
+  %ext = and i16 %x, 255
+  %ctlz = call i16 @llvm.ctlz.i16(i16 %ext, i1 true)
+  %sub = sub nuw nsw i16 16, %ctlz
+  ret i16 %sub
+}

@RKSimon
Copy link
Collaborator

RKSimon commented Jul 28, 2024

CodeGen/VE/Scalar/ctlz.ll failure?

// Fold ctlz_zero_undef(X << C) --> ctlz_zero_undef(X) - C
SDValue A;
APInt C;
if (sd_match(N0, m_Shl(m_Value(A), m_ConstInt(C))) &&
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

alive link has nuw flag but this doesn't check that

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is checked by DAG.computeKnownBits(A).countMinLeadingZeros() >= C.getLimitedValue().

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That check on the operand of the shift cannot see the flag on the shift itself

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The proof has been updated: https://alive2.llvm.org/ce/z/Gd37Tv.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But can't you use the NUW in place of this check?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC DAGCombine doesn't infer poison-generating flags for new SDNodes.

Optimized lowered selection DAG: %bb.0 'count_activebits:entry'
SelectionDAG has 13 nodes:
  t0: ch,glue = EntryToken
              t2: i64,ch = CopyFromReg t0, Register:i64 %0
            t3: i16 = truncate t2
          t5: i16 = and t3, Constant:i16<255>
        t6: i16 = ctlz_zero_undef t5
      t8: i16 = sub nuw nsw Constant:i16<16>, t6
    t9: i64 = any_extend t8
  t11: ch,glue = CopyToReg t0, Register:i64 $x10, t9
  t12: ch = RISCVISD::RET_GLUE t11, Register:i64 $x10, t11:1
Type-legalized selection DAG: %bb.0 'count_activebits:entry'
SelectionDAG has 13 nodes:
  t0: ch,glue = EntryToken
            t2: i64,ch = CopyFromReg t0, Register:i64 %0
          t15: i64 = and t2, Constant:i64<255>
        t17: i64 = shl t15, Constant:i64<48>
      t18: i64 = ctlz_zero_undef t17
    t19: i64 = sub Constant:i64<16>, t18
  t11: ch,glue = CopyToReg t0, Register:i64 $x10, t19
  t12: ch = RISCVISD::RET_GLUE t11, Register:i64 $x10, t11:1
Optimized type-legalized selection DAG: %bb.0 'count_activebits:entry'
SelectionDAG has 13 nodes:
  t0: ch,glue = EntryToken
            t2: i64,ch = CopyFromReg t0, Register:i64 %0
          t15: i64 = and t2, Constant:i64<255>
        t17: i64 = shl t15, Constant:i64<48>
      t18: i64 = ctlz_zero_undef t17
    t19: i64 = sub Constant:i64<16>, t18
  t11: ch,glue = CopyToReg t0, Register:i64 $x10, t19
  t12: ch = RISCVISD::RET_GLUE t11, Register:i64 $x10, t11:1

@@ -11133,6 +11133,18 @@ SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) {
if (SDValue C =
DAG.FoldConstantArithmetic(ISD::CTLZ_ZERO_UNDEF, DL, VT, {N0}))
return C;

// Fold ctlz_zero_undef(X << C) --> ctlz_zero_undef(X) - C
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not obvious to me that it's better. it's just trading a shift for an add or sub?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it better to fold C1 - ctlz_zero_undef(X << C2) --> C1 + C2 - ctlz_zero_undef(X)?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can do both. The win is probably that the parameter of the ctlz_zero_undef got simpler.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:AMDGPU llvm:SelectionDAG SelectionDAGISel as well
Projects
None yet
Development

Successfully merging this pull request may close these issues.

Update diff July 8th 2024, 1:50:32 pm
5 participants