-
Notifications
You must be signed in to change notification settings - Fork 11.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[DAGCombine] Fold ctlz_zero_undef(X << C) -> ctlz_zero_undef(X) - C
#100932
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-amdgpu Author: Yingwei Zheng (dtcxzyw) ChangesAlive2: https://alive2.llvm.org/ce/z/Dv26cE Fixes a codegen regression introduced by 69192e0. Full diff: https://github.com/llvm/llvm-project/pull/100932.diff 4 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 060e66175d965..a446643f52052 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11133,6 +11133,18 @@ SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) {
if (SDValue C =
DAG.FoldConstantArithmetic(ISD::CTLZ_ZERO_UNDEF, DL, VT, {N0}))
return C;
+
+ // Fold ctlz_zero_undef(X << C) --> ctlz_zero_undef(X) - C
+ SDValue A;
+ APInt C;
+ if (sd_match(N0, m_Shl(m_Value(A), m_ConstInt(C))) &&
+ DAG.computeKnownBits(A).countMinLeadingZeros() >= C.getLimitedValue()) {
+ SDValue NegC =
+ DAG.getConstant(-C.zextOrTrunc(VT.getScalarSizeInBits()), DL, VT);
+ return DAG.getNode(ISD::ADD, DL, VT,
+ DAG.getNode(ISD::CTLZ_ZERO_UNDEF, DL, VT, A), NegC);
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 2168e7fe1dd28..7b2e5c8f6cca4 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -620,7 +620,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -629,11 +629,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: FFBH_UINT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: FFBH_UINT * T0.W, T0.X,
+; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: -24(nan), 3(4.203895e-45)
; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
@@ -724,7 +723,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -733,11 +732,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: FFBH_UINT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: FFBH_UINT * T0.W, T0.X,
+; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: -16(nan), 3(4.203895e-45)
; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index cb9fc6c16333e..4478111c35f08 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -1417,3 +1417,50 @@ define i64 @orc_b_i64(i64 %a) {
%2 = mul nuw i64 %1, 255
ret i64 %2
}
+
+define i16 @count_activebits(i16 %x) nounwind {
+; RV32I-LABEL: count_activebits:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: andi a1, a0, 255
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: srli a0, a0, 25
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: srli a1, a0, 2
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: srli a1, a0, 4
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: not a0, a0
+; RV32I-NEXT: srli a1, a0, 1
+; RV32I-NEXT: lui a2, 5
+; RV32I-NEXT: addi a2, a2, 1365
+; RV32I-NEXT: and a1, a1, a2
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: lui a1, 3
+; RV32I-NEXT: addi a1, a1, 819
+; RV32I-NEXT: and a2, a0, a1
+; RV32I-NEXT: srli a0, a0, 2
+; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: add a0, a2, a0
+; RV32I-NEXT: srli a1, a0, 4
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: andi a1, a0, 15
+; RV32I-NEXT: slli a0, a0, 20
+; RV32I-NEXT: srli a0, a0, 28
+; RV32I-NEXT: li a2, 16
+; RV32I-NEXT: sub a2, a2, a1
+; RV32I-NEXT: sub a0, a2, a0
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: count_activebits:
+; RV32ZBB: # %bb.0: # %entry
+; RV32ZBB-NEXT: andi a0, a0, 255
+; RV32ZBB-NEXT: clz a0, a0
+; RV32ZBB-NEXT: li a1, 32
+; RV32ZBB-NEXT: sub a0, a1, a0
+; RV32ZBB-NEXT: ret
+entry:
+ %ext = and i16 %x, 255
+ %ctlz = call i16 @llvm.ctlz.i16(i16 %ext, i1 true)
+ %sub = sub nuw nsw i16 16, %ctlz
+ ret i16 %sub
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index 6c354cc1b446b..3da9ee958221e 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -1560,3 +1560,50 @@ define i64 @orc_b_i64(i64 %a) {
%2 = mul nuw i64 %1, 255
ret i64 %2
}
+
+define i16 @count_activebits(i16 %x) nounwind {
+; RV64I-LABEL: count_activebits:
+; RV64I: # %bb.0: # %entry
+; RV64I-NEXT: andi a1, a0, 255
+; RV64I-NEXT: slli a0, a0, 56
+; RV64I-NEXT: srli a0, a0, 57
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: srli a1, a0, 2
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: not a0, a0
+; RV64I-NEXT: srli a1, a0, 1
+; RV64I-NEXT: lui a2, 5
+; RV64I-NEXT: addiw a2, a2, 1365
+; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: lui a1, 3
+; RV64I-NEXT: addiw a1, a1, 819
+; RV64I-NEXT: and a2, a0, a1
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: add a0, a2, a0
+; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: andi a1, a0, 15
+; RV64I-NEXT: slli a0, a0, 52
+; RV64I-NEXT: srli a0, a0, 60
+; RV64I-NEXT: li a2, 16
+; RV64I-NEXT: sub a2, a2, a1
+; RV64I-NEXT: sub a0, a2, a0
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: count_activebits:
+; RV64ZBB: # %bb.0: # %entry
+; RV64ZBB-NEXT: andi a0, a0, 255
+; RV64ZBB-NEXT: clz a0, a0
+; RV64ZBB-NEXT: li a1, 64
+; RV64ZBB-NEXT: sub a0, a1, a0
+; RV64ZBB-NEXT: ret
+entry:
+ %ext = and i16 %x, 255
+ %ctlz = call i16 @llvm.ctlz.i16(i16 %ext, i1 true)
+ %sub = sub nuw nsw i16 16, %ctlz
+ ret i16 %sub
+}
|
CodeGen/VE/Scalar/ctlz.ll failure? |
// Fold ctlz_zero_undef(X << C) --> ctlz_zero_undef(X) - C | ||
SDValue A; | ||
APInt C; | ||
if (sd_match(N0, m_Shl(m_Value(A), m_ConstInt(C))) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
alive link has nuw flag but this doesn't check that
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is checked by DAG.computeKnownBits(A).countMinLeadingZeros() >= C.getLimitedValue()
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That check on the operand of the shift cannot see the flag on the shift itself
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The proof has been updated: https://alive2.llvm.org/ce/z/Gd37Tv.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But can't you use the NUW in place of this check?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IIRC DAGCombine doesn't infer poison-generating flags for new SDNodes.
Optimized lowered selection DAG: %bb.0 'count_activebits:entry'
SelectionDAG has 13 nodes:
t0: ch,glue = EntryToken
t2: i64,ch = CopyFromReg t0, Register:i64 %0
t3: i16 = truncate t2
t5: i16 = and t3, Constant:i16<255>
t6: i16 = ctlz_zero_undef t5
t8: i16 = sub nuw nsw Constant:i16<16>, t6
t9: i64 = any_extend t8
t11: ch,glue = CopyToReg t0, Register:i64 $x10, t9
t12: ch = RISCVISD::RET_GLUE t11, Register:i64 $x10, t11:1
Type-legalized selection DAG: %bb.0 'count_activebits:entry'
SelectionDAG has 13 nodes:
t0: ch,glue = EntryToken
t2: i64,ch = CopyFromReg t0, Register:i64 %0
t15: i64 = and t2, Constant:i64<255>
t17: i64 = shl t15, Constant:i64<48>
t18: i64 = ctlz_zero_undef t17
t19: i64 = sub Constant:i64<16>, t18
t11: ch,glue = CopyToReg t0, Register:i64 $x10, t19
t12: ch = RISCVISD::RET_GLUE t11, Register:i64 $x10, t11:1
Optimized type-legalized selection DAG: %bb.0 'count_activebits:entry'
SelectionDAG has 13 nodes:
t0: ch,glue = EntryToken
t2: i64,ch = CopyFromReg t0, Register:i64 %0
t15: i64 = and t2, Constant:i64<255>
t17: i64 = shl t15, Constant:i64<48>
t18: i64 = ctlz_zero_undef t17
t19: i64 = sub Constant:i64<16>, t18
t11: ch,glue = CopyToReg t0, Register:i64 $x10, t19
t12: ch = RISCVISD::RET_GLUE t11, Register:i64 $x10, t11:1
@@ -11133,6 +11133,18 @@ SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) { | |||
if (SDValue C = | |||
DAG.FoldConstantArithmetic(ISD::CTLZ_ZERO_UNDEF, DL, VT, {N0})) | |||
return C; | |||
|
|||
// Fold ctlz_zero_undef(X << C) --> ctlz_zero_undef(X) - C |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's not obvious to me that it's better. it's just trading a shift for an add or sub?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it better to fold C1 - ctlz_zero_undef(X << C2) --> C1 + C2 - ctlz_zero_undef(X)
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can do both. The win is probably that the parameter of the ctlz_zero_undef
got simpler.
Alive2: https://alive2.llvm.org/ce/z/Gd37Tv
Fixes a codegen regression introduced by 69192e0.
Closes dtcxzyw/llvm-codegen-benchmark#75.