From cf128305bdada3ffb34054813a855d80b3948025 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Wed, 22 May 2024 20:20:33 +0800
Subject: [PATCH] [SDAG] Don't treat ISD::SHL as a uniform binary operator in
 `ShrinkDemandedOp` (#92753)

In `TargetLowering::ShrinkDemandedOp`, types of lhs and rhs may differ
before legalization.
In the original case, `VT` is `i64` and `SmallVT` is `i32`, but the type
of rhs is `i8`. Then invalid truncate nodes will be created.

See the description of ISD::SHL for further information:
> After legalization, the type of the shift amount is known to be
TLI.getShiftAmountTy(). Before legalization, the shift amount can be any
type, but care must be taken to ensure it is large enough.


https://github.com/llvm/llvm-project/blob/605ae4e93be8976095c7eedf5c08bfdb9ff71257/llvm/include/llvm/CodeGen/ISDOpcodes.h#L691-L712

This patch stops handling ISD::SHL in `TargetLowering::ShrinkDemandedOp`
and duplicates the logic in `TargetLowering::SimplifyDemandedBits`.
Additionally, it adds some additional checks like
`isNarrowingProfitable` and `isTypeDesirableForOp` to improve the
codegen on AArch64.

Fixes https://github.com/llvm/llvm-project/issues/92720.
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  32 ++++-
 llvm/test/CodeGen/AArch64/bitfield-insert.ll  |  11 +-
 llvm/test/CodeGen/AArch64/trunc-to-tbl.ll     | 118 +++++++++---------
 llvm/test/CodeGen/X86/pr92720.ll              |  15 +++
 4 files changed, 105 insertions(+), 71 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/pr92720.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index b6d4050236b761..87c4c62522c1bc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -587,6 +587,10 @@ bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth,
   if (VT.isVector())
     return false;
 
+  assert(Op.getOperand(0).getValueType().getScalarSizeInBits() == BitWidth &&
+         Op.getOperand(1).getValueType().getScalarSizeInBits() == BitWidth &&
+         "ShrinkDemandedOp only supports operands that have the same size!");
+
   // Don't do this if the node has another user, which may require the
   // full value.
   if (!Op.getNode()->hasOneUse())
@@ -1832,11 +1836,33 @@ bool TargetLowering::SimplifyDemandedBits(
         }
       }
 
+      // TODO: Can we merge this fold with the one below?
       // Try shrinking the operation as long as the shift amount will still be
       // in range.
-      if ((ShAmt < DemandedBits.getActiveBits()) &&
-          ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
-        return true;
+      if (ShAmt < DemandedBits.getActiveBits() && !VT.isVector() &&
+          Op.getNode()->hasOneUse()) {
+        // Search for the smallest integer type with free casts to and from
+        // Op's type. For expedience, just check power-of-2 integer types.
+        unsigned DemandedSize = DemandedBits.getActiveBits();
+        for (unsigned SmallVTBits = llvm::bit_ceil(DemandedSize);
+             SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
+          EVT SmallVT = EVT::getIntegerVT(*TLO.DAG.getContext(), SmallVTBits);
+          if (isNarrowingProfitable(VT, SmallVT) &&
+              isTypeDesirableForOp(ISD::SHL, SmallVT) &&
+              isTruncateFree(VT, SmallVT) && isZExtFree(SmallVT, VT) &&
+              (!TLO.LegalOperations() || isOperationLegal(ISD::SHL, SmallVT))) {
+            assert(DemandedSize <= SmallVTBits &&
+                   "Narrowed below demanded bits?");
+            // We found a type with free casts.
+            SDValue NarrowShl = TLO.DAG.getNode(
+                ISD::SHL, dl, SmallVT,
+                TLO.DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(0)),
+                TLO.DAG.getShiftAmountConstant(ShAmt, SmallVT, dl));
+            return TLO.CombineTo(
+                Op, TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT, NarrowShl));
+          }
+        }
+      }
 
       // Narrow shift to lower half - similar to ShrinkDemandedOp.
       // (shl i64:x, K) -> (i64 zero_extend (shl (i32 (trunc i64:x)), K))
diff --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
index 30b5e86c1e6dc8..14a594e8028d5d 100644
--- a/llvm/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
@@ -193,11 +193,10 @@ define void @test_64bit_badmask(ptr %existing, ptr %new) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    mov w10, #135 // =0x87
-; CHECK-NEXT:    mov w11, #664 // =0x298
-; CHECK-NEXT:    lsl w9, w9, #3
-; CHECK-NEXT:    and x8, x8, x10
-; CHECK-NEXT:    and x9, x9, x11
+; CHECK-NEXT:    mov w10, #664 // =0x298
+; CHECK-NEXT:    mov w11, #135 // =0x87
+; CHECK-NEXT:    and x9, x10, x9, lsl #3
+; CHECK-NEXT:    and x8, x8, x11
 ; CHECK-NEXT:    orr x8, x8, x9
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    ret
@@ -579,7 +578,6 @@ define <2 x i32> @test_complex_type(ptr %addr, i64 %in, ptr %bf ) {
 define i64 @test_truncated_shift(i64 %x, i64 %y) {
 ; CHECK-LABEL: test_truncated_shift:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $w1 killed $w1 killed $x1 def $x1
 ; CHECK-NEXT:    bfi x0, x1, #25, #5
 ; CHECK-NEXT:    ret
 entry:
@@ -593,7 +591,6 @@ entry:
 define i64 @test_and_extended_shift_with_imm(i64 %0) {
 ; CHECK-LABEL: test_and_extended_shift_with_imm:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0 def $x0
 ; CHECK-NEXT:    ubfiz x0, x0, #7, #8
 ; CHECK-NEXT:    ret
   %2 = shl i64 %0, 7
diff --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
index 18cd4cc2111a42..c4a58ba12dc6be 100644
--- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
@@ -571,29 +571,27 @@ define void @trunc_v8i19_to_v8i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB5_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldp x10, x9, [x0]
-; CHECK-NEXT:    ldrb w13, [x0, #18]
-; CHECK-NEXT:    ldrh w14, [x0, #16]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldrb w14, [x0, #18]
+; CHECK-NEXT:    ldrh w15, [x0, #16]
 ; CHECK-NEXT:    add x0, x0, #32
-; CHECK-NEXT:    ubfx x12, x9, #12, #20
-; CHECK-NEXT:    fmov s0, w10
-; CHECK-NEXT:    lsr x11, x10, #19
-; CHECK-NEXT:    lsr x15, x9, #31
-; CHECK-NEXT:    fmov s1, w12
-; CHECK-NEXT:    lsr x12, x9, #50
-; CHECK-NEXT:    mov.s v0[1], w11
-; CHECK-NEXT:    orr w11, w14, w13, lsl #16
-; CHECK-NEXT:    lsr x13, x10, #38
-; CHECK-NEXT:    lsr x10, x10, #57
-; CHECK-NEXT:    mov.s v1[1], w15
-; CHECK-NEXT:    orr w12, w12, w11, lsl #14
-; CHECK-NEXT:    orr w9, w10, w9, lsl #7
-; CHECK-NEXT:    lsr w10, w11, #5
-; CHECK-NEXT:    mov.s v0[2], w13
+; CHECK-NEXT:    ubfx x12, x10, #12, #20
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    lsr x11, x9, #19
+; CHECK-NEXT:    lsr x13, x10, #31
+; CHECK-NEXT:    fmov s0, w12
+; CHECK-NEXT:    lsr x12, x9, #38
+; CHECK-NEXT:    extr x9, x10, x9, #57
+; CHECK-NEXT:    mov.s v1[1], w11
+; CHECK-NEXT:    orr x11, x15, x14, lsl #16
+; CHECK-NEXT:    mov.s v0[1], w13
+; CHECK-NEXT:    extr x13, x11, x10, #50
+; CHECK-NEXT:    ubfx x10, x11, #5, #27
 ; CHECK-NEXT:    mov.s v1[2], w12
-; CHECK-NEXT:    mov.s v0[3], w9
-; CHECK-NEXT:    mov.s v1[3], w10
-; CHECK-NEXT:    uzp1.8h v0, v0, v1
+; CHECK-NEXT:    mov.s v0[2], w13
+; CHECK-NEXT:    mov.s v1[3], w9
+; CHECK-NEXT:    mov.s v0[3], w10
+; CHECK-NEXT:    uzp1.8h v0, v1, v0
 ; CHECK-NEXT:    xtn.8b v0, v0
 ; CHECK-NEXT:    str d0, [x1, x8, lsl #3]
 ; CHECK-NEXT:    add x8, x8, #1
@@ -608,35 +606,34 @@ define void @trunc_v8i19_to_v8i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-BE-NEXT:  .LBB5_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    ldp x10, x9, [x0]
-; CHECK-BE-NEXT:    ldrb w16, [x0, #18]
-; CHECK-BE-NEXT:    lsr x11, x9, #40
-; CHECK-BE-NEXT:    ubfx x12, x9, #33, #7
-; CHECK-BE-NEXT:    lsr x15, x10, #45
-; CHECK-BE-NEXT:    lsr x13, x10, #40
-; CHECK-BE-NEXT:    ubfx x14, x10, #26, #14
-; CHECK-BE-NEXT:    orr w11, w12, w11, lsl #7
-; CHECK-BE-NEXT:    ldrh w12, [x0, #16]
-; CHECK-BE-NEXT:    fmov s0, w15
-; CHECK-BE-NEXT:    orr w13, w14, w13, lsl #14
-; CHECK-BE-NEXT:    ubfx x14, x9, #14, #18
+; CHECK-BE-NEXT:    ldrh w16, [x0, #16]
+; CHECK-BE-NEXT:    ldrb w17, [x0, #18]
 ; CHECK-BE-NEXT:    add x0, x0, #32
-; CHECK-BE-NEXT:    fmov s1, w11
-; CHECK-BE-NEXT:    orr w11, w16, w12, lsl #8
-; CHECK-BE-NEXT:    lsl x12, x9, #24
-; CHECK-BE-NEXT:    mov v0.s[1], w13
+; CHECK-BE-NEXT:    lsl x11, x9, #24
+; CHECK-BE-NEXT:    lsr x12, x9, #40
+; CHECK-BE-NEXT:    lsr x13, x10, #45
+; CHECK-BE-NEXT:    lsl x14, x10, #24
+; CHECK-BE-NEXT:    lsr x15, x10, #40
+; CHECK-BE-NEXT:    extr x12, x12, x11, #57
+; CHECK-BE-NEXT:    fmov s0, w13
 ; CHECK-BE-NEXT:    ubfx x13, x10, #7, #25
+; CHECK-BE-NEXT:    extr x14, x15, x14, #50
+; CHECK-BE-NEXT:    ubfx x15, x9, #14, #18
 ; CHECK-BE-NEXT:    extr x9, x10, x9, #40
-; CHECK-BE-NEXT:    orr w12, w11, w12
-; CHECK-BE-NEXT:    mov v1.s[1], w14
-; CHECK-BE-NEXT:    lsr w12, w12, #19
+; CHECK-BE-NEXT:    fmov s1, w12
+; CHECK-BE-NEXT:    orr w12, w17, w16, lsl #8
+; CHECK-BE-NEXT:    mov v0.s[1], w14
 ; CHECK-BE-NEXT:    ubfx x9, x9, #12, #20
+; CHECK-BE-NEXT:    orr w11, w12, w11
+; CHECK-BE-NEXT:    mov v1.s[1], w15
+; CHECK-BE-NEXT:    lsr w11, w11, #19
 ; CHECK-BE-NEXT:    mov v0.s[2], w13
-; CHECK-BE-NEXT:    mov v1.s[2], w12
+; CHECK-BE-NEXT:    mov v1.s[2], w11
 ; CHECK-BE-NEXT:    mov v0.s[3], w9
 ; CHECK-BE-NEXT:    add x9, x1, x8, lsl #3
 ; CHECK-BE-NEXT:    add x8, x8, #1
 ; CHECK-BE-NEXT:    cmp x8, #1000
-; CHECK-BE-NEXT:    mov v1.s[3], w11
+; CHECK-BE-NEXT:    mov v1.s[3], w12
 ; CHECK-BE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-BE-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-BE-NEXT:    st1 { v0.8b }, [x9]
@@ -650,35 +647,34 @@ define void @trunc_v8i19_to_v8i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-DISABLE-NEXT:  .LBB5_1: // %loop
 ; CHECK-DISABLE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-DISABLE-NEXT:    ldp x10, x9, [x0]
-; CHECK-DISABLE-NEXT:    ldrb w16, [x0, #18]
-; CHECK-DISABLE-NEXT:    lsr x11, x9, #40
-; CHECK-DISABLE-NEXT:    ubfx x12, x9, #33, #7
-; CHECK-DISABLE-NEXT:    lsr x15, x10, #45
-; CHECK-DISABLE-NEXT:    lsr x13, x10, #40
-; CHECK-DISABLE-NEXT:    ubfx x14, x10, #26, #14
-; CHECK-DISABLE-NEXT:    orr w11, w12, w11, lsl #7
-; CHECK-DISABLE-NEXT:    ldrh w12, [x0, #16]
-; CHECK-DISABLE-NEXT:    fmov s0, w15
-; CHECK-DISABLE-NEXT:    orr w13, w14, w13, lsl #14
-; CHECK-DISABLE-NEXT:    ubfx x14, x9, #14, #18
+; CHECK-DISABLE-NEXT:    ldrh w16, [x0, #16]
+; CHECK-DISABLE-NEXT:    ldrb w17, [x0, #18]
 ; CHECK-DISABLE-NEXT:    add x0, x0, #32
-; CHECK-DISABLE-NEXT:    fmov s1, w11
-; CHECK-DISABLE-NEXT:    orr w11, w16, w12, lsl #8
-; CHECK-DISABLE-NEXT:    lsl x12, x9, #24
-; CHECK-DISABLE-NEXT:    mov v0.s[1], w13
+; CHECK-DISABLE-NEXT:    lsl x11, x9, #24
+; CHECK-DISABLE-NEXT:    lsr x12, x9, #40
+; CHECK-DISABLE-NEXT:    lsr x13, x10, #45
+; CHECK-DISABLE-NEXT:    lsl x14, x10, #24
+; CHECK-DISABLE-NEXT:    lsr x15, x10, #40
+; CHECK-DISABLE-NEXT:    extr x12, x12, x11, #57
+; CHECK-DISABLE-NEXT:    fmov s0, w13
 ; CHECK-DISABLE-NEXT:    ubfx x13, x10, #7, #25
+; CHECK-DISABLE-NEXT:    extr x14, x15, x14, #50
+; CHECK-DISABLE-NEXT:    ubfx x15, x9, #14, #18
 ; CHECK-DISABLE-NEXT:    extr x9, x10, x9, #40
-; CHECK-DISABLE-NEXT:    orr w12, w11, w12
-; CHECK-DISABLE-NEXT:    mov v1.s[1], w14
-; CHECK-DISABLE-NEXT:    lsr w12, w12, #19
+; CHECK-DISABLE-NEXT:    fmov s1, w12
+; CHECK-DISABLE-NEXT:    orr w12, w17, w16, lsl #8
+; CHECK-DISABLE-NEXT:    mov v0.s[1], w14
 ; CHECK-DISABLE-NEXT:    ubfx x9, x9, #12, #20
+; CHECK-DISABLE-NEXT:    orr w11, w12, w11
+; CHECK-DISABLE-NEXT:    mov v1.s[1], w15
+; CHECK-DISABLE-NEXT:    lsr w11, w11, #19
 ; CHECK-DISABLE-NEXT:    mov v0.s[2], w13
-; CHECK-DISABLE-NEXT:    mov v1.s[2], w12
+; CHECK-DISABLE-NEXT:    mov v1.s[2], w11
 ; CHECK-DISABLE-NEXT:    mov v0.s[3], w9
 ; CHECK-DISABLE-NEXT:    add x9, x1, x8, lsl #3
 ; CHECK-DISABLE-NEXT:    add x8, x8, #1
 ; CHECK-DISABLE-NEXT:    cmp x8, #1000
-; CHECK-DISABLE-NEXT:    mov v1.s[3], w11
+; CHECK-DISABLE-NEXT:    mov v1.s[3], w12
 ; CHECK-DISABLE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-DISABLE-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-DISABLE-NEXT:    st1 { v0.8b }, [x9]
diff --git a/llvm/test/CodeGen/X86/pr92720.ll b/llvm/test/CodeGen/X86/pr92720.ll
new file mode 100644
index 00000000000000..b2543c08328c7a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr92720.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s
+
+; Make sure we don't crash when shrinking the shift amount before legalization.
+define i64 @pr92720(i64 %x) {
+; CHECK-LABEL: pr92720:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movabsq $8589934592, %rax # imm = 0x200000000
+; CHECK-NEXT:    retq
+  %or = or i64 %x, 255
+  %sub = sub i64 0, %or
+  %shl = shl i64 1, %sub
+  %sext = shl i64 %shl, 32
+  ret i64 %sext
+}