-
Notifications
You must be signed in to change notification settings - Fork 11.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Always expand zero strided vp.strided.load #98901
[RISCV] Always expand zero strided vp.strided.load #98901
Conversation
This patch makes zero strided VP loads always be expanded to a scalar load and splat even if +optimized-zero-stride-load is present. Expanding it allows more .vx splat patterns to be matched, which is needed to prevent regressions in llvm#98111. If the feature is present, RISCVISelDAGToDAG will combine it back to a zero strided load. The RV32 test diff also shows how need to emit a zero strided load either way after expanding an SEW=64 strided load. We could maybe fix this in a later patch by not doing the expand if SEW>XLEN.
@llvm/pr-subscribers-backend-risc-v Author: Luke Lau (lukel97) ChangesThis patch makes zero strided VP loads always be expanded to a scalar load and splat even if +optimized-zero-stride-load is present. Expanding it allows more .vx splat patterns to be matched, which is needed to prevent regressions in #98111. If the feature is present, RISCVISelDAGToDAG will combine it back to a zero strided load. The RV32 test diff also shows how need to emit a zero strided load either way after expanding an SEW=64 strided load. We could maybe fix this in a later patch by not doing the expand if SEW>XLEN. Full diff: https://github.com/llvm/llvm-project/pull/98901.diff 3 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
index 35c46157c2eb9..b3f3dc6e2256c 100644
--- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
@@ -163,10 +163,10 @@ bool RISCVCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
return true;
}
+// Always expand zero strided loads so we match more .vx splat patterns, even if
+// we have +optimized-zero-stride-loads. RISCVDAGToDAGISel::Select will convert
+// it back to a strided load if it's optimized.
bool RISCVCodeGenPrepare::expandVPStrideLoad(IntrinsicInst &II) {
- if (ST->hasOptimizedZeroStrideLoad())
- return false;
-
Value *BasePtr, *VL;
using namespace PatternMatch;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
index 41c7d1f5fd64c..95f853b77f18b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
@@ -638,7 +638,7 @@ declare <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr, i64,
define <4 x i8> @zero_strided_unmasked_vpload_4i8_i8(ptr %ptr) {
; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4i8_i8:
; CHECK-OPT: # %bb.0:
-; CHECK-OPT-NEXT: vsetivli zero, 3, e8, mf4, ta, ma
+; CHECK-OPT-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; CHECK-OPT-NEXT: vlse8.v v8, (a0), zero
; CHECK-OPT-NEXT: ret
;
@@ -657,7 +657,7 @@ define <4 x i8> @zero_strided_unmasked_vpload_4i8_i8(ptr %ptr) {
define <4 x half> @zero_strided_unmasked_vpload_4f16(ptr %ptr) {
; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4f16:
; CHECK-OPT: # %bb.0:
-; CHECK-OPT-NEXT: vsetivli zero, 3, e16, mf2, ta, ma
+; CHECK-OPT-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; CHECK-OPT-NEXT: vlse16.v v8, (a0), zero
; CHECK-OPT-NEXT: ret
;
@@ -670,3 +670,30 @@ define <4 x half> @zero_strided_unmasked_vpload_4f16(ptr %ptr) {
%load = call <4 x half> @llvm.experimental.vp.strided.load.4f16.p0.i32(ptr %ptr, i32 0, <4 x i1> splat (i1 true), i32 3)
ret <4 x half> %load
}
+
+define <4 x i64> @zero_strided_vadd.vx(<4 x i64> %v, ptr %ptr) {
+; CHECK-RV32-LABEL: zero_strided_vadd.vx:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: addi sp, sp, -16
+; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16
+; CHECK-RV32-NEXT: lw a1, 4(a0)
+; CHECK-RV32-NEXT: lw a0, 0(a0)
+; CHECK-RV32-NEXT: sw a1, 12(sp)
+; CHECK-RV32-NEXT: sw a0, 8(sp)
+; CHECK-RV32-NEXT: addi a0, sp, 8
+; CHECK-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-RV32-NEXT: vlse64.v v10, (a0), zero
+; CHECK-RV32-NEXT: vadd.vv v8, v8, v10
+; CHECK-RV32-NEXT: addi sp, sp, 16
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: zero_strided_vadd.vx:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ld a0, 0(a0)
+; CHECK-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-RV64-NEXT: vadd.vx v8, v8, a0
+; CHECK-RV64-NEXT: ret
+ %load = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i32(ptr %ptr, i32 0, <4 x i1> splat (i1 true), i32 4)
+ %w = add <4 x i64> %v, %load
+ ret <4 x i64> %w
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
index 6b8ded4914226..563da270272c2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
@@ -822,3 +822,31 @@ define <vscale x 1 x half> @zero_strided_unmasked_vpload_nxv1f16(ptr %ptr) {
%load = call <vscale x 1 x half> @llvm.experimental.vp.strided.load.nxv1f16.p0.i32(ptr %ptr, i32 0, <vscale x 1 x i1> splat (i1 true), i32 4)
ret <vscale x 1 x half> %load
}
+
+define <vscale x 1 x i64> @zero_strided_vadd.vx(<vscale x 1 x i64> %v, ptr %ptr) {
+; CHECK-RV32-LABEL: zero_strided_vadd.vx:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: addi sp, sp, -16
+; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16
+; CHECK-RV32-NEXT: lw a1, 4(a0)
+; CHECK-RV32-NEXT: lw a0, 0(a0)
+; CHECK-RV32-NEXT: sw a1, 12(sp)
+; CHECK-RV32-NEXT: sw a0, 8(sp)
+; CHECK-RV32-NEXT: addi a0, sp, 8
+; CHECK-RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-RV32-NEXT: vlse64.v v9, (a0), zero
+; CHECK-RV32-NEXT: vadd.vv v8, v8, v9
+; CHECK-RV32-NEXT: addi sp, sp, 16
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: zero_strided_vadd.vx:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ld a0, 0(a0)
+; CHECK-RV64-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-RV64-NEXT: vadd.vx v8, v8, a0
+; CHECK-RV64-NEXT: ret
+ %vscale = call i32 @llvm.vscale()
+ %load = call <vscale x 1 x i64> @llvm.experimental.vp.strided.load.nxv1i64.p0.i32(ptr %ptr, i32 0, <vscale x 1 x i1> splat (i1 true), i32 %vscale)
+ %w = add <vscale x 1 x i64> %v, %load
+ ret <vscale x 1 x i64> %w
+}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
It's unfortunately we loose the smaller EVL here, but this looks like the right pragmatic tradeoff at the moment.
My thinking is that we only lose the smaller EVL when we have optimised zero stride loads, so a larger EVL shouldn't matter anyway. Maybe asides from vl toggles |
This patch makes zero strided VP loads always be expanded to a scalar load and splat even if +optimized-zero-stride-load is present.
Expanding it allows more .vx splat patterns to be matched, which is needed to prevent regressions in #98111.
If the feature is present, RISCVISelDAGToDAG will combine it back to a zero strided load.
The RV32 test diff also shows how need to emit a zero strided load either way after expanding an SEW=64 strided load. We could maybe fix this in a later patch by not doing the expand if SEW>XLEN.