-
Notifications
You must be signed in to change notification settings - Fork 11.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Add patterns for constructive splice. #113912
Conversation
This removes the RUN line for '-mattr=+sve -force-streaming-compatible' from sve-streaming-mode-fixed-length-int-div.ll, as there is no particular value in testing this and with the changes to splice would cause lots of extra CHECK lines for the SVE and SVE2 case.
SVE2 added the constructive splice instruction, which takes a tuple. Even though the register allocator must ensure that the tuple uses consecutive registers for the tuple, it's likely to be more efficient than using the destructive splice instruction when the first operand is reused.
@llvm/pr-subscribers-backend-aarch64 Author: Sander de Smalen (sdesmalen-arm) ChangesSVE2 adds the constructive splice instruction, which takes a tuple. Patch is 125.51 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/113912.diff 9 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index dc96b249c4e40c..65a5c2157ec498 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3846,7 +3846,7 @@ let Predicates = [HasSVE2] in {
let Predicates = [HasSVE2orSME] in {
// SVE2 vector splice (constructive)
- defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">;
+ defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice", AArch64splice>;
} // End HasSVE2orSME
let Predicates = [HasSVE2] in {
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 02ee0fe9244572..ea6c826382871e 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -7245,11 +7245,33 @@ class sve2_int_perm_splice_cons<bits<2> sz8_64, string asm,
let hasSideEffects = 0;
}
-multiclass sve2_int_perm_splice_cons<string asm> {
+multiclass sve2_int_perm_splice_cons<string asm, SDPatternOperator op> {
def _B : sve2_int_perm_splice_cons<0b00, asm, ZPR8, ZZ_b>;
def _H : sve2_int_perm_splice_cons<0b01, asm, ZPR16, ZZ_h>;
def _S : sve2_int_perm_splice_cons<0b10, asm, ZPR32, ZZ_s>;
def _D : sve2_int_perm_splice_cons<0b11, asm, ZPR64, ZZ_d>;
+
+ let AddedComplexity = 2 in {
+ foreach VT = [nxv16i8] in
+ def : Pat<(VT (op nxv16i1:$pred, VT:$zn1, VT:$zn2)),
+ (!cast<Instruction>(NAME # _B)
+ nxv16i1:$pred, (REG_SEQUENCE ZPR2, VT:$zn1, zsub0, VT:$zn2, zsub1))>;
+
+ foreach VT = [nxv8i16, nxv8f16, nxv8bf16] in
+ def : Pat<(VT (op nxv8i1:$pred, VT:$zn1, VT:$zn2)),
+ (!cast<Instruction>(NAME # _H)
+ nxv8i1:$pred, (REG_SEQUENCE ZPR2, VT:$zn1, zsub0, VT:$zn2, zsub1))>;
+
+ foreach VT = [nxv4i32, nxv4f16, nxv4f32, nxv4bf16] in
+ def : Pat<(VT (op nxv4i1:$pred, VT:$zn1, VT:$zn2)),
+ (!cast<Instruction>(NAME # _S)
+ nxv4i1:$pred, (REG_SEQUENCE ZPR2, VT:$zn1, zsub0, VT:$zn2, zsub1))>;
+
+ foreach VT = [nxv2i64, nxv2f16, nxv2f32, nxv2f64, nxv2bf16] in
+ def : Pat<(VT (op nxv2i1:$pred, VT:$zn1, VT:$zn2)),
+ (!cast<Instruction>(NAME # _D)
+ nxv2i1:$pred, (REG_SEQUENCE ZPR2, VT:$zn1, zsub0, VT:$zn2, zsub1))>;
+ }
}
class sve_int_perm_rev<bits<2> sz8_64, bits<2> opc, string asm,
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index c1810c678ea522..6e2ecfca9e963e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
+; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,SME
; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
target triple = "aarch64-unknown-linux-gnu"
@@ -61,10 +61,10 @@ define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2) {
define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2) {
; CHECK-LABEL: concat_v16i8:
; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: ptrue p0.b, vl8
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: splice z0.b, p0, { z0.b, z1.b }
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
@@ -172,10 +172,10 @@ define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2) {
define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2) {
; CHECK-LABEL: concat_v8i16:
; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: ptrue p0.h, vl4
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h }
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
@@ -270,10 +270,10 @@ define <2 x i32> @concat_v2i32(<1 x i32> %op1, <1 x i32> %op2) {
define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2) {
; CHECK-LABEL: concat_v4i32:
; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: ptrue p0.s, vl2
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: splice z0.s, p0, { z0.s, z1.s }
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
@@ -340,10 +340,10 @@ define void @concat_v16i32(ptr %a, ptr %b, ptr %c) {
define <2 x i64> @concat_v2i64(<1 x i64> %op1, <1 x i64> %op2) {
; CHECK-LABEL: concat_v2i64:
; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: ptrue p0.d, vl1
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: splice z0.d, p0, { z0.d, z1.d }
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
@@ -406,17 +406,33 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) {
;
define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) {
-; CHECK-LABEL: concat_v4f16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT: mov z2.h, z1.h[1]
-; CHECK-NEXT: mov z3.h, z0.h[1]
-; CHECK-NEXT: zip1 z1.h, z1.h, z2.h
-; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
-; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
-; CHECK-NEXT: ret
+; SVE2-LABEL: concat_v4f16:
+; SVE2: // %bb.0:
+; SVE2-NEXT: cnth x8
+; SVE2-NEXT: adrp x9, .LCPI15_0
+; SVE2-NEXT: adrp x10, .LCPI15_1
+; SVE2-NEXT: mov z2.h, w8
+; SVE2-NEXT: ldr q3, [x9, :lo12:.LCPI15_0]
+; SVE2-NEXT: ldr q4, [x10, :lo12:.LCPI15_1]
+; SVE2-NEXT: ptrue p0.h, vl8
+; SVE2-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
+; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
+; SVE2-NEXT: mad z2.h, p0/m, z3.h, z4.h
+; SVE2-NEXT: tbl z0.h, { z0.h, z1.h }, z2.h
+; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT: ret
+;
+; SME-LABEL: concat_v4f16:
+; SME: // %bb.0:
+; SME-NEXT: // kill: def $d1 killed $d1 def $z1
+; SME-NEXT: // kill: def $d0 killed $d0 def $z0
+; SME-NEXT: mov z2.h, z1.h[1]
+; SME-NEXT: mov z3.h, z0.h[1]
+; SME-NEXT: zip1 z1.h, z1.h, z2.h
+; SME-NEXT: zip1 z0.h, z0.h, z3.h
+; SME-NEXT: zip1 z0.s, z0.s, z1.s
+; SME-NEXT: // kill: def $d0 killed $d0 killed $z0
+; SME-NEXT: ret
;
; NONEON-NOSVE-LABEL: concat_v4f16:
; NONEON-NOSVE: // %bb.0:
@@ -436,10 +452,10 @@ define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) {
define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2) {
; CHECK-LABEL: concat_v8f16:
; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: ptrue p0.h, vl4
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h }
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
@@ -534,10 +550,10 @@ define <2 x float> @concat_v2f32(<1 x float> %op1, <1 x float> %op2) {
define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2) {
; CHECK-LABEL: concat_v4f32:
; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: ptrue p0.s, vl2
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: splice z0.s, p0, { z0.s, z1.s }
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
@@ -604,10 +620,10 @@ define void @concat_v16f32(ptr %a, ptr %b, ptr %c) {
define <2 x double> @concat_v2f64(<1 x double> %op1, <1 x double> %op2) {
; CHECK-LABEL: concat_v2f64:
; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: ptrue p0.d, vl1
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: splice z0.d, p0, { z0.d, z1.d }
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
index f1771a753826cc..2282e74af5d006 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE
-; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s --check-prefixes=SVE
+; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=SVE2
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=SVE2
; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
@@ -842,16 +842,16 @@ define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) {
;
; SVE2-LABEL: test_copysign_v4f32_v4f64:
; SVE2: // %bb.0:
-; SVE2-NEXT: ldp q0, q1, [x1]
+; SVE2-NEXT: ldp q1, q0, [x1]
; SVE2-NEXT: ptrue p0.d
-; SVE2-NEXT: ldr q2, [x0]
-; SVE2-NEXT: fcvt z1.s, p0/m, z1.d
; SVE2-NEXT: fcvt z0.s, p0/m, z0.d
+; SVE2-NEXT: fcvt z1.s, p0/m, z1.d
; SVE2-NEXT: ptrue p0.s, vl2
-; SVE2-NEXT: uzp1 z1.s, z1.s, z1.s
-; SVE2-NEXT: uzp1 z0.s, z0.s, z0.s
-; SVE2-NEXT: splice z0.s, p0, z0.s, z1.s
+; SVE2-NEXT: uzp1 z3.s, z0.s, z0.s
+; SVE2-NEXT: uzp1 z2.s, z1.s, z1.s
; SVE2-NEXT: mov z1.s, #0x7fffffff
+; SVE2-NEXT: splice z0.s, p0, { z2.s, z3.s }
+; SVE2-NEXT: ldr q2, [x0]
; SVE2-NEXT: bsl z2.d, z2.d, z0.d, z1.d
; SVE2-NEXT: str q2, [x0]
; SVE2-NEXT: ret
@@ -1237,16 +1237,16 @@ define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) {
;
; SVE2-LABEL: test_copysign_v8f16_v8f32:
; SVE2: // %bb.0:
-; SVE2-NEXT: ldp q0, q1, [x1]
+; SVE2-NEXT: ldp q1, q0, [x1]
; SVE2-NEXT: ptrue p0.s
-; SVE2-NEXT: ldr q2, [x0]
-; SVE2-NEXT: fcvt z1.h, p0/m, z1.s
; SVE2-NEXT: fcvt z0.h, p0/m, z0.s
+; SVE2-NEXT: fcvt z1.h, p0/m, z1.s
; SVE2-NEXT: ptrue p0.h, vl4
-; SVE2-NEXT: uzp1 z1.h, z1.h, z1.h
-; SVE2-NEXT: uzp1 z0.h, z0.h, z0.h
-; SVE2-NEXT: splice z0.h, p0, z0.h, z1.h
+; SVE2-NEXT: uzp1 z3.h, z0.h, z0.h
+; SVE2-NEXT: uzp1 z2.h, z1.h, z1.h
; SVE2-NEXT: mov z1.h, #32767 // =0x7fff
+; SVE2-NEXT: splice z0.h, p0, { z2.h, z3.h }
+; SVE2-NEXT: ldr q2, [x0]
; SVE2-NEXT: bsl z2.d, z2.d, z0.d, z1.d
; SVE2-NEXT: str q2, [x0]
; SVE2-NEXT: ret
@@ -1349,5 +1349,3 @@ declare <8 x float> @llvm.copysign.v8f32(<8 x float> %a, <8 x float> %b) #0
declare <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %b) #0
declare <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b) #0
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
index 516772b8ca6640..1fdcd4f8268708 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
@@ -1,7 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE
-; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK
; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
target triple = "aarch64-unknown-linux-gnu"
@@ -26,19 +25,6 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v4i8:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8
-; NEON-NOSVE-NEXT: shl v1.4h, v1.4h, #8
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8
-; NEON-NOSVE-NEXT: sshr v1.4h, v1.4h, #8
-; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: xtn v0.4h, v0.4s
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v4i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #32
@@ -85,27 +71,12 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: ptrue p0.h, vl4
; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h
-; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h
-; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b
+; CHECK-NEXT: uzp1 z2.h, z0.h, z0.h
+; CHECK-NEXT: splice z0.h, p0, { z1.h, z2.h }
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v8i8:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0
-; NEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: sshll2 v2.4s, v1.8h, #0
-; NEON-NOSVE-NEXT: sshll2 v3.4s, v0.8h, #0
-; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
-; NEON-NOSVE-NEXT: xtn v0.8b, v0.8h
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v8i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #32
@@ -177,45 +148,21 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8
; CHECK-NEXT: sunpklo z1.s, z1.h
; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z5.s
-; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT: uzp1 z5.h, z2.h, z2.h
; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: uzp1 z1.h, z4.h, z4.h
; CHECK-NEXT: ptrue p0.h, vl4
-; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT: splice z1.h, p0, z1.h, z2.h
-; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT: splice z3.h, p0, z3.h, z0.h
-; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b
+; CHECK-NEXT: uzp1 z1.h, z3.h, z3.h
+; CHECK-NEXT: uzp1 z2.h, z0.h, z0.h
+; CHECK-NEXT: splice z0.h, p0, { z4.h, z5.h }
+; CHECK-NEXT: splice z1.h, p0, { z1.h, z2.h }
; CHECK-NEXT: ptrue p0.b, vl8
-; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b
-; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b
+; CHECK-NEXT: uzp1 z2.b, z0.b, z0.b
+; CHECK-NEXT: uzp1 z3.b, z1.b, z1.b
+; CHECK-NEXT: splice z0.b, p0, { z2.b, z3.b }
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v16i8:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: sshll2 v2.8h, v1.16b, #0
-; NEON-NOSVE-NEXT: sshll2 v3.8h, v0.16b, #0
-; NEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0
-; NEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: sshll2 v4.4s, v2.8h, #0
-; NEON-NOSVE-NEXT: sshll2 v5.4s, v3.8h, #0
-; NEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0
-; NEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0
-; NEON-NOSVE-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
-; NEON-NOSVE-NEXT: sshll2 v5.4s, v0.8h, #0
-; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; NEON-NOSVE-NEXT: sshll2 v3.4s, v1.8h, #0
-; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: sdivr z3.s, p0/m, z3.s, z5.s
-; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h
-; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v3.8h
-; NEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v16i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
@@ -319,7 +266,6 @@ define void @sdiv_v32i8(ptr %a, ptr %b) {
; CHECK-NEXT: sunpklo z4.h, z2.b
; CHECK-NEXT: sunpklo z2.s, z3.h
; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: sunpklo z5.s, z4.h
; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
; CHECK-NEXT: sunpklo z3.s, z3.h
@@ -328,7 +274,6 @@ define void @sdiv_v32i8(ptr %a, ptr %b) {
; CHECK-NEXT: ldr q5, [x0]
; CHECK-NEXT: sunpklo z16.h, z5.b
; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8
-; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h
; CHECK-NEXT: sunpklo z5.h, z5.b
; CHECK-NEXT: sunpklo z18.s, z16.h
; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8
@@ -337,81 +282,36 @@ define void @sdiv_v32i8(ptr %a, ptr %b) {
; CHECK-NEXT: sunpklo z18.s, z5.h
; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8
; CHECK-NEXT: sunpklo z5.s, z5.h
-; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s
; CHECK-NEXT: sunpklo z16.s, z6.h
; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8
; CHECK-NEXT: sunpklo z6.s, z6.h
+; CHECK-NEXT: uzp1 z20.h, z17.h, z17.h
; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z18.s
+; CHECK-NEXT: uzp1 z18.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z19.h, z1.h, z1.h
+; CHECK-NEXT: uzp1 z21.h, z7.h, z7.h
; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT: uzp1 z6.h, z7.h, z7.h
-; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h
+; CHECK-NEXT: uzp1 z0.h, z16.h, z16.h
; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT: uzp1 z4.h, z17.h, z17.h
; CHECK-NEXT: ptrue p0.h, vl4
-; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h
-; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h
-; CHECK-NEXT: splice z4.h, p0, z4.h, z6.h
-; CHECK-NEXT: splice z7.h, p0, z7.h, z5.h
-; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
-; CHECK-NEXT: uzp1 z1.b, z4.b, z4.b
-; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h
-; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b
+; CHECK-NEXT: uzp1 z1.h, z5.h, z5.h
+; CHECK-NEXT: uzp1 z4.h, z2.h, z2.h
+; CHECK-NEXT: splice z2.h, p0, { z20.h, z21.h }
+; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h }
+; CHECK-NEXT: uzp1 z5.h, z3...
[truncated]
|
According to this: https://developer.arm.com/documentation/ddi0602/2024-09/SVE-Instructions/SPLICE--Splice-two-vectors-under-predicate-control- the splice instructions require both SVE2 and SME, it looks like the splice definition currently is guarded under "SVE2OrSME". |
The specification says:
If both features are not implemented then it's a decode error. This implies either of the features is sufficient to enable the instruction (along with a streaming mode check when targeting SME without SVE). |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, you're right.
; SVE2-NEXT: ptrue p0.h, vl8 | ||
; SVE2-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1 | ||
; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1 | ||
; SVE2-NEXT: mad z2.h, p0/m, z3.h, z4.h | ||
; SVE2-NEXT: tbl z0.h, { z0.h, z1.h }, z2.h | ||
; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change (regression?) looks strange. Not necessarily blocking this but do you know why we see a difference here between SVE2 and SME?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for pointing out; that's due to changing the RUN line from +sve to +sve2, not the code change for splice.
It seems the lowering of shufflevector goes through the generic expansion code for +sve (which ends up resulting in the relatively neat zips), but chooses to use the TBL when it is available (i.e. with +sve2), which unfortunately doesn't end up being as nice.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, can be a follow up fix then.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(Not for this PR) but ext
can also have a pattern added to use the constructive variant?
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/52/builds/3334 Here is the relevant piece of the build log for the reference
|
SVE2 adds the constructive splice instruction, which takes a tuple.
Even though the register allocator must ensure that the tuple uses
consecutive registers for the tuple, it's likely to be more efficient
than using the destructive splice instruction when the first operand
is reused.