Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AArch64] Add patterns for constructive splice. #113912

Merged
merged 2 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -3846,7 +3846,7 @@ let Predicates = [HasSVE2] in {

let Predicates = [HasSVE2orSME] in {
// SVE2 vector splice (constructive)
defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">;
defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice", AArch64splice>;
} // End HasSVE2orSME

let Predicates = [HasSVE2] in {
Expand Down
24 changes: 23 additions & 1 deletion llvm/lib/Target/AArch64/SVEInstrFormats.td
Original file line number Diff line number Diff line change
Expand Up @@ -7245,11 +7245,33 @@ class sve2_int_perm_splice_cons<bits<2> sz8_64, string asm,
let hasSideEffects = 0;
}

multiclass sve2_int_perm_splice_cons<string asm> {
multiclass sve2_int_perm_splice_cons<string asm, SDPatternOperator op> {
def _B : sve2_int_perm_splice_cons<0b00, asm, ZPR8, ZZ_b>;
def _H : sve2_int_perm_splice_cons<0b01, asm, ZPR16, ZZ_h>;
def _S : sve2_int_perm_splice_cons<0b10, asm, ZPR32, ZZ_s>;
def _D : sve2_int_perm_splice_cons<0b11, asm, ZPR64, ZZ_d>;

let AddedComplexity = 2 in {
foreach VT = [nxv16i8] in
def : Pat<(VT (op nxv16i1:$pred, VT:$zn1, VT:$zn2)),
(!cast<Instruction>(NAME # _B)
nxv16i1:$pred, (REG_SEQUENCE ZPR2, VT:$zn1, zsub0, VT:$zn2, zsub1))>;

foreach VT = [nxv8i16, nxv8f16, nxv8bf16] in
def : Pat<(VT (op nxv8i1:$pred, VT:$zn1, VT:$zn2)),
(!cast<Instruction>(NAME # _H)
nxv8i1:$pred, (REG_SEQUENCE ZPR2, VT:$zn1, zsub0, VT:$zn2, zsub1))>;

foreach VT = [nxv4i32, nxv4f16, nxv4f32, nxv4bf16] in
def : Pat<(VT (op nxv4i1:$pred, VT:$zn1, VT:$zn2)),
(!cast<Instruction>(NAME # _S)
nxv4i1:$pred, (REG_SEQUENCE ZPR2, VT:$zn1, zsub0, VT:$zn2, zsub1))>;

foreach VT = [nxv2i64, nxv2f16, nxv2f32, nxv2f64, nxv2bf16] in
def : Pat<(VT (op nxv2i1:$pred, VT:$zn1, VT:$zn2)),
(!cast<Instruction>(NAME # _D)
nxv2i1:$pred, (REG_SEQUENCE ZPR2, VT:$zn1, zsub0, VT:$zn2, zsub1))>;
}
}

class sve_int_perm_rev<bits<2> sz8_64, bits<2> opc, string asm,
Expand Down
84 changes: 50 additions & 34 deletions llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE2
; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,SME
; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE

target triple = "aarch64-unknown-linux-gnu"
Expand Down Expand Up @@ -61,10 +61,10 @@ define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2) {
define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2) {
; CHECK-LABEL: concat_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: ptrue p0.b, vl8
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: splice z0.b, p0, { z0.b, z1.b }
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
Expand Down Expand Up @@ -172,10 +172,10 @@ define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2) {
define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2) {
; CHECK-LABEL: concat_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: ptrue p0.h, vl4
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h }
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
Expand Down Expand Up @@ -270,10 +270,10 @@ define <2 x i32> @concat_v2i32(<1 x i32> %op1, <1 x i32> %op2) {
define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2) {
; CHECK-LABEL: concat_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: ptrue p0.s, vl2
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: splice z0.s, p0, { z0.s, z1.s }
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
Expand Down Expand Up @@ -340,10 +340,10 @@ define void @concat_v16i32(ptr %a, ptr %b, ptr %c) {
define <2 x i64> @concat_v2i64(<1 x i64> %op1, <1 x i64> %op2) {
; CHECK-LABEL: concat_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: ptrue p0.d, vl1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: splice z0.d, p0, { z0.d, z1.d }
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
Expand Down Expand Up @@ -406,17 +406,33 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) {
;

define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) {
; CHECK-LABEL: concat_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: mov z2.h, z1.h[1]
; CHECK-NEXT: mov z3.h, z0.h[1]
; CHECK-NEXT: zip1 z1.h, z1.h, z2.h
; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
; SVE2-LABEL: concat_v4f16:
; SVE2: // %bb.0:
; SVE2-NEXT: cnth x8
; SVE2-NEXT: adrp x9, .LCPI15_0
; SVE2-NEXT: adrp x10, .LCPI15_1
; SVE2-NEXT: mov z2.h, w8
; SVE2-NEXT: ldr q3, [x9, :lo12:.LCPI15_0]
; SVE2-NEXT: ldr q4, [x10, :lo12:.LCPI15_1]
; SVE2-NEXT: ptrue p0.h, vl8
; SVE2-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
; SVE2-NEXT: mad z2.h, p0/m, z3.h, z4.h
; SVE2-NEXT: tbl z0.h, { z0.h, z1.h }, z2.h
; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0
Comment on lines +417 to +422
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change (regression?) looks strange. Not necessarily blocking this but do you know why we see a difference here between SVE2 and SME?

Copy link
Collaborator Author

@sdesmalen-arm sdesmalen-arm Oct 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for pointing out; that's due to changing the RUN line from +sve to +sve2, not the code change for splice.

It seems the lowering of shufflevector goes through the generic expansion code for +sve (which ends up resulting in the relatively neat zips), but chooses to use the TBL when it is available (i.e. with +sve2), which unfortunately doesn't end up being as nice.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, can be a follow up fix then.

; SVE2-NEXT: ret
;
; SME-LABEL: concat_v4f16:
; SME: // %bb.0:
; SME-NEXT: // kill: def $d1 killed $d1 def $z1
; SME-NEXT: // kill: def $d0 killed $d0 def $z0
; SME-NEXT: mov z2.h, z1.h[1]
; SME-NEXT: mov z3.h, z0.h[1]
; SME-NEXT: zip1 z1.h, z1.h, z2.h
; SME-NEXT: zip1 z0.h, z0.h, z3.h
; SME-NEXT: zip1 z0.s, z0.s, z1.s
; SME-NEXT: // kill: def $d0 killed $d0 killed $z0
; SME-NEXT: ret
;
; NONEON-NOSVE-LABEL: concat_v4f16:
; NONEON-NOSVE: // %bb.0:
Expand All @@ -436,10 +452,10 @@ define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) {
define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2) {
; CHECK-LABEL: concat_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: ptrue p0.h, vl4
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h }
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
Expand Down Expand Up @@ -534,10 +550,10 @@ define <2 x float> @concat_v2f32(<1 x float> %op1, <1 x float> %op2) {
define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2) {
; CHECK-LABEL: concat_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: ptrue p0.s, vl2
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: splice z0.s, p0, { z0.s, z1.s }
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
Expand Down Expand Up @@ -604,10 +620,10 @@ define void @concat_v16f32(ptr %a, ptr %b, ptr %c) {
define <2 x double> @concat_v2f64(<1 x double> %op1, <1 x double> %op2) {
; CHECK-LABEL: concat_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: ptrue p0.d, vl1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: splice z0.d, p0, { z0.d, z1.d }
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE
; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE2
; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,SVE2
; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s --check-prefixes=SVE
; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=SVE2
; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=SVE2
; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE

target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
Expand Down Expand Up @@ -842,16 +842,16 @@ define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) {
;
; SVE2-LABEL: test_copysign_v4f32_v4f64:
; SVE2: // %bb.0:
; SVE2-NEXT: ldp q0, q1, [x1]
; SVE2-NEXT: ldp q1, q0, [x1]
; SVE2-NEXT: ptrue p0.d
; SVE2-NEXT: ldr q2, [x0]
; SVE2-NEXT: fcvt z1.s, p0/m, z1.d
; SVE2-NEXT: fcvt z0.s, p0/m, z0.d
; SVE2-NEXT: fcvt z1.s, p0/m, z1.d
; SVE2-NEXT: ptrue p0.s, vl2
; SVE2-NEXT: uzp1 z1.s, z1.s, z1.s
; SVE2-NEXT: uzp1 z0.s, z0.s, z0.s
; SVE2-NEXT: splice z0.s, p0, z0.s, z1.s
; SVE2-NEXT: uzp1 z3.s, z0.s, z0.s
; SVE2-NEXT: uzp1 z2.s, z1.s, z1.s
; SVE2-NEXT: mov z1.s, #0x7fffffff
; SVE2-NEXT: splice z0.s, p0, { z2.s, z3.s }
; SVE2-NEXT: ldr q2, [x0]
; SVE2-NEXT: bsl z2.d, z2.d, z0.d, z1.d
; SVE2-NEXT: str q2, [x0]
; SVE2-NEXT: ret
Expand Down Expand Up @@ -1237,16 +1237,16 @@ define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) {
;
; SVE2-LABEL: test_copysign_v8f16_v8f32:
; SVE2: // %bb.0:
; SVE2-NEXT: ldp q0, q1, [x1]
; SVE2-NEXT: ldp q1, q0, [x1]
; SVE2-NEXT: ptrue p0.s
; SVE2-NEXT: ldr q2, [x0]
; SVE2-NEXT: fcvt z1.h, p0/m, z1.s
; SVE2-NEXT: fcvt z0.h, p0/m, z0.s
; SVE2-NEXT: fcvt z1.h, p0/m, z1.s
; SVE2-NEXT: ptrue p0.h, vl4
; SVE2-NEXT: uzp1 z1.h, z1.h, z1.h
; SVE2-NEXT: uzp1 z0.h, z0.h, z0.h
; SVE2-NEXT: splice z0.h, p0, z0.h, z1.h
; SVE2-NEXT: uzp1 z3.h, z0.h, z0.h
; SVE2-NEXT: uzp1 z2.h, z1.h, z1.h
; SVE2-NEXT: mov z1.h, #32767 // =0x7fff
; SVE2-NEXT: splice z0.h, p0, { z2.h, z3.h }
; SVE2-NEXT: ldr q2, [x0]
; SVE2-NEXT: bsl z2.d, z2.d, z0.d, z1.d
; SVE2-NEXT: str q2, [x0]
; SVE2-NEXT: ret
Expand Down Expand Up @@ -1349,5 +1349,3 @@ declare <8 x float> @llvm.copysign.v8f32(<8 x float> %a, <8 x float> %b) #0

declare <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %b) #0
declare <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b) #0
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK: {{.*}}
Loading
Loading