From 7ed4e9f01cf72fa4a4bb8a07a8f755d4e36d6399 Mon Sep 17 00:00:00 2001 From: Rose Date: Fri, 3 May 2024 20:50:49 -0400 Subject: [PATCH] [CodeGen] Pre-commit tests (NFC) --- .../AArch64/GlobalISel/combine-udiv.ll | 55 +++++ .../AArch64/GlobalISel/combine-udiv.mir | 123 ++++++++++ llvm/test/CodeGen/X86/udiv-exact.ll | 225 ++++++++++++++++++ 3 files changed, 403 insertions(+) create mode 100644 llvm/test/CodeGen/X86/udiv-exact.ll diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll index c97a00ccdd4557..42fc70ffee0359 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll @@ -269,3 +269,58 @@ define i32 @udiv_div_by_180(i32 %x) %udiv = udiv i32 %truncate, 180 ret i32 %udiv } + +define i32 @udiv_div_by_180_exact(i32 %x) +; SDAG-LABEL: udiv_div_by_180_exact: +; SDAG: // %bb.0: +; SDAG-NEXT: lsr w8, w0, #2 +; SDAG-NEXT: mov w9, #27671 // =0x6c17 +; SDAG-NEXT: movk w9, #5825, lsl #16 +; SDAG-NEXT: umull x8, w8, w9 +; SDAG-NEXT: lsr x0, x8, #34 +; SDAG-NEXT: // kill: def $w0 killed $w0 killed $x0 +; SDAG-NEXT: ret +; +; GISEL-LABEL: udiv_div_by_180_exact: +; GISEL: // %bb.0: +; GISEL-NEXT: lsr w8, w0, #2 +; GISEL-NEXT: mov w9, #27671 // =0x6c17 +; GISEL-NEXT: movk w9, #5825, lsl #16 +; GISEL-NEXT: umull x8, w8, w9 +; GISEL-NEXT: lsr x8, x8, #32 +; GISEL-NEXT: lsr w0, w8, #2 +; GISEL-NEXT: ret +{ + %udiv = udiv exact i32 %x, 180 + ret i32 %udiv +} + +define <4 x i32> @udiv_div_by_104_exact(<4 x i32> %x) +; SDAG-LABEL: udiv_div_by_104_exact: +; SDAG: // %bb.0: +; SDAG-NEXT: adrp x8, .LCPI8_0 +; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI8_0] +; SDAG-NEXT: adrp x8, .LCPI8_1 +; SDAG-NEXT: umull2 v2.2d, v0.4s, v1.4s +; SDAG-NEXT: umull v0.2d, v0.2s, v1.2s +; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI8_1] +; SDAG-NEXT: uzp2 v0.4s, v0.4s, v2.4s +; SDAG-NEXT: ushl v0.4s, v0.4s, v1.4s +; SDAG-NEXT: ret +; +; GISEL-LABEL: udiv_div_by_104_exact: +; GISEL: // %bb.0: +; GISEL-NEXT: adrp x8, .LCPI8_1 +; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI8_1] +; GISEL-NEXT: adrp x8, .LCPI8_0 +; GISEL-NEXT: umull2 v2.2d, v0.4s, v1.4s +; GISEL-NEXT: umull v0.2d, v0.2s, v1.2s +; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI8_0] +; GISEL-NEXT: neg v1.4s, v1.4s +; GISEL-NEXT: uzp2 v0.4s, v0.4s, v2.4s +; GISEL-NEXT: ushl v0.4s, v0.4s, v1.4s +; GISEL-NEXT: ret +{ + %udiv = udiv exact <4 x i32> %x, + ret <4 x i32> %udiv +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir index 02233b9f498bd8..539152417e01f3 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir @@ -304,5 +304,128 @@ body: | %10:_(<8 x s16>) = G_UDIV %0, %1 $q0 = COPY %10(<8 x s16>) RET_ReallyLR implicit $q0 +... +--- +name: udiv_exact +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: udiv_exact + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1321528399 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY]], [[C]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UMULH]], [[C1]](s32) + ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 104 + %2:_(s32) = exact G_UDIV %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: udiv_noexact +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: udiv_noexact + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1321528399 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY]], [[C]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UMULH]], [[C1]](s32) + ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 104 + %2:_(s32) = G_UDIV %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: udiv_exact_minsize +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: udiv_exact_minsize + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1321528399 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY]], [[C]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UMULH]], [[C1]](s32) + ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 104 + %2:_(s32) = exact G_UDIV %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: div_v4s32 +body: | + bb.1: + liveins: $q0 + + ; CHECK-LABEL: name: div_v4s32 + ; CHECK: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1321528399 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 954437177 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C2]](s32), [[C]](s32), [[C2]](s32) + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C3]](s32), [[C1]](s32), [[C3]](s32) + ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<4 x s32>) = G_UMULH [[COPY]], [[BUILD_VECTOR]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<4 x s32>) = G_LSHR [[UMULH]], [[BUILD_VECTOR1]](<4 x s32>) + ; CHECK-NEXT: $q0 = COPY [[LSHR]](<4 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(<4 x s32>) = COPY $q0 + %c1:_(s32) = G_CONSTANT i32 104 + %c2:_(s32) = G_CONSTANT i32 72 + %1:_(<4 x s32>) = G_BUILD_VECTOR %c1(s32), %c2(s32), %c1(s32), %c2(s32) + %3:_(<4 x s32>) = exact G_UDIV %0, %1 + $q0 = COPY %3(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: div_v4s32_splat +body: | + bb.1: + liveins: $q0 + + ; CHECK-LABEL: name: div_v4s32_splat + ; CHECK: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1321528399 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32), [[C1]](s32), [[C1]](s32) + ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<4 x s32>) = G_UMULH [[COPY]], [[BUILD_VECTOR]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<4 x s32>) = G_LSHR [[UMULH]], [[BUILD_VECTOR1]](<4 x s32>) + ; CHECK-NEXT: $q0 = COPY [[LSHR]](<4 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(<4 x s32>) = COPY $q0 + %c1:_(s32) = G_CONSTANT i32 104 + %1:_(<4 x s32>) = G_BUILD_VECTOR %c1(s32), %c1(s32), %c1(s32), %c1(s32) + %3:_(<4 x s32>) = exact G_UDIV %0, %1 + $q0 = COPY %3(<4 x s32>) + RET_ReallyLR implicit $q0 ... diff --git a/llvm/test/CodeGen/X86/udiv-exact.ll b/llvm/test/CodeGen/X86/udiv-exact.ll new file mode 100644 index 00000000000000..0a835e07107888 --- /dev/null +++ b/llvm/test/CodeGen/X86/udiv-exact.ll @@ -0,0 +1,225 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 + +define i32 @test1(i32 %x) { +; X86-LABEL: test1: +; X86: # %bb.0: +; X86-NEXT: movl $1374389535, %eax # imm = 0x51EB851F +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: retl +; +; X64-LABEL: test1: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: imulq $1374389535, %rax, %rax # imm = 0x51EB851F +; X64-NEXT: shrq $35, %rax +; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: retq + %div = udiv exact i32 %x, 25 + ret i32 %div +} + +define i32 @test2(i32 %x) { +; X86-LABEL: test2: +; X86: # %bb.0: +; X86-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $4, %eax +; X86-NEXT: retl +; +; X64-LABEL: test2: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %ecx +; X64-NEXT: movl $2863311531, %eax # imm = 0xAAAAAAAB +; X64-NEXT: imulq %rcx, %rax +; X64-NEXT: shrq $36, %rax +; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: retq + %div = udiv exact i32 %x, 24 + ret i32 %div +} + +define <4 x i32> @test3(<4 x i32> %x) { +; X86-LABEL: test3: +; X86: # %bb.0: +; X86-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-NEXT: pmuludq %xmm1, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; X86-NEXT: pmuludq %xmm1, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-NEXT: psrld $4, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: test3: +; X64: # %bb.0: +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; X64-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; X64-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; X64-NEXT: vpsrld $4, %xmm0, %xmm0 +; X64-NEXT: retq + %div = udiv exact <4 x i32> %x, + ret <4 x i32> %div +} + +define <4 x i32> @test4(<4 x i32> %x) { +; X86-LABEL: test4: +; X86: # %bb.0: +; X86-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535] +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-NEXT: pmuludq %xmm1, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; X86-NEXT: pmuludq %xmm1, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-NEXT: psrld $3, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: test4: +; X64: # %bb.0: +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; X64-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; X64-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; X64-NEXT: vpsrld $3, %xmm0, %xmm0 +; X64-NEXT: retq + %div = udiv exact <4 x i32> %x, + ret <4 x i32> %div +} + +define <4 x i32> @test5(<4 x i32> %x) { +; X86-LABEL: test5: +; X86: # %bb.0: +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psrld $4, %xmm1 +; X86-NEXT: psrld $3, %xmm0 +; X86-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X86-NEXT: retl +; +; X64-LABEL: test5: +; X64: # %bb.0: +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: retq + %div = udiv exact <4 x i32> %x, + ret <4 x i32> %div +} + +define <4 x i32> @test6(<4 x i32> %x) { +; X86-LABEL: test6: +; X86: # %bb.0: +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psrld $4, %xmm1 +; X86-NEXT: psrld $3, %xmm0 +; X86-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X86-NEXT: retl +; +; X64-LABEL: test6: +; X64: # %bb.0: +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: retq + %div = udiv exact <4 x i32> %x, + ret <4 x i32> %div +} + +define <4 x i32> @test7(<4 x i32> %x) { +; X86-LABEL: test7: +; X86: # %bb.0: +; X86-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,795364315,795364315] +; X86-NEXT: pmuludq %xmm0, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-NEXT: psubd %xmm1, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X86-NEXT: paddd %xmm1, %xmm0 +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psrld $3, %xmm1 +; X86-NEXT: psrld $4, %xmm0 +; X86-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X86-NEXT: retl +; +; X64-LABEL: test7: +; X64: # %bb.0: +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X64-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; X64-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: retq + %div = udiv exact <4 x i32> %x, + ret <4 x i32> %div +} + +define <4 x i32> @test8(<4 x i32> %x) { +; X86-LABEL: test8: +; X86: # %bb.0: +; X86-NEXT: movdqa {{.*#+}} xmm1 = [u,u,2863311531,2863311531] +; X86-NEXT: movdqa %xmm0, %xmm2 +; X86-NEXT: pmuludq %xmm1, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X86-NEXT: pmuludq %xmm1, %xmm3 +; X86-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; X86-NEXT: psrld $4, %xmm2 +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; X86-NEXT: retl +; +; X64-LABEL: test8: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; X64-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X64-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; X64-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X64-NEXT: vpsrld $4, %xmm1, %xmm1 +; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; X64-NEXT: retq + %div = udiv exact <4 x i32> %x, + ret <4 x i32> %div +}