From ead07427834c82aac105d36b8671abbe915c441c Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Mon, 5 Feb 2024 07:38:40 +0100 Subject: [PATCH] Use dedicated instructions for zip / unzip on arm64 --- include/xsimd/arch/xsimd_neon64.hpp | 83 +++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/include/xsimd/arch/xsimd_neon64.hpp b/include/xsimd/arch/xsimd_neon64.hpp index bc982c7ce..c6ecb423f 100644 --- a/include/xsimd/arch/xsimd_neon64.hpp +++ b/include/xsimd/arch/xsimd_neon64.hpp @@ -952,6 +952,41 @@ namespace xsimd /********** * zip_lo * **********/ + template = 0> + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip1q_u8(lhs, rhs); + } + + template = 0> + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip1q_s8(lhs, rhs); + } + + template = 0> + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip1q_u16(lhs, rhs); + } + + template = 0> + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip1q_s16(lhs, rhs); + } + + template = 0> + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip1q_u32(lhs, rhs); + } + + template = 0> + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip1q_s32(lhs, rhs); + } template = 0> inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept @@ -965,6 +1000,12 @@ namespace xsimd return vzip1q_s64(lhs, rhs); } + template + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip1q_f32(lhs, rhs); + } + template inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { @@ -975,6 +1016,42 @@ namespace xsimd * zip_hi * **********/ + template = 0> + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip2q_u8(lhs, rhs); + } + + template = 0> + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip2q_s8(lhs, rhs); + } + + template = 0> + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip2q_u16(lhs, rhs); + } + + template = 0> + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip2q_s16(lhs, rhs); + } + + template = 0> + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip2q_u32(lhs, rhs); + } + + template = 0> + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip2q_s32(lhs, rhs); + } + template = 0> inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { @@ -987,6 +1064,12 @@ namespace xsimd return vzip2q_s64(lhs, rhs); } + template + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip2q_f32(lhs, rhs); + } + template inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept {