Skip to content

Commit

Permalink
Use dedicated instructions for zip / unzip on arm64
Browse files Browse the repository at this point in the history
  • Loading branch information
serge-sans-paille committed Feb 5, 2024
1 parent b6868c2 commit ead0742
Showing 1 changed file with 83 additions and 0 deletions.
83 changes: 83 additions & 0 deletions include/xsimd/arch/xsimd_neon64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -952,6 +952,41 @@ namespace xsimd
/**********
* zip_lo *
**********/
template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
{
return vzip1q_u8(lhs, rhs);
}

template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
{
return vzip1q_s8(lhs, rhs);
}

template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
{
return vzip1q_u16(lhs, rhs);
}

template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
{
return vzip1q_s16(lhs, rhs);
}

template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
{
return vzip1q_u32(lhs, rhs);
}

template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
{
return vzip1q_s32(lhs, rhs);
}

template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
Expand All @@ -965,6 +1000,12 @@ namespace xsimd
return vzip1q_s64(lhs, rhs);
}

template <class A>
inline batch<float, A> zip_lo(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon64>) noexcept
{
return vzip1q_f32(lhs, rhs);
}

template <class A>
inline batch<double, A> zip_lo(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
{
Expand All @@ -975,6 +1016,42 @@ namespace xsimd
* zip_hi *
**********/

template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
{
return vzip2q_u8(lhs, rhs);
}

template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
{
return vzip2q_s8(lhs, rhs);
}

template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
{
return vzip2q_u16(lhs, rhs);
}

template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
{
return vzip2q_s16(lhs, rhs);
}

template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
{
return vzip2q_u32(lhs, rhs);
}

template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
{
return vzip2q_s32(lhs, rhs);
}

template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
{
Expand All @@ -987,6 +1064,12 @@ namespace xsimd
return vzip2q_s64(lhs, rhs);
}

template <class A>
inline batch<float, A> zip_hi(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon64>) noexcept
{
return vzip2q_f32(lhs, rhs);
}

template <class A>
inline batch<double, A> zip_hi(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
{
Expand Down

0 comments on commit ead0742

Please sign in to comment.