Skip to content

Commit

Permalink
Fix dispatching mechanism
Browse files Browse the repository at this point in the history
Traverse required arch in the order provided by the user instead of
trying to guess the best one.

It is actually impossible to define the notion of a best architectures
as intel instruction set have a tree structure and not a linear
structure : there are multiple leaves and none of them can be considered
the "best".
  • Loading branch information
serge-sans-paille committed Dec 22, 2023
1 parent a48ab43 commit 0e7ea7e
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 98 deletions.
9 changes: 3 additions & 6 deletions include/xsimd/config/xsimd_arch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,9 +187,6 @@ namespace xsimd
};
} // namespace detail

struct unsupported
{
};
using all_x86_architectures = arch_list<
avx512vnni<avx512vbmi>, avx512vbmi, avx512ifma, avx512pf, avx512vnni<avx512bw>, avx512bw, avx512er, avx512dq, avx512cd, avx512f,
avxvnni, fma3<avx2>, avx2, fma3<avx>, avx, fma4, fma3<sse4_2>,
Expand Down Expand Up @@ -221,7 +218,7 @@ namespace xsimd
class dispatcher
{

const unsigned best_arch_found;
const decltype(available_architectures()) availables_archs;
F functor;

template <class Arch, class... Tys>
Expand All @@ -234,15 +231,15 @@ namespace xsimd
template <class Arch, class ArchNext, class... Archs, class... Tys>
inline auto walk_archs(arch_list<Arch, ArchNext, Archs...>, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward<Tys>(args)...))
{
if (Arch::version() <= best_arch_found)
if (availables_archs.has(Arch {}))
return functor(Arch {}, std::forward<Tys>(args)...);
else
return walk_archs(arch_list<ArchNext, Archs...> {}, std::forward<Tys>(args)...);
}

public:
inline dispatcher(F f) noexcept
: best_arch_found(available_architectures().best)
: availables_archs(available_architectures())
, functor(f)
{
}
Expand Down
128 changes: 45 additions & 83 deletions include/xsimd/config/xsimd_cpuid.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,37 +33,45 @@ namespace xsimd
{
struct supported_arch
{
unsigned sse2 : 1;
unsigned sse3 : 1;
unsigned ssse3 : 1;
unsigned sse4_1 : 1;
unsigned sse4_2 : 1;
unsigned sse4a : 1;
unsigned fma3_sse : 1;
unsigned fma4 : 1;
unsigned xop : 1;
unsigned avx : 1;
unsigned fma3_avx : 1;
unsigned avx2 : 1;
unsigned avxvnni : 1;
unsigned fma3_avx2 : 1;
unsigned avx512f : 1;
unsigned avx512cd : 1;
unsigned avx512dq : 1;
unsigned avx512bw : 1;
unsigned avx512er : 1;
unsigned avx512pf : 1;
unsigned avx512ifma : 1;
unsigned avx512vbmi : 1;
unsigned avx512vnni_bw : 1;
unsigned avx512vnni_vbmi : 1;
unsigned neon : 1;
unsigned neon64 : 1;
unsigned sve : 1;
unsigned rvv : 1;

// version number of the best arch available
unsigned best;

#define ARCH_FIELD_EX(arch, field_name) \
unsigned field_name; \
bool has(::xsimd::arch) const { return this->field_name; }
#define ARCH_FIELD(name) ARCH_FIELD_EX(name, name)

ARCH_FIELD(sse2)
ARCH_FIELD(sse3)

ARCH_FIELD(ssse3)
ARCH_FIELD(sse4_1)
ARCH_FIELD(sse4_2)
// ARCH_FIELD(sse4a)
ARCH_FIELD_EX(fma3<::xsimd::sse4_2>, fma3_sse42)
ARCH_FIELD(fma4)
// ARCH_FIELD(xop)
ARCH_FIELD(avx)
ARCH_FIELD_EX(fma3<::xsimd::avx>, fma3_avx)
ARCH_FIELD(avx2)
ARCH_FIELD(avxvnni)
ARCH_FIELD_EX(fma3<::xsimd::avx2>, fma3_avx2)
ARCH_FIELD(avx512f)
ARCH_FIELD(avx512cd)
ARCH_FIELD(avx512dq)
ARCH_FIELD(avx512bw)
ARCH_FIELD(avx512er)
ARCH_FIELD(avx512pf)
ARCH_FIELD(avx512ifma)
ARCH_FIELD(avx512vbmi)
ARCH_FIELD_EX(avx512vnni<::xsimd::avx512bw>, avx512vnni_bw)
ARCH_FIELD_EX(avx512vnni<::xsimd::avx512vbmi>, avx512vnni_vbmi)
ARCH_FIELD(neon)
ARCH_FIELD(neon64)
ARCH_FIELD(sve)
#if XSIMD_WITH_RVV
ARCH_FIELD(rvv)
#endif

#undef ARCH_FIELD

inline supported_arch() noexcept
{
Expand All @@ -72,7 +80,6 @@ namespace xsimd
#if defined(__aarch64__) || defined(_M_ARM64)
neon = 1;
neon64 = 1;
best = neon64::version();
#elif defined(__ARM_NEON) || defined(_M_ARM)

#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
Expand All @@ -82,7 +89,6 @@ namespace xsimd
neon = 0;
#endif
neon64 = 0;
best = neon::version() * neon;

#elif defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0

Expand All @@ -91,7 +97,6 @@ namespace xsimd
#else
sve = 0;
#endif
best = sve::version() * sve;

#elif defined(__riscv_vector) && defined(__riscv_v_fixed_vlen) && __riscv_v_fixed_vlen > 0

Expand All @@ -104,7 +109,6 @@ namespace xsimd
rvv = 0;
#endif

best = ::xsimd::rvv::version() * rvv;
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86)
auto get_cpuid = [](int reg[4], int level, int count = 0) noexcept
{
Expand All @@ -122,14 +126,12 @@ namespace xsimd
__asm__("xchg{l}\t{%%}ebx, %1\n\t"
"cpuid\n\t"
"xchg{l}\t{%%}ebx, %1\n\t"
: "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]),
"=d"(reg[3])
: "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
: "0"(level), "2"(count));

#else
__asm__("cpuid\n\t"
: "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]),
"=d"(reg[3])
: "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
: "0"(level), "2"(count));
#endif

Expand All @@ -143,87 +145,47 @@ namespace xsimd
get_cpuid(regs1, 0x1);

sse2 = regs1[3] >> 26 & 1;
best = std::max(best, sse2::version() * sse2);

sse3 = regs1[2] >> 0 & 1;
best = std::max(best, sse3::version() * sse3);

ssse3 = regs1[2] >> 9 & 1;
best = std::max(best, ssse3::version() * ssse3);

sse4_1 = regs1[2] >> 19 & 1;
best = std::max(best, sse4_1::version() * sse4_1);

sse4_2 = regs1[2] >> 20 & 1;
best = std::max(best, sse4_2::version() * sse4_2);

fma3_sse = regs1[2] >> 12 & 1;
if (sse4_2)
best = std::max(best, fma3<xsimd::sse4_2>::version() * fma3_sse);
fma3_sse42 = regs1[2] >> 12 & 1;

avx = regs1[2] >> 28 & 1;
best = std::max(best, avx::version() * avx);

fma3_avx = avx && fma3_sse;
best = std::max(best, fma3<xsimd::avx>::version() * fma3_avx);
fma3_avx = avx && fma3_sse42;

int regs8[4];
get_cpuid(regs8, 0x80000001);
fma4 = regs8[2] >> 16 & 1;
best = std::max(best, fma4::version() * fma4);

// sse4a = regs[2] >> 6 & 1;
// best = std::max(best, XSIMD_X86_AMD_SSE4A_VERSION * sse4a);

// xop = regs[2] >> 11 & 1;
// best = std::max(best, XSIMD_X86_AMD_XOP_VERSION * xop);

int regs7[4];
get_cpuid(regs7, 0x7);
avx2 = regs7[1] >> 5 & 1;
best = std::max(best, avx2::version() * avx2);

int regs7a[4];
get_cpuid(regs7a, 0x7, 0x1);
avxvnni = regs7a[0] >> 4 & 1;
best = std::max(best, avxvnni::version() * avxvnni * avx2);

fma3_avx2 = avx2 && fma3_sse;
best = std::max(best, fma3<xsimd::avx2>::version() * fma3_avx2);
fma3_avx2 = avx2 && fma3_sse42;

avx512f = regs7[1] >> 16 & 1;
best = std::max(best, avx512f::version() * avx512f);

avx512cd = regs7[1] >> 28 & 1;
best = std::max(best, avx512cd::version() * avx512cd * avx512f);

avx512dq = regs7[1] >> 17 & 1;
best = std::max(best, avx512dq::version() * avx512dq * avx512cd * avx512f);

avx512bw = regs7[1] >> 30 & 1;
best = std::max(best, avx512bw::version() * avx512bw * avx512dq * avx512cd * avx512f);

avx512er = regs7[1] >> 27 & 1;
best = std::max(best, avx512er::version() * avx512er * avx512cd * avx512f);

avx512pf = regs7[1] >> 26 & 1;
best = std::max(best, avx512pf::version() * avx512pf * avx512er * avx512cd * avx512f);

avx512ifma = regs7[1] >> 21 & 1;
best = std::max(best, avx512ifma::version() * avx512ifma * avx512bw * avx512dq * avx512cd * avx512f);

avx512vbmi = regs7[2] >> 1 & 1;
best = std::max(best, avx512vbmi::version() * avx512vbmi * avx512ifma * avx512bw * avx512dq * avx512cd * avx512f);

avx512vnni_bw = regs7[2] >> 11 & 1;
best = std::max(best, avx512vnni<xsimd::avx512bw>::version() * avx512vnni_bw * avx512bw * avx512dq * avx512cd * avx512f);

avx512vnni_vbmi = avx512vbmi && avx512vnni_bw;
best = std::max(best, avx512vnni<xsimd::avx512vbmi>::version() * avx512vnni_vbmi);
#endif
}
};
}
} // namespace detail

inline detail::supported_arch available_architectures() noexcept
{
Expand Down
4 changes: 4 additions & 0 deletions include/xsimd/types/xsimd_generic_arch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ namespace xsimd
protected:
static constexpr unsigned version(unsigned major, unsigned minor, unsigned patch, unsigned multiplier = 100u) noexcept { return major * multiplier * multiplier + minor * multiplier + patch; }
};

struct unsupported
{
};
}

#endif
2 changes: 2 additions & 0 deletions include/xsimd/types/xsimd_rvv_register.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,8 @@ namespace xsimd
using type = detail::rvv_bool_simd_register<T>;
};
} // namespace types
#else
using rvv = detail::rvv<0xFFFFFFFF>;
#endif
} // namespace xsimd

Expand Down
2 changes: 2 additions & 0 deletions include/xsimd/types/xsimd_sve_register.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ namespace xsimd
using type = detail::sve_bool_simd_register;
};
} // namespace types
#else
using sve = detail::sve<0xFFFFFFFF>;
#endif
} // namespace xsimd

Expand Down
9 changes: 0 additions & 9 deletions test/test_arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,15 +124,6 @@ TEST_CASE("[multi arch support]")
float res = dispatched(data, 17);
CHECK_EQ(ref, res);
}

// check that we pick the most appropriate version
{
auto dispatched = xsimd::dispatch<xsimd::arch_list<xsimd::sse3, xsimd::sse2, xsimd::generic>>(get_arch_version {});
unsigned expected = xsimd::available_architectures().best >= xsimd::sse3::version()
? xsimd::sse3::version()
: xsimd::sse2::version();
CHECK_EQ(expected, dispatched());
}
#endif
}

Expand Down

0 comments on commit 0e7ea7e

Please sign in to comment.