From 0e7ea7e27b29d8c923863fc90fdb78f76581af96 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Fri, 22 Dec 2023 11:11:26 +0100 Subject: [PATCH] Fix dispatching mechanism Traverse required arch in the order provided by the user instead of trying to guess the best one. It is actually impossible to define the notion of a best architectures as intel instruction set have a tree structure and not a linear structure : there are multiple leaves and none of them can be considered the "best". --- include/xsimd/config/xsimd_arch.hpp | 9 +- include/xsimd/config/xsimd_cpuid.hpp | 128 ++++++++------------- include/xsimd/types/xsimd_generic_arch.hpp | 4 + include/xsimd/types/xsimd_rvv_register.hpp | 2 + include/xsimd/types/xsimd_sve_register.hpp | 2 + test/test_arch.cpp | 9 -- 6 files changed, 56 insertions(+), 98 deletions(-) diff --git a/include/xsimd/config/xsimd_arch.hpp b/include/xsimd/config/xsimd_arch.hpp index fe8c54166..575459a00 100644 --- a/include/xsimd/config/xsimd_arch.hpp +++ b/include/xsimd/config/xsimd_arch.hpp @@ -187,9 +187,6 @@ namespace xsimd }; } // namespace detail - struct unsupported - { - }; using all_x86_architectures = arch_list< avx512vnni, avx512vbmi, avx512ifma, avx512pf, avx512vnni, avx512bw, avx512er, avx512dq, avx512cd, avx512f, avxvnni, fma3, avx2, fma3, avx, fma4, fma3, @@ -221,7 +218,7 @@ namespace xsimd class dispatcher { - const unsigned best_arch_found; + const decltype(available_architectures()) availables_archs; F functor; template @@ -234,7 +231,7 @@ namespace xsimd template inline auto walk_archs(arch_list, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward(args)...)) { - if (Arch::version() <= best_arch_found) + if (availables_archs.has(Arch {})) return functor(Arch {}, std::forward(args)...); else return walk_archs(arch_list {}, std::forward(args)...); @@ -242,7 +239,7 @@ namespace xsimd public: inline dispatcher(F f) noexcept - : best_arch_found(available_architectures().best) + : availables_archs(available_architectures()) , functor(f) { } diff --git a/include/xsimd/config/xsimd_cpuid.hpp b/include/xsimd/config/xsimd_cpuid.hpp index 62aca6132..487d71b47 100644 --- a/include/xsimd/config/xsimd_cpuid.hpp +++ b/include/xsimd/config/xsimd_cpuid.hpp @@ -33,37 +33,45 @@ namespace xsimd { struct supported_arch { - unsigned sse2 : 1; - unsigned sse3 : 1; - unsigned ssse3 : 1; - unsigned sse4_1 : 1; - unsigned sse4_2 : 1; - unsigned sse4a : 1; - unsigned fma3_sse : 1; - unsigned fma4 : 1; - unsigned xop : 1; - unsigned avx : 1; - unsigned fma3_avx : 1; - unsigned avx2 : 1; - unsigned avxvnni : 1; - unsigned fma3_avx2 : 1; - unsigned avx512f : 1; - unsigned avx512cd : 1; - unsigned avx512dq : 1; - unsigned avx512bw : 1; - unsigned avx512er : 1; - unsigned avx512pf : 1; - unsigned avx512ifma : 1; - unsigned avx512vbmi : 1; - unsigned avx512vnni_bw : 1; - unsigned avx512vnni_vbmi : 1; - unsigned neon : 1; - unsigned neon64 : 1; - unsigned sve : 1; - unsigned rvv : 1; - - // version number of the best arch available - unsigned best; + +#define ARCH_FIELD_EX(arch, field_name) \ + unsigned field_name; \ + bool has(::xsimd::arch) const { return this->field_name; } +#define ARCH_FIELD(name) ARCH_FIELD_EX(name, name) + + ARCH_FIELD(sse2) + ARCH_FIELD(sse3) + + ARCH_FIELD(ssse3) + ARCH_FIELD(sse4_1) + ARCH_FIELD(sse4_2) + // ARCH_FIELD(sse4a) + ARCH_FIELD_EX(fma3<::xsimd::sse4_2>, fma3_sse42) + ARCH_FIELD(fma4) + // ARCH_FIELD(xop) + ARCH_FIELD(avx) + ARCH_FIELD_EX(fma3<::xsimd::avx>, fma3_avx) + ARCH_FIELD(avx2) + ARCH_FIELD(avxvnni) + ARCH_FIELD_EX(fma3<::xsimd::avx2>, fma3_avx2) + ARCH_FIELD(avx512f) + ARCH_FIELD(avx512cd) + ARCH_FIELD(avx512dq) + ARCH_FIELD(avx512bw) + ARCH_FIELD(avx512er) + ARCH_FIELD(avx512pf) + ARCH_FIELD(avx512ifma) + ARCH_FIELD(avx512vbmi) + ARCH_FIELD_EX(avx512vnni<::xsimd::avx512bw>, avx512vnni_bw) + ARCH_FIELD_EX(avx512vnni<::xsimd::avx512vbmi>, avx512vnni_vbmi) + ARCH_FIELD(neon) + ARCH_FIELD(neon64) + ARCH_FIELD(sve) +#if XSIMD_WITH_RVV + ARCH_FIELD(rvv) +#endif + +#undef ARCH_FIELD inline supported_arch() noexcept { @@ -72,7 +80,6 @@ namespace xsimd #if defined(__aarch64__) || defined(_M_ARM64) neon = 1; neon64 = 1; - best = neon64::version(); #elif defined(__ARM_NEON) || defined(_M_ARM) #if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18) @@ -82,7 +89,6 @@ namespace xsimd neon = 0; #endif neon64 = 0; - best = neon::version() * neon; #elif defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0 @@ -91,7 +97,6 @@ namespace xsimd #else sve = 0; #endif - best = sve::version() * sve; #elif defined(__riscv_vector) && defined(__riscv_v_fixed_vlen) && __riscv_v_fixed_vlen > 0 @@ -104,7 +109,6 @@ namespace xsimd rvv = 0; #endif - best = ::xsimd::rvv::version() * rvv; #elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86) auto get_cpuid = [](int reg[4], int level, int count = 0) noexcept { @@ -122,14 +126,12 @@ namespace xsimd __asm__("xchg{l}\t{%%}ebx, %1\n\t" "cpuid\n\t" "xchg{l}\t{%%}ebx, %1\n\t" - : "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]), - "=d"(reg[3]) + : "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]), "=d"(reg[3]) : "0"(level), "2"(count)); #else __asm__("cpuid\n\t" - : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), - "=d"(reg[3]) + : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3]) : "0"(level), "2"(count)); #endif @@ -143,87 +145,47 @@ namespace xsimd get_cpuid(regs1, 0x1); sse2 = regs1[3] >> 26 & 1; - best = std::max(best, sse2::version() * sse2); - sse3 = regs1[2] >> 0 & 1; - best = std::max(best, sse3::version() * sse3); - ssse3 = regs1[2] >> 9 & 1; - best = std::max(best, ssse3::version() * ssse3); - sse4_1 = regs1[2] >> 19 & 1; - best = std::max(best, sse4_1::version() * sse4_1); - sse4_2 = regs1[2] >> 20 & 1; - best = std::max(best, sse4_2::version() * sse4_2); - - fma3_sse = regs1[2] >> 12 & 1; - if (sse4_2) - best = std::max(best, fma3::version() * fma3_sse); + fma3_sse42 = regs1[2] >> 12 & 1; avx = regs1[2] >> 28 & 1; - best = std::max(best, avx::version() * avx); - - fma3_avx = avx && fma3_sse; - best = std::max(best, fma3::version() * fma3_avx); + fma3_avx = avx && fma3_sse42; int regs8[4]; get_cpuid(regs8, 0x80000001); fma4 = regs8[2] >> 16 & 1; - best = std::max(best, fma4::version() * fma4); // sse4a = regs[2] >> 6 & 1; - // best = std::max(best, XSIMD_X86_AMD_SSE4A_VERSION * sse4a); // xop = regs[2] >> 11 & 1; - // best = std::max(best, XSIMD_X86_AMD_XOP_VERSION * xop); int regs7[4]; get_cpuid(regs7, 0x7); avx2 = regs7[1] >> 5 & 1; - best = std::max(best, avx2::version() * avx2); int regs7a[4]; get_cpuid(regs7a, 0x7, 0x1); avxvnni = regs7a[0] >> 4 & 1; - best = std::max(best, avxvnni::version() * avxvnni * avx2); - fma3_avx2 = avx2 && fma3_sse; - best = std::max(best, fma3::version() * fma3_avx2); + fma3_avx2 = avx2 && fma3_sse42; avx512f = regs7[1] >> 16 & 1; - best = std::max(best, avx512f::version() * avx512f); - avx512cd = regs7[1] >> 28 & 1; - best = std::max(best, avx512cd::version() * avx512cd * avx512f); - avx512dq = regs7[1] >> 17 & 1; - best = std::max(best, avx512dq::version() * avx512dq * avx512cd * avx512f); - avx512bw = regs7[1] >> 30 & 1; - best = std::max(best, avx512bw::version() * avx512bw * avx512dq * avx512cd * avx512f); - avx512er = regs7[1] >> 27 & 1; - best = std::max(best, avx512er::version() * avx512er * avx512cd * avx512f); - avx512pf = regs7[1] >> 26 & 1; - best = std::max(best, avx512pf::version() * avx512pf * avx512er * avx512cd * avx512f); - avx512ifma = regs7[1] >> 21 & 1; - best = std::max(best, avx512ifma::version() * avx512ifma * avx512bw * avx512dq * avx512cd * avx512f); - avx512vbmi = regs7[2] >> 1 & 1; - best = std::max(best, avx512vbmi::version() * avx512vbmi * avx512ifma * avx512bw * avx512dq * avx512cd * avx512f); - avx512vnni_bw = regs7[2] >> 11 & 1; - best = std::max(best, avx512vnni::version() * avx512vnni_bw * avx512bw * avx512dq * avx512cd * avx512f); - avx512vnni_vbmi = avx512vbmi && avx512vnni_bw; - best = std::max(best, avx512vnni::version() * avx512vnni_vbmi); #endif } }; - } + } // namespace detail inline detail::supported_arch available_architectures() noexcept { diff --git a/include/xsimd/types/xsimd_generic_arch.hpp b/include/xsimd/types/xsimd_generic_arch.hpp index 2aa25419c..f4a2ca6aa 100644 --- a/include/xsimd/types/xsimd_generic_arch.hpp +++ b/include/xsimd/types/xsimd_generic_arch.hpp @@ -43,6 +43,10 @@ namespace xsimd protected: static constexpr unsigned version(unsigned major, unsigned minor, unsigned patch, unsigned multiplier = 100u) noexcept { return major * multiplier * multiplier + minor * multiplier + patch; } }; + + struct unsupported + { + }; } #endif diff --git a/include/xsimd/types/xsimd_rvv_register.hpp b/include/xsimd/types/xsimd_rvv_register.hpp index 1b3daf459..bdc0ef3b8 100644 --- a/include/xsimd/types/xsimd_rvv_register.hpp +++ b/include/xsimd/types/xsimd_rvv_register.hpp @@ -411,6 +411,8 @@ namespace xsimd using type = detail::rvv_bool_simd_register; }; } // namespace types +#else + using rvv = detail::rvv<0xFFFFFFFF>; #endif } // namespace xsimd diff --git a/include/xsimd/types/xsimd_sve_register.hpp b/include/xsimd/types/xsimd_sve_register.hpp index 27b241980..4f75c607e 100644 --- a/include/xsimd/types/xsimd_sve_register.hpp +++ b/include/xsimd/types/xsimd_sve_register.hpp @@ -149,6 +149,8 @@ namespace xsimd using type = detail::sve_bool_simd_register; }; } // namespace types +#else + using sve = detail::sve<0xFFFFFFFF>; #endif } // namespace xsimd diff --git a/test/test_arch.cpp b/test/test_arch.cpp index 8d877a695..0ea7c57c9 100644 --- a/test/test_arch.cpp +++ b/test/test_arch.cpp @@ -124,15 +124,6 @@ TEST_CASE("[multi arch support]") float res = dispatched(data, 17); CHECK_EQ(ref, res); } - - // check that we pick the most appropriate version - { - auto dispatched = xsimd::dispatch>(get_arch_version {}); - unsigned expected = xsimd::available_architectures().best >= xsimd::sse3::version() - ? xsimd::sse3::version() - : xsimd::sse2::version(); - CHECK_EQ(expected, dispatched()); - } #endif }