From 889435ade1b4976f38040cd5d4facad82c90fe32 Mon Sep 17 00:00:00 2001 From: RealTimeChris <40668522+RealTimeChris@users.noreply.github.com> Date: Tue, 19 Sep 2023 20:20:57 -0400 Subject: [PATCH] Enhancement: Improving the efficiency of the AVX-implementations, as well as encapsulating some of their functions. Can't believe I didn't notice this before. --- include/dpp/isa/avx.h | 110 ++++++++++++++-------------------- include/dpp/isa/avx2.h | 120 ++++++++++++++----------------------- include/dpp/isa/avx512.h | 97 ++++++++++++++---------------- include/dpp/isa/fallback.h | 7 +-- 4 files changed, 138 insertions(+), 196 deletions(-) diff --git a/include/dpp/isa/avx.h b/include/dpp/isa/avx.h index 9f1d8b025e..9dc66b0d2b 100644 --- a/include/dpp/isa/avx.h +++ b/include/dpp/isa/avx.h @@ -23,80 +23,22 @@ #if defined _MSC_VER || defined __GNUC__ || defined __clang__ #include - -#ifdef max - #undef max -#endif -#ifdef min - #undef min -#endif +#include namespace dpp { using avx_float = __m128; - using avx_int = __m128i; - - /* - * @brief Extracts a 32-bit integer from a 128-bit AVX register. - * @param value The AVX register containing packed 32-bit integers. - * @param index The index of the 32-bit integer to extract (0-3). - * @return The extracted 32-bit integer. - */ - inline int32_t extract_int32_from_avx(const avx_int& value, int64_t index) { - switch (index) { - case 0: { - return _mm_extract_epi32(value, 0); - } - case 1: { - return _mm_extract_epi32(value, 1); - } - case 2: { - return _mm_extract_epi32(value, 2); - } - case 3: { - return _mm_extract_epi32(value, 3); - } - default: { - return _mm_extract_epi32(value, 0); - } - } - } - + /** * @brief A class for audio mixing operations using AVX instructions. */ class audio_mixer { public: - /* - * @brief The number of 32-bit values per CPU register. - */ - inline static constexpr int32_t byte_blocks_per_register{ 4 }; - - /* - * @brief Stores values from a 128-bit AVX vector to a storage location. - * @tparam value_type The target value type for storage. - * @param values_to_store The 128-bit AVX vector containing values to store. - * @param storage_location Pointer to the storage location. - */ - template inline static void store_values(const avx_int& values_to_store, value_type* storage_location) { - for (int64_t x = 0; x < byte_blocks_per_register; ++x) { - storage_location[x] = static_cast(extract_int32_from_avx(values_to_store, x)); - } - } /** - * @brief Specialization for gathering non-float values into an AVX register. - * @tparam value_type The type of values being gathered. - * @tparam Indices Parameter pack of indices for gathering values. - * @return An AVX register containing gathered values. + * @brief The number of 32-bit values per CPU register. */ - template inline static avx_float gather_values(value_type* values) { - alignas(16) float new_array[byte_blocks_per_register]{}; - for (size_t x = 0; x < byte_blocks_per_register; ++x) { - new_array[x] = static_cast(values[x]); - } - return _mm_load_ps(new_array); - } + inline static constexpr int32_t byte_blocks_per_register{ 4 }; /** * @brief Collect a single register worth of data from data_in, apply gain and increment, and store the result in data_out. @@ -115,7 +57,7 @@ namespace dpp { _mm_min_ps(current_samples_new, _mm_set1_ps(static_cast(std::numeric_limits::max()))), _mm_cmp_ps(current_samples_new, _mm_set1_ps(0.0f), _CMP_GE_OQ)); - store_values(_mm_cvtps_epi32(current_samples_new), data_out); + store_values(current_samples_new, data_out); } /** @@ -126,9 +68,49 @@ namespace dpp { * @param decoded_data Pointer to the array of int16_t values. */ inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) { - auto newValues{ _mm_cvtps_epi32(_mm_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data))) }; + auto newValues{ _mm_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data)) }; store_values(newValues, up_sampled_vector); } + + protected: + + /** + * @brief Stores values from a 128-bit AVX vector to a storage location. + * @tparam value_type The target value type for storage. + * @param values_to_store The 128-bit AVX vector containing values to store. + * @param storage_location Pointer to the storage location. + */ + template inline static void store_values(const avx_float& values_to_store, value_type* storage_location) { + for (int64_t x = 0; x < byte_blocks_per_register; ++x) { + storage_location[x] = static_cast(extract_float_from_avx(values_to_store, x)); + } + } + + /** + * @brief Specialization for gathering non-float values into an AVX register. + * @tparam value_type The type of values being gathered. + * @tparam Indices Parameter pack of indices for gathering values. + * @return An AVX register containing gathered values. + */ + template inline static avx_float gather_values(value_type* values) { + alignas(16) float new_array[byte_blocks_per_register]{}; + for (uint64_t x = 0; x < byte_blocks_per_register; ++x) { + new_array[x] = static_cast(values[x]); + } + return _mm256_load_ps(new_array); + } + + /** + * @brief Extracts a 32-bit integer from a 128-bit AVX register. + * @param value The AVX register containing packed 32-bit integers. + * @param index The index of the 32-bit integer to extract (0-3). + * @return The extracted 32-bit integer. + */ + inline static float extract_float_from_avx(const avx_float& value, int64_t index) { + alignas(16) float new_array[4]{}; + _mm_store_ps(new_array, value); + return new_array[index]; + } }; } // namespace dpp diff --git a/include/dpp/isa/avx2.h b/include/dpp/isa/avx2.h index 8f89cb9509..de53274293 100644 --- a/include/dpp/isa/avx2.h +++ b/include/dpp/isa/avx2.h @@ -23,92 +23,22 @@ #if defined _MSC_VER || defined __GNUC__ || defined __clang__ #include - -#ifdef max - #undef max -#endif -#ifdef min - #undef min -#endif +#include namespace dpp { using avx_2_float = __m256; - using avx_2_int = __m256i; - - /* - * @brief Extracts a 32-bit integer from a 256-bit AVX2 register. - * @param value The AVX2 register containing packed 32-bit integers. - * @param index The index of the 32bit integer to extract (0-7). - * @return The extracted 32-bit integer. - */ - inline int32_t extract_int32_from_avx2(const avx_2_int& value, int64_t index) { - switch (index) { - case 0: { - return _mm256_extract_epi32(value, 0); - } - case 1: { - return _mm256_extract_epi32(value, 1); - } - case 2: { - return _mm256_extract_epi32(value, 2); - } - case 3: { - return _mm256_extract_epi32(value, 3); - } - case 4: { - return _mm256_extract_epi32(value, 4); - } - case 5: { - return _mm256_extract_epi32(value, 5); - } - case 6: { - return _mm256_extract_epi32(value, 6); - } - case 7: { - return _mm256_extract_epi32(value, 7); - } - default: { - return _mm256_extract_epi32(value, 0); - } - } - } /** * @brief A class for audio mixing operations using AVX2 instructions. */ class audio_mixer { public: - /* - * @brief The number of 32-bit values per CPU register. - */ - inline static constexpr int32_t byte_blocks_per_register{ 8 }; - - /* - * @brief Stores values from a 256-bit AVX2 vector to a storage location. - * @tparam value_type The target value type for storage. - * @param values_to_store The 256-bit AVX2 vector containing values to store. - * @param storage_location Pointer to the storage location. - */ - template inline static void store_values(const avx_2_int& values_to_store, value_type* storage_location) { - for (int64_t x = 0; x < byte_blocks_per_register; ++x) { - storage_location[x] = static_cast(extract_int32_from_avx2(values_to_store, x)); - } - } /** - * @brief Specialization for gathering non-float values into an AVX2 register. - * @tparam value_type The type of values being gathered. - * @tparam Indices Parameter pack of indices for gathering values. - * @return An AVX2 register containing gathered values. + * @brief The number of 32-bit values per CPU register. */ - template inline static avx_2_float gather_values(value_type* values) { - alignas(32) float new_array[byte_blocks_per_register]{}; - for (size_t x = 0; x < byte_blocks_per_register; ++x) { - new_array[x] = static_cast(values[x]); - } - return _mm256_load_ps(new_array); - } + inline static constexpr int32_t byte_blocks_per_register{ 8 }; /** * @brief Collect a single register worth of data from data_in, apply gain and increment, and store the result in data_out. @@ -129,7 +59,7 @@ namespace dpp { _mm256_min_ps(current_samples_new, _mm256_set1_ps(static_cast(std::numeric_limits::max()))), _mm256_cmp_ps(current_samples_new, _mm256_set1_ps(0.0f), _CMP_GE_OQ)); - store_values(_mm256_cvtps_epi32(current_samples_new), data_out); + store_values(current_samples_new, data_out); } /** @@ -141,9 +71,49 @@ namespace dpp { * @param x Index to select a specific set of elements to combine. */ inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) { - auto newValues{ _mm256_cvtps_epi32(_mm256_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data))) }; + auto newValues{ _mm256_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data)) }; store_values(newValues, up_sampled_vector); } + + protected: + + /** + * @brief Stores values from a 256-bit AVX2 vector to a storage location. + * @tparam value_type The target value type for storage. + * @param values_to_store The 256-bit AVX2 vector containing values to store. + * @param storage_location Pointer to the storage location. + */ + template inline static void store_values(const avx_2_float& values_to_store, value_type* storage_location) { + for (int64_t x = 0; x < byte_blocks_per_register; ++x) { + storage_location[x] = static_cast(extract_float_from_avx_2(values_to_store, x)); + } + } + + /** + * @brief Specialization for gathering non-float values into an AVX2 register. + * @tparam value_type The type of values being gathered. + * @tparam Indices Parameter pack of indices for gathering values. + * @return An AVX2 register containing gathered values. + */ + template inline static avx_2_float gather_values(value_type* values) { + alignas(32) float new_array[byte_blocks_per_register]{}; + for (uint64_t x = 0; x < byte_blocks_per_register; ++x) { + new_array[x] = static_cast(values[x]); + } + return _mm256_load_ps(new_array); + } + + /** + * @brief Extracts a 32-bit integer from a 256-bit AVX2 register. + * @param value The AVX2 register containing packed 32-bit integers. + * @param index The index of the 32-bit integer to extract (0-7). + * @return The extracted 32-bit integer. + */ + inline static float extract_float_from_avx_2(const avx_2_float& value, int64_t index) { + alignas(32) float new_array[byte_blocks_per_register]{}; + _mm256_store_ps(new_array, value); + return new_array[index]; + } }; } // namespace dpp diff --git a/include/dpp/isa/avx512.h b/include/dpp/isa/avx512.h index b691242b9f..3fa9b31096 100644 --- a/include/dpp/isa/avx512.h +++ b/include/dpp/isa/avx512.h @@ -23,67 +23,22 @@ #if defined _MSC_VER || defined __GNUC__ || defined __clang__ #include - -#ifdef max - #undef max -#endif -#ifdef min - #undef min -#endif - +#include namespace dpp { using avx_512_float = __m512; - using avx_512_int = __m512i; - - /* - * @brief Extracts a 32-bit integer from a 512-bit AVX-512 register. - * @param value The AVX-512 register containing packed 32-bit integers. - * @param index The index of the 32-bit integer to extract (0-15). - * @return The extracted 32-bit integer. - */ - inline int32_t extract_int32_from_avx512(const avx_512_int& value, int64_t index) { - alignas(64) int32_t result[32]; - _mm512_store_si512(result, value); - return result[index]; - } - + /** * @brief A class for audio mixing operations using AVX512 instructions. */ class audio_mixer { public: - /* - * @brief The number of 32-bit values per CPU register. - */ - inline static constexpr int32_t byte_blocks_per_register{ 16 }; - - /* - * @brief Stores values from a 512-bit AVX512 vector to a storage location. - * @tparam value_type The target value type for storage. - * @param values_to_store The 512-bit AVX512 vector containing values to store. - * @param storage_location Pointer to the storage location. - */ - template inline static void store_values(const avx_512_int& values_to_store, value_type* storage_location) { - for (int64_t x = 0; x < byte_blocks_per_register; ++x) { - storage_location[x] = static_cast(extract_int32_from_avx512(values_to_store, x)); - } - } /** - * @brief Specialization for gathering non-float values into an AVX512 register. - * @tparam value_type The type of values being gathered. - * @tparam Indices Parameter pack of indices for gathering values. - * @return An AVX512 register containing gathered values. + * @brief The number of 32-bit values per CPU register. */ - template inline static avx_512_float gather_values(value_type* values) { - alignas(64) float new_array[byte_blocks_per_register]{}; - for (size_t x = 0; x < byte_blocks_per_register; ++x) { - new_array[x] = static_cast(values[x]); - } - return _mm512_load_ps(new_array); - } + inline static constexpr int32_t byte_blocks_per_register{ 16 }; /** * @brief Collect a single register worth of data from data_in, apply gain and increment, and store the result in data_out. @@ -108,7 +63,7 @@ namespace dpp { current_samples_new = _mm512_mask_max_ps(current_samples_new, mask_ge, current_samples_new, lower_limit); current_samples_new = _mm512_mask_min_ps(current_samples_new, ~mask_ge, current_samples_new, upper_limit); - store_values(_mm512_cvtps_epi32(current_samples_new), data_out); + store_values(current_samples_new, data_out); } /** @@ -119,9 +74,49 @@ namespace dpp { * @param decoded_data Pointer to the array of int16_t values. */ inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) { - auto newValues{ _mm512_cvtps_epi32(_mm512_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data))) }; + auto newValues{ _mm512_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data)) }; store_values(newValues, up_sampled_vector); } + + protected: + + /** + * @brief Stores values from a 512-bit AVX512 vector to a storage location. + * @tparam value_type The target value type for storage. + * @param values_to_store The 512-bit AVX512 vector containing values to store. + * @param storage_location Pointer to the storage location. + */ + template inline static void store_values(const avx_512_float& values_to_store, value_type* storage_location) { + for (int64_t x = 0; x < byte_blocks_per_register; ++x) { + storage_location[x] = static_cast(extract_float_from_avx_512(values_to_store, x)); + } + } + + /** + * @brief Specialization for gathering non-float values into an AVX512 register. + * @tparam value_type The type of values being gathered. + * @tparam Indices Parameter pack of indices for gathering values. + * @return An AVX512 register containing gathered values. + */ + template inline static avx_512_float gather_values(value_type* values) { + alignas(64) float new_array[byte_blocks_per_register]{}; + for (uint64_t x = 0; x < byte_blocks_per_register; ++x) { + new_array[x] = static_cast(values[x]); + } + return _mm512_load_ps(new_array); + } + + /** + * @brief Extracts a 32-bit integer from a 512-bit AVX512 register. + * @param value The AVX512 register containing packed 32-bit integers. + * @param index The index of the 32-bit integer to extract (0-15). + * @return The extracted 32-bit integer. + */ + inline static float extract_float_from_avx_512(const avx_512_float& value, int64_t index) { + alignas(64) float new_array[byte_blocks_per_register]{}; + _mm512_store_ps(new_array, value); + return new_array[index]; + } }; } // namespace dpp diff --git a/include/dpp/isa/fallback.h b/include/dpp/isa/fallback.h index 2ce44c4464..147ff51d0a 100644 --- a/include/dpp/isa/fallback.h +++ b/include/dpp/isa/fallback.h @@ -20,12 +20,7 @@ ************************************************************************************/ #pragma once -#ifdef max - #undef max -#endif -#ifdef min - #undef min -#endif +#include namespace dpp {