-
-
Notifications
You must be signed in to change notification settings - Fork 165
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding AVX implementation for mixing audio.
Should significantly increase efficiency on supported platforms. Specify a CMake variable called AVX_TYPE that is set to either T_AVX512, T_AVX2, T_AVX, or T_AVX_FALLBACK to specify each of the respective architectures.
- Loading branch information
1 parent
b68a006
commit 08284e4
Showing
8 changed files
with
370 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
include(CheckCXXSourceRuns) | ||
|
||
function(check_instruction_set INSTRUCTION_SET_NAME INSTRUCTION_SET_FLAG INSTRUCTION_SET_INTRINSIC) | ||
|
||
set(INSTRUCTION_SET_CODE " | ||
#include <immintrin.h> | ||
#include <stdint.h> | ||
int main() | ||
{ | ||
${INSTRUCTION_SET_INTRINSIC}; | ||
return 0; | ||
} | ||
") | ||
|
||
set(CMAKE_REQUIRED_FLAGS "${INSTRUCTION_SET_FLAG}") | ||
CHECK_CXX_SOURCE_RUNS("${INSTRUCTION_SET_CODE}" "${INSTRUCTION_SET_NAME}") | ||
if(${INSTRUCTION_SET_NAME}) | ||
set(AVX_TYPE "${INSTRUCTION_SET_NAME}" PARENT_SCOPE) | ||
set(AVX_FLAG "${INSTRUCTION_SET_FLAG}" PARENT_SCOPE) | ||
set(AVX_NAME "${INSTRUCTION_SET_NAME}" PARENT_SCOPE) | ||
else() | ||
message(STATUS "Instruction set ${INSTRUCTION_SET_NAME} not supported. Falling back to the previous instruction set.") | ||
return() | ||
endif() | ||
endfunction() | ||
|
||
if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") | ||
set(INSTRUCTION_SETS | ||
"T_AVX?/arch:AVX?auto result = _mm_testz_ps(__m128{}, __m128{})" | ||
"T_AVX2?/arch:AVX2?auto result = _mm256_extract_epi64(__m256i{}, 0)" | ||
"T_AVX512?/arch:AVX512?auto result = _mm512_add_ps(__m512i{}, __m512i{}).auto result2 = _mm512_cmplt_epu8_mask(__m512i{}, __m512i{})" | ||
) | ||
else() | ||
set(INSTRUCTION_SETS | ||
"T_AVX?-mavx.-mpclmul.-mbmi?auto result = _mm_testz_ps(__m128{}, __m128{})" | ||
"T_AVX2?-mavx2.-mavx.-mpclmul.-mbmi?auto result = _mm256_extract_epi64(__m256i{}, 0)" | ||
"T_AVX512?-mavx512bw.-mavx512f.-mavx2.-mavx.-mpclmul.-mbmi?auto result = _mm512_add_ps(__m512i{}, __m512i{}).auto result2 = _mm512_cmplt_epu8_mask(__m512i{}, __m512i{})" | ||
) | ||
endif() | ||
|
||
set(CMAKE_REQUIRED_FLAGS_SAVE "${CMAKE_REQUIRED_FLAGS}") | ||
|
||
set(AVX_NAME "T_Fallback") | ||
|
||
foreach(INSTRUCTION_SET IN LISTS INSTRUCTION_SETS) | ||
string(REPLACE "?" ";" CURRENT_LIST "${INSTRUCTION_SET}") | ||
list(GET CURRENT_LIST 0 INSTRUCTION_SET_NAME) | ||
list(GET CURRENT_LIST 1 INSTRUCTION_SET_FLAG) | ||
string(REPLACE "." ";" INSTRUCTION_SET_FLAG "${INSTRUCTION_SET_FLAG}") | ||
list(GET CURRENT_LIST 2 INSTRUCTION_SET_INTRINSIC) | ||
string(REPLACE "." ";" INSTRUCTION_SET_INTRINSIC "${INSTRUCTION_SET_INTRINSIC}") | ||
check_instruction_set("${INSTRUCTION_SET_NAME}" "${INSTRUCTION_SET_FLAG}" "${INSTRUCTION_SET_INTRINSIC}") | ||
endforeach() | ||
|
||
message(STATUS "Detected CPU Architecture: ${AVX_NAME}") | ||
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS_SAVE}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,249 @@ | ||
/************************************************************************************ | ||
* | ||
* D++, A Lightweight C++ library for Discord | ||
* | ||
* Copyright 2021 Craig Edwards and D++ contributors | ||
* (https://github.com/brainboxdotcc/DPP/graphs/contributors) | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* | ||
************************************************************************************/ | ||
#pragma once | ||
|
||
#include <immintrin.h> | ||
#include <opus/opus.h> | ||
#include <limits> | ||
|
||
#ifdef max | ||
#undef max | ||
#endif | ||
|
||
#ifdef min | ||
#undef min | ||
#endif | ||
|
||
namespace dpp { | ||
|
||
#ifdef T_AVX512 | ||
|
||
/** | ||
* @brief Collect sixteen elements from data_in, apply gain and increment, and store the result in data_out. | ||
* This version uses AVX-512 instructions. | ||
* | ||
* @param data_in Pointer to the input array of opus_int32 values. | ||
* @param data_out Pointer to the output array of opus_int16 values. | ||
* @param current_gain The gain to be applied to the elements. | ||
* @param increment The increment value to be added to each element. | ||
*/ | ||
inline void collect_sixteen_elements(opus_int32* data_in, opus_int16* data_out, float current_gain, float increment) { | ||
auto original_values = _mm512_load_si512(reinterpret_cast<const __m512i*>(data_in)); | ||
auto original_float_values = _mm512_cvtepi32_ps(original_values); | ||
auto original_increment = _mm512_mul_ps(_mm512_set1_ps(increment), | ||
_mm512_set_ps(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f)); | ||
auto original_gain = _mm512_set1_ps(current_gain); | ||
auto final_gain = _mm512_add_ps(original_gain, original_increment); | ||
__m512 current_samples_new{ _mm512_mul_ps(original_float_values, final_gain) }; | ||
auto selector_values{ _mm512_cmp_ps_mask(current_samples_new, _mm512_set1_ps(0.0f), _CMP_GE_OQ) }; | ||
auto max_values{ _mm512_max_ps(current_samples_new, _mm512_set1_ps(static_cast<float>(std::numeric_limits<opus_int16>::min()))) }; | ||
auto min_values{ _mm512_min_ps(current_samples_new, _mm512_set1_ps(static_cast<float>(std::numeric_limits<opus_int16>::max()))) }; | ||
|
||
__m512 final_values{ _mm512_mask_blend_ps(selector_values, max_values, min_values) }; | ||
|
||
__m512i current_samples_newer{ _mm512_cvtps_epi32(final_values) }; | ||
|
||
__m256i pack256{ _mm256_packs_epi32(_mm512_extracti64x4_epi64(current_samples_newer, 0), | ||
_mm512_extracti64x4_epi64(current_samples_newer, 1)) }; | ||
|
||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(data_out), pack256); | ||
} | ||
|
||
/** | ||
* @brief Combine 16 elements from up_sampled_vector with 16 elements from decoded_data and store the result in up_sampled_vector. | ||
* This version uses AVX-512 instructions. | ||
* | ||
* @param up_sampled_vector Pointer to the array of opus_int32 values. | ||
* @param decoded_data Pointer to the array of opus_int16 values. | ||
* @param x Index to select a specific set of elements to combine. | ||
*/ | ||
inline void combine_samples(opus_int32* up_sampled_vector, const opus_int16* decoded_data, size_t x) { | ||
__m512i* up_sampled_ptr = reinterpret_cast<__m512i*>(up_sampled_vector + (x * 16)); | ||
const __m256i* decoded_ptr = reinterpret_cast<const __m256i*>(decoded_data + (x * 16)); | ||
|
||
__m512i current_up_sampled = _mm512_loadu_si512(up_sampled_ptr); | ||
__m256i current_decoded256 = _mm256_loadu_si256(decoded_ptr); | ||
|
||
__m512i upSampledAdded512 = _mm512_add_epi32(current_up_sampled, _mm512_cvtepi16_epi32(current_decoded256)); | ||
|
||
_mm512_storeu_si512(up_sampled_ptr, upSampledAdded512); | ||
} | ||
|
||
#elif T_AVX2 | ||
|
||
/** | ||
* @brief Collect sixteen elements from data_in, apply gain and increment, and store the result in data_out. | ||
* This version uses AVX2 instructions. | ||
* | ||
* @param data_in Pointer to the input array of opus_int32 values. | ||
* @param data_out Pointer to the output array of opus_int16 values. | ||
* @param current_gain The gain to be applied to the elements. | ||
* @param increment The increment value to be added to each element. | ||
*/ | ||
inline void collect_sixteen_elements(opus_int32* data_in, opus_int16* data_out, float current_gain, float increment) { | ||
for (size_t x = 0; x < 2; ++x) { | ||
auto original_values = _mm256_load_si256(reinterpret_cast<const __m256i*>(data_in) + x); | ||
auto original_float_values = _mm256_cvtepi32_ps(original_values); | ||
auto original_increment = _mm256_mul_ps(_mm256_set1_ps(increment), | ||
_mm256_set_ps(0.0f * static_cast<float>(x), 1.0f * static_cast<float>(x), 2.0f * static_cast<float>(x), 3.0f * static_cast<float>(x), | ||
4.0f * static_cast<float>(x), 5.0f * static_cast<float>(x), 6.0f * static_cast<float>(x), 7.0f * static_cast<float>(x))); | ||
auto original_gain = _mm256_set1_ps(current_gain); | ||
auto final_gain = _mm256_add_ps(original_gain, original_increment); | ||
__m256 current_samples_new{ _mm256_mul_ps(original_float_values, final_gain) }; | ||
auto selector_values{ _mm256_cmp_ps(current_samples_new, _mm256_set1_ps(0.0f), _CMP_GE_OQ) }; | ||
auto max_values{ _mm256_max_ps(current_samples_new, _mm256_set1_ps(static_cast<float>(std::numeric_limits<opus_int16>::min()))) }; | ||
auto min_values{ _mm256_min_ps(current_samples_new, _mm256_set1_ps(static_cast<float>(std::numeric_limits<opus_int16>::max()))) }; | ||
|
||
__m256 final_values{ _mm256_blendv_ps(max_values, min_values, selector_values) }; | ||
|
||
__m256i current_samples_newer{ _mm256_cvtps_epi32(final_values) }; | ||
|
||
__m128i pack128{ _mm_packs_epi32(_mm256_extractf128_si256(current_samples_newer, 0), | ||
_mm256_extractf128_si256(current_samples_newer, 1)) }; | ||
|
||
_mm_storeu_si128(reinterpret_cast<__m128i*>(data_out) + x, pack128); | ||
} | ||
} | ||
|
||
/** | ||
* @brief Combine 8 elements from up_sampled_vector with 8 elements from decoded_data and store the result in up_sampled_vector. | ||
* This version uses AVX2 instructions. | ||
* | ||
* @param up_sampled_vector Pointer to the array of opus_int32 values. | ||
* @param decoded_data Pointer to the array of opus_int16 values. | ||
* @param x Index to select a specific set of elements to combine. | ||
*/ | ||
inline void combine_samples(opus_int32* up_sampled_vector, const opus_int16* decoded_data, size_t x) { | ||
for (size_t y = 0; y < 2; ++y) { | ||
__m256i* up_sampled_ptr = reinterpret_cast<__m256i*>(up_sampled_vector + (x * 8 * y)); | ||
const __m128i* decoded_ptr = reinterpret_cast<const __m128i*>(decoded_data + (x * 8 * y)); | ||
|
||
__m256i current_up_sampled = _mm256_loadu_si256(up_sampled_ptr); | ||
__m128i current_decoded128 = _mm_loadu_si128(decoded_ptr); | ||
|
||
__m256i up_sampled_added = _mm256_add_epi32(current_up_sampled, _mm256_cvtepi16_epi32(current_decoded128)); | ||
|
||
_mm256_storeu_si256(up_sampled_ptr, up_sampled_added); | ||
} | ||
} | ||
|
||
#elif T_AVX | ||
|
||
/** | ||
* @brief Collect sixteen elements from data_in, apply gain and increment, and store the result in data_out. | ||
* This version uses AVX instructions. | ||
* | ||
* @param data_in Pointer to the input array of opus_int32 values. | ||
* @param data_out Pointer to the output array of opus_int16 values. | ||
* @param current_gain The gain to be applied to the elements. | ||
* @param increment The increment value to be added to each element. | ||
*/ | ||
inline void collect_sixteen_elements(opus_int32* data_in, opus_int16* data_out, float current_gain, float increment) { | ||
const __m128 current_gain128 = _mm_set1_ps(current_gain); | ||
const __m128 increment128 = _mm_set1_ps(increment); | ||
|
||
for (int x = 0; x < 4; ++x) { | ||
auto original_values = _mm_load_si128(reinterpret_cast<const __m128i*>(data_in) + x); | ||
auto original_float_values = _mm_cvtepi32_ps(original_values); | ||
auto original_increment = _mm_mul_ps(_mm_set1_ps(increment), | ||
_mm_set_ps(0.0f * static_cast<float>(x), 1.0f * static_cast<float>(x), 2.0f * static_cast<float>(x), 3.0f * static_cast<float>(x))); | ||
auto original_gain = _mm_set1_ps(current_gain); | ||
auto final_gain = _mm_add_ps(original_gain, original_increment); | ||
__m128 currentSamplesNew128{ _mm_mul_ps(original_float_values, final_gain) }; | ||
auto selector_values{ _mm_cmp_ps(currentSamplesNew128, _mm_set1_ps(0.0f), _CMP_GE_OQ) }; | ||
auto max_values{ _mm_max_ps(currentSamplesNew128, _mm_set1_ps(static_cast<float>(std::numeric_limits<opus_int16>::min()))) }; | ||
auto min_values{ _mm_min_ps(currentSamplesNew128, _mm_set1_ps(static_cast<float>(std::numeric_limits<opus_int16>::max()))) }; | ||
|
||
__m128 final_values{ _mm_blendv_ps(max_values, min_values, selector_values) }; | ||
|
||
__m128i current_samples_newer{ _mm_cvtps_epi32(final_values) }; | ||
__m128i pack128{ _mm_packus_epi32(_mm_extracti_si64(current_samples_newer, 0, 64), _mm_extracti_si64(current_samples_newer, 63, 64)) }; | ||
|
||
_mm_storeu_si128(reinterpret_cast<__m128i*>(data_out) + x, pack128); | ||
} | ||
} | ||
|
||
/** | ||
* @brief Combine 4 elements from up_sampled_vector with 4 elements from decoded_data and store the result in up_sampled_vector. | ||
* This version uses AVX instructions. | ||
* | ||
* @param up_sampled_vector Pointer to the array of opus_int32 values. | ||
* @param decoded_data Pointer to the array of opus_int16 values. | ||
* @param x Index to select a specific set of elements to combine. | ||
*/ | ||
inline void combine_samples(opus_int32* up_sampled_vector, const opus_int16* decoded_data, size_t x) { | ||
for (size_t y = 0; y < 4; ++y) { | ||
__m128i* up_sampled_ptr = reinterpret_cast<__m128i*>(up_sampled_vector + (x * 8 * y)); | ||
const __m128i* decoded_ptr = reinterpret_cast<const __m128i*>(decoded_data + (x * 8 * y)); | ||
|
||
__m128i current_up_sampled = _mm_loadu_si128(up_sampled_ptr); | ||
__m128i current_decoded128 = _mm_loadu_si128(decoded_ptr); | ||
|
||
__m128i up_sampled_added = _mm_add_epi32(current_up_sampled, _mm_cvtepi16_epi32(current_decoded128)); | ||
|
||
_mm_storeu_si128(up_sampled_ptr, up_sampled_added); | ||
} | ||
} | ||
|
||
#else | ||
|
||
/** | ||
* @brief Collect sixteen elements from data_in, apply gain and increment, and store the result in data_out. | ||
* This version uses scalar operations (no SIMD instructions). | ||
* | ||
* @param data_in Pointer to the input array of opus_int32 values. | ||
* @param data_out Pointer to the output array of opus_int16 values. | ||
* @param current_gainFloat The gain to be applied to the elements. | ||
* @param incrementFloat The increment value to be added to each element. | ||
*/ | ||
inline void collect_sixteen_elements(opus_int32* data_in, opus_int16* data_out, float current_gainFloat, float incrementFloat) { | ||
for (int i = 0; i < 16; ++i) { | ||
float current_sample_float = static_cast<float>(data_in[i]); | ||
current_sample_float = current_sample_float * current_gainFloat + incrementFloat * i; | ||
|
||
if (current_sample_float > static_cast<float>(std::numeric_limits<opus_int16>::max())) { | ||
current_sample_float = static_cast<float>(std::numeric_limits<opus_int16>::max()); | ||
} | ||
else if (current_sample_float < static_cast<float>(std::numeric_limits<opus_int16>::min())) { | ||
current_sample_float = static_cast<float>(std::numeric_limits<opus_int16>::min()); | ||
} | ||
|
||
data_out[i] = static_cast<opus_int16>(current_sample_float); | ||
} | ||
} | ||
|
||
/** | ||
* @brief Combine 1 element from up_sampled_vector with 1 element from decoded_data and store the result in up_sampled_vector. | ||
* This version uses scalar operations (no SIMD instructions). | ||
* | ||
* @param up_sampled_vector Pointer to the array of opus_int32 values. | ||
* @param decoded_data Pointer to the array of opus_int16 values. | ||
* @param x Index to select a specific element to combine. | ||
*/ | ||
inline void combine_samples(opus_int32* up_sampled_vector, const opus_int16* decoded_data, size_t x) { | ||
for (size_t i = 0; i < 16; ++i) { | ||
size_t index = (x * 16) + i; | ||
up_sampled_vector[index] += static_cast<int32_t>(decoded_data[index]); | ||
} | ||
} | ||
|
||
#endif | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.