Skip to content

Commit

Permalink
Adding AVX implementation for mixing audio.
Browse files Browse the repository at this point in the history
Should significantly increase efficiency on supported platforms. Specify a CMake variable called AVX_TYPE that is set to either T_AVX512, T_AVX2, T_AVX, or T_AVX_FALLBACK to specify each of the respective architectures.
  • Loading branch information
RealTimeChris committed Aug 1, 2023
1 parent b68a006 commit 08284e4
Show file tree
Hide file tree
Showing 8 changed files with 370 additions and 8 deletions.
56 changes: 56 additions & 0 deletions cmake/DetectArchitecture.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
include(CheckCXXSourceRuns)

function(check_instruction_set INSTRUCTION_SET_NAME INSTRUCTION_SET_FLAG INSTRUCTION_SET_INTRINSIC)

set(INSTRUCTION_SET_CODE "
#include <immintrin.h>
#include <stdint.h>
int main()
{
${INSTRUCTION_SET_INTRINSIC};
return 0;
}
")

set(CMAKE_REQUIRED_FLAGS "${INSTRUCTION_SET_FLAG}")
CHECK_CXX_SOURCE_RUNS("${INSTRUCTION_SET_CODE}" "${INSTRUCTION_SET_NAME}")
if(${INSTRUCTION_SET_NAME})
set(AVX_TYPE "${INSTRUCTION_SET_NAME}" PARENT_SCOPE)
set(AVX_FLAG "${INSTRUCTION_SET_FLAG}" PARENT_SCOPE)
set(AVX_NAME "${INSTRUCTION_SET_NAME}" PARENT_SCOPE)
else()
message(STATUS "Instruction set ${INSTRUCTION_SET_NAME} not supported. Falling back to the previous instruction set.")
return()
endif()
endfunction()

if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
set(INSTRUCTION_SETS
"T_AVX?/arch:AVX?auto result = _mm_testz_ps(__m128{}, __m128{})"
"T_AVX2?/arch:AVX2?auto result = _mm256_extract_epi64(__m256i{}, 0)"
"T_AVX512?/arch:AVX512?auto result = _mm512_add_ps(__m512i{}, __m512i{}).auto result2 = _mm512_cmplt_epu8_mask(__m512i{}, __m512i{})"
)
else()
set(INSTRUCTION_SETS
"T_AVX?-mavx.-mpclmul.-mbmi?auto result = _mm_testz_ps(__m128{}, __m128{})"
"T_AVX2?-mavx2.-mavx.-mpclmul.-mbmi?auto result = _mm256_extract_epi64(__m256i{}, 0)"
"T_AVX512?-mavx512bw.-mavx512f.-mavx2.-mavx.-mpclmul.-mbmi?auto result = _mm512_add_ps(__m512i{}, __m512i{}).auto result2 = _mm512_cmplt_epu8_mask(__m512i{}, __m512i{})"
)
endif()

set(CMAKE_REQUIRED_FLAGS_SAVE "${CMAKE_REQUIRED_FLAGS}")

set(AVX_NAME "T_Fallback")

foreach(INSTRUCTION_SET IN LISTS INSTRUCTION_SETS)
string(REPLACE "?" ";" CURRENT_LIST "${INSTRUCTION_SET}")
list(GET CURRENT_LIST 0 INSTRUCTION_SET_NAME)
list(GET CURRENT_LIST 1 INSTRUCTION_SET_FLAG)
string(REPLACE "." ";" INSTRUCTION_SET_FLAG "${INSTRUCTION_SET_FLAG}")
list(GET CURRENT_LIST 2 INSTRUCTION_SET_INTRINSIC)
string(REPLACE "." ";" INSTRUCTION_SET_INTRINSIC "${INSTRUCTION_SET_INTRINSIC}")
check_instruction_set("${INSTRUCTION_SET_NAME}" "${INSTRUCTION_SET_FLAG}" "${INSTRUCTION_SET_INTRINSIC}")
endforeach()

message(STATUS "Detected CPU Architecture: ${AVX_NAME}")
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS_SAVE}")
38 changes: 38 additions & 0 deletions include/dpp/discordvoiceclient.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include <dpp/dispatcher.h>
#include <dpp/cluster.h>
#include <dpp/discordevents.h>
#include <dpp/isa_detection.hpp>
#include <dpp/socket.h>
#include <queue>
#include <thread>
Expand All @@ -58,6 +59,23 @@ namespace dpp {

using json = nlohmann::json;

/*
* @brief For holding a moving average of the number of current voice users, for applying a smooth gain ramp.
*/
struct DPP_EXPORT moving_averager {
moving_averager() = default;

moving_averager(uint64_t collectionCountNew);

moving_averager operator+=(int64_t value);

operator float();

protected:
std::deque<int64_t> values{};
uint64_t collectionCount{};
};

// Forward declaration
class cluster;

Expand Down Expand Up @@ -473,6 +491,21 @@ class DPP_EXPORT discord_voice_client : public websocket_client
*/
bool terminating;

/**
* @brief The gain value for the end of the current voice iteration.
*/
float endGain;

/**
* @brief The gain value for the current voice iteration.
*/
float current_gain;

/**
* @brief The amount to increment each successive sample for, for the current voice iteration.
*/
float increment;

/**
* @brief Heartbeat interval for sending heartbeat keepalive
*/
Expand Down Expand Up @@ -503,6 +536,11 @@ class DPP_EXPORT discord_voice_client : public websocket_client
*/
snowflake server_id;

/**
* @brief Moving averager.
*/
moving_averager moving_average;

/**
* @brief Channel ID
*/
Expand Down
249 changes: 249 additions & 0 deletions include/dpp/isa_detection.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
/************************************************************************************
*
* D++, A Lightweight C++ library for Discord
*
* Copyright 2021 Craig Edwards and D++ contributors
* (https://github.com/brainboxdotcc/DPP/graphs/contributors)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
************************************************************************************/
#pragma once

#include <immintrin.h>
#include <opus/opus.h>
#include <limits>

#ifdef max
#undef max
#endif

#ifdef min
#undef min
#endif

namespace dpp {

#ifdef T_AVX512

/**
* @brief Collect sixteen elements from data_in, apply gain and increment, and store the result in data_out.
* This version uses AVX-512 instructions.
*
* @param data_in Pointer to the input array of opus_int32 values.
* @param data_out Pointer to the output array of opus_int16 values.
* @param current_gain The gain to be applied to the elements.
* @param increment The increment value to be added to each element.
*/
inline void collect_sixteen_elements(opus_int32* data_in, opus_int16* data_out, float current_gain, float increment) {
auto original_values = _mm512_load_si512(reinterpret_cast<const __m512i*>(data_in));
auto original_float_values = _mm512_cvtepi32_ps(original_values);
auto original_increment = _mm512_mul_ps(_mm512_set1_ps(increment),
_mm512_set_ps(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f));
auto original_gain = _mm512_set1_ps(current_gain);
auto final_gain = _mm512_add_ps(original_gain, original_increment);
__m512 current_samples_new{ _mm512_mul_ps(original_float_values, final_gain) };
auto selector_values{ _mm512_cmp_ps_mask(current_samples_new, _mm512_set1_ps(0.0f), _CMP_GE_OQ) };
auto max_values{ _mm512_max_ps(current_samples_new, _mm512_set1_ps(static_cast<float>(std::numeric_limits<opus_int16>::min()))) };
auto min_values{ _mm512_min_ps(current_samples_new, _mm512_set1_ps(static_cast<float>(std::numeric_limits<opus_int16>::max()))) };

__m512 final_values{ _mm512_mask_blend_ps(selector_values, max_values, min_values) };

__m512i current_samples_newer{ _mm512_cvtps_epi32(final_values) };

__m256i pack256{ _mm256_packs_epi32(_mm512_extracti64x4_epi64(current_samples_newer, 0),
_mm512_extracti64x4_epi64(current_samples_newer, 1)) };

_mm256_storeu_si256(reinterpret_cast<__m256i*>(data_out), pack256);
}

/**
* @brief Combine 16 elements from up_sampled_vector with 16 elements from decoded_data and store the result in up_sampled_vector.
* This version uses AVX-512 instructions.
*
* @param up_sampled_vector Pointer to the array of opus_int32 values.
* @param decoded_data Pointer to the array of opus_int16 values.
* @param x Index to select a specific set of elements to combine.
*/
inline void combine_samples(opus_int32* up_sampled_vector, const opus_int16* decoded_data, size_t x) {
__m512i* up_sampled_ptr = reinterpret_cast<__m512i*>(up_sampled_vector + (x * 16));
const __m256i* decoded_ptr = reinterpret_cast<const __m256i*>(decoded_data + (x * 16));

__m512i current_up_sampled = _mm512_loadu_si512(up_sampled_ptr);
__m256i current_decoded256 = _mm256_loadu_si256(decoded_ptr);

__m512i upSampledAdded512 = _mm512_add_epi32(current_up_sampled, _mm512_cvtepi16_epi32(current_decoded256));

_mm512_storeu_si512(up_sampled_ptr, upSampledAdded512);
}

#elif T_AVX2

/**
* @brief Collect sixteen elements from data_in, apply gain and increment, and store the result in data_out.
* This version uses AVX2 instructions.
*
* @param data_in Pointer to the input array of opus_int32 values.
* @param data_out Pointer to the output array of opus_int16 values.
* @param current_gain The gain to be applied to the elements.
* @param increment The increment value to be added to each element.
*/
inline void collect_sixteen_elements(opus_int32* data_in, opus_int16* data_out, float current_gain, float increment) {
for (size_t x = 0; x < 2; ++x) {
auto original_values = _mm256_load_si256(reinterpret_cast<const __m256i*>(data_in) + x);
auto original_float_values = _mm256_cvtepi32_ps(original_values);
auto original_increment = _mm256_mul_ps(_mm256_set1_ps(increment),
_mm256_set_ps(0.0f * static_cast<float>(x), 1.0f * static_cast<float>(x), 2.0f * static_cast<float>(x), 3.0f * static_cast<float>(x),
4.0f * static_cast<float>(x), 5.0f * static_cast<float>(x), 6.0f * static_cast<float>(x), 7.0f * static_cast<float>(x)));
auto original_gain = _mm256_set1_ps(current_gain);
auto final_gain = _mm256_add_ps(original_gain, original_increment);
__m256 current_samples_new{ _mm256_mul_ps(original_float_values, final_gain) };
auto selector_values{ _mm256_cmp_ps(current_samples_new, _mm256_set1_ps(0.0f), _CMP_GE_OQ) };
auto max_values{ _mm256_max_ps(current_samples_new, _mm256_set1_ps(static_cast<float>(std::numeric_limits<opus_int16>::min()))) };
auto min_values{ _mm256_min_ps(current_samples_new, _mm256_set1_ps(static_cast<float>(std::numeric_limits<opus_int16>::max()))) };

__m256 final_values{ _mm256_blendv_ps(max_values, min_values, selector_values) };

__m256i current_samples_newer{ _mm256_cvtps_epi32(final_values) };

__m128i pack128{ _mm_packs_epi32(_mm256_extractf128_si256(current_samples_newer, 0),
_mm256_extractf128_si256(current_samples_newer, 1)) };

_mm_storeu_si128(reinterpret_cast<__m128i*>(data_out) + x, pack128);
}
}

/**
* @brief Combine 8 elements from up_sampled_vector with 8 elements from decoded_data and store the result in up_sampled_vector.
* This version uses AVX2 instructions.
*
* @param up_sampled_vector Pointer to the array of opus_int32 values.
* @param decoded_data Pointer to the array of opus_int16 values.
* @param x Index to select a specific set of elements to combine.
*/
inline void combine_samples(opus_int32* up_sampled_vector, const opus_int16* decoded_data, size_t x) {
for (size_t y = 0; y < 2; ++y) {
__m256i* up_sampled_ptr = reinterpret_cast<__m256i*>(up_sampled_vector + (x * 8 * y));
const __m128i* decoded_ptr = reinterpret_cast<const __m128i*>(decoded_data + (x * 8 * y));

__m256i current_up_sampled = _mm256_loadu_si256(up_sampled_ptr);
__m128i current_decoded128 = _mm_loadu_si128(decoded_ptr);

__m256i up_sampled_added = _mm256_add_epi32(current_up_sampled, _mm256_cvtepi16_epi32(current_decoded128));

_mm256_storeu_si256(up_sampled_ptr, up_sampled_added);
}
}

#elif T_AVX

/**
* @brief Collect sixteen elements from data_in, apply gain and increment, and store the result in data_out.
* This version uses AVX instructions.
*
* @param data_in Pointer to the input array of opus_int32 values.
* @param data_out Pointer to the output array of opus_int16 values.
* @param current_gain The gain to be applied to the elements.
* @param increment The increment value to be added to each element.
*/
inline void collect_sixteen_elements(opus_int32* data_in, opus_int16* data_out, float current_gain, float increment) {
const __m128 current_gain128 = _mm_set1_ps(current_gain);
const __m128 increment128 = _mm_set1_ps(increment);

for (int x = 0; x < 4; ++x) {
auto original_values = _mm_load_si128(reinterpret_cast<const __m128i*>(data_in) + x);
auto original_float_values = _mm_cvtepi32_ps(original_values);
auto original_increment = _mm_mul_ps(_mm_set1_ps(increment),
_mm_set_ps(0.0f * static_cast<float>(x), 1.0f * static_cast<float>(x), 2.0f * static_cast<float>(x), 3.0f * static_cast<float>(x)));
auto original_gain = _mm_set1_ps(current_gain);
auto final_gain = _mm_add_ps(original_gain, original_increment);
__m128 currentSamplesNew128{ _mm_mul_ps(original_float_values, final_gain) };
auto selector_values{ _mm_cmp_ps(currentSamplesNew128, _mm_set1_ps(0.0f), _CMP_GE_OQ) };
auto max_values{ _mm_max_ps(currentSamplesNew128, _mm_set1_ps(static_cast<float>(std::numeric_limits<opus_int16>::min()))) };
auto min_values{ _mm_min_ps(currentSamplesNew128, _mm_set1_ps(static_cast<float>(std::numeric_limits<opus_int16>::max()))) };

__m128 final_values{ _mm_blendv_ps(max_values, min_values, selector_values) };

__m128i current_samples_newer{ _mm_cvtps_epi32(final_values) };
__m128i pack128{ _mm_packus_epi32(_mm_extracti_si64(current_samples_newer, 0, 64), _mm_extracti_si64(current_samples_newer, 63, 64)) };

_mm_storeu_si128(reinterpret_cast<__m128i*>(data_out) + x, pack128);
}
}

/**
* @brief Combine 4 elements from up_sampled_vector with 4 elements from decoded_data and store the result in up_sampled_vector.
* This version uses AVX instructions.
*
* @param up_sampled_vector Pointer to the array of opus_int32 values.
* @param decoded_data Pointer to the array of opus_int16 values.
* @param x Index to select a specific set of elements to combine.
*/
inline void combine_samples(opus_int32* up_sampled_vector, const opus_int16* decoded_data, size_t x) {
for (size_t y = 0; y < 4; ++y) {
__m128i* up_sampled_ptr = reinterpret_cast<__m128i*>(up_sampled_vector + (x * 8 * y));
const __m128i* decoded_ptr = reinterpret_cast<const __m128i*>(decoded_data + (x * 8 * y));

__m128i current_up_sampled = _mm_loadu_si128(up_sampled_ptr);
__m128i current_decoded128 = _mm_loadu_si128(decoded_ptr);

__m128i up_sampled_added = _mm_add_epi32(current_up_sampled, _mm_cvtepi16_epi32(current_decoded128));

_mm_storeu_si128(up_sampled_ptr, up_sampled_added);
}
}

#else

/**
* @brief Collect sixteen elements from data_in, apply gain and increment, and store the result in data_out.
* This version uses scalar operations (no SIMD instructions).
*
* @param data_in Pointer to the input array of opus_int32 values.
* @param data_out Pointer to the output array of opus_int16 values.
* @param current_gainFloat The gain to be applied to the elements.
* @param incrementFloat The increment value to be added to each element.
*/
inline void collect_sixteen_elements(opus_int32* data_in, opus_int16* data_out, float current_gainFloat, float incrementFloat) {
for (int i = 0; i < 16; ++i) {
float current_sample_float = static_cast<float>(data_in[i]);
current_sample_float = current_sample_float * current_gainFloat + incrementFloat * i;

if (current_sample_float > static_cast<float>(std::numeric_limits<opus_int16>::max())) {
current_sample_float = static_cast<float>(std::numeric_limits<opus_int16>::max());
}
else if (current_sample_float < static_cast<float>(std::numeric_limits<opus_int16>::min())) {
current_sample_float = static_cast<float>(std::numeric_limits<opus_int16>::min());
}

data_out[i] = static_cast<opus_int16>(current_sample_float);
}
}

/**
* @brief Combine 1 element from up_sampled_vector with 1 element from decoded_data and store the result in up_sampled_vector.
* This version uses scalar operations (no SIMD instructions).
*
* @param up_sampled_vector Pointer to the array of opus_int32 values.
* @param decoded_data Pointer to the array of opus_int16 values.
* @param x Index to select a specific element to combine.
*/
inline void combine_samples(opus_int32* up_sampled_vector, const opus_int16* decoded_data, size_t x) {
for (size_t i = 0; i < 16; ++i) {
size_t index = (x * 16) + i;
up_sampled_vector[index] += static_cast<int32_t>(decoded_data[index]);
}
}

#endif
}
6 changes: 6 additions & 0 deletions library-vcpkg/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,24 @@ endif()

add_library("${PROJECT_NAME}::${LIB_NAME}" ALIAS "${LIB_NAME}")

if(NOT DEFINED AVX_TYPE)
include("${CMAKE_CURRENT_SOURCE_DIR}/../cmake/DetectArchitecture.cmake")
endif()

target_compile_definitions(
"${LIB_NAME}" PUBLIC
"DPP_BUILD"
"$<$<PLATFORM_ID:Windows>:$<$<CONFIG:Debug>:/sdl;/std:c++17;/Od;/DEBUG;/sdl;/MP;/DFD_SETSIZE=1024;/Zc:preprocessor>>"
"$<$<PLATFORM_ID:Windows>:$<$<CONFIG:Release>:/std:c++17;/O2;/Oi;/Oy;/GL;/Gy;/sdl;/MP;/DFD_SETSIZE=1024;/Zc:preprocessor>>"
"${AVX_TYPE}"
)

target_compile_options(
"${LIB_NAME}" PUBLIC
"$<$<PLATFORM_ID:Windows>:/bigobj>"
"$<$<PLATFORM_ID:Linux>:$<$<CONFIG:Debug>:-std=c++17;-Wall;-Wempty-body;-Wno-psabi;-Wunknown-pragmas;-Wignored-qualifiers;-Wimplicit-fallthrough;-Wmissing-field-initializers;-Wsign-compare;-Wtype-limits;-Wuninitialized;-Wshift-negative-value;-pthread;-g;-Og;-fPIC>>"
"$<$<PLATFORM_ID:Linux>:$<$<CONFIG:Release>:-std=c++17;-Wall;-Wempty-body;-Wno-psabi;-Wunknown-pragmas;-Wignored-qualifiers;-Wimplicit-fallthrough;-Wmissing-field-initializers;-Wsign-compare;-Wtype-limits;-Wuninitialized;-Wshift-negative-value;-pthread;-O3;-fPIC>>"
"${AVX_FLAG}"
)

target_compile_features(
Expand Down
Loading

0 comments on commit 08284e4

Please sign in to comment.