Adding AVX implementation for mixing audio.

Should significantly increase efficiency on supported platforms. Specify a CMake variable called AVX_TYPE that is set to either T_AVX512, T_AVX2, T_AVX, or T_AVX_FALLBACK to specify each of the respective architectures.
brainboxdotcc · Aug 1, 2023 · 08284e4 · 08284e4
1 parent b68a006
commit 08284e4
Show file tree

Hide file tree

Showing 8 changed files with 370 additions and 8 deletions.
diff --git a/cmake/DetectArchitecture.cmake b/cmake/DetectArchitecture.cmake
@@ -0,0 +1,56 @@
+include(CheckCXXSourceRuns)
+
+function(check_instruction_set INSTRUCTION_SET_NAME INSTRUCTION_SET_FLAG INSTRUCTION_SET_INTRINSIC)
+
+    set(INSTRUCTION_SET_CODE "
+        #include <immintrin.h>
+        #include <stdint.h>
+        int main()
+        {
+            ${INSTRUCTION_SET_INTRINSIC};
+            return 0;
+        }
+    ")
+
+    set(CMAKE_REQUIRED_FLAGS "${INSTRUCTION_SET_FLAG}")
+    CHECK_CXX_SOURCE_RUNS("${INSTRUCTION_SET_CODE}" "${INSTRUCTION_SET_NAME}")
+    if(${INSTRUCTION_SET_NAME})
+        set(AVX_TYPE "${INSTRUCTION_SET_NAME}" PARENT_SCOPE)
+        set(AVX_FLAG "${INSTRUCTION_SET_FLAG}" PARENT_SCOPE)
+        set(AVX_NAME "${INSTRUCTION_SET_NAME}" PARENT_SCOPE)
+    else()
+        message(STATUS "Instruction set ${INSTRUCTION_SET_NAME} not supported. Falling back to the previous instruction set.")
+        return()
+    endif()
+endfunction()
+
+if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+    set(INSTRUCTION_SETS
+        "T_AVX?/arch:AVX?auto result = _mm_testz_ps(__m128{}, __m128{})"
+        "T_AVX2?/arch:AVX2?auto result = _mm256_extract_epi64(__m256i{}, 0)"
+        "T_AVX512?/arch:AVX512?auto result = _mm512_add_ps(__m512i{}, __m512i{}).auto result2 = _mm512_cmplt_epu8_mask(__m512i{}, __m512i{})"
+    )
+else()
+    set(INSTRUCTION_SETS
+        "T_AVX?-mavx.-mpclmul.-mbmi?auto result = _mm_testz_ps(__m128{}, __m128{})"
+        "T_AVX2?-mavx2.-mavx.-mpclmul.-mbmi?auto result = _mm256_extract_epi64(__m256i{}, 0)"
+        "T_AVX512?-mavx512bw.-mavx512f.-mavx2.-mavx.-mpclmul.-mbmi?auto result = _mm512_add_ps(__m512i{}, __m512i{}).auto result2 = _mm512_cmplt_epu8_mask(__m512i{}, __m512i{})"
+    )
+endif()
+
+set(CMAKE_REQUIRED_FLAGS_SAVE "${CMAKE_REQUIRED_FLAGS}")
+
+set(AVX_NAME "T_Fallback")
+
+foreach(INSTRUCTION_SET IN LISTS INSTRUCTION_SETS)
+    string(REPLACE "?" ";" CURRENT_LIST "${INSTRUCTION_SET}")
+    list(GET CURRENT_LIST 0 INSTRUCTION_SET_NAME)
+    list(GET CURRENT_LIST 1 INSTRUCTION_SET_FLAG)
+    string(REPLACE "." ";" INSTRUCTION_SET_FLAG "${INSTRUCTION_SET_FLAG}")
+    list(GET CURRENT_LIST 2 INSTRUCTION_SET_INTRINSIC)
+    string(REPLACE "." ";" INSTRUCTION_SET_INTRINSIC "${INSTRUCTION_SET_INTRINSIC}")
+    check_instruction_set("${INSTRUCTION_SET_NAME}" "${INSTRUCTION_SET_FLAG}" "${INSTRUCTION_SET_INTRINSIC}")
+endforeach()
+
+message(STATUS "Detected CPU Architecture: ${AVX_NAME}")
+set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS_SAVE}")
diff --git a/include/dpp/discordvoiceclient.h b/include/dpp/discordvoiceclient.h
@@ -37,6 +37,7 @@
 #include <dpp/dispatcher.h>
 #include <dpp/cluster.h>
 #include <dpp/discordevents.h>
+#include <dpp/isa_detection.hpp>
 #include <dpp/socket.h>
 #include <queue>
 #include <thread>
@@ -58,6 +59,23 @@ namespace dpp {
 
 using json = nlohmann::json;
 
+/*
+* @brief For holding a moving average of the number of current voice users, for applying a smooth gain ramp.
+*/
+struct DPP_EXPORT moving_averager {
+	moving_averager() = default;
+
+	moving_averager(uint64_t collectionCountNew);
+
+	moving_averager operator+=(int64_t value);
+
+	operator float();
+
+protected:
+	std::deque<int64_t> values{};
+	uint64_t collectionCount{};
+};
+
 // Forward declaration
 class cluster;
 
@@ -473,6 +491,21 @@ class DPP_EXPORT discord_voice_client : public websocket_client
 	 */
 	bool terminating;
 
+	/**
+	 * @brief The gain value for the end of the current voice iteration.
+	 */
+	float endGain;
+
+	/**
+	 * @brief The gain value for the current voice iteration.
+	 */
+	float current_gain;
+
+	/**
+	 * @brief The amount to increment each successive sample for, for the current voice iteration.
+	 */
+	float increment;
+
 	/**
 	 * @brief Heartbeat interval for sending heartbeat keepalive
 	 */
@@ -503,6 +536,11 @@ class DPP_EXPORT discord_voice_client : public websocket_client
 	 */
 	snowflake server_id;
 
+	/**
+	 * @brief Moving averager.
+	 */
+	moving_averager moving_average;
+
 	/**
 	 * @brief Channel ID
 	 */

diff --git a/include/dpp/isa_detection.hpp b/include/dpp/isa_detection.hpp
@@ -0,0 +1,249 @@
+/************************************************************************************
+ *
+ * D++, A Lightweight C++ library for Discord
+ *
+ * Copyright 2021 Craig Edwards and D++ contributors
+ * (https://github.com/brainboxdotcc/DPP/graphs/contributors)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ ************************************************************************************/
+#pragma once
+
+#include <immintrin.h>
+#include <opus/opus.h>
+#include <limits>
+
+#ifdef max
+	#undef max
+#endif
+
+#ifdef min
+	#undef min
+#endif
+
+namespace dpp {
+
+#ifdef T_AVX512
+
+	/**
+	 * @brief Collect sixteen elements from data_in, apply gain and increment, and store the result in data_out.
+	 * This version uses AVX-512 instructions.
+	 *
+	 * @param data_in Pointer to the input array of opus_int32 values.
+	 * @param data_out Pointer to the output array of opus_int16 values.
+	 * @param current_gain The gain to be applied to the elements.
+	 * @param increment The increment value to be added to each element.
+	 */
+	inline void collect_sixteen_elements(opus_int32* data_in, opus_int16* data_out, float current_gain, float increment) {
+		auto original_values = _mm512_load_si512(reinterpret_cast<const __m512i*>(data_in));
+		auto original_float_values = _mm512_cvtepi32_ps(original_values);
+		auto original_increment = _mm512_mul_ps(_mm512_set1_ps(increment),
+			_mm512_set_ps(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f));
+		auto original_gain = _mm512_set1_ps(current_gain);
+		auto final_gain = _mm512_add_ps(original_gain, original_increment);
+		__m512 current_samples_new{ _mm512_mul_ps(original_float_values, final_gain) };
+		auto selector_values{ _mm512_cmp_ps_mask(current_samples_new, _mm512_set1_ps(0.0f), _CMP_GE_OQ) };
+		auto max_values{ _mm512_max_ps(current_samples_new, _mm512_set1_ps(static_cast<float>(std::numeric_limits<opus_int16>::min()))) };
+		auto min_values{ _mm512_min_ps(current_samples_new, _mm512_set1_ps(static_cast<float>(std::numeric_limits<opus_int16>::max()))) };
+
+		__m512 final_values{ _mm512_mask_blend_ps(selector_values, max_values, min_values) };
+
+		__m512i current_samples_newer{ _mm512_cvtps_epi32(final_values) };
+
+		__m256i pack256{ _mm256_packs_epi32(_mm512_extracti64x4_epi64(current_samples_newer, 0),
+			_mm512_extracti64x4_epi64(current_samples_newer, 1)) };
+
+		_mm256_storeu_si256(reinterpret_cast<__m256i*>(data_out), pack256);
+	}
+
+	/**
+	 * @brief Combine 16 elements from up_sampled_vector with 16 elements from decoded_data and store the result in up_sampled_vector.
+	 *        This version uses AVX-512 instructions.
+	 *
+	 * @param up_sampled_vector Pointer to the array of opus_int32 values.
+	 * @param decoded_data Pointer to the array of opus_int16 values.
+	 * @param x Index to select a specific set of elements to combine.
+	 */
+	inline void combine_samples(opus_int32* up_sampled_vector, const opus_int16* decoded_data, size_t x) {
+		__m512i* up_sampled_ptr = reinterpret_cast<__m512i*>(up_sampled_vector + (x * 16));
+		const __m256i* decoded_ptr = reinterpret_cast<const __m256i*>(decoded_data + (x * 16));
+
+		__m512i current_up_sampled = _mm512_loadu_si512(up_sampled_ptr);
+		__m256i current_decoded256 = _mm256_loadu_si256(decoded_ptr);
+
+		__m512i upSampledAdded512 = _mm512_add_epi32(current_up_sampled, _mm512_cvtepi16_epi32(current_decoded256));
+
+		_mm512_storeu_si512(up_sampled_ptr, upSampledAdded512);
+	}
+
+#elif T_AVX2
+
+	/**
+	 * @brief Collect sixteen elements from data_in, apply gain and increment, and store the result in data_out.
+	 *        This version uses AVX2 instructions.
+	 *
+	 * @param data_in Pointer to the input array of opus_int32 values.
+	 * @param data_out Pointer to the output array of opus_int16 values.
+	 * @param current_gain The gain to be applied to the elements.
+	 * @param increment The increment value to be added to each element.
+	 */
+	inline void collect_sixteen_elements(opus_int32* data_in, opus_int16* data_out, float current_gain, float increment) {
+		for (size_t x = 0; x < 2; ++x) {
+			auto original_values = _mm256_load_si256(reinterpret_cast<const __m256i*>(data_in) + x);
+			auto original_float_values = _mm256_cvtepi32_ps(original_values);
+			auto original_increment = _mm256_mul_ps(_mm256_set1_ps(increment),
+				_mm256_set_ps(0.0f * static_cast<float>(x), 1.0f * static_cast<float>(x), 2.0f * static_cast<float>(x), 3.0f * static_cast<float>(x),
+					4.0f * static_cast<float>(x), 5.0f * static_cast<float>(x), 6.0f * static_cast<float>(x), 7.0f * static_cast<float>(x)));
+			auto original_gain = _mm256_set1_ps(current_gain);
+			auto final_gain = _mm256_add_ps(original_gain, original_increment);
+			__m256 current_samples_new{ _mm256_mul_ps(original_float_values, final_gain) };
+			auto selector_values{ _mm256_cmp_ps(current_samples_new, _mm256_set1_ps(0.0f), _CMP_GE_OQ) };
+			auto max_values{ _mm256_max_ps(current_samples_new, _mm256_set1_ps(static_cast<float>(std::numeric_limits<opus_int16>::min()))) };
+			auto min_values{ _mm256_min_ps(current_samples_new, _mm256_set1_ps(static_cast<float>(std::numeric_limits<opus_int16>::max()))) };
+
+			__m256 final_values{ _mm256_blendv_ps(max_values, min_values, selector_values) };
+
+			__m256i current_samples_newer{ _mm256_cvtps_epi32(final_values) };
+
+			__m128i pack128{ _mm_packs_epi32(_mm256_extractf128_si256(current_samples_newer, 0),
+				_mm256_extractf128_si256(current_samples_newer, 1)) };
+
+			_mm_storeu_si128(reinterpret_cast<__m128i*>(data_out) + x, pack128);
+		}
+	}
+
+	/**
+	 * @brief Combine 8 elements from up_sampled_vector with 8 elements from decoded_data and store the result in up_sampled_vector.
+	 *        This version uses AVX2 instructions.
+	 *
+	 * @param up_sampled_vector Pointer to the array of opus_int32 values.
+	 * @param decoded_data Pointer to the array of opus_int16 values.
+	 * @param x Index to select a specific set of elements to combine.
+	 */
+	inline void combine_samples(opus_int32* up_sampled_vector, const opus_int16* decoded_data, size_t x) {
+		for (size_t y = 0; y < 2; ++y) {
+			__m256i* up_sampled_ptr = reinterpret_cast<__m256i*>(up_sampled_vector + (x * 8 * y));
+			const __m128i* decoded_ptr = reinterpret_cast<const __m128i*>(decoded_data + (x * 8 * y));
+
+			__m256i current_up_sampled = _mm256_loadu_si256(up_sampled_ptr);
+			__m128i current_decoded128 = _mm_loadu_si128(decoded_ptr);
+
+			__m256i up_sampled_added = _mm256_add_epi32(current_up_sampled, _mm256_cvtepi16_epi32(current_decoded128));
+
+			_mm256_storeu_si256(up_sampled_ptr, up_sampled_added);
+		}
+	}
+
+#elif T_AVX
+
+	/**
+	 * @brief Collect sixteen elements from data_in, apply gain and increment, and store the result in data_out.
+	 *        This version uses AVX instructions.
+	 *
+	 * @param data_in Pointer to the input array of opus_int32 values.
+	 * @param data_out Pointer to the output array of opus_int16 values.
+	 * @param current_gain The gain to be applied to the elements.
+	 * @param increment The increment value to be added to each element.
+	 */
+	inline void collect_sixteen_elements(opus_int32* data_in, opus_int16* data_out, float current_gain, float increment) {
+		const __m128 current_gain128 = _mm_set1_ps(current_gain);
+		const __m128 increment128 = _mm_set1_ps(increment);
+
+		for (int x = 0; x < 4; ++x) {
+			auto original_values = _mm_load_si128(reinterpret_cast<const __m128i*>(data_in) + x);
+			auto original_float_values = _mm_cvtepi32_ps(original_values);
+			auto original_increment = _mm_mul_ps(_mm_set1_ps(increment),
+				_mm_set_ps(0.0f * static_cast<float>(x), 1.0f * static_cast<float>(x), 2.0f * static_cast<float>(x), 3.0f * static_cast<float>(x)));
+			auto original_gain = _mm_set1_ps(current_gain);
+			auto final_gain = _mm_add_ps(original_gain, original_increment);
+			__m128 currentSamplesNew128{ _mm_mul_ps(original_float_values, final_gain) };
+			auto selector_values{ _mm_cmp_ps(currentSamplesNew128, _mm_set1_ps(0.0f), _CMP_GE_OQ) };
+			auto max_values{ _mm_max_ps(currentSamplesNew128, _mm_set1_ps(static_cast<float>(std::numeric_limits<opus_int16>::min()))) };
+			auto min_values{ _mm_min_ps(currentSamplesNew128, _mm_set1_ps(static_cast<float>(std::numeric_limits<opus_int16>::max()))) };
+
+			__m128 final_values{ _mm_blendv_ps(max_values, min_values, selector_values) };
+
+			__m128i current_samples_newer{ _mm_cvtps_epi32(final_values) };
+			__m128i pack128{ _mm_packus_epi32(_mm_extracti_si64(current_samples_newer, 0, 64), _mm_extracti_si64(current_samples_newer, 63, 64)) };
+
+			_mm_storeu_si128(reinterpret_cast<__m128i*>(data_out) + x, pack128);
+		}
+	}
+
+	/**
+	 * @brief Combine 4 elements from up_sampled_vector with 4 elements from decoded_data and store the result in up_sampled_vector.
+	 *        This version uses AVX instructions.
+	 *
+	 * @param up_sampled_vector Pointer to the array of opus_int32 values.
+	 * @param decoded_data Pointer to the array of opus_int16 values.
+	 * @param x Index to select a specific set of elements to combine.
+	 */
+	inline void combine_samples(opus_int32* up_sampled_vector, const opus_int16* decoded_data, size_t x) {
+		for (size_t y = 0; y < 4; ++y) {
+			__m128i* up_sampled_ptr = reinterpret_cast<__m128i*>(up_sampled_vector + (x * 8 * y));
+			const __m128i* decoded_ptr = reinterpret_cast<const __m128i*>(decoded_data + (x * 8 * y));
+
+			__m128i current_up_sampled = _mm_loadu_si128(up_sampled_ptr);
+			__m128i current_decoded128 = _mm_loadu_si128(decoded_ptr);
+
+			__m128i up_sampled_added = _mm_add_epi32(current_up_sampled, _mm_cvtepi16_epi32(current_decoded128));
+
+			_mm_storeu_si128(up_sampled_ptr, up_sampled_added);
+		}
+	}
+
+#else
+
+	/**
+	 * @brief Collect sixteen elements from data_in, apply gain and increment, and store the result in data_out.
+	 *        This version uses scalar operations (no SIMD instructions).
+	 *
+	 * @param data_in Pointer to the input array of opus_int32 values.
+	 * @param data_out Pointer to the output array of opus_int16 values.
+	 * @param current_gainFloat The gain to be applied to the elements.
+	 * @param incrementFloat The increment value to be added to each element.
+	 */
+	inline void collect_sixteen_elements(opus_int32* data_in, opus_int16* data_out, float current_gainFloat, float incrementFloat) {
+		for (int i = 0; i < 16; ++i) {
+			float current_sample_float = static_cast<float>(data_in[i]);
+			current_sample_float = current_sample_float * current_gainFloat + incrementFloat * i;
+
+			if (current_sample_float > static_cast<float>(std::numeric_limits<opus_int16>::max())) {
+				current_sample_float = static_cast<float>(std::numeric_limits<opus_int16>::max());
+			}
+			else if (current_sample_float < static_cast<float>(std::numeric_limits<opus_int16>::min())) {
+				current_sample_float = static_cast<float>(std::numeric_limits<opus_int16>::min());
+			}
+
+			data_out[i] = static_cast<opus_int16>(current_sample_float);
+		}
+	}
+
+	/**
+	 * @brief Combine 1 element from up_sampled_vector with 1 element from decoded_data and store the result in up_sampled_vector.
+	 *        This version uses scalar operations (no SIMD instructions).
+	 *
+	 * @param up_sampled_vector Pointer to the array of opus_int32 values.
+	 * @param decoded_data Pointer to the array of opus_int16 values.
+	 * @param x Index to select a specific element to combine.
+	 */
+	inline void combine_samples(opus_int32* up_sampled_vector, const opus_int16* decoded_data, size_t x) {
+		for (size_t i = 0; i < 16; ++i) {
+			size_t index = (x * 16) + i;
+			up_sampled_vector[index] += static_cast<int32_t>(decoded_data[index]);
+		}
+	}
+
+#endif
+}
diff --git a/library-vcpkg/CMakeLists.txt b/library-vcpkg/CMakeLists.txt
@@ -11,18 +11,24 @@ endif()
 
 add_library("${PROJECT_NAME}::${LIB_NAME}" ALIAS "${LIB_NAME}")
 
+if(NOT DEFINED AVX_TYPE)
+	include("${CMAKE_CURRENT_SOURCE_DIR}/../cmake/DetectArchitecture.cmake")
+endif()
+
 target_compile_definitions(
 	"${LIB_NAME}" PUBLIC
 	"DPP_BUILD"
 	"$<$<PLATFORM_ID:Windows>:$<$<CONFIG:Debug>:/sdl;/std:c++17;/Od;/DEBUG;/sdl;/MP;/DFD_SETSIZE=1024;/Zc:preprocessor>>"
 	"$<$<PLATFORM_ID:Windows>:$<$<CONFIG:Release>:/std:c++17;/O2;/Oi;/Oy;/GL;/Gy;/sdl;/MP;/DFD_SETSIZE=1024;/Zc:preprocessor>>"
+	"${AVX_TYPE}"
 )
 
 target_compile_options(
 	"${LIB_NAME}" PUBLIC
 	"$<$<PLATFORM_ID:Windows>:/bigobj>"
 	"$<$<PLATFORM_ID:Linux>:$<$<CONFIG:Debug>:-std=c++17;-Wall;-Wempty-body;-Wno-psabi;-Wunknown-pragmas;-Wignored-qualifiers;-Wimplicit-fallthrough;-Wmissing-field-initializers;-Wsign-compare;-Wtype-limits;-Wuninitialized;-Wshift-negative-value;-pthread;-g;-Og;-fPIC>>"
 	"$<$<PLATFORM_ID:Linux>:$<$<CONFIG:Release>:-std=c++17;-Wall;-Wempty-body;-Wno-psabi;-Wunknown-pragmas;-Wignored-qualifiers;-Wimplicit-fallthrough;-Wmissing-field-initializers;-Wsign-compare;-Wtype-limits;-Wuninitialized;-Wshift-negative-value;-pthread;-O3;-fPIC>>"
+	"${AVX_FLAG}"
 )
 
 target_compile_features(