Skip to content

Commit

Permalink
refactor: Improve AVX, make detection a little nicer (#865)
Browse files Browse the repository at this point in the history
  • Loading branch information
RealTimeChris authored Sep 19, 2023
2 parents c79da97 + cb2c445 commit 2e75d3d
Show file tree
Hide file tree
Showing 11 changed files with 583 additions and 406 deletions.
14 changes: 7 additions & 7 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:
run: sudo sed -i 's/azure\.//' /etc/apt/sources.list && sudo apt update && sudo apt install ${{ matrix.cfg.cpp-version }} libsodium-dev libopus-dev zlib1g-dev rpm

- name: Generate CMake
run: mkdir build && cd build && cmake -DDPP_NO_VCPKG=ON -DAVX_TYPE=T_fallback -DCMAKE_BUILD_TYPE=Release ..
run: mkdir build && cd build && cmake -DDPP_NO_VCPKG=ON -DAVX_TYPE=AVX0 -DCMAKE_BUILD_TYPE=Release ..
env:
CXX: ${{matrix.cfg.cpp-version}}

Expand Down Expand Up @@ -90,7 +90,7 @@ jobs:
run: sudo sed -i 's/azure\.//' /etc/apt/sources.list && sudo apt update && sudo apt install ${{ matrix.cfg.cpp-version }} libsodium-dev libopus-dev zlib1g-dev rpm

- name: Generate CMake
run: mkdir build && cd build && cmake -DDPP_NO_VCPKG=ON -DAVX_TYPE=T_fallback -DCMAKE_BUILD_TYPE=Release ${{matrix.cfg.cmake-flags}} ..
run: mkdir build && cd build && cmake -DDPP_NO_VCPKG=ON -DAVX_TYPE=AVX0 -DCMAKE_BUILD_TYPE=Release ${{matrix.cfg.cmake-flags}} ..
env:
CXX: ${{matrix.cfg.cpp-version}}

Expand Down Expand Up @@ -131,7 +131,7 @@ jobs:
run: brew install cmake make libsodium opus openssl

- name: Generate CMake
run: mkdir build && cd build && cmake -DDPP_NO_VCPKG=ON -DCMAKE_BUILD_TYPE=Release -DDPP_CORO=ON -DAVX_TYPE=T_fallback ..
run: mkdir build && cd build && cmake -DDPP_NO_VCPKG=ON -DCMAKE_BUILD_TYPE=Release -DDPP_CORO=ON -DAVX_TYPE=AVX0 ..
env:
DONT_RUN_VCPKG: true

Expand Down Expand Up @@ -175,13 +175,13 @@ jobs:

- name: Generate CMake (x64)
if: ${{ matrix.cfg.arch == 'x64' }}
run: mkdir main/build && cd main/build && cmake -G "Visual Studio ${{matrix.cfg.vsv}} ${{matrix.cfg.vs}}" -DDPP_NO_VCPKG=ON -DAVX_TYPE=T_fallback ..
run: mkdir main/build && cd main/build && cmake -G "Visual Studio ${{matrix.cfg.vsv}} ${{matrix.cfg.vs}}" -DDPP_NO_VCPKG=ON -DAVX_TYPE=AVX0 ..
env:
DONT_RUN_VCPKG: true

- name: Generate CMake (x86)
if: ${{ matrix.cfg.arch == 'x86' }}
run: mkdir main/build && cd main/build && cmake -DCMAKE_TOOLCHAIN_FILE="cmake\Win32Toolchain.cmake" -DDPP_NO_VCPKG=ON -DAVX_TYPE=T_fallback -G "Visual Studio ${{matrix.cfg.vsv}} ${{matrix.cfg.vs}}" -A Win32 -T host=x86 ..
run: mkdir main/build && cd main/build && cmake -DCMAKE_TOOLCHAIN_FILE="cmake\Win32Toolchain.cmake" -DDPP_NO_VCPKG=ON -DAVX_TYPE=AVX0 -G "Visual Studio ${{matrix.cfg.vsv}} ${{matrix.cfg.vs}}" -A Win32 -T host=x86 ..
env:
DONT_RUN_VCPKG: true

Expand Down Expand Up @@ -230,7 +230,7 @@ jobs:
run: sudo sed -i 's/azure\.//' /etc/apt/sources.list && sudo apt update && sudo apt install cmake rpm

- name: Generate CMakeFiles
run: mkdir build && cd build && sudo cmake ${{matrix.cfg.cmake-options}} -DDPP_NO_VCPKG=ON -DCMAKE_BUILD_TYPE=Release -DAVX_TYPE=T_fallback ..
run: mkdir build && cd build && sudo cmake ${{matrix.cfg.cmake-options}} -DDPP_NO_VCPKG=ON -DCMAKE_BUILD_TYPE=Release -DAVX_TYPE=AVX0 ..

- name: Compile Source
run: cd build && sudo make -j2
Expand Down Expand Up @@ -266,7 +266,7 @@ jobs:
# ls -lah
# mkdir build
# cd build
# cmake -DAVX_TYPE=T_fallback -DDPP_NO_VCPKG=ON -DCMAKE_BUILD_TYPE=Release ..
# cmake -DAVX_TYPE=AVX0 -DDPP_NO_VCPKG=ON -DCMAKE_BUILD_TYPE=Release ..
# make -j2
# make install
# cpack --verbose
Expand Down
30 changes: 17 additions & 13 deletions cmake/DetectArchitecture.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -17,29 +17,30 @@ function(check_instruction_set INSTRUCTION_SET_NAME INSTRUCTION_SET_FLAG INSTRUC
if(${INSTRUCTION_SET_NAME})
set(AVX_TYPE "${INSTRUCTION_SET_NAME}" PARENT_SCOPE)
set(AVX_FLAG "${INSTRUCTION_SET_FLAG}" PARENT_SCOPE)
set(AVX_NAME "${INSTRUCTION_SET_NAME}" PARENT_SCOPE)
else()
return()
endif()
endfunction()

if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
set(INSTRUCTION_SETS
"T_AVX?/arch:AVX?__m128i value{}#auto result = _mm_extract_epi32(value, 0)"
"T_AVX2?/arch:AVX2?__m256i value{}#auto result = _mm256_extract_epi32(value, 0)"
"T_AVX512?/arch:AVX512?int32_t result[16]#const _mm512i& value{}#_mm512_store_si512(result, value)"
"AVX1?/arch:AVX?__m128i value{}#auto result = _mm_extract_epi32(value, 0)"
"AVX2?/arch:AVX2?__m256i value{}#auto result = _mm256_extract_epi32(value, 0)"
"AVX512?/arch:AVX512?int32_t result[16]#const _mm512i& value{}#_mm512_store_si512(result, value)"
)
else()
set(INSTRUCTION_SETS
"T_AVX?-mavx?__m128i value{}#auto result = _mm_extract_epi32(value, 0)"
"T_AVX2?-mavx2?__m256i value{}#auto result = _mm256_extract_epi32(value, 0)"
"T_AVX512?-mavx512f?int32_t result[16]#const _mm512i& value{}#_mm512_store_si512(result, value)"
"AVX1?-mavx?__m128i value{}#auto result = _mm_extract_epi32(value, 0)"
"AVX2?-mavx2?__m256i value{}#auto result = _mm256_extract_epi32(value, 0)"
"AVX512?-mavx512f?int32_t result[16]#const _mm512i& value{}#_mm512_store_si512(result, value)"
)
endif()

set(CMAKE_REQUIRED_FLAGS_SAVE "${CMAKE_REQUIRED_FLAGS}")

set(AVX_NAME "T_fallback")
set(AVX_TYPE "AVX0")
set(AVX_TYPE "AVX0" PARENT_SCOPE)
set(AVX_FLAGS "" PARENT_SCOPE)

# This is only supported on x86/x64, it is completely skipped and forced to T_fallback anywhere else
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "i386") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "AMD64"))
Expand All @@ -54,11 +55,14 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64") OR (${CMAKE_SYSTEM_PROCESSOR} M
check_instruction_set("${INSTRUCTION_SET_NAME}" "${INSTRUCTION_SET_FLAG}" "${INSTRUCTION_SET_INTRINSIC}")
endforeach()

string(REPLACE "T_" "" AVX_DISPLAY ${AVX_NAME})
message(STATUS "Detected ${CMAKE_SYSTEM_PROCESSOR} SSE type: ${AVX_DISPLAY}")
message(STATUS "Detected ${CMAKE_SYSTEM_PROCESSOR} AVX type: ${AVX_TYPE} (FLAGS: ${AVX_FLAG})")
set(AVX_TYPE ${AVX_TYPE})
set(AVX_TYPE ${AVX_TYPE} PARENT_SCOPE)
set(AVX_FLAG ${AVX_FLAG} PARENT_SCOPE)
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS_SAVE}")
else()
message(STATUS "SSE not supported by architecture ${CMAKE_SYSTEM_PROCESSOR} ${AVX_NAME}")
set(AVX_NAME "T_fallback")
set(AVX_TYPE "T_fallback")
message(STATUS "AVX not supported by architecture ${CMAKE_SYSTEM_PROCESSOR} ${AVX_TYPE}")
set(AVX_TYPE "AVX0")
set(AVX_FLAG "" PARENT_SCOPE)
set(AVX_TYPE "AVX0" PARENT_SCOPE)
endif()
136 changes: 136 additions & 0 deletions include/dpp/isa/avx.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
/************************************************************************************
*
* D++, A Lightweight C++ library for Discord
*
* Copyright 2021 Craig Edwards and D++ contributors
* (https://github.com/brainboxdotcc/DPP/graphs/contributors)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
************************************************************************************/
#pragma once

#if defined _MSC_VER || defined __GNUC__ || defined __clang__

#include <immintrin.h>

#ifdef max
#undef max
#endif
#ifdef min
#undef min
#endif

namespace dpp {

using avx_float = __m128;
using avx_int = __m128i;

/*
* @brief Extracts a 32-bit integer from a 128-bit AVX register.
* @param value The AVX register containing packed 32-bit integers.
* @param index The index of the 32-bit integer to extract (0-3).
* @return The extracted 32-bit integer.
*/
inline int32_t extract_int32_from_avx(const avx_int& value, int64_t index) {
switch (index) {
case 0: {
return _mm_extract_epi32(value, 0);
}
case 1: {
return _mm_extract_epi32(value, 1);
}
case 2: {
return _mm_extract_epi32(value, 2);
}
case 3: {
return _mm_extract_epi32(value, 3);
}
default: {
return _mm_extract_epi32(value, 0);
}
}
}

/**
* @brief A class for audio mixing operations using AVX instructions.
*/
class audio_mixer {
public:
/*
* @brief The number of 32-bit values per CPU register.
*/
inline static constexpr int32_t byte_blocks_per_register{ 4 };

/*
* @brief Stores values from a 128-bit AVX vector to a storage location.
* @tparam value_type The target value type for storage.
* @param values_to_store The 128-bit AVX vector containing values to store.
* @param storage_location Pointer to the storage location.
*/
template<typename value_type> inline static void store_values(const avx_int& values_to_store, value_type* storage_location) {
for (int64_t x = 0; x < byte_blocks_per_register; ++x) {
storage_location[x] = static_cast<value_type>(extract_int32_from_avx(values_to_store, x));
}
}

/**
* @brief Specialization for gathering non-float values into an AVX register.
* @tparam value_type The type of values being gathered.
* @tparam Indices Parameter pack of indices for gathering values.
* @return An AVX register containing gathered values.
*/
template<typename value_type> inline static avx_float gather_values(value_type* values) {
alignas(16) float new_array[byte_blocks_per_register]{};
for (size_t x = 0; x < byte_blocks_per_register; ++x) {
new_array[x] = static_cast<float>(values[x]);
}
return _mm_load_ps(new_array);
}

/**
* @brief Collect a single register worth of data from data_in, apply gain and increment, and store the result in data_out.
* This version uses AVX instructions.
*
* @param data_in Pointer to the input array of int32_t values.
* @param data_out Pointer to the output array of int16_t values.
* @param current_gain The gain to be applied to the elements.
* @param increment The increment value to be added to each element.
*/
inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
avx_float current_samples_new{ _mm_mul_ps(gather_values(data_in),
_mm_add_ps(_mm_set1_ps(current_gain), _mm_mul_ps(_mm_set1_ps(increment), _mm_set_ps(0.0f, 1.0f, 2.0f, 3.0f)))) };

current_samples_new = _mm_blendv_ps(_mm_max_ps(current_samples_new, _mm_set1_ps(static_cast<float>(std::numeric_limits<int16_t>::min()))),
_mm_min_ps(current_samples_new, _mm_set1_ps(static_cast<float>(std::numeric_limits<int16_t>::max()))),
_mm_cmp_ps(current_samples_new, _mm_set1_ps(0.0f), _CMP_GE_OQ));

store_values(_mm_cvtps_epi32(current_samples_new), data_out);
}

/**
* @brief Combine a register worth of elements from decoded_data and store the result in up_sampled_vector.
* This version uses AVX instructions.
*
* @param up_sampled_vector Pointer to the array of int32_t values.
* @param decoded_data Pointer to the array of int16_t values.
*/
inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
auto newValues{ _mm_cvtps_epi32(_mm_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data))) };
store_values(newValues, up_sampled_vector);
}
};

} // namespace dpp

#endif
Loading

0 comments on commit 2e75d3d

Please sign in to comment.