Skip to content

Commit

Permalink
Updating to fix the clamping behavior of the AVX-512 implementation.
Browse files Browse the repository at this point in the history
Realized that I had done this wrong in the previous implementation. Also edited the name of a variable. Also, just edited the loads to use aligned loads instead of unaligned.
  • Loading branch information
RealTimeChris committed Sep 17, 2023
1 parent a7dbdff commit 593a149
Showing 1 changed file with 15 additions and 11 deletions.
26 changes: 15 additions & 11 deletions include/dpp/isa_detection.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,11 +155,11 @@ namespace dpp {
* @return An AVX512 register containing gathered values.
*/
template<typename value_type> inline static avx_512_float gather_values(value_type* values) {
float new_array[byte_blocks_per_register]{};
alignas(64) float new_array[byte_blocks_per_register]{};
for (size_t x = 0; x < byte_blocks_per_register; ++x) {
new_array[x] = static_cast<float>(values[x]);
}
return _mm512_loadu_ps(new_array);
return _mm512_load_ps(new_array);
}

/**
Expand All @@ -177,9 +177,13 @@ namespace dpp {
_mm512_mul_ps(_mm512_set1_ps(increment),
_mm512_set_ps(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f)))) };

current_samples_new = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(current_samples_new, _mm512_set1_ps(0.0f), _CMP_GE_OQ),
_mm512_max_ps(current_samples_new, _mm512_set1_ps(static_cast<float>(std::numeric_limits<int16_t>::min()))),
_mm512_min_ps(current_samples_new, _mm512_set1_ps(static_cast<float>(std::numeric_limits<int16_t>::max()))));
__m512 lower_limit = _mm512_set1_ps(static_cast<float>(std::numeric_limits<opus_int16>::min()));
__m512 upper_limit = _mm512_set1_ps(static_cast<float>(std::numeric_limits<opus_int16>::max()));

__mmask16 mask_ge = _mm512_cmp_ps_mask(current_samples_new, _mm512_set1_ps(0.0f), _CMP_GE_OQ);

current_samples_new = _mm512_mask_max_ps(current_samples_new, mask_ge, current_samples_new, lower_limit);
current_samples_new = _mm512_mask_min_ps(current_samples_new, ~mask_ge, current_samples_new, upper_limit);

store_values(_mm512_cvtps_epi32(current_samples_new), data_out);
}
Expand Down Expand Up @@ -228,11 +232,11 @@ namespace dpp {
* @return An AVX2 register containing gathered values.
*/
template<typename value_type> inline static avx_2_float gather_values(value_type* values) {
float new_array[byte_blocks_per_register]{};
alignas(32) float new_array[byte_blocks_per_register]{};
for (size_t x = 0; x < byte_blocks_per_register; ++x) {
new_array[x] = static_cast<float>(values[x]);
}
return _mm256_loadu_ps(new_array);
return _mm256_load_ps(new_array);
}

/**
Expand Down Expand Up @@ -302,11 +306,11 @@ namespace dpp {
* @return An AVX register containing gathered values.
*/
template<typename value_type> inline static avx_float gather_values(value_type* values) {
float new_array[byte_blocks_per_register]{};
alignas(16) float new_array[byte_blocks_per_register]{};
for (size_t x = 0; x < byte_blocks_per_register; ++x) {
new_array[x] = static_cast<float>(values[x]);
}
return _mm_loadu_ps(new_array);
return _mm_load_ps(new_array);
}

/**
Expand Down Expand Up @@ -365,8 +369,8 @@ namespace dpp {
*/
inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
for (uint64_t x = 0; x < byte_blocks_per_register; ++x) {
auto increment_neww = increment * x;
auto current_gain_new = current_gain + increment_neww;
auto increment_new = increment * x;
auto current_gain_new = current_gain + increment_new;
auto current_sample_new = data_in[x] * current_gain_new;
if (current_sample_new >= std::numeric_limits<int16_t>::max()) {
current_sample_new = std::numeric_limits<int16_t>::max();
Expand Down

0 comments on commit 593a149

Please sign in to comment.