From 454b6de0d3a75a549d5cc56095f34bdeeab28a99 Mon Sep 17 00:00:00 2001 From: Pete Harris Date: Thu, 15 Aug 2024 10:02:31 +0100 Subject: [PATCH] Improve SIMD vtable API (#492) --- Source/UnitTest/test_simd.cpp | 183 ++++++++++++------ Source/astcenc_decompress_symbolic.cpp | 27 +-- .../astcenc_ideal_endpoints_and_weights.cpp | 19 +- Source/astcenc_vecmathlib.h | 20 ++ Source/astcenc_vecmathlib_avx2_8.h | 130 ++++++++----- Source/astcenc_vecmathlib_neon_4.h | 108 ++++++----- Source/astcenc_vecmathlib_none_4.h | 120 ++++++------ Source/astcenc_vecmathlib_sse_4.h | 171 +++++++++------- Source/astcenc_vecmathlib_sve_8.h | 109 ++++++----- Test/astc_profile_valgrind.py | 2 +- 10 files changed, 544 insertions(+), 345 deletions(-) diff --git a/Source/UnitTest/test_simd.cpp b/Source/UnitTest/test_simd.cpp index f857c3550..24fb63f53 100644 --- a/Source/UnitTest/test_simd.cpp +++ b/Source/UnitTest/test_simd.cpp @@ -1947,43 +1947,78 @@ TEST(vmask4, not) } /** @brief Test vint4 table permute. */ -TEST(vint4, vtable_8bt_32bi_32entry) +TEST(vint4, vtable4_16x8) { - vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); - vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f); + uint8_t data[16] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + }; - vint4 table0p, table1p; - vtable_prepare(table0, table1, table0p, table1p); + vtable4_16x8 table; + vtable_prepare(table, data); - vint4 index(0, 7, 4, 31); + vint4 index(0, 7, 4, 15); - vint4 result = vtable_8bt_32bi(table0p, table1p, index); + vint4 result = vtable_lookup_32bit(table, index); - EXPECT_EQ(result.lane<0>(), 3); - EXPECT_EQ(result.lane<1>(), 4); - EXPECT_EQ(result.lane<2>(), 7); - EXPECT_EQ(result.lane<3>(), 28); + EXPECT_EQ(result.lane<0>(), 0); + EXPECT_EQ(result.lane<1>(), 7); + EXPECT_EQ(result.lane<2>(), 4); + EXPECT_EQ(result.lane<3>(), 15); } /** @brief Test vint4 table permute. */ -TEST(vint4, vtable_8bt_32bi_64entry) +TEST(vint4, vtable4_32x8) { - vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); - vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f); - vint4 table2(0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f); - vint4 table3(0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f); + uint8_t data[32] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f + }; + + vtable4_32x8 table; + vtable_prepare(table, data); + + vint4 index(0, 7, 4, 31); + + vint4 result = vtable_lookup_32bit(table, index); - vint4 table0p, table1p, table2p, table3p; - vtable_prepare(table0, table1, table2, table3, table0p, table1p, table2p, table3p); + EXPECT_EQ(result.lane<0>(), 0); + EXPECT_EQ(result.lane<1>(), 7); + EXPECT_EQ(result.lane<2>(), 4); + EXPECT_EQ(result.lane<3>(), 31); +} + +/** @brief Test vint4 table permute. */ +TEST(vint4, vtable4_64x8) +{ + uint8_t data[64] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f + }; + + vtable4_64x8 table; + vtable_prepare(table, data); vint4 index(0, 7, 38, 63); - vint4 result = vtable_8bt_32bi(table0p, table1p, table2p, table3p, index); + vint4 result = vtable_lookup_32bit(table, index); + + uint8_t* hack = reinterpret_cast(&table); + std::cout << "38: " << hack[38] << "\n"; + std::cout << "63: " << hack[63] << "\n"; - EXPECT_EQ(result.lane<0>(), 3); - EXPECT_EQ(result.lane<1>(), 4); - EXPECT_EQ(result.lane<2>(), 37); - EXPECT_EQ(result.lane<3>(), 60); + EXPECT_EQ(result.lane<0>(), 0); + EXPECT_EQ(result.lane<1>(), 7); + EXPECT_EQ(result.lane<2>(), 38); + EXPECT_EQ(result.lane<3>(), 63); } /** @brief Test vint4 rgba byte interleave. */ @@ -3657,57 +3692,95 @@ TEST(vmask8, not) } /** @brief Test vint8 table permute. */ -TEST(vint8, vtable_8bt_32bi_32entry) +TEST(vint8, vtable8_16x8) { - vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); - vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f); + uint8_t data[16] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + }; - vint8 table0p, table1p; - vtable_prepare(table0, table1, table0p, table1p); + vtable8_16x8 table; + vtable_prepare(table, data); - vint8 index = vint8_lit(0, 7, 4, 15, 16, 20, 23, 31); + vint8 index = vint8_lit(0, 7, 4, 15, 1, 2, 14, 4); - vint8 result = vtable_8bt_32bi(table0p, table1p, index); + vint8 result = vtable_lookup_32bit(table, index); alignas(32) int ra[8]; store(result, ra); - EXPECT_EQ(ra[0], 3); - EXPECT_EQ(ra[1], 4); - EXPECT_EQ(ra[2], 7); - EXPECT_EQ(ra[3], 12); - EXPECT_EQ(ra[4], 19); - EXPECT_EQ(ra[5], 23); - EXPECT_EQ(ra[6], 20); - EXPECT_EQ(ra[7], 28); + EXPECT_EQ(ra[0], 0); + EXPECT_EQ(ra[1], 7); + EXPECT_EQ(ra[2], 4); + EXPECT_EQ(ra[3], 15); + EXPECT_EQ(ra[4], 1); + EXPECT_EQ(ra[5], 2); + EXPECT_EQ(ra[6], 14); + EXPECT_EQ(ra[7], 4); } -/** @brief Test vint4 table permute. */ -TEST(vint8, vtable_8bt_32bi_64entry) +/** @brief Test vint8 table permute. */ +TEST(vint8, vtable8_32x8) { - vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); - vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f); - vint4 table2(0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f); - vint4 table3(0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f); + uint8_t data[32] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f + }; + + vtable8_32x8 table; + vtable_prepare(table, data); + + vint8 index = vint8_lit(0, 7, 4, 15, 16, 20, 23, 31); - vint8 table0p, table1p, table2p, table3p; - vtable_prepare(table0, table1, table2, table3, table0p, table1p, table2p, table3p); + vint8 result = vtable_lookup_32bit(table, index); + + alignas(32) int ra[8]; + store(result, ra); + + EXPECT_EQ(ra[0], 0); + EXPECT_EQ(ra[1], 7); + EXPECT_EQ(ra[2], 4); + EXPECT_EQ(ra[3], 15); + EXPECT_EQ(ra[4], 16); + EXPECT_EQ(ra[5], 20); + EXPECT_EQ(ra[6], 23); + EXPECT_EQ(ra[7], 31); +} + +/** @brief Test vint8 table permute. */ +TEST(vint8, vtable8_64x8) +{ + uint8_t data[64] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f + }; + + vtable8_64x8 table; + vtable_prepare(table, data); vint8 index = vint8_lit(0, 7, 4, 15, 16, 20, 38, 63); - vint8 result = vtable_8bt_32bi(table0p, table1p, table2p, table3p, index); + vint8 result = vtable_lookup_32bit(table, index); alignas(32) int ra[8]; store(result, ra); - EXPECT_EQ(ra[0], 3); - EXPECT_EQ(ra[1], 4); - EXPECT_EQ(ra[2], 7); - EXPECT_EQ(ra[3], 12); - EXPECT_EQ(ra[4], 19); - EXPECT_EQ(ra[5], 23); - EXPECT_EQ(ra[6], 37); - EXPECT_EQ(ra[7], 60); + EXPECT_EQ(ra[0], 0); + EXPECT_EQ(ra[1], 7); + EXPECT_EQ(ra[2], 4); + EXPECT_EQ(ra[3], 15); + EXPECT_EQ(ra[4], 16); + EXPECT_EQ(ra[5], 20); + EXPECT_EQ(ra[6], 38); + EXPECT_EQ(ra[7], 63); } #endif diff --git a/Source/astcenc_decompress_symbolic.cpp b/Source/astcenc_decompress_symbolic.cpp index 902a3f3e9..e7791eef6 100644 --- a/Source/astcenc_decompress_symbolic.cpp +++ b/Source/astcenc_decompress_symbolic.cpp @@ -98,13 +98,8 @@ void unpack_weights( if (!is_dual_plane) { // Build full 64-entry weight lookup table - vint4 tab0 = vint4::load(scb.weights + 0); - vint4 tab1 = vint4::load(scb.weights + 16); - vint4 tab2 = vint4::load(scb.weights + 32); - vint4 tab3 = vint4::load(scb.weights + 48); - - vint tab0p, tab1p, tab2p, tab3p; - vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p); + vtable_64x8 table; + vtable_prepare(table, scb.weights); for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH) { @@ -118,7 +113,7 @@ void unpack_weights( vint texel_weights(di.texel_weights_tr[j] + i); vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i); - summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int; + summed_value += vtable_lookup_32bit(table, texel_weights) * texel_weights_int; } store(lsr<4>(summed_value), weights_plane1 + i); @@ -128,16 +123,12 @@ void unpack_weights( { // Build a 32-entry weight lookup table per plane // Plane 1 - vint4 tab0_plane1 = vint4::load(scb.weights + 0); - vint4 tab1_plane1 = vint4::load(scb.weights + 16); - vint tab0_plane1p, tab1_plane1p; - vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p); + vtable_32x8 tab_plane1; + vtable_prepare(tab_plane1, scb.weights); // Plane 2 - vint4 tab0_plane2 = vint4::load(scb.weights + 32); - vint4 tab1_plane2 = vint4::load(scb.weights + 48); - vint tab0_plane2p, tab1_plane2p; - vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p); + vtable_32x8 tab_plane2; + vtable_prepare(tab_plane2, scb.weights + 32); for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH) { @@ -153,8 +144,8 @@ void unpack_weights( vint texel_weights(di.texel_weights_tr[j] + i); vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i); - sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int; - sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int; + sum_plane1 += vtable_lookup_32bit(tab_plane1, texel_weights) * texel_weights_int; + sum_plane2 += vtable_lookup_32bit(tab_plane2, texel_weights) * texel_weights_int; } store(lsr<4>(sum_plane1), weights_plane1 + i); diff --git a/Source/astcenc_ideal_endpoints_and_weights.cpp b/Source/astcenc_ideal_endpoints_and_weights.cpp index 3442464d5..ec680dd5e 100644 --- a/Source/astcenc_ideal_endpoints_and_weights.cpp +++ b/Source/astcenc_ideal_endpoints_and_weights.cpp @@ -1023,9 +1023,8 @@ void compute_quantized_weights_for_decimation( // safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements if (get_quant_level(quant_level) <= 16) { - vint4 tab0 = vint4::load(qat.quant_to_unquant); - vint tab0p; - vtable_prepare(tab0, tab0p); + vtable_16x8 table; + vtable_prepare(table, qat.quant_to_unquant); for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) { @@ -1038,8 +1037,8 @@ void compute_quantized_weights_for_decimation( vint weightl = float_to_int(ix1); vint weighth = min(weightl + vint(1), steps_m1); - vint ixli = vtable_8bt_32bi(tab0p, weightl); - vint ixhi = vtable_8bt_32bi(tab0p, weighth); + vint ixli = vtable_lookup_32bit(table, weightl); + vint ixhi = vtable_lookup_32bit(table, weighth); vfloat ixl = int_to_float(ixli); vfloat ixh = int_to_float(ixhi); @@ -1055,10 +1054,8 @@ void compute_quantized_weights_for_decimation( } else { - vint4 tab0 = vint4::load(qat.quant_to_unquant + 0); - vint4 tab1 = vint4::load(qat.quant_to_unquant + 16); - vint tab0p, tab1p; - vtable_prepare(tab0, tab1, tab0p, tab1p); + vtable_32x8 table; + vtable_prepare(table, qat.quant_to_unquant); for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) { @@ -1071,8 +1068,8 @@ void compute_quantized_weights_for_decimation( vint weightl = float_to_int(ix1); vint weighth = min(weightl + vint(1), steps_m1); - vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl); - vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth); + vint ixli = vtable_lookup_32bit(table, weightl); + vint ixhi = vtable_lookup_32bit(table, weighth); vfloat ixl = int_to_float(ixli); vfloat ixh = int_to_float(ixhi); diff --git a/Source/astcenc_vecmathlib.h b/Source/astcenc_vecmathlib.h index 628755619..b41f6fa3a 100644 --- a/Source/astcenc_vecmathlib.h +++ b/Source/astcenc_vecmathlib.h @@ -96,6 +96,10 @@ using vint = vint8; using vmask = vmask8; + using vtable_16x8 = vtable8_16x8; + using vtable_32x8 = vtable8_32x8; + using vtable_64x8 = vtable8_64x8; + constexpr auto loada = vfloat8::loada; constexpr auto load1 = vfloat8::load1; @@ -111,6 +115,10 @@ using vint = vint4; using vmask = vmask4; + using vtable_16x8 = vtable4_16x8; + using vtable_32x8 = vtable4_32x8; + using vtable_64x8 = vtable4_64x8; + constexpr auto loada = vfloat4::loada; constexpr auto load1 = vfloat4::load1; @@ -138,6 +146,10 @@ using vint = vint8; using vmask = vmask8; + using vtable_16x8 = vtable8_16x8; + using vtable_32x8 = vtable8_32x8; + using vtable_64x8 = vtable8_64x8; + constexpr auto loada = vfloat8::loada; constexpr auto load1 = vfloat8::load1; @@ -153,6 +165,10 @@ using vint = vint4; using vmask = vmask4; + using vtable_16x8 = vtable4_16x8; + using vtable_32x8 = vtable4_32x8; + using vtable_64x8 = vtable4_64x8; + constexpr auto loada = vfloat4::loada; constexpr auto load1 = vfloat4::load1; @@ -185,6 +201,10 @@ using vint = vint4; using vmask = vmask4; + using vtable_16x8 = vtable4_16x8; + using vtable_32x8 = vtable4_32x8; + using vtable_64x8 = vtable4_64x8; + constexpr auto loada = vfloat4::loada; constexpr auto load1 = vfloat4::load1; #endif diff --git a/Source/astcenc_vecmathlib_avx2_8.h b/Source/astcenc_vecmathlib_avx2_8.h index 347abf83c..7c75818ae 100644 --- a/Source/astcenc_vecmathlib_avx2_8.h +++ b/Source/astcenc_vecmathlib_avx2_8.h @@ -971,98 +971,140 @@ ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a) return vfloat8(_mm256_castsi256_ps(a.m)); } +/* + * Table structure for a 16x 8-bit entry table. + */ +struct vtable8_16x8 { + vint8 t0; +}; + +/* + * Table structure for a 32x 8-bit entry table. + */ +struct vtable8_32x8 { + vint8 t0; + vint8 t1; +}; + +/* + * Table structure for a 64x 8-bit entry table. + */ +struct vtable8_64x8 { + vint8 t0; + vint8 t1; + vint8 t2; + vint8 t3; +}; + /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table for 16x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p) -{ - // AVX2 duplicates the table within each 128-bit lane - __m128i t0n = t0.m; - t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n)); +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable8_16x8& table, + const uint8_t* data +) { + // AVX2 tables duplicate table entries in each 128-bit half-register + vint4 d0 = vint4::load(data); + + table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m)); } /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table for 32x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p) -{ - // AVX2 duplicates the table within each 128-bit lane - __m128i t0n = t0.m; - t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n)); +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable8_32x8& table, + const uint8_t* data +) { + // AVX2 tables duplicate table entries in each 128-bit half-register + vint4 d0 = vint4::load(data); + vint4 d1 = vint4::load(data + 16); + + table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m)); + table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m)); - __m128i t1n = _mm_xor_si128(t0.m, t1.m); - t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n)); + // XOR chain the high rows to allow table emulation + table.t1 = table.t1 ^ table.t0; } /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table 64x 8-bit entry table. */ ASTCENC_SIMD_INLINE void vtable_prepare( - vint4 t0, vint4 t1, vint4 t2, vint4 t3, - vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p) -{ - // AVX2 duplicates the table within each 128-bit lane - __m128i t0n = t0.m; - t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n)); - - __m128i t1n = _mm_xor_si128(t0.m, t1.m); - t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n)); + vtable8_64x8& table, + const uint8_t* data +) { + // AVX2 tables duplicate table entries in each 128-bit half-register + vint4 d0 = vint4::load(data); + vint4 d1 = vint4::load(data + 16); + vint4 d2 = vint4::load(data + 32); + vint4 d3 = vint4::load(data + 48); - __m128i t2n = _mm_xor_si128(t1.m, t2.m); - t2p = vint8(astcenc_mm256_set_m128i(t2n, t2n)); + table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m)); + table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m)); + table.t2 = vint8(astcenc_mm256_set_m128i(d2.m, d2.m)); + table.t3 = vint8(astcenc_mm256_set_m128i(d3.m, d3.m)); - __m128i t3n = _mm_xor_si128(t2.m, t3.m); - t3p = vint8(astcenc_mm256_set_m128i(t3n, t3n)); + // XOR chain the high rows to allow table emulation + table.t3 = table.t3 ^ table.t2; + table.t2 = table.t2 ^ table.t1; + table.t1 = table.t1 ^ table.t0; } /** - * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx) -{ +ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit( + const vtable8_16x8& tbl, + vint8 idx +) { // Set index byte MSB to 1 for unused bytes so shuffle returns zero __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast(0xFFFFFF00))); - __m256i result = _mm256_shuffle_epi8(t0.m, idxx); + __m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx); return vint8(result); } /** - * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx) -{ +ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit( + const vtable8_32x8& tbl, + vint8 idx +) { // Set index byte MSB to 1 for unused bytes so shuffle returns zero __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast(0xFFFFFF00))); - __m256i result = _mm256_shuffle_epi8(t0.m, idxx); + __m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx); idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16)); - __m256i result2 = _mm256_shuffle_epi8(t1.m, idxx); + __m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx); result = _mm256_xor_si256(result, result2); return vint8(result); } /** - * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx) -{ +ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit( + const vtable8_64x8& tbl, + vint8 idx +) { // Set index byte MSB to 1 for unused bytes so shuffle returns zero __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast(0xFFFFFF00))); - __m256i result = _mm256_shuffle_epi8(t0.m, idxx); + __m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx); idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16)); - __m256i result2 = _mm256_shuffle_epi8(t1.m, idxx); + __m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx); result = _mm256_xor_si256(result, result2); idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16)); - result2 = _mm256_shuffle_epi8(t2.m, idxx); + result2 = _mm256_shuffle_epi8(tbl.t2.m, idxx); result = _mm256_xor_si256(result, result2); idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16)); - result2 = _mm256_shuffle_epi8(t3.m, idxx); + result2 = _mm256_shuffle_epi8(tbl.t3.m, idxx); result = _mm256_xor_si256(result, result2); return vint8(result); diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h index e15d4f9ef..c7ff01289 100644 --- a/Source/astcenc_vecmathlib_neon_4.h +++ b/Source/astcenc_vecmathlib_neon_4.h @@ -939,87 +939,105 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v) return vfloat4(vreinterpretq_f32_s32(v.m)); } -/** - * @brief Prepare a vtable lookup table for use with the native SIMD size. +/* + * Table structure for a 16x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p) -{ - t0p = t0; -} +struct vtable4_16x8 { + uint8x16_t t0; +}; + +/* + * Table structure for a 32x 8-bit entry table. + */ +struct vtable4_32x8 { + uint8x16x2_t t01; +}; +/* + * Table structure for a 64x 8-bit entry table. + */ +struct vtable4_64x8 { + uint8x16x4_t t0123; +}; /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table for 16x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p) -{ - t0p = t0; - t1p = t1; +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable4_16x8& table, + const uint8_t* data +) { + table.t0 = vld1q_u8(data); } /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table for 32x 8-bit entry table. */ ASTCENC_SIMD_INLINE void vtable_prepare( - vint4 t0, vint4 t1, vint4 t2, vint4 t3, - vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p) -{ - t0p = t0; - t1p = t1; - t2p = t2; - t3p = t3; + vtable4_32x8& table, + const uint8_t* data +) { + table.t01 = uint8x16x2_t { + vld1q_u8(data), + vld1q_u8(data + 16) + }; } /** - * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes. + * @brief Prepare a vtable lookup table 64x 8-bit entry table. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx) -{ - int8x16_t table { - vreinterpretq_s8_s32(t0.m) +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable4_64x8& table, + const uint8_t* data +) { + table.t0123 = uint8x16x4_t { + vld1q_u8(data), + vld1q_u8(data + 16), + vld1q_u8(data + 32), + vld1q_u8(data + 48) }; +} +/** + * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices. + */ +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( + const vtable4_16x8& tbl, + vint4 idx +) { // Set index byte above max index for unused bytes so table lookup returns zero int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00)); uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked); - return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(table, idx_bytes))); + return vint4(vreinterpretq_s32_u8(vqtbl1q_u8(tbl.t0, idx_bytes))); } /** - * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx) -{ - int8x16x2_t table { - vreinterpretq_s8_s32(t0.m), - vreinterpretq_s8_s32(t1.m) - }; - +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( + const vtable4_32x8& tbl, + vint4 idx +) { // Set index byte above max index for unused bytes so table lookup returns zero int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00)); uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked); - return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table, idx_bytes))); + return vint4(vreinterpretq_s32_u8(vqtbl2q_u8(tbl.t01, idx_bytes))); } /** - * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx) -{ - int8x16x4_t table { - vreinterpretq_s8_s32(t0.m), - vreinterpretq_s8_s32(t1.m), - vreinterpretq_s8_s32(t2.m), - vreinterpretq_s8_s32(t3.m) - }; - +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( + const vtable4_64x8& tbl, + vint4 idx +) { // Set index byte above max index for unused bytes so table lookup returns zero int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00)); uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked); - return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table, idx_bytes))); + return vint4(vreinterpretq_s32_u8(vqtbl4q_u8(tbl.t0123, idx_bytes))); } /** diff --git a/Source/astcenc_vecmathlib_none_4.h b/Source/astcenc_vecmathlib_none_4.h index 55981e4c3..4646e84ad 100644 --- a/Source/astcenc_vecmathlib_none_4.h +++ b/Source/astcenc_vecmathlib_none_4.h @@ -1067,84 +1067,94 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a) return r; } +/* + * Table structure for a 16x 8-bit entry table. + */ +struct vtable4_16x8 { + const uint8_t* data; +}; + +/* + * Table structure for a 32x 8-bit entry table. + */ +struct vtable4_32x8 { + const uint8_t* data; +}; + +/* + * Table structure for a 64x 8-bit entry table. + */ +struct vtable4_64x8 { + const uint8_t* data; +}; + /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table for 16x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p) -{ - t0p = t0; +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable4_16x8& table, + const uint8_t* data +) { + table.data = data; } /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table for 32x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p) -{ - t0p = t0; - t1p = t1; +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable4_32x8& table, + const uint8_t* data +) { + table.data = data; } /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table 64x 8-bit entry table. */ ASTCENC_SIMD_INLINE void vtable_prepare( - vint4 t0, vint4 t1, vint4 t2, vint4 t3, - vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p) -{ - t0p = t0; - t1p = t1; - t2p = t2; - t3p = t3; + vtable4_64x8& table, + const uint8_t* data +) { + table.data = data; } /** - * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx) -{ - uint8_t table[16]; - - std::memcpy(table + 0, t0.m, 4 * sizeof(int)); - - return vint4(table[idx.lane<0>()], - table[idx.lane<1>()], - table[idx.lane<2>()], - table[idx.lane<3>()]); +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( + const vtable4_16x8& table, + vint4 idx +) { + return vint4(table.data[idx.lane<0>()], + table.data[idx.lane<1>()], + table.data[idx.lane<2>()], + table.data[idx.lane<3>()]); } - /** - * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx) -{ - uint8_t table[32]; - - std::memcpy(table + 0, t0.m, 4 * sizeof(int)); - std::memcpy(table + 16, t1.m, 4 * sizeof(int)); - - return vint4(table[idx.lane<0>()], - table[idx.lane<1>()], - table[idx.lane<2>()], - table[idx.lane<3>()]); +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( + const vtable4_32x8& table, + vint4 idx +) { + return vint4(table.data[idx.lane<0>()], + table.data[idx.lane<1>()], + table.data[idx.lane<2>()], + table.data[idx.lane<3>()]); } /** - * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx) -{ - uint8_t table[64]; - - std::memcpy(table + 0, t0.m, 4 * sizeof(int)); - std::memcpy(table + 16, t1.m, 4 * sizeof(int)); - std::memcpy(table + 32, t2.m, 4 * sizeof(int)); - std::memcpy(table + 48, t3.m, 4 * sizeof(int)); - - return vint4(table[idx.lane<0>()], - table[idx.lane<1>()], - table[idx.lane<2>()], - table[idx.lane<3>()]); +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( + const vtable4_64x8& table, + vint4 idx +) { + return vint4(table.data[idx.lane<0>()], + table.data[idx.lane<1>()], + table.data[idx.lane<2>()], + table.data[idx.lane<3>()]); } /** diff --git a/Source/astcenc_vecmathlib_sse_4.h b/Source/astcenc_vecmathlib_sse_4.h index 111d9b9e5..5c726b6ab 100644 --- a/Source/astcenc_vecmathlib_sse_4.h +++ b/Source/astcenc_vecmathlib_sse_4.h @@ -1037,136 +1037,173 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v) return vfloat4(_mm_castsi128_ps(v.m)); } +/* + * Table structure for a 16x 8-bit entry table. + */ +struct vtable4_16x8 { +#if ASTCENC_SSE >= 41 + vint4 t0; +#else + const uint8_t* data; +#endif +}; + +/* + * Table structure for a 32x 8-bit entry table. + */ +struct vtable4_32x8 { +#if ASTCENC_SSE >= 41 + vint4 t0; + vint4 t1; +#else + const uint8_t* data; +#endif +}; + +/* + * Table structure for a 64x 8-bit entry table. + */ +struct vtable4_64x8 { +#if ASTCENC_SSE >= 41 + vint4 t0; + vint4 t1; + vint4 t2; + vint4 t3; +#else + const uint8_t* data; +#endif +}; + /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table for 16x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p) -{ - t0p = t0; +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable4_16x8& table, + const uint8_t* data +) { +#if ASTCENC_SSE >= 41 + table.t0 = vint4::load(data); +#else + table.data = data; +#endif } /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table for 32x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p) -{ +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable4_32x8& table, + const uint8_t* data +) { #if ASTCENC_SSE >= 41 - t0p = t0; - t1p = t0 ^ t1; + table.t0 = vint4::load(data); + table.t1 = vint4::load(data + 16); + + table.t1 = table.t1 ^ table.t0; #else - t0p = t0; - t1p = t1; + table.data = data; #endif } /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table 64x 8-bit entry table. */ ASTCENC_SIMD_INLINE void vtable_prepare( - vint4 t0, vint4 t1, vint4 t2, vint4 t3, - vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p) -{ + vtable4_64x8& table, + const uint8_t* data +) { #if ASTCENC_SSE >= 41 - t0p = t0; - t1p = t0 ^ t1; - t2p = t1 ^ t2; - t3p = t2 ^ t3; + table.t0 = vint4::load(data); + table.t1 = vint4::load(data + 16); + table.t2 = vint4::load(data + 32); + table.t3 = vint4::load(data + 48); + + table.t3 = table.t3 ^ table.t2; + table.t2 = table.t2 ^ table.t1; + table.t1 = table.t1 ^ table.t0; #else - t0p = t0; - t1p = t1; - t2p = t2; - t3p = t3; + table.data = data; #endif } /** - * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx) -{ +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( + const vtable4_16x8& tbl, + vint4 idx +) { #if ASTCENC_SSE >= 41 // Set index byte MSB to 1 for unused bytes so shuffle returns zero __m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast(0xFFFFFF00))); - __m128i result = _mm_shuffle_epi8(t0.m, idxx); + __m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx); return vint4(result); #else - uint8_t table[16]; - - std::memcpy(table + 0, &t0.m, 4 * sizeof(int)); - - return vint4(table[idx.lane<0>()], - table[idx.lane<1>()], - table[idx.lane<2>()], - table[idx.lane<3>()]); + return vint4(tbl.data[idx.lane<0>()], + tbl.data[idx.lane<1>()], + tbl.data[idx.lane<2>()], + tbl.data[idx.lane<3>()]); #endif } /** - * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx) -{ +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( + const vtable4_32x8& tbl, + vint4 idx +) { #if ASTCENC_SSE >= 41 // Set index byte MSB to 1 for unused bytes so shuffle returns zero __m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast(0xFFFFFF00))); - __m128i result = _mm_shuffle_epi8(t0.m, idxx); + __m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx); idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16)); - __m128i result2 = _mm_shuffle_epi8(t1.m, idxx); + __m128i result2 = _mm_shuffle_epi8(tbl.t1.m, idxx); result = _mm_xor_si128(result, result2); return vint4(result); #else - uint8_t table[32]; - - std::memcpy(table + 0, &t0.m, 4 * sizeof(int)); - std::memcpy(table + 16, &t1.m, 4 * sizeof(int)); - - return vint4(table[idx.lane<0>()], - table[idx.lane<1>()], - table[idx.lane<2>()], - table[idx.lane<3>()]); + return vint4(tbl.data[idx.lane<0>()], + tbl.data[idx.lane<1>()], + tbl.data[idx.lane<2>()], + tbl.data[idx.lane<3>()]); #endif } /** - * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx) -{ +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( + const vtable4_64x8& tbl, + vint4 idx +) { #if ASTCENC_SSE >= 41 // Set index byte MSB to 1 for unused bytes so shuffle returns zero __m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast(0xFFFFFF00))); - __m128i result = _mm_shuffle_epi8(t0.m, idxx); + __m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx); idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16)); - __m128i result2 = _mm_shuffle_epi8(t1.m, idxx); + __m128i result2 = _mm_shuffle_epi8(tbl.t1.m, idxx); result = _mm_xor_si128(result, result2); idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16)); - result2 = _mm_shuffle_epi8(t2.m, idxx); + result2 = _mm_shuffle_epi8(tbl.t2.m, idxx); result = _mm_xor_si128(result, result2); idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16)); - result2 = _mm_shuffle_epi8(t3.m, idxx); + result2 = _mm_shuffle_epi8(tbl.t3.m, idxx); result = _mm_xor_si128(result, result2); return vint4(result); #else - uint8_t table[64]; - - std::memcpy(table + 0, &t0.m, 4 * sizeof(int)); - std::memcpy(table + 16, &t1.m, 4 * sizeof(int)); - std::memcpy(table + 32, &t2.m, 4 * sizeof(int)); - std::memcpy(table + 48, &t3.m, 4 * sizeof(int)); - - return vint4(table[idx.lane<0>()], - table[idx.lane<1>()], - table[idx.lane<2>()], - table[idx.lane<3>()]); + return vint4(tbl.data[idx.lane<0>()], + tbl.data[idx.lane<1>()], + tbl.data[idx.lane<2>()], + tbl.data[idx.lane<3>()]); #endif } diff --git a/Source/astcenc_vecmathlib_sve_8.h b/Source/astcenc_vecmathlib_sve_8.h index 169f28cb1..9b48411b3 100644 --- a/Source/astcenc_vecmathlib_sve_8.h +++ b/Source/astcenc_vecmathlib_sve_8.h @@ -906,90 +906,101 @@ ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a) return vfloat8(svreinterpret_f32_s32(a.m)); } -/** - * @brief Prepare a vtable lookup table for use with the native SIMD size. +/* + * Table structure for a 16x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p) -{ - t0p = vint8(svdup_neonq_f32(t0.m)); -} +struct vtable8_16x8 { + vint8 t0; +}; -/** - * @brief Prepare a vtable lookup table for use with the native SIMD size. +/* + * Table structure for a 32x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p) -{ - // 8-wide SVE uses a single table register, so t1 is unused - (void)t1p; +struct vtable8_32x8 { + vint8 t0; +}; - svfloat32_8_t t0v = svdup_neonq_f32(t0.m); - svfloat32_8_t t1v = svdup_neonq_f32(t1.m); +/* + * Table structure for a 64x 8-bit entry table. + */ +struct vtable8_64x8 { + vint8 t0; + vint8 t1; +}; - t0p = vint8(svext_f32(t0v, t1v, 4)); +/** + * @brief Prepare a vtable lookup table for 16x 8-bit entry table. + */ +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable8_16x8& table, + const uint8_t* data +) { + // Top half of register will be zeros + table.t0 = vint8(svld1_u8(svptrue_pat_b8(SV_VL16), data)); } /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table for 32x 8-bit entry table. */ ASTCENC_SIMD_INLINE void vtable_prepare( - vint4 t0, vint4 t1, vint4 t2, vint4 t3, - vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p) -{ - // 8-wide SVE uses a two table registers, so t2 and t3 are unused - (void)t2p; - (void)t3p; - - svfloat32_8_t t0v = svdup_neonq_f32(t0.m); - svfloat32_8_t t1v = svdup_neonq_f32(t1.m); - svfloat32_8_t t2v = svdup_neonq_f32(t2.m); - svfloat32_8_t t3v = svdup_neonq_f32(t3.m); + vtable8_32x8& table, + const uint8_t* data +) { + table.t0 = vint8(svld1_u8(svptrue_b8(), data)); +} - t0p = vint8(svext_f32(t0v, t1v, 4)); - t1p = vint8(svext_f32(t2v, t3v, 4)); +/** + * @brief Prepare a vtable lookup table 64x 8-bit entry table. + */ +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable8_64x8& table, + const uint8_t* data +) { + table.t0 = vint8(svld1_u8(svptrue_b8(), data)); + table.t1 = vint8(svld1_u8(svptrue_b8(), data + 32)); } /** - * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx) -{ +ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit( + const vtable8_16x8& tbl, + vint8 idx +) { // Set index byte above max index for unused bytes so table lookup returns zero svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00)); svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked); - svuint8_8_t tbl_bytes = svreinterpret_u8_s32(t0.m); + svuint8_8_t tbl_bytes = svreinterpret_u8_s32(tbl.t0.m); svuint8_8_t result = svtbl_u8(tbl_bytes, idx_bytes); return vint8(svreinterpret_s32_u8(result)); } /** - * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx) -{ - // 8-wide SVE uses a single table register, so t1 is unused - (void)t1; - +ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit( + const vtable8_32x8& tbl, + vint8 idx +) { // Set index byte above max index for unused bytes so table lookup returns zero svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00)); svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked); - svuint8_8_t tbl_bytes = svreinterpret_u8_s32(t0.m); + svuint8_8_t tbl_bytes = svreinterpret_u8_s32(tbl.t0.m); svuint8_8_t result = svtbl_u8(tbl_bytes, idx_bytes); return vint8(svreinterpret_s32_u8(result)); } /** - * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx) -{ - // 8-wide SVE uses a two table registers, so t2 and t3 are unused - (void)t2; - (void)t3; - +ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit( + const vtable8_64x8& tbl, + vint8 idx +) { // Set index byte above max index for unused bytes so table lookup returns zero svint32_8_t literal32 = svdup_s32(32); svbool_8_t idx_lo_select = svcmplt(svptrue_b32(), idx.m, literal32); @@ -999,8 +1010,8 @@ ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3 svuint8_8_t idx_lo_bytes = svreinterpret_u8_s32(idx_lo_masked); svuint8_8_t idx_hi_bytes = svreinterpret_u8_s32(idx_hi_masked); - svuint8_8_t tbl0_bytes = svreinterpret_u8_s32(t0.m); - svuint8_8_t tbl1_bytes = svreinterpret_u8_s32(t1.m); + svuint8_8_t tbl0_bytes = svreinterpret_u8_s32(tbl.t0.m); + svuint8_8_t tbl1_bytes = svreinterpret_u8_s32(tbl.t1.m); svint32_8_t t0_lookup = svreinterpret_s32_u8(svtbl_u8(tbl0_bytes, idx_lo_bytes)); svint32_8_t t1_lookup = svreinterpret_s32_u8(svtbl_u8(tbl1_bytes, idx_hi_bytes)); diff --git a/Test/astc_profile_valgrind.py b/Test/astc_profile_valgrind.py index eb9a82a45..3cd08445b 100644 --- a/Test/astc_profile_valgrind.py +++ b/Test/astc_profile_valgrind.py @@ -125,7 +125,7 @@ def run_pass(image, noStartup, encoder, blocksize, quality): if noStartup: args = ["gprof2dot", "--format=callgrind", "--output=out.dot", "callgrind.txt", - "-s", "-z", "compress_block(astcenc_contexti const&, image_block const&, physical_compressed_block&, compression_working_buffers&)"] + "-s", "-z", "compress_block(astcenc_contexti const&, image_block const&, unsigned char*, compression_working_buffers&)"] else: args = ["gprof2dot", "--format=callgrind", "--output=out.dot", "callgrind.txt", "-s", "-z", "main"]