From 454b6de0d3a75a549d5cc56095f34bdeeab28a99 Mon Sep 17 00:00:00 2001
From: Pete Harris <peter.harris@arm.com>
Date: Thu, 15 Aug 2024 10:02:31 +0100
Subject: [PATCH] Improve SIMD vtable API (#492)

---
 Source/UnitTest/test_simd.cpp                 | 183 ++++++++++++------
 Source/astcenc_decompress_symbolic.cpp        |  27 +--
 .../astcenc_ideal_endpoints_and_weights.cpp   |  19 +-
 Source/astcenc_vecmathlib.h                   |  20 ++
 Source/astcenc_vecmathlib_avx2_8.h            | 130 ++++++++-----
 Source/astcenc_vecmathlib_neon_4.h            | 108 ++++++-----
 Source/astcenc_vecmathlib_none_4.h            | 120 ++++++------
 Source/astcenc_vecmathlib_sse_4.h             | 171 +++++++++-------
 Source/astcenc_vecmathlib_sve_8.h             | 109 ++++++-----
 Test/astc_profile_valgrind.py                 |   2 +-
 10 files changed, 544 insertions(+), 345 deletions(-)

diff --git a/Source/UnitTest/test_simd.cpp b/Source/UnitTest/test_simd.cpp
index f857c3550..24fb63f53 100644
--- a/Source/UnitTest/test_simd.cpp
+++ b/Source/UnitTest/test_simd.cpp
@@ -1947,43 +1947,78 @@ TEST(vmask4, not)
 }
 
 /** @brief Test vint4 table permute. */
-TEST(vint4, vtable_8bt_32bi_32entry)
+TEST(vint4, vtable4_16x8)
 {
-	vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
-	vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f);
+	uint8_t data[16] = {
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+	};
 
-	vint4 table0p, table1p;
-	vtable_prepare(table0, table1, table0p, table1p);
+	vtable4_16x8 table;
+	vtable_prepare(table, data);
 
-	vint4 index(0, 7, 4, 31);
+	vint4 index(0, 7, 4, 15);
 
-	vint4 result = vtable_8bt_32bi(table0p, table1p, index);
+	vint4 result = vtable_lookup_32bit(table, index);
 
-	EXPECT_EQ(result.lane<0>(),  3);
-	EXPECT_EQ(result.lane<1>(),  4);
-	EXPECT_EQ(result.lane<2>(),  7);
-	EXPECT_EQ(result.lane<3>(), 28);
+	EXPECT_EQ(result.lane<0>(),  0);
+	EXPECT_EQ(result.lane<1>(),  7);
+	EXPECT_EQ(result.lane<2>(),  4);
+	EXPECT_EQ(result.lane<3>(), 15);
 }
 
 /** @brief Test vint4 table permute. */
-TEST(vint4, vtable_8bt_32bi_64entry)
+TEST(vint4, vtable4_32x8)
 {
-	vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
-	vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f);
-	vint4 table2(0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f);
-	vint4 table3(0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);
+	uint8_t data[32] = {
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+	};
+
+	vtable4_32x8 table;
+	vtable_prepare(table, data);
+
+	vint4 index(0, 7, 4, 31);
+
+	vint4 result = vtable_lookup_32bit(table, index);
 
-	vint4 table0p, table1p, table2p, table3p;
-	vtable_prepare(table0, table1, table2, table3, table0p, table1p, table2p, table3p);
+	EXPECT_EQ(result.lane<0>(),  0);
+	EXPECT_EQ(result.lane<1>(),  7);
+	EXPECT_EQ(result.lane<2>(),  4);
+	EXPECT_EQ(result.lane<3>(), 31);
+}
+
+/** @brief Test vint4 table permute. */
+TEST(vint4, vtable4_64x8)
+{
+	uint8_t data[64] = {
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+		0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+		0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+		0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+		0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f
+	};
+
+	vtable4_64x8 table;
+	vtable_prepare(table, data);
 
 	vint4 index(0, 7, 38, 63);
 
-	vint4 result = vtable_8bt_32bi(table0p, table1p, table2p, table3p, index);
+	vint4 result = vtable_lookup_32bit(table, index);
+
+	uint8_t* hack = reinterpret_cast<uint8_t*>(&table);
+	std::cout << "38: " << hack[38] << "\n";
+	std::cout << "63: " << hack[63] << "\n";
 
-	EXPECT_EQ(result.lane<0>(),  3);
-	EXPECT_EQ(result.lane<1>(),  4);
-	EXPECT_EQ(result.lane<2>(), 37);
-	EXPECT_EQ(result.lane<3>(), 60);
+	EXPECT_EQ(result.lane<0>(),  0);
+	EXPECT_EQ(result.lane<1>(),  7);
+	EXPECT_EQ(result.lane<2>(), 38);
+	EXPECT_EQ(result.lane<3>(), 63);
 }
 
 /** @brief Test vint4 rgba byte interleave. */
@@ -3657,57 +3692,95 @@ TEST(vmask8, not)
 }
 
 /** @brief Test vint8 table permute. */
-TEST(vint8, vtable_8bt_32bi_32entry)
+TEST(vint8, vtable8_16x8)
 {
-	vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
-	vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f);
+	uint8_t data[16] = {
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+	};
 
-	vint8 table0p, table1p;
-	vtable_prepare(table0, table1, table0p, table1p);
+	vtable8_16x8 table;
+	vtable_prepare(table, data);
 
-	vint8 index = vint8_lit(0, 7, 4, 15, 16, 20, 23, 31);
+	vint8 index = vint8_lit(0, 7, 4, 15, 1, 2, 14, 4);
 
-	vint8 result = vtable_8bt_32bi(table0p, table1p, index);
+	vint8 result = vtable_lookup_32bit(table, index);
 
 	alignas(32) int ra[8];
 	store(result, ra);
 
-	EXPECT_EQ(ra[0],  3);
-	EXPECT_EQ(ra[1],  4);
-	EXPECT_EQ(ra[2],  7);
-	EXPECT_EQ(ra[3], 12);
-	EXPECT_EQ(ra[4], 19);
-	EXPECT_EQ(ra[5], 23);
-	EXPECT_EQ(ra[6], 20);
-	EXPECT_EQ(ra[7], 28);
+	EXPECT_EQ(ra[0],  0);
+	EXPECT_EQ(ra[1],  7);
+	EXPECT_EQ(ra[2],  4);
+	EXPECT_EQ(ra[3], 15);
+	EXPECT_EQ(ra[4],  1);
+	EXPECT_EQ(ra[5],  2);
+	EXPECT_EQ(ra[6], 14);
+	EXPECT_EQ(ra[7],  4);
 }
 
-/** @brief Test vint4 table permute. */
-TEST(vint8, vtable_8bt_32bi_64entry)
+/** @brief Test vint8 table permute. */
+TEST(vint8, vtable8_32x8)
 {
-	vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
-	vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f);
-	vint4 table2(0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f);
-	vint4 table3(0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);
+	uint8_t data[32] = {
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+	};
+
+	vtable8_32x8 table;
+	vtable_prepare(table, data);
+
+	vint8 index = vint8_lit(0, 7, 4, 15, 16, 20, 23, 31);
 
-	vint8 table0p, table1p, table2p, table3p;
-	vtable_prepare(table0, table1, table2, table3, table0p, table1p, table2p, table3p);
+	vint8 result = vtable_lookup_32bit(table, index);
+
+	alignas(32) int ra[8];
+	store(result, ra);
+
+	EXPECT_EQ(ra[0],  0);
+	EXPECT_EQ(ra[1],  7);
+	EXPECT_EQ(ra[2],  4);
+	EXPECT_EQ(ra[3], 15);
+	EXPECT_EQ(ra[4], 16);
+	EXPECT_EQ(ra[5], 20);
+	EXPECT_EQ(ra[6], 23);
+	EXPECT_EQ(ra[7], 31);
+}
+
+/** @brief Test vint8 table permute. */
+TEST(vint8, vtable8_64x8)
+{
+	uint8_t data[64] = {
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+		0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+		0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+		0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+		0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f
+	};
+
+	vtable8_64x8 table;
+	vtable_prepare(table, data);
 
 	vint8 index = vint8_lit(0, 7, 4, 15, 16, 20, 38, 63);
 
-	vint8 result = vtable_8bt_32bi(table0p, table1p, table2p, table3p, index);
+	vint8 result = vtable_lookup_32bit(table, index);
 
 	alignas(32) int ra[8];
 	store(result, ra);
 
-	EXPECT_EQ(ra[0],  3);
-	EXPECT_EQ(ra[1],  4);
-	EXPECT_EQ(ra[2],  7);
-	EXPECT_EQ(ra[3], 12);
-	EXPECT_EQ(ra[4], 19);
-	EXPECT_EQ(ra[5], 23);
-	EXPECT_EQ(ra[6], 37);
-	EXPECT_EQ(ra[7], 60);
+	EXPECT_EQ(ra[0],  0);
+	EXPECT_EQ(ra[1],  7);
+	EXPECT_EQ(ra[2],  4);
+	EXPECT_EQ(ra[3], 15);
+	EXPECT_EQ(ra[4], 16);
+	EXPECT_EQ(ra[5], 20);
+	EXPECT_EQ(ra[6], 38);
+	EXPECT_EQ(ra[7], 63);
 }
 
 #endif
diff --git a/Source/astcenc_decompress_symbolic.cpp b/Source/astcenc_decompress_symbolic.cpp
index 902a3f3e9..e7791eef6 100644
--- a/Source/astcenc_decompress_symbolic.cpp
+++ b/Source/astcenc_decompress_symbolic.cpp
@@ -98,13 +98,8 @@ void unpack_weights(
 	if (!is_dual_plane)
 	{
 		// Build full 64-entry weight lookup table
-		vint4 tab0 = vint4::load(scb.weights +  0);
-		vint4 tab1 = vint4::load(scb.weights + 16);
-		vint4 tab2 = vint4::load(scb.weights + 32);
-		vint4 tab3 = vint4::load(scb.weights + 48);
-
-		vint tab0p, tab1p, tab2p, tab3p;
-		vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p);
+		vtable_64x8 table;
+		vtable_prepare(table, scb.weights);
 
 		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
@@ -118,7 +113,7 @@ void unpack_weights(
 				vint texel_weights(di.texel_weights_tr[j] + i);
 				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
 
-				summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int;
+				summed_value += vtable_lookup_32bit(table, texel_weights) * texel_weights_int;
 			}
 
 			store(lsr<4>(summed_value), weights_plane1 + i);
@@ -128,16 +123,12 @@ void unpack_weights(
 	{
 		// Build a 32-entry weight lookup table per plane
 		// Plane 1
-		vint4 tab0_plane1 = vint4::load(scb.weights +  0);
-		vint4 tab1_plane1 = vint4::load(scb.weights + 16);
-		vint tab0_plane1p, tab1_plane1p;
-		vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p);
+		vtable_32x8 tab_plane1;
+		vtable_prepare(tab_plane1, scb.weights);
 
 		// Plane 2
-		vint4 tab0_plane2 = vint4::load(scb.weights + 32);
-		vint4 tab1_plane2 = vint4::load(scb.weights + 48);
-		vint tab0_plane2p, tab1_plane2p;
-		vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p);
+		vtable_32x8 tab_plane2;
+		vtable_prepare(tab_plane2, scb.weights + 32);
 
 		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
@@ -153,8 +144,8 @@ void unpack_weights(
 				vint texel_weights(di.texel_weights_tr[j] + i);
 				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
 
-				sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int;
-				sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int;
+				sum_plane1 += vtable_lookup_32bit(tab_plane1, texel_weights) * texel_weights_int;
+				sum_plane2 += vtable_lookup_32bit(tab_plane2, texel_weights) * texel_weights_int;
 			}
 
 			store(lsr<4>(sum_plane1), weights_plane1 + i);
diff --git a/Source/astcenc_ideal_endpoints_and_weights.cpp b/Source/astcenc_ideal_endpoints_and_weights.cpp
index 3442464d5..ec680dd5e 100644
--- a/Source/astcenc_ideal_endpoints_and_weights.cpp
+++ b/Source/astcenc_ideal_endpoints_and_weights.cpp
@@ -1023,9 +1023,8 @@ void compute_quantized_weights_for_decimation(
 	// safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
 	if (get_quant_level(quant_level) <= 16)
 	{
-		vint4 tab0 = vint4::load(qat.quant_to_unquant);
-		vint tab0p;
-		vtable_prepare(tab0, tab0p);
+		vtable_16x8 table;
+		vtable_prepare(table, qat.quant_to_unquant);
 
 		for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
 		{
@@ -1038,8 +1037,8 @@ void compute_quantized_weights_for_decimation(
 			vint weightl = float_to_int(ix1);
 			vint weighth = min(weightl + vint(1), steps_m1);
 
-			vint ixli = vtable_8bt_32bi(tab0p, weightl);
-			vint ixhi = vtable_8bt_32bi(tab0p, weighth);
+			vint ixli = vtable_lookup_32bit(table, weightl);
+			vint ixhi = vtable_lookup_32bit(table, weighth);
 
 			vfloat ixl = int_to_float(ixli);
 			vfloat ixh = int_to_float(ixhi);
@@ -1055,10 +1054,8 @@ void compute_quantized_weights_for_decimation(
 	}
 	else
 	{
-		vint4 tab0 = vint4::load(qat.quant_to_unquant +  0);
-		vint4 tab1 = vint4::load(qat.quant_to_unquant + 16);
-		vint tab0p, tab1p;
-		vtable_prepare(tab0, tab1, tab0p, tab1p);
+		vtable_32x8 table;
+		vtable_prepare(table, qat.quant_to_unquant);
 
 		for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
 		{
@@ -1071,8 +1068,8 @@ void compute_quantized_weights_for_decimation(
 			vint weightl = float_to_int(ix1);
 			vint weighth = min(weightl + vint(1), steps_m1);
 
-			vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl);
-			vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth);
+			vint ixli = vtable_lookup_32bit(table, weightl);
+			vint ixhi = vtable_lookup_32bit(table, weighth);
 
 			vfloat ixl = int_to_float(ixli);
 			vfloat ixh = int_to_float(ixhi);
diff --git a/Source/astcenc_vecmathlib.h b/Source/astcenc_vecmathlib.h
index 628755619..b41f6fa3a 100644
--- a/Source/astcenc_vecmathlib.h
+++ b/Source/astcenc_vecmathlib.h
@@ -96,6 +96,10 @@
 	using vint = vint8;
 	using vmask = vmask8;
 
+	using vtable_16x8 = vtable8_16x8;
+	using vtable_32x8 = vtable8_32x8;
+	using vtable_64x8 = vtable8_64x8;
+
 	constexpr auto loada = vfloat8::loada;
 	constexpr auto load1 = vfloat8::load1;
 
@@ -111,6 +115,10 @@
 	using vint = vint4;
 	using vmask = vmask4;
 
+	using vtable_16x8 = vtable4_16x8;
+	using vtable_32x8 = vtable4_32x8;
+	using vtable_64x8 = vtable4_64x8;
+
 	constexpr auto loada = vfloat4::loada;
 	constexpr auto load1 = vfloat4::load1;
 
@@ -138,6 +146,10 @@
 	using vint = vint8;
 	using vmask = vmask8;
 
+	using vtable_16x8 = vtable8_16x8;
+	using vtable_32x8 = vtable8_32x8;
+	using vtable_64x8 = vtable8_64x8;
+
 	constexpr auto loada = vfloat8::loada;
 	constexpr auto load1 = vfloat8::load1;
 
@@ -153,6 +165,10 @@
 	using vint = vint4;
 	using vmask = vmask4;
 
+	using vtable_16x8 = vtable4_16x8;
+	using vtable_32x8 = vtable4_32x8;
+	using vtable_64x8 = vtable4_64x8;
+
 	constexpr auto loada = vfloat4::loada;
 	constexpr auto load1 = vfloat4::load1;
 
@@ -185,6 +201,10 @@
 	using vint = vint4;
 	using vmask = vmask4;
 
+	using vtable_16x8 = vtable4_16x8;
+	using vtable_32x8 = vtable4_32x8;
+	using vtable_64x8 = vtable4_64x8;
+
 	constexpr auto loada = vfloat4::loada;
 	constexpr auto load1 = vfloat4::load1;
 #endif
diff --git a/Source/astcenc_vecmathlib_avx2_8.h b/Source/astcenc_vecmathlib_avx2_8.h
index 347abf83c..7c75818ae 100644
--- a/Source/astcenc_vecmathlib_avx2_8.h
+++ b/Source/astcenc_vecmathlib_avx2_8.h
@@ -971,98 +971,140 @@ ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
 	return vfloat8(_mm256_castsi256_ps(a.m));
 }
 
+/*
+ * Table structure for a 16x 8-bit entry table.
+ */
+struct vtable8_16x8 {
+	vint8 t0;
+};
+
+/*
+ * Table structure for a 32x 8-bit entry table.
+ */
+struct vtable8_32x8 {
+	vint8 t0;
+	vint8 t1;
+};
+
+/*
+ * Table structure for a 64x 8-bit entry table.
+ */
+struct vtable8_64x8 {
+	vint8 t0;
+	vint8 t1;
+	vint8 t2;
+	vint8 t3;
+};
+
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p)
-{
-	// AVX2 duplicates the table within each 128-bit lane
-	__m128i t0n = t0.m;
-	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable8_16x8& table,
+	const uint8_t* data
+) {
+	// AVX2 tables duplicate table entries in each 128-bit half-register
+	vint4 d0 = vint4::load(data);
+
+	table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p)
-{
-	// AVX2 duplicates the table within each 128-bit lane
-	__m128i t0n = t0.m;
-	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable8_32x8& table,
+	const uint8_t* data
+) {
+	// AVX2 tables duplicate table entries in each 128-bit half-register
+	vint4 d0 = vint4::load(data);
+	vint4 d1 = vint4::load(data + 16);
+
+	table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
+	table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m));
 
-	__m128i t1n = _mm_xor_si128(t0.m, t1.m);
-	t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
+	// XOR chain the high rows to allow table emulation
+	table.t1 = table.t1 ^ table.t0;
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table 64x 8-bit entry table.
  */
 ASTCENC_SIMD_INLINE void vtable_prepare(
-	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
-	vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p)
-{
-	// AVX2 duplicates the table within each 128-bit lane
-	__m128i t0n = t0.m;
-	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
-
-	__m128i t1n = _mm_xor_si128(t0.m, t1.m);
-	t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
+	vtable8_64x8& table,
+	const uint8_t* data
+) {
+	// AVX2 tables duplicate table entries in each 128-bit half-register
+	vint4 d0 = vint4::load(data);
+	vint4 d1 = vint4::load(data + 16);
+	vint4 d2 = vint4::load(data + 32);
+	vint4 d3 = vint4::load(data + 48);
 
-	__m128i t2n = _mm_xor_si128(t1.m, t2.m);
-	t2p = vint8(astcenc_mm256_set_m128i(t2n, t2n));
+	table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
+	table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m));
+	table.t2 = vint8(astcenc_mm256_set_m128i(d2.m, d2.m));
+	table.t3 = vint8(astcenc_mm256_set_m128i(d3.m, d3.m));
 
-	__m128i t3n = _mm_xor_si128(t2.m, t3.m);
-	t3p = vint8(astcenc_mm256_set_m128i(t3n, t3n));
+	// XOR chain the high rows to allow table emulation
+	table.t3 = table.t3 ^ table.t2;
+	table.t2 = table.t2 ^ table.t1;
+	table.t1 = table.t1 ^ table.t0;
 }
 
 /**
- * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx)
-{
+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
+	const vtable8_16x8& tbl,
+	vint8 idx
+) {
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
 
-	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
+	__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
 	return vint8(result);
 }
 
 /**
- * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx)
-{
+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
+	const vtable8_32x8& tbl,
+	vint8 idx
+) {
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
 
-	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
+	__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
 
-	__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
+	__m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx);
 	result = _mm256_xor_si256(result, result2);
 	return vint8(result);
 }
 
 /**
- * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx)
-{
+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
+	const vtable8_64x8& tbl,
+	vint8 idx
+) {
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
 
-	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
+	__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
 
-	__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
+	__m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx);
 	result = _mm256_xor_si256(result, result2);
 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
 
-	result2 = _mm256_shuffle_epi8(t2.m, idxx);
+	result2 = _mm256_shuffle_epi8(tbl.t2.m, idxx);
 	result = _mm256_xor_si256(result, result2);
 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
 
-	result2 = _mm256_shuffle_epi8(t3.m, idxx);
+	result2 = _mm256_shuffle_epi8(tbl.t3.m, idxx);
 	result = _mm256_xor_si256(result, result2);
 
 	return vint8(result);
diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h
index e15d4f9ef..c7ff01289 100644
--- a/Source/astcenc_vecmathlib_neon_4.h
+++ b/Source/astcenc_vecmathlib_neon_4.h
@@ -939,87 +939,105 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
 	return vfloat4(vreinterpretq_f32_s32(v.m));
 }
 
-/**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+/*
+ * Table structure for a 16x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
-{
-	t0p = t0;
-}
+struct vtable4_16x8 {
+	uint8x16_t t0;
+};
+
+/*
+ * Table structure for a 32x 8-bit entry table.
+ */
+struct vtable4_32x8 {
+	uint8x16x2_t t01;
+};
 
+/*
+ * Table structure for a 64x 8-bit entry table.
+ */
+struct vtable4_64x8 {
+	uint8x16x4_t t0123;
+};
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
-{
-	t0p = t0;
-	t1p = t1;
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable4_16x8& table,
+	const uint8_t* data
+) {
+	table.t0 = vld1q_u8(data);
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
  */
 ASTCENC_SIMD_INLINE void vtable_prepare(
-	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
-	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
-{
-	t0p = t0;
-	t1p = t1;
-	t2p = t2;
-	t3p = t3;
+	vtable4_32x8& table,
+	const uint8_t* data
+) {
+	table.t01 = uint8x16x2_t {
+		vld1q_u8(data),
+		vld1q_u8(data + 16)
+	};
 }
 
 /**
- * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
+ * @brief Prepare a vtable lookup table 64x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
-{
-	int8x16_t table {
-		vreinterpretq_s8_s32(t0.m)
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable4_64x8& table,
+	const uint8_t* data
+) {
+	table.t0123 = uint8x16x4_t {
+		vld1q_u8(data),
+		vld1q_u8(data + 16),
+		vld1q_u8(data + 32),
+		vld1q_u8(data + 48)
 	};
+}
 
+/**
+ * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
+ */
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
+	const vtable4_16x8& tbl,
+	vint4 idx
+) {
 	// Set index byte above max index for unused bytes so table lookup returns zero
 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
 
-	return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(table, idx_bytes)));
+	return vint4(vreinterpretq_s32_u8(vqtbl1q_u8(tbl.t0, idx_bytes)));
 }
 
 /**
- * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
-{
-	int8x16x2_t table {
-		vreinterpretq_s8_s32(t0.m),
-		vreinterpretq_s8_s32(t1.m)
-	};
-
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
+	const vtable4_32x8& tbl,
+	vint4 idx
+) {
 	// Set index byte above max index for unused bytes so table lookup returns zero
 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
 
-	return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table, idx_bytes)));
+	return vint4(vreinterpretq_s32_u8(vqtbl2q_u8(tbl.t01, idx_bytes)));
 }
 
 /**
- * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
-{
-	int8x16x4_t table {
-		vreinterpretq_s8_s32(t0.m),
-		vreinterpretq_s8_s32(t1.m),
-		vreinterpretq_s8_s32(t2.m),
-		vreinterpretq_s8_s32(t3.m)
-	};
-
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
+	const vtable4_64x8& tbl,
+	vint4 idx
+) {
 	// Set index byte above max index for unused bytes so table lookup returns zero
 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
 
-	return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table, idx_bytes)));
+	return vint4(vreinterpretq_s32_u8(vqtbl4q_u8(tbl.t0123, idx_bytes)));
 }
 
 /**
diff --git a/Source/astcenc_vecmathlib_none_4.h b/Source/astcenc_vecmathlib_none_4.h
index 55981e4c3..4646e84ad 100644
--- a/Source/astcenc_vecmathlib_none_4.h
+++ b/Source/astcenc_vecmathlib_none_4.h
@@ -1067,84 +1067,94 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a)
 	return r;
 }
 
+/*
+ * Table structure for a 16x 8-bit entry table.
+ */
+struct vtable4_16x8 {
+	const uint8_t* data;
+};
+
+/*
+ * Table structure for a 32x 8-bit entry table.
+ */
+struct vtable4_32x8 {
+	const uint8_t* data;
+};
+
+/*
+ * Table structure for a 64x 8-bit entry table.
+ */
+struct vtable4_64x8 {
+	const uint8_t* data;
+};
+
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
-{
-	t0p = t0;
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable4_16x8& table,
+	const uint8_t* data
+) {
+	table.data = data;
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
-{
-	t0p = t0;
-	t1p = t1;
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable4_32x8& table,
+	const uint8_t* data
+) {
+	table.data = data;
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table 64x 8-bit entry table.
  */
 ASTCENC_SIMD_INLINE void vtable_prepare(
-	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
-	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
-{
-	t0p = t0;
-	t1p = t1;
-	t2p = t2;
-	t3p = t3;
+	vtable4_64x8& table,
+	const uint8_t* data
+) {
+	table.data = data;
 }
 
 /**
- * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
-{
-	uint8_t table[16];
-
-	std::memcpy(table +  0, t0.m, 4 * sizeof(int));
-
-	return vint4(table[idx.lane<0>()],
-	             table[idx.lane<1>()],
-	             table[idx.lane<2>()],
-	             table[idx.lane<3>()]);
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
+	const vtable4_16x8& table,
+	vint4 idx
+) {
+	return vint4(table.data[idx.lane<0>()],
+	             table.data[idx.lane<1>()],
+	             table.data[idx.lane<2>()],
+	             table.data[idx.lane<3>()]);
 }
 
-
 /**
- * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
-{
-	uint8_t table[32];
-
-	std::memcpy(table +  0, t0.m, 4 * sizeof(int));
-	std::memcpy(table + 16, t1.m, 4 * sizeof(int));
-
-	return vint4(table[idx.lane<0>()],
-	             table[idx.lane<1>()],
-	             table[idx.lane<2>()],
-	             table[idx.lane<3>()]);
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
+	const vtable4_32x8& table,
+	vint4 idx
+) {
+	return vint4(table.data[idx.lane<0>()],
+	             table.data[idx.lane<1>()],
+	             table.data[idx.lane<2>()],
+	             table.data[idx.lane<3>()]);
 }
 
 /**
- * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
-{
-	uint8_t table[64];
-
-	std::memcpy(table +  0, t0.m, 4 * sizeof(int));
-	std::memcpy(table + 16, t1.m, 4 * sizeof(int));
-	std::memcpy(table + 32, t2.m, 4 * sizeof(int));
-	std::memcpy(table + 48, t3.m, 4 * sizeof(int));
-
-	return vint4(table[idx.lane<0>()],
-	             table[idx.lane<1>()],
-	             table[idx.lane<2>()],
-	             table[idx.lane<3>()]);
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
+	const vtable4_64x8& table,
+	vint4 idx
+) {
+	return vint4(table.data[idx.lane<0>()],
+	             table.data[idx.lane<1>()],
+	             table.data[idx.lane<2>()],
+	             table.data[idx.lane<3>()]);
 }
 
 /**
diff --git a/Source/astcenc_vecmathlib_sse_4.h b/Source/astcenc_vecmathlib_sse_4.h
index 111d9b9e5..5c726b6ab 100644
--- a/Source/astcenc_vecmathlib_sse_4.h
+++ b/Source/astcenc_vecmathlib_sse_4.h
@@ -1037,136 +1037,173 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
 	return vfloat4(_mm_castsi128_ps(v.m));
 }
 
+/*
+ * Table structure for a 16x 8-bit entry table.
+ */
+struct vtable4_16x8 {
+#if ASTCENC_SSE >= 41
+	vint4 t0;
+#else
+	const uint8_t* data;
+#endif
+};
+
+/*
+ * Table structure for a 32x 8-bit entry table.
+ */
+struct vtable4_32x8 {
+#if ASTCENC_SSE >= 41
+	vint4 t0;
+	vint4 t1;
+#else
+	const uint8_t* data;
+#endif
+};
+
+/*
+ * Table structure for a 64x 8-bit entry table.
+ */
+struct vtable4_64x8 {
+#if ASTCENC_SSE >= 41
+	vint4 t0;
+	vint4 t1;
+	vint4 t2;
+	vint4 t3;
+#else
+	const uint8_t* data;
+#endif
+};
+
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
-{
-	t0p = t0;
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable4_16x8& table,
+	const uint8_t* data
+) {
+#if ASTCENC_SSE >= 41
+	table.t0 = vint4::load(data);
+#else
+	table.data = data;
+#endif
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
-{
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable4_32x8& table,
+	const uint8_t* data
+) {
 #if ASTCENC_SSE >= 41
-	t0p = t0;
-	t1p = t0 ^ t1;
+	table.t0 = vint4::load(data);
+	table.t1 = vint4::load(data + 16);
+
+	table.t1 = table.t1 ^ table.t0;
 #else
-	t0p = t0;
-	t1p = t1;
+	table.data = data;
 #endif
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table 64x 8-bit entry table.
  */
 ASTCENC_SIMD_INLINE void vtable_prepare(
-	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
-	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
-{
+	vtable4_64x8& table,
+	const uint8_t* data
+) {
 #if ASTCENC_SSE >= 41
-	t0p = t0;
-	t1p = t0 ^ t1;
-	t2p = t1 ^ t2;
-	t3p = t2 ^ t3;
+	table.t0 = vint4::load(data);
+	table.t1 = vint4::load(data + 16);
+	table.t2 = vint4::load(data + 32);
+	table.t3 = vint4::load(data + 48);
+
+	table.t3 = table.t3 ^ table.t2;
+	table.t2 = table.t2 ^ table.t1;
+	table.t1 = table.t1 ^ table.t0;
 #else
-	t0p = t0;
-	t1p = t1;
-	t2p = t2;
-	t3p = t3;
+	table.data = data;
 #endif
 }
 
 /**
- * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
-{
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
+	const vtable4_16x8& tbl,
+	vint4 idx
+) {
 #if ASTCENC_SSE >= 41
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
 
-	__m128i result = _mm_shuffle_epi8(t0.m, idxx);
+	__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);
 	return vint4(result);
 #else
-	uint8_t table[16];
-
-	std::memcpy(table +  0, &t0.m, 4 * sizeof(int));
-
-	return vint4(table[idx.lane<0>()],
-	             table[idx.lane<1>()],
-	             table[idx.lane<2>()],
-	             table[idx.lane<3>()]);
+	return vint4(tbl.data[idx.lane<0>()],
+	             tbl.data[idx.lane<1>()],
+	             tbl.data[idx.lane<2>()],
+	             tbl.data[idx.lane<3>()]);
 #endif
 }
 
 /**
- * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
-{
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
+	const vtable4_32x8& tbl,
+	vint4 idx
+) {
 #if ASTCENC_SSE >= 41
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
 
-	__m128i result = _mm_shuffle_epi8(t0.m, idxx);
+	__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);
 	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
 
-	__m128i result2 = _mm_shuffle_epi8(t1.m, idxx);
+	__m128i result2 = _mm_shuffle_epi8(tbl.t1.m, idxx);
 	result = _mm_xor_si128(result, result2);
 
 	return vint4(result);
 #else
-	uint8_t table[32];
-
-	std::memcpy(table +  0, &t0.m, 4 * sizeof(int));
-	std::memcpy(table + 16, &t1.m, 4 * sizeof(int));
-
-	return vint4(table[idx.lane<0>()],
-	             table[idx.lane<1>()],
-	             table[idx.lane<2>()],
-	             table[idx.lane<3>()]);
+	return vint4(tbl.data[idx.lane<0>()],
+	             tbl.data[idx.lane<1>()],
+	             tbl.data[idx.lane<2>()],
+	             tbl.data[idx.lane<3>()]);
 #endif
 }
 
 /**
- * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
-{
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
+	const vtable4_64x8& tbl,
+	vint4 idx
+) {
 #if ASTCENC_SSE >= 41
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
 
-	__m128i result = _mm_shuffle_epi8(t0.m, idxx);
+	__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);
 	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
 
-	__m128i result2 = _mm_shuffle_epi8(t1.m, idxx);
+	__m128i result2 = _mm_shuffle_epi8(tbl.t1.m, idxx);
 	result = _mm_xor_si128(result, result2);
 	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
 
-	result2 = _mm_shuffle_epi8(t2.m, idxx);
+	result2 = _mm_shuffle_epi8(tbl.t2.m, idxx);
 	result = _mm_xor_si128(result, result2);
 	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
 
-	result2 = _mm_shuffle_epi8(t3.m, idxx);
+	result2 = _mm_shuffle_epi8(tbl.t3.m, idxx);
 	result = _mm_xor_si128(result, result2);
 
 	return vint4(result);
 #else
-	uint8_t table[64];
-
-	std::memcpy(table +  0, &t0.m, 4 * sizeof(int));
-	std::memcpy(table + 16, &t1.m, 4 * sizeof(int));
-	std::memcpy(table + 32, &t2.m, 4 * sizeof(int));
-	std::memcpy(table + 48, &t3.m, 4 * sizeof(int));
-
-	return vint4(table[idx.lane<0>()],
-	             table[idx.lane<1>()],
-	             table[idx.lane<2>()],
-	             table[idx.lane<3>()]);
+	return vint4(tbl.data[idx.lane<0>()],
+	             tbl.data[idx.lane<1>()],
+	             tbl.data[idx.lane<2>()],
+	             tbl.data[idx.lane<3>()]);
 #endif
 }
 
diff --git a/Source/astcenc_vecmathlib_sve_8.h b/Source/astcenc_vecmathlib_sve_8.h
index 169f28cb1..9b48411b3 100644
--- a/Source/astcenc_vecmathlib_sve_8.h
+++ b/Source/astcenc_vecmathlib_sve_8.h
@@ -906,90 +906,101 @@ ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
 	return vfloat8(svreinterpret_f32_s32(a.m));
 }
 
-/**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+/*
+ * Table structure for a 16x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p)
-{
-	t0p = vint8(svdup_neonq_f32(t0.m));
-}
+struct vtable8_16x8 {
+	vint8 t0;
+};
 
-/**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+/*
+ * Table structure for a 32x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p)
-{
-	// 8-wide SVE uses a single table register, so t1 is unused
-	(void)t1p;
+struct vtable8_32x8 {
+	vint8 t0;
+};
 
-	svfloat32_8_t t0v = svdup_neonq_f32(t0.m);
-	svfloat32_8_t t1v = svdup_neonq_f32(t1.m);
+/*
+ * Table structure for a 64x 8-bit entry table.
+ */
+struct vtable8_64x8 {
+	vint8 t0;
+	vint8 t1;
+};
 
-	t0p = vint8(svext_f32(t0v, t1v, 4));
+/**
+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
+ */
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable8_16x8& table,
+	const uint8_t* data
+) {
+	// Top half of register will be zeros
+	table.t0 = vint8(svld1_u8(svptrue_pat_b8(SV_VL16), data));
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
  */
 ASTCENC_SIMD_INLINE void vtable_prepare(
-	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
-	vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p)
-{
-	// 8-wide SVE uses a two table registers, so t2 and t3 are unused
-	(void)t2p;
-	(void)t3p;
-
-	svfloat32_8_t t0v = svdup_neonq_f32(t0.m);
-	svfloat32_8_t t1v = svdup_neonq_f32(t1.m);
-	svfloat32_8_t t2v = svdup_neonq_f32(t2.m);
-	svfloat32_8_t t3v = svdup_neonq_f32(t3.m);
+	vtable8_32x8& table,
+	const uint8_t* data
+) {
+	table.t0 = vint8(svld1_u8(svptrue_b8(), data));
+}
 
-	t0p = vint8(svext_f32(t0v, t1v, 4));
-	t1p = vint8(svext_f32(t2v, t3v, 4));
+/**
+ * @brief Prepare a vtable lookup table 64x 8-bit entry table.
+ */
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable8_64x8& table,
+	const uint8_t* data
+) {
+	table.t0 = vint8(svld1_u8(svptrue_b8(), data));
+	table.t1 = vint8(svld1_u8(svptrue_b8(), data + 32));
 }
 
 /**
- * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx)
-{
+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
+	const vtable8_16x8& tbl,
+	vint8 idx
+) {
 	// Set index byte above max index for unused bytes so table lookup returns zero
 	svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00));
 
 	svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked);
-	svuint8_8_t tbl_bytes = svreinterpret_u8_s32(t0.m);
+	svuint8_8_t tbl_bytes = svreinterpret_u8_s32(tbl.t0.m);
 	svuint8_8_t result = svtbl_u8(tbl_bytes, idx_bytes);
 
 	return vint8(svreinterpret_s32_u8(result));
 }
 
 /**
- * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx)
-{
-	// 8-wide SVE uses a single table register, so t1 is unused
-	(void)t1;
-
+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
+	const vtable8_32x8& tbl,
+	vint8 idx
+) {
 	// Set index byte above max index for unused bytes so table lookup returns zero
 	svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00));
 
 	svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked);
-	svuint8_8_t tbl_bytes = svreinterpret_u8_s32(t0.m);
+	svuint8_8_t tbl_bytes = svreinterpret_u8_s32(tbl.t0.m);
 	svuint8_8_t result = svtbl_u8(tbl_bytes, idx_bytes);
 
 	return vint8(svreinterpret_s32_u8(result));
 }
 
 /**
- * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx)
-{
-	// 8-wide SVE uses a two table registers, so t2 and t3 are unused
-	(void)t2;
-	(void)t3;
-
+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
+	const vtable8_64x8& tbl,
+	vint8 idx
+) {
 	// Set index byte above max index for unused bytes so table lookup returns zero
 	svint32_8_t literal32 = svdup_s32(32);
 	svbool_8_t idx_lo_select = svcmplt(svptrue_b32(), idx.m, literal32);
@@ -999,8 +1010,8 @@ ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3
 	svuint8_8_t idx_lo_bytes = svreinterpret_u8_s32(idx_lo_masked);
 	svuint8_8_t idx_hi_bytes = svreinterpret_u8_s32(idx_hi_masked);
 
-	svuint8_8_t tbl0_bytes = svreinterpret_u8_s32(t0.m);
-	svuint8_8_t tbl1_bytes = svreinterpret_u8_s32(t1.m);
+	svuint8_8_t tbl0_bytes = svreinterpret_u8_s32(tbl.t0.m);
+	svuint8_8_t tbl1_bytes = svreinterpret_u8_s32(tbl.t1.m);
 
 	svint32_8_t t0_lookup = svreinterpret_s32_u8(svtbl_u8(tbl0_bytes, idx_lo_bytes));
 	svint32_8_t t1_lookup = svreinterpret_s32_u8(svtbl_u8(tbl1_bytes, idx_hi_bytes));
diff --git a/Test/astc_profile_valgrind.py b/Test/astc_profile_valgrind.py
index eb9a82a45..3cd08445b 100644
--- a/Test/astc_profile_valgrind.py
+++ b/Test/astc_profile_valgrind.py
@@ -125,7 +125,7 @@ def run_pass(image, noStartup, encoder, blocksize, quality):
 
     if noStartup:
         args = ["gprof2dot", "--format=callgrind", "--output=out.dot", "callgrind.txt",
-                "-s", "-z", "compress_block(astcenc_contexti const&, image_block const&, physical_compressed_block&, compression_working_buffers&)"]
+                "-s", "-z", "compress_block(astcenc_contexti const&, image_block const&, unsigned char*, compression_working_buffers&)"]
     else:
         args = ["gprof2dot", "--format=callgrind", "--output=out.dot", "callgrind.txt",
                 "-s",  "-z", "main"]