Skip to content

Commit

Permalink
Fix self-review
Browse files Browse the repository at this point in the history
  • Loading branch information
solidpixel committed Aug 9, 2024
1 parent 50dd7a4 commit cd25173
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 68 deletions.
12 changes: 7 additions & 5 deletions Source/astcenc_vecmathlib_avx2_8.h
Original file line number Diff line number Diff line change
Expand Up @@ -1005,6 +1005,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
) {
// AVX2 tables duplicate table entries in each 128-bit half-register
vint4 d0 = vint4::load(data);

table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
}

Expand All @@ -1016,14 +1017,14 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
const uint8_t* data
) {
// AVX2 tables duplicate table entries in each 128-bit half-register
// Direct lookup for first row
vint4 d0 = vint4::load(data);
table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));

// XOR with previous rows for subsequent rows
vint4 d1 = vint4::load(data + 16);
d1 = d1 ^ d0;

table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m));

// XOR chain the high rows to allow table emulation
table.t1 = table.t1 ^ table.t0;
}

/**
Expand All @@ -1044,6 +1045,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
table.t2 = vint8(astcenc_mm256_set_m128i(d2.m, d2.m));
table.t3 = vint8(astcenc_mm256_set_m128i(d3.m, d3.m));

// XOR chain the high rows to allow table emulation
table.t3 = table.t3 ^ table.t2;
table.t2 = table.t2 ^ table.t1;
table.t1 = table.t1 ^ table.t0;
Expand Down
55 changes: 22 additions & 33 deletions Source/astcenc_vecmathlib_neon_4.h
Original file line number Diff line number Diff line change
Expand Up @@ -943,25 +943,21 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
* Table structure for a 16x 8-bit entry table.
*/
struct vtable4_16x8 {
vint4 t0;
int8x16_t t0;
};

/*
* Table structure for a 32x 8-bit entry table.
*/
struct vtable4_32x8 {
vint4 t0;
vint4 t1;
int8x16x2_t t01;
};

/*
* Table structure for a 64x 8-bit entry table.
*/
struct vtable4_64x8 {
vint4 t0;
vint4 t1;
vint4 t2;
vint4 t3;
int8x16x4_t t0123;
};

/**
Expand All @@ -971,7 +967,8 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
vtable4_16x8& table,
const uint8_t* data
) {
table.t0 = vint4::load(data);
vint4 t0 = vint4::load(data);
table.t0 = vreinterpretq_s8_s32(t0.m);
}

/**
Expand All @@ -981,8 +978,11 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
vtable4_32x8& table,
const uint8_t* data
) {
table.t0 = vint4::load(data);
table.t1 = vint4::load(data + 16);
vint4 t0 = vint4::load(data);
vint4 t1 = vint4::load(data + 16);

table.t01[0] = vreinterpretq_s8_s32(t0.m);
table.t01[1] = vreinterpretq_s8_s32(t1.m);
}

/**
Expand All @@ -992,10 +992,15 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
vtable4_64x8& table,
const uint8_t* data
) {
table.t0 = vint4::load(data);
table.t1 = vint4::load(data + 16);
table.t2 = vint4::load(data + 32);
table.t3 = vint4::load(data + 48);
vint4 t0 = vint4::load(data);
vint4 t1 = vint4::load(data + 16);
vint4 t2 = vint4::load(data + 32);
vint4 t3 = vint4::load(data + 48);

table.t0123[0] = vreinterpretq_s8_s32(t0.m);
table.t0123[1] = vreinterpretq_s8_s32(t1.m);
table.t0123[2] = vreinterpretq_s8_s32(t2.m);
table.t0123[3] = vreinterpretq_s8_s32(t3.m);
}

/**
Expand All @@ -1005,15 +1010,11 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup(
const vtable4_16x8& tbl,
vint4 idx
) {
int8x16_t table {
vreinterpretq_s8_s32(tbl.t0.m)
};

// Set index byte above max index for unused bytes so table lookup returns zero
int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);

return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(table, idx_bytes)));
return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(tbl.t0, idx_bytes)));
}

/**
Expand All @@ -1023,16 +1024,11 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup(
const vtable4_32x8& tbl,
vint4 idx
) {
int8x16x2_t table {
vreinterpretq_s8_s32(tbl.t0.m),
vreinterpretq_s8_s32(tbl.t1.m)
};

// Set index byte above max index for unused bytes so table lookup returns zero
int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);

return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table, idx_bytes)));
return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table.t01, idx_bytes)));
}

/**
Expand All @@ -1042,18 +1038,11 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup(
const vtable4_64x8& tbl,
vint4 idx
) {
int8x16x4_t table {
vreinterpretq_s8_s32(tbl.t0.m),
vreinterpretq_s8_s32(tbl.t1.m),
vreinterpretq_s8_s32(tbl.t2.m),
vreinterpretq_s8_s32(tbl.t3.m)
};

// Set index byte above max index for unused bytes so table lookup returns zero
int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);

return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table, idx_bytes)));
return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table.t0123, idx_bytes)));
}

/**
Expand Down
64 changes: 34 additions & 30 deletions Source/astcenc_vecmathlib_sse_4.h
Original file line number Diff line number Diff line change
Expand Up @@ -1041,25 +1041,37 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
* Table structure for a 16x 8-bit entry table.
*/
struct vtable4_16x8 {
#if ASTCENC_SSE >= 41
vint4 t0;
#else
const uint8_t* data;
#endif
};

/*
* Table structure for a 32x 8-bit entry table.
*/
struct vtable4_32x8 {
#if ASTCENC_SSE >= 41
vint4 t0;
vint4 t1;
#else
const uint8_t* data;
#endif
};

/*
* Table structure for a 64x 8-bit entry table.
*/
struct vtable4_64x8 {
#if ASTCENC_SSE >= 41
vint4 t0;
vint4 t1;
vint4 t2;
vint4 t3;
#else
const uint8_t* data;
#endif
};

/**
Expand All @@ -1069,7 +1081,11 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
vtable4_16x8& table,
const uint8_t* data
) {
#if ASTCENC_SSE >= 41
table.t0 = vint4::load(data);
#else
table.data = data;
#endif
}

/**
Expand All @@ -1079,11 +1095,13 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
vtable4_32x8& table,
const uint8_t* data
) {
#if ASTCENC_SSE >= 41
table.t0 = vint4::load(data);
table.t1 = vint4::load(data + 16);

#if ASTCENC_SSE >= 41
table.t1 = table.t1 ^ table.t0;
#else
table.data = data;
#endif
}

Expand All @@ -1094,15 +1112,17 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
vtable4_64x8& table,
const uint8_t* data
) {
#if ASTCENC_SSE >= 41
table.t0 = vint4::load(data);
table.t1 = vint4::load(data + 16);
table.t2 = vint4::load(data + 32);
table.t3 = vint4::load(data + 48);

#if ASTCENC_SSE >= 41
table.t3 = table.t3 ^ table.t2;
table.t2 = table.t2 ^ table.t1;
table.t1 = table.t1 ^ table.t0;
#else
table.data = data;
#endif
}

Expand All @@ -1120,14 +1140,10 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup(
__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);
return vint4(result);
#else
uint8_t table[16];

std::memcpy(table + 0, &tbl.t0.m, 4 * sizeof(int));

return vint4(table[idx.lane<0>()],
table[idx.lane<1>()],
table[idx.lane<2>()],
table[idx.lane<3>()]);
return vint4(tbl.data[idx.lane<0>()],
tbl.data[idx.lane<1>()],
tbl.data[idx.lane<2>()],
tbl.data[idx.lane<3>()]);
#endif
}

Expand All @@ -1150,15 +1166,10 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup(

return vint4(result);
#else
uint8_t table[32];

std::memcpy(table + 0, &tbl.t0.m, 4 * sizeof(int));
std::memcpy(table + 16, &tbl.t1.m, 4 * sizeof(int));

return vint4(table[idx.lane<0>()],
table[idx.lane<1>()],
table[idx.lane<2>()],
table[idx.lane<3>()]);
return vint4(tbl.data[idx.lane<0>()],
tbl.data[idx.lane<1>()],
tbl.data[idx.lane<2>()],
tbl.data[idx.lane<3>()]);
#endif
}

Expand Down Expand Up @@ -1189,17 +1200,10 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup(

return vint4(result);
#else
uint8_t table[64];

std::memcpy(table + 0, &tbl.t0.m, 4 * sizeof(int));
std::memcpy(table + 16, &tbl.t1.m, 4 * sizeof(int));
std::memcpy(table + 32, &tbl.t2.m, 4 * sizeof(int));
std::memcpy(table + 48, &tbl.t3.m, 4 * sizeof(int));

return vint4(table[idx.lane<0>()],
table[idx.lane<1>()],
table[idx.lane<2>()],
table[idx.lane<3>()]);
return vint4(tbl.data[idx.lane<0>()],
tbl.data[idx.lane<1>()],
tbl.data[idx.lane<2>()],
tbl.data[idx.lane<3>()]);
#endif
}

Expand Down

0 comments on commit cd25173

Please sign in to comment.