diff --git a/src/_imaging.c b/src/_imaging.c index 281f3a4d2e6..4afc42c5a4f 100644 --- a/src/_imaging.c +++ b/src/_imaging.c @@ -798,7 +798,7 @@ _prepare_lut_table(PyObject *table, Py_ssize_t table_size) { } /* malloc check ok, max is 2 * 4 * 65**3 = 2197000 */ - prepared = (INT16 *)malloc(sizeof(INT16) * table_size); + prepared = (INT16 *)malloc(sizeof(INT16) * (table_size + 2)); if (!prepared) { if (free_table_data) { free(table_data); diff --git a/src/libImaging/ColorLUT.c b/src/libImaging/ColorLUT.c index aee7cda067d..a9887dc58a2 100644 --- a/src/libImaging/ColorLUT.c +++ b/src/libImaging/ColorLUT.c @@ -7,38 +7,119 @@ #define PRECISION_BITS (16 - 8 - 2) #define PRECISION_ROUNDING (1 << (PRECISION_BITS - 1)) -/* 8 - scales are multiplied on byte. +/* 16 - UINT16 capacity 6 - max index in the table - (max size is 65, but index 64 is not reachable) */ -#define SCALE_BITS (32 - 8 - 6) + (max size is 65, but index 64 is not reachable) + 7 - that much bits we can save if divide the value by 255. */ +#define SCALE_BITS (16 - 6 + 7) #define SCALE_MASK ((1 << SCALE_BITS) - 1) -#define SHIFT_BITS (16 - 1) +/* Number of bits in the index tail which is used as a shift + between two table values. Should be >= 9, <= 14. */ +#define SHIFT_BITS (16 - 7) -static inline UINT8 -clip8(int in) { - return clip8_lookups[(in + PRECISION_ROUNDING) >> PRECISION_BITS]; -} -static inline void -interpolate3(INT16 out[3], const INT16 a[3], const INT16 b[3], INT16 shift) { - out[0] = (a[0] * ((1 << SHIFT_BITS) - shift) + b[0] * shift) >> SHIFT_BITS; - out[1] = (a[1] * ((1 << SHIFT_BITS) - shift) + b[1] * shift) >> SHIFT_BITS; - out[2] = (a[2] * ((1 << SHIFT_BITS) - shift) + b[2] * shift) >> SHIFT_BITS; -} +__m128i static inline +mm_lut_pixel(INT16* table, int size2D_shift, int size3D_shift, + int idx, __m128i shift, __m128i shuffle) +{ + __m128i leftleft, leftright, rightleft, rightright; + __m128i source, left, right; + __m128i shift1D = _mm_shuffle_epi32(shift, 0x00); + __m128i shift2D = _mm_shuffle_epi32(shift, 0x55); + __m128i shift3D = _mm_shuffle_epi32(shift, 0xaa); + + source = _mm_shuffle_epi8( + _mm_loadu_si128((__m128i *) &table[idx + 0]), shuffle); + leftleft = _mm_srai_epi32(_mm_madd_epi16( + source, shift1D), SHIFT_BITS); + + source = _mm_shuffle_epi8( + _mm_loadu_si128((__m128i *) &table[idx + size2D_shift]), shuffle); + leftright = _mm_slli_epi32(_mm_madd_epi16( + source, shift1D), 16 - SHIFT_BITS); + + source = _mm_shuffle_epi8( + _mm_loadu_si128((__m128i *) &table[idx + size3D_shift]), shuffle); + rightleft = _mm_srai_epi32(_mm_madd_epi16( + source, shift1D), SHIFT_BITS); + + source = _mm_shuffle_epi8( + _mm_loadu_si128((__m128i *) &table[idx + size3D_shift + size2D_shift]), shuffle); + rightright = _mm_slli_epi32(_mm_madd_epi16( + source, shift1D), 16 - SHIFT_BITS); + + left = _mm_srai_epi32(_mm_madd_epi16( + _mm_blend_epi16(leftleft, leftright, 0xaa), shift2D), + SHIFT_BITS); -static inline void -interpolate4(INT16 out[4], const INT16 a[4], const INT16 b[4], INT16 shift) { - out[0] = (a[0] * ((1 << SHIFT_BITS) - shift) + b[0] * shift) >> SHIFT_BITS; - out[1] = (a[1] * ((1 << SHIFT_BITS) - shift) + b[1] * shift) >> SHIFT_BITS; - out[2] = (a[2] * ((1 << SHIFT_BITS) - shift) + b[2] * shift) >> SHIFT_BITS; - out[3] = (a[3] * ((1 << SHIFT_BITS) - shift) + b[3] * shift) >> SHIFT_BITS; + right = _mm_slli_epi32(_mm_madd_epi16( + _mm_blend_epi16(rightleft, rightright, 0xaa), shift2D), + 16 - SHIFT_BITS); + + return _mm_srai_epi32(_mm_madd_epi16( + _mm_blend_epi16(left, right, 0xaa), shift3D), + SHIFT_BITS); } -static inline int -table_index3D(int index1D, int index2D, int index3D, int size1D, int size1D_2D) { - return index1D + index2D * size1D + index3D * size1D_2D; + +#if defined(__AVX2__) +__m256i static inline +mm256_lut_pixel(INT16* table, int size2D_shift, int size3D_shift, + int idx1, int idx2, __m256i shift, __m256i shuffle) +{ + __m256i leftleft, leftright, rightleft, rightright; + __m256i source, left, right; + __m256i shift1D = _mm256_shuffle_epi32(shift, 0x00); + __m256i shift2D = _mm256_shuffle_epi32(shift, 0x55); + __m256i shift3D = _mm256_shuffle_epi32(shift, 0xaa); + + source = _mm256_shuffle_epi8( + _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadu_si128((__m128i *) &table[idx1 + 0])), + _mm_loadu_si128((__m128i *) &table[idx2 + 0]), 1), + shuffle); + leftleft = _mm256_srai_epi32(_mm256_madd_epi16( + source, shift1D), SHIFT_BITS); + + source = _mm256_shuffle_epi8( + _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadu_si128((__m128i *) &table[idx1 + size2D_shift])), + _mm_loadu_si128((__m128i *) &table[idx2 + size2D_shift]), 1), + shuffle); + leftright = _mm256_slli_epi32(_mm256_madd_epi16( + source, shift1D), 16 - SHIFT_BITS); + + source = _mm256_shuffle_epi8( + _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadu_si128((__m128i *) &table[idx1 + size3D_shift])), + _mm_loadu_si128((__m128i *) &table[idx2 + size3D_shift]), 1), + shuffle); + rightleft = _mm256_srai_epi32(_mm256_madd_epi16( + source, shift1D), SHIFT_BITS); + + source = _mm256_shuffle_epi8( + _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadu_si128((__m128i *) &table[idx1 + size3D_shift + size2D_shift])), + _mm_loadu_si128((__m128i *) &table[idx2 + size3D_shift + size2D_shift]), 1), + shuffle); + rightright = _mm256_slli_epi32(_mm256_madd_epi16( + source, shift1D), 16 - SHIFT_BITS); + + left = _mm256_srai_epi32(_mm256_madd_epi16( + _mm256_blend_epi16(leftleft, leftright, 0xaa), shift2D), + SHIFT_BITS); + + right = _mm256_slli_epi32(_mm256_madd_epi16( + _mm256_blend_epi16(rightleft, rightright, 0xaa), shift2D), + 16 - SHIFT_BITS); + + return _mm256_srai_epi32(_mm256_madd_epi16( + _mm256_blend_epi16(left, right, 0xaa), shift3D), + SHIFT_BITS); } +#endif + /* Transforms colors of imIn using provided 3D lookup table @@ -74,12 +155,56 @@ ImagingColorLUT3D_linear( +1 cells will be outside of the table. With this compensation we never hit the upper cells but this also doesn't introduce any noticeable difference. */ - UINT32 scale1D = (size1D - 1) / 255.0 * (1 << SCALE_BITS); - UINT32 scale2D = (size2D - 1) / 255.0 * (1 << SCALE_BITS); - UINT32 scale3D = (size3D - 1) / 255.0 * (1 << SCALE_BITS); - int size1D_2D = size1D * size2D; - int x, y; + int y, size1D_2D = size1D * size2D; + int size2D_shift = size1D * table_channels; + int size3D_shift = size1D_2D * table_channels; ImagingSectionCookie cookie; + __m128i scale = _mm_set_epi16( + 0, + (int) ((size3D - 1) / 255.0 * (1<> 8); + __m128i index_mul = _mm_set_epi16( + 0, size1D_2D*table_channels, size1D*table_channels, table_channels, + 0, size1D_2D*table_channels, size1D*table_channels, table_channels); + __m128i shuffle3 = _mm_set_epi8(-1,-1, -1,-1, 11,10, 5,4, 9,8, 3,2, 7,6, 1,0); + __m128i shuffle4 = _mm_set_epi8(15,14, 7,6, 13,12, 5,4, 11,10, 3,2, 9,8, 1,0); +#if defined(__AVX2__) + __m256i scale256 = _mm256_set_epi16( + 0, + (int) ((size3D - 1) / 255.0 * (1<> 8); + __m256i index_mul256 = _mm256_set_epi16( + 0, size1D_2D*table_channels, size1D*table_channels, table_channels, + 0, size1D_2D*table_channels, size1D*table_channels, table_channels, + 0, size1D_2D*table_channels, size1D*table_channels, table_channels, + 0, size1D_2D*table_channels, size1D*table_channels, table_channels); + __m256i shuffle3_256 = _mm256_set_epi8( + -1,-1, -1,-1, 11,10, 5,4, 9,8, 3,2, 7,6, 1,0, + -1,-1, -1,-1, 11,10, 5,4, 9,8, 3,2, 7,6, 1,0); + __m256i shuffle4_256 = _mm256_set_epi8( + 15,14, 7,6, 13,12, 5,4, 11,10, 3,2, 9,8, 1,0, + 15,14, 7,6, 13,12, 5,4, 11,10, 3,2, 9,8, 1,0); +#endif if (table_channels < 3 || table_channels > 4) { PyErr_SetString(PyExc_ValueError, "table_channels could be 3 or 4"); @@ -98,86 +223,227 @@ ImagingColorLUT3D_linear( ImagingSectionEnter(&cookie); for (y = 0; y < imOut->ysize; y++) { - UINT8 *rowIn = (UINT8 *)imIn->image[y]; - char *rowOut = (char *)imOut->image[y]; - for (x = 0; x < imOut->xsize; x++) { - UINT32 index1D = rowIn[x * 4 + 0] * scale1D; - UINT32 index2D = rowIn[x * 4 + 1] * scale2D; - UINT32 index3D = rowIn[x * 4 + 2] * scale3D; - INT16 shift1D = (SCALE_MASK & index1D) >> (SCALE_BITS - SHIFT_BITS); - INT16 shift2D = (SCALE_MASK & index2D) >> (SCALE_BITS - SHIFT_BITS); - INT16 shift3D = (SCALE_MASK & index3D) >> (SCALE_BITS - SHIFT_BITS); - int idx = table_channels * table_index3D( - index1D >> SCALE_BITS, - index2D >> SCALE_BITS, - index3D >> SCALE_BITS, - size1D, - size1D_2D); - INT16 result[4], left[4], right[4]; - INT16 leftleft[4], leftright[4], rightleft[4], rightright[4]; + UINT32* rowIn = (UINT32 *)imIn->image[y]; + UINT32* rowOut = (UINT32 *)imOut->image[y]; + int x = 0; + + #if defined(__AVX2__) + // This makes sense if 12 or more pixels remain (two loops) + if (x < imOut->xsize - 11) { + __m128i source = _mm_loadu_si128((__m128i *) &rowIn[x]); + // scale up to 16 bits, but scale * 255 * 256 up to 31 bits + // bi, gi and ri - 6 bits index + // rs, rs and rs - 9 bits shift + // 00 bi3.bs3 gi3.gs3 ri3.rs3 00 bi2.bs2 gi2.gs2 ri2.rs2 + // 00 bi1.bs1 gi1.gs1 ri1.rs1 00 bi0.bs0 gi0.gs0 ri0.rs0 + __m256i index = _mm256_mulhi_epu16(scale256, + _mm256_slli_epi16(_mm256_cvtepu8_epi16(source), 8)); + // 0000 0000 idx3 idx2 + // 0000 0000 idx1 idx0 + __m256i index_src = _mm256_hadd_epi32( + _mm256_madd_epi16(index_mul256, _mm256_srli_epi16(index, SHIFT_BITS)), + _mm256_setzero_si256()); + + for (; x < imOut->xsize - 7; x += 4) { + __m128i next_source = _mm_loadu_si128((__m128i *) &rowIn[x + 4]); + // scale up to 16 bits, but scale * 255 * 256 up to 31 bits + // bi, gi and ri - 6 bits index + // rs, rs and rs - 9 bits shift + // 00 bi3.bs3 gi3.gs3 ri3.rs3 00 bi2.bs2 gi2.gs2 ri2.rs2 + // 00 bi1.bs1 gi1.gs1 ri1.rs1 00 bi0.bs0 gi0.gs0 ri0.rs0 + __m256i next_index = _mm256_mulhi_epu16(scale256, + _mm256_slli_epi16(_mm256_cvtepu8_epi16(next_source), 8)); + // 0000 0000 idx3 idx2 + // 0000 0000 idx1 idx0 + __m256i next_index_src = _mm256_hadd_epi32( + _mm256_madd_epi16(index_mul256, _mm256_srli_epi16(next_index, SHIFT_BITS)), + _mm256_setzero_si256()); + + int idx0 = _mm_cvtsi128_si32(_mm256_castsi256_si128(index_src)); + int idx1 = _mm256_extract_epi32(index_src, 1); + int idx2 = _mm256_extract_epi32(index_src, 4); + int idx3 = _mm256_extract_epi32(index_src, 5); + + // 00 0.bs3 0.gs3 0.rs3 00 0.bs2 0.gs2 0.rs2 + // 00 0.bs1 0.gs1 0.rs1 00 0.bs0 0.gs0 0.rs0 + __m256i shift = _mm256_and_si256(index, scale_mask256); + // 11 1-bs3 1-gs3 1-rs3 11 1-bs2 1-gs2 1-rs2 + // 11 1-bs1 1-gs1 1-rs1 11 1-bs0 1-gs0 1-rs0 + __m256i inv_shift = _mm256_sub_epi16(_mm256_set1_epi16((1<xsize - 5) { + __m128i source = _mm_loadl_epi64((__m128i *) &rowIn[x]); + // scale up to 16 bits, but scale * 255 * 256 up to 31 bits + // bi, gi and ri - 6 bits index + // rs, rs and rs - 9 bits shift + // 00 bi1.bs1 gi1.gs1 ri1.rs1 00 bi0.bs0 gi0.gs0 ri0.rs0 + __m128i index = _mm_mulhi_epu16(scale, + _mm_unpacklo_epi8(_mm_setzero_si128(), source)); + // 0000 0000 idx1 idx0 + __m128i index_src = _mm_hadd_epi32( + _mm_madd_epi16(index_mul, _mm_srli_epi16(index, SHIFT_BITS)), + _mm_setzero_si128()); + + for (; x < imOut->xsize - 3; x += 2) { + __m128i next_source = _mm_loadl_epi64((__m128i *) &rowIn[x + 2]); + __m128i next_index = _mm_mulhi_epu16(scale, + _mm_unpacklo_epi8(_mm_setzero_si128(), next_source)); + __m128i next_index_src = _mm_hadd_epi32( + _mm_madd_epi16(index_mul, _mm_srli_epi16(next_index, SHIFT_BITS)), + _mm_setzero_si128()); + + int idx0 = _mm_cvtsi128_si32(index_src); + int idx1 = _mm_cvtsi128_si32(_mm_srli_si128(index_src, 4)); + + // 00 0.bs1 0.gs1 0.rs1 00 0.bs0 0.gs0 0.rs0 + __m128i shift = _mm_and_si128(index, scale_mask); + // 11 1-bs1 1-gs1 1-rs1 11 1-bs0 1-gs0 1-rs0 + __m128i inv_shift = _mm_sub_epi16(_mm_set1_epi16((1<xsize; x++) { + __m128i source = _mm_cvtsi32_si128(rowIn[x]); + // scale up to 16 bits, but scale * 255 * 256 up to 31 bits + // bi, gi and ri - 6 bits index + // rs, rs and rs - 9 bits shift + // 00 00 00 00 00 bi.bs gi.gs ri.rs + __m128i index = _mm_mulhi_epu16(scale, + _mm_unpacklo_epi8(_mm_setzero_si128(), source)); + + int idx0 = _mm_cvtsi128_si32( + _mm_hadd_epi32( + _mm_madd_epi16(index_mul, _mm_srli_epi16(index, SHIFT_BITS)), + _mm_setzero_si128())); + + // 00 00 00 00 00 0.bs 0.gs 0.rs + __m128i shift = _mm_and_si128(index, scale_mask); + // 11 11 11 11 11 1-bs 1-gs 1-rs + __m128i inv_shift = _mm_sub_epi16(_mm_set1_epi16((1<