Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SIMD: AlphaComposite SSE4 & AVX2 #8299

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,19 @@ jobs:
include:
- { python-version: "3.11", PYTHONOPTIMIZE: 1, REVERSE: "--reverse" }
- { python-version: "3.10", PYTHONOPTIMIZE: 2 }
# SIMD-accelerated builds for x86
- { os: "ubuntu-latest", python-version: "3.9", acceleration: "sse4"}
- { os: "ubuntu-latest", python-version: "3.12", acceleration: "avx2"}
# Free-threaded
- { os: "ubuntu-latest", python-version: "3.13-dev", disable-gil: true }
# M1 only available for 3.10+
- { os: "macos-13", python-version: "3.9" }
- { os: "macos-13", python-version: "3.9", acceleration: "avx2"}
exclude:
- { os: "macos-14", python-version: "3.9" }

runs-on: ${{ matrix.os }}
name: ${{ matrix.os }} Python ${{ matrix.python-version }} ${{ matrix.disable-gil && 'free-threaded' || '' }}
name: ${{ matrix.os }} Python ${{ matrix.python-version }} ${{ matrix.acceleration }} ${{ matrix.disable-gil && 'free-threaded' || '' }}

steps:
- uses: actions/checkout@v4
Expand Down Expand Up @@ -108,7 +112,7 @@ jobs:
GHA_LIBIMAGEQUANT_CACHE_HIT: ${{ steps.cache-libimagequant.outputs.cache-hit }}

- name: Install macOS dependencies
if: startsWith(matrix.os, 'macOS')
if: startsWith(matrix.os, 'macos')
run: |
.github/workflows/macos-install.sh
env:
Expand All @@ -118,6 +122,11 @@ jobs:
if: "matrix.os == 'ubuntu-latest' && matrix.python-version == '3.12'"
run: echo "::add-matcher::.github/problem-matchers/gcc.json"

- name: Set compiler options for optimization
if: ${{ matrix.acceleration }}
run: |
echo "CC=cc -m${{ matrix.acceleration }}" >> $GITHUB_ENV

- name: Build
run: |
.ci/build.sh
Expand Down
4 changes: 3 additions & 1 deletion Tests/test_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ def test(name: str, function: Callable[[str], str | None]) -> None:
assert version is None
else:
assert function(name) == version
if name != "PIL":
if name == "acceleration":
assert version in ("avx2", "sse4", "sse2", "neon", None)
elif name != "PIL":
if name == "zlib" and version is not None:
version = re.sub(".zlib-ng$", "", version)
elif name == "libtiff" and version is not None:
Expand Down
4 changes: 3 additions & 1 deletion src/PIL/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def get_supported_codecs() -> list[str]:
"libjpeg_turbo": ("PIL._imaging", "HAVE_LIBJPEGTURBO", "libjpeg_turbo_version"),
"libimagequant": ("PIL._imaging", "HAVE_LIBIMAGEQUANT", "imagequant_version"),
"xcb": ("PIL._imaging", "HAVE_XCB", None),
"acceleration": ("PIL._imaging", "acceleration", "acceleration"),
}


Expand Down Expand Up @@ -267,6 +268,7 @@ def pilinfo(out: IO[str] | None = None, supported_formats: bool = True) -> None:

for name, feature in [
("pil", "PIL CORE"),
("acceleration", "Acceleration"),
("tkinter", "TKINTER"),
("freetype2", "FREETYPE2"),
("littlecms2", "LITTLECMS2"),
Expand All @@ -291,7 +293,7 @@ def pilinfo(out: IO[str] | None = None, supported_formats: bool = True) -> None:
if v is None:
v = version(name)
if v is not None:
version_static = name in ("pil", "jpg")
version_static = name in ("pil", "jpg", "acceleration")
if name == "littlecms2":
# this check is also in src/_imagingcms.c:setup_module()
version_static = tuple(int(x) for x in v.split(".")) < (2, 7)
Expand Down
13 changes: 13 additions & 0 deletions src/_imaging.c
Original file line number Diff line number Diff line change
Expand Up @@ -4407,6 +4407,19 @@ setup_module(PyObject *m) {
Py_INCREF(have_xcb);
PyModule_AddObject(m, "HAVE_XCB", have_xcb);

#ifdef __AVX2__
PyModule_AddStringConstant(m, "acceleration", "avx2");
#elif defined(__SSE4__)
PyModule_AddStringConstant(m, "acceleration", "sse4");
#elif defined(__SSE2__)
PyModule_AddStringConstant(m, "acceleration", "sse2");
#elif defined(__NEON__)
PyModule_AddStringConstant(m, "acceleration", "neon");
#else
Py_INCREF(Py_False);
PyModule_AddObject(m, "acceleration", Py_False);
#endif

PyObject *pillow_version = PyUnicode_FromString(version);
PyDict_SetItemString(
d, "PILLOW_VERSION", pillow_version ? pillow_version : Py_None
Expand Down
264 changes: 247 additions & 17 deletions src/libImaging/AlphaComposite.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,38 +46,268 @@ ImagingAlphaComposite(Imaging imDst, Imaging imSrc) {
rgba8 *src = (rgba8 *)imSrc->image[y];
rgba8 *out = (rgba8 *)imOut->image[y];

for (x = 0; x < imDst->xsize; x++) {
if (src->a == 0) {
x = 0;

#if defined(__AVX2__)
{
__m256i vmm_max_alpha = _mm256_set1_epi32(255);
__m256i vmm_max_alpha2 = _mm256_set1_epi32(255 * 255);
__m256i vmm_zero = _mm256_setzero_si256();
__m256i vmm_half = _mm256_set1_epi16(128);
__m256i vmm_get_lo = _mm256_set_epi8(
-1,
-1,
5,
4,
5,
4,
5,
4,
-1,
-1,
1,
0,
1,
0,
1,
0,
-1,
-1,
5,
4,
5,
4,
5,
4,
-1,
-1,
1,
0,
1,
0,
1,
0
);
__m256i vmm_get_hi = _mm256_set_epi8(
-1,
-1,
13,
12,
13,
12,
13,
12,
-1,
-1,
9,
8,
9,
8,
9,
8,
-1,
-1,
13,
12,
13,
12,
13,
12,
-1,
-1,
9,
8,
9,
8,
9,
8
);

#define MM_SHIFTDIV255_epi16(src) \
_mm256_srli_epi16(_mm256_add_epi16(src, _mm256_srli_epi16(src, 8)), 8)

for (; x < imDst->xsize - 7; x += 8) {
__m256i mm_dst, mm_dst_lo, mm_dst_hi;
__m256i mm_src, mm_src_lo, mm_src_hi;
__m256i mm_dst_a, mm_src_a, mm_out_a, mm_blend;
__m256i mm_coef1, mm_coef2, mm_out_lo, mm_out_hi;

mm_dst = _mm256_loadu_si256((__m256i *)&dst[x]);
mm_dst_lo = _mm256_unpacklo_epi8(mm_dst, vmm_zero);
mm_dst_hi = _mm256_unpackhi_epi8(mm_dst, vmm_zero);
mm_src = _mm256_loadu_si256((__m256i *)&src[x]);
mm_src_lo = _mm256_unpacklo_epi8(mm_src, vmm_zero);
mm_src_hi = _mm256_unpackhi_epi8(mm_src, vmm_zero);

mm_dst_a = _mm256_srli_epi32(mm_dst, 24);
mm_src_a = _mm256_srli_epi32(mm_src, 24);

// Compute coefficients
// blend = dst->a * (255 - src->a); 16 bits
mm_blend = _mm256_mullo_epi16(
mm_dst_a, _mm256_sub_epi32(vmm_max_alpha, mm_src_a)
);
// outa = src->a * 255 + dst->a * (255 - src->a); 16 bits
mm_out_a = _mm256_add_epi32(
_mm256_mullo_epi16(mm_src_a, vmm_max_alpha), mm_blend
);
mm_coef1 = _mm256_mullo_epi32(mm_src_a, vmm_max_alpha2);
// 8 bits
mm_coef1 = _mm256_cvtps_epi32(_mm256_mul_ps(
_mm256_cvtepi32_ps(mm_coef1),
_mm256_rcp_ps(_mm256_cvtepi32_ps(mm_out_a))
));
// 8 bits
mm_coef2 = _mm256_sub_epi32(vmm_max_alpha, mm_coef1);

mm_out_lo = _mm256_add_epi16(
_mm256_mullo_epi16(
mm_src_lo, _mm256_shuffle_epi8(mm_coef1, vmm_get_lo)
),
_mm256_mullo_epi16(
mm_dst_lo, _mm256_shuffle_epi8(mm_coef2, vmm_get_lo)
)
);
mm_out_lo = _mm256_or_si256(
mm_out_lo,
_mm256_slli_epi64(_mm256_unpacklo_epi32(mm_out_a, vmm_zero), 48)
);
mm_out_lo = _mm256_add_epi16(mm_out_lo, vmm_half);
mm_out_lo = MM_SHIFTDIV255_epi16(mm_out_lo);

mm_out_hi = _mm256_add_epi16(
_mm256_mullo_epi16(
mm_src_hi, _mm256_shuffle_epi8(mm_coef1, vmm_get_hi)
),
_mm256_mullo_epi16(
mm_dst_hi, _mm256_shuffle_epi8(mm_coef2, vmm_get_hi)
)
);
mm_out_hi = _mm256_or_si256(
mm_out_hi,
_mm256_slli_epi64(_mm256_unpackhi_epi32(mm_out_a, vmm_zero), 48)
);
mm_out_hi = _mm256_add_epi16(mm_out_hi, vmm_half);
mm_out_hi = MM_SHIFTDIV255_epi16(mm_out_hi);

_mm256_storeu_si256(
(__m256i *)&out[x], _mm256_packus_epi16(mm_out_lo, mm_out_hi)
);
}

#undef MM_SHIFTDIV255_epi16
}
#endif
#if defined(__SSE4__)
{
__m128i mm_max_alpha = _mm_set1_epi32(255);
__m128i mm_max_alpha2 = _mm_set1_epi32(255 * 255);
__m128i mm_zero = _mm_setzero_si128();
__m128i mm_half = _mm_set1_epi16(128);
__m128i mm_get_lo =
_mm_set_epi8(-1, -1, 5, 4, 5, 4, 5, 4, -1, -1, 1, 0, 1, 0, 1, 0);
__m128i mm_get_hi =
_mm_set_epi8(-1, -1, 13, 12, 13, 12, 13, 12, -1, -1, 9, 8, 9, 8, 9, 8);

#define MM_SHIFTDIV255_epi16(src) \
_mm_srli_epi16(_mm_add_epi16(src, _mm_srli_epi16(src, 8)), 8)

for (; x < imDst->xsize - 3; x += 4) {
__m128i mm_dst, mm_dst_lo, mm_dst_hi;
__m128i mm_src, mm_src_hi, mm_src_lo;
__m128i mm_dst_a, mm_src_a, mm_out_a, mm_blend;
__m128i mm_coef1, mm_coef2, mm_out_lo, mm_out_hi;

// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
mm_dst = _mm_loadu_si128((__m128i *)&dst[x]);
// [16] a1 b1 g1 r1 a0 b0 g0 r0
mm_dst_lo = _mm_unpacklo_epi8(mm_dst, mm_zero);
// [16] a3 b3 g3 r3 a2 b2 g2 r2
mm_dst_hi = _mm_unpackhi_epi8(mm_dst, mm_zero);
// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
mm_src = _mm_loadu_si128((__m128i *)&src[x]);
mm_src_lo = _mm_unpacklo_epi8(mm_src, mm_zero);
mm_src_hi = _mm_unpackhi_epi8(mm_src, mm_zero);

// [32] a3 a2 a1 a0
mm_dst_a = _mm_srli_epi32(mm_dst, 24);
mm_src_a = _mm_srli_epi32(mm_src, 24);

// Compute coefficients
// blend = dst->a * (255 - src->a)
// [16] xx b3 xx b2 xx b1 xx b0
mm_blend =
_mm_mullo_epi16(mm_dst_a, _mm_sub_epi32(mm_max_alpha, mm_src_a));
// outa = src->a * 255 + blend
// [16] xx a3 xx a2 xx a1 xx a0
mm_out_a =
_mm_add_epi32(_mm_mullo_epi16(mm_src_a, mm_max_alpha), mm_blend);
// coef1 = src->a * 255 * 255 / outa
mm_coef1 = _mm_mullo_epi32(mm_src_a, mm_max_alpha2);
// [8] xx xx xx c3 xx xx xx c2 xx xx xx c1 xx xx xx c0
mm_coef1 = _mm_cvtps_epi32(_mm_mul_ps(
_mm_cvtepi32_ps(mm_coef1), _mm_rcp_ps(_mm_cvtepi32_ps(mm_out_a))
));
// [8] xx xx xx c3 xx xx xx c2 xx xx xx c1 xx xx xx c0
mm_coef2 = _mm_sub_epi32(mm_max_alpha, mm_coef1);

mm_out_lo = _mm_add_epi16(
_mm_mullo_epi16(mm_src_lo, _mm_shuffle_epi8(mm_coef1, mm_get_lo)),
_mm_mullo_epi16(mm_dst_lo, _mm_shuffle_epi8(mm_coef2, mm_get_lo))
);
mm_out_lo = _mm_or_si128(
mm_out_lo, _mm_slli_epi64(_mm_unpacklo_epi32(mm_out_a, mm_zero), 48)
);
mm_out_lo = _mm_add_epi16(mm_out_lo, mm_half);
mm_out_lo = MM_SHIFTDIV255_epi16(mm_out_lo);

mm_out_hi = _mm_add_epi16(
_mm_mullo_epi16(mm_src_hi, _mm_shuffle_epi8(mm_coef1, mm_get_hi)),
_mm_mullo_epi16(mm_dst_hi, _mm_shuffle_epi8(mm_coef2, mm_get_hi))
);
mm_out_hi = _mm_or_si128(
mm_out_hi, _mm_slli_epi64(_mm_unpackhi_epi32(mm_out_a, mm_zero), 48)
);
mm_out_hi = _mm_add_epi16(mm_out_hi, mm_half);
mm_out_hi = MM_SHIFTDIV255_epi16(mm_out_hi);

_mm_storeu_si128(
(__m128i *)&out[x], _mm_packus_epi16(mm_out_lo, mm_out_hi)
);
}

#undef MM_SHIFTDIV255_epi16
}
#endif

for (; x < imDst->xsize; x += 1) {
if (src[x].a == 0) {
// Copy 4 bytes at once.
*out = *dst;
out[x] = dst[x];
} else {
// Integer implementation with increased precision.
// Each variable has extra meaningful bits.
// Divisions are rounded.

UINT32 tmpr, tmpg, tmpb;
UINT32 blend = dst->a * (255 - src->a);
UINT32 outa255 = src->a * 255 + blend;
UINT32 blend = dst[x].a * (255 - src[x].a);
UINT32 outa255 = src[x].a * 255 + blend;
// There we use 7 bits for precision.
// We could use more, but we go beyond 32 bits.
UINT32 coef1 = src->a * 255 * 255 * (1 << PRECISION_BITS) / outa255;
UINT32 coef1 = src[x].a * 255 * 255 * (1 << PRECISION_BITS) / outa255;
UINT32 coef2 = 255 * (1 << PRECISION_BITS) - coef1;

tmpr = src->r * coef1 + dst->r * coef2;
tmpg = src->g * coef1 + dst->g * coef2;
tmpb = src->b * coef1 + dst->b * coef2;
out->r =
tmpr = src[x].r * coef1 + dst[x].r * coef2;
tmpg = src[x].g * coef1 + dst[x].g * coef2;
tmpb = src[x].b * coef1 + dst[x].b * coef2;
out[x].r =
SHIFTFORDIV255(tmpr + (0x80 << PRECISION_BITS)) >> PRECISION_BITS;
out->g =
out[x].g =
SHIFTFORDIV255(tmpg + (0x80 << PRECISION_BITS)) >> PRECISION_BITS;
out->b =
out[x].b =
SHIFTFORDIV255(tmpb + (0x80 << PRECISION_BITS)) >> PRECISION_BITS;
out->a = SHIFTFORDIV255(outa255 + 0x80);
out[x].a = SHIFTFORDIV255(outa255 + 0x80);
}

dst++;
src++;
out++;
}
}

Expand Down
Loading
Loading