Skip to content

Commit

Permalink
Merge branch 'simd/master' into simd/7.0.x
Browse files Browse the repository at this point in the history
  • Loading branch information
homm committed Dec 19, 2021
2 parents 7bde926 + 7da215e commit bf7595a
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 9 deletions.
5 changes: 5 additions & 0 deletions CHANGES.SIMD.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
Changelog (Pillow-SIMD)
=======================

7.0.0.post4
-----------

- Filter: fixed wrong offset handling for 3x3 single-band version

7.0.0.post3
-----------

Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ The project is supported by Uploadcare, a SAAS for cloud-based image storing and

[![Uploadcare][uploadcare.logo]][uploadcare.com]

In fact, Uploadcare has been running Pillow-SIMD for about three years now.
In fact, Uploadcare has been running Pillow-SIMD since 2015.

The following image operations are currently SIMD-accelerated:

Expand Down Expand Up @@ -96,6 +96,7 @@ So there is no easy way to compile such library, especially with setuptools.

If there's a copy of the original Pillow installed, it has to be removed first
with `$ pip uninstall -y pillow`.
Please install [prerequisites](https://pillow.readthedocs.io/en/stable/installation.html#building-from-source) for your platform.
The installation itself is simple just as running `$ pip install pillow-simd`,
and if you're using SSE4-capable CPU everything should run smoothly.
If you'd like to install the AVX2-enabled version,
Expand Down Expand Up @@ -124,4 +125,4 @@ All bugfixes to the original Pillow will then be transferred to the next Pillow-
[pillow-perf-page]: https://python-pillow.github.io/pillow-perf/
[pillow-perf-repo]: https://github.com/python-pillow/pillow-perf
[uploadcare.com]: https://uploadcare.com/?utm_source=github&utm_medium=description&utm_campaign=pillow-simd
[uploadcare.logo]: https://ucarecdn.com/74c4d283-f7cf-45d7-924c-fc77345585af/uploadcare.svg
[uploadcare.logo]: https://ucarecdn.com/8eca784b-bbe5-4f7e-8cdf-98d75aab8cec/logotransparent.svg
2 changes: 1 addition & 1 deletion src/PIL/_version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# Master version for Pillow
__version__ = "7.0.0.post3"
__version__ = "7.0.0.post4"
13 changes: 9 additions & 4 deletions src/libImaging/FilterSIMD_3x3f_u8.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,12 @@ ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel,
float offset)
{
#define MM_KERNEL1x3_SUM1(ss, row, kernel) \
ss = _mm_set1_ps(offset); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, kernel0##kernel)); \
ss = _mm_mul_ps(pix0##row, kernel0##kernel); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, kernel1##kernel)); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix2##row, kernel2##kernel));

#define MM_KERNEL1x3_SUM1_2(ss, row, kernel) \
ss = _mm_set1_ps(offset); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix3##row, kernel0##kernel)); \
ss = _mm_mul_ps(pix3##row, kernel0##kernel); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, kernel1##kernel)); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, kernel2##kernel));

Expand All @@ -26,6 +24,7 @@ ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel,
__m128 kernel01 = _mm_set_ps(kernel[2], kernel[1], kernel[0], 0);
__m128 kernel11 = _mm_set_ps(kernel[5], kernel[4], kernel[3], 0);
__m128 kernel21 = _mm_set_ps(kernel[8], kernel[7], kernel[6], 0);
__m128 mm_offset = _mm_set1_ps(offset);

memcpy(imOut->image8[0], im->image8[0], im->linesize);
y = 1;
Expand Down Expand Up @@ -64,12 +63,14 @@ ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel,
ss4 = _mm_hadd_ps(ss4, ss5);

ss0 = _mm_hadd_ps(ss0, ss1);
ss0 = _mm_add_ps(ss0, mm_offset);
ssi0 = _mm_cvtps_epi32(ss0);
ssi0 = _mm_packs_epi32(ssi0, ssi0);
ssi0 = _mm_packus_epi16(ssi0, ssi0);
*((UINT32*) &out0[x]) = _mm_cvtsi128_si32(ssi0);

ss3 = _mm_hadd_ps(ss3, ss4);
ss3 = _mm_add_ps(ss3, mm_offset);
ssi0 = _mm_cvtps_epi32(ss3);
ssi0 = _mm_packs_epi32(ssi0, ssi0);
ssi0 = _mm_packus_epi16(ssi0, ssi0);
Expand All @@ -89,13 +90,15 @@ ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel,

ss0 = _mm_hadd_ps(ss0, ss0);
ss0 = _mm_hadd_ps(ss0, ss0);
ss0 = _mm_add_ps(ss0, mm_offset);
ssi0 = _mm_cvtps_epi32(ss0);
ssi0 = _mm_packs_epi32(ssi0, ssi0);
ssi0 = _mm_packus_epi16(ssi0, ssi0);
out0[x] = _mm_cvtsi128_si32(ssi0);

ss1 = _mm_hadd_ps(ss1, ss1);
ss1 = _mm_hadd_ps(ss1, ss1);
ss1 = _mm_add_ps(ss1, mm_offset);
ssi0 = _mm_cvtps_epi32(ss1);
ssi0 = _mm_packs_epi32(ssi0, ssi0);
ssi0 = _mm_packus_epi16(ssi0, ssi0);
Expand All @@ -122,6 +125,7 @@ ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel,

ss = _mm_hadd_ps(ss, ss);
ss = _mm_hadd_ps(ss, ss);
ss = _mm_add_ps(ss, mm_offset);
ssi0 = _mm_cvtps_epi32(ss);
ssi0 = _mm_packs_epi32(ssi0, ssi0);
ssi0 = _mm_packus_epi16(ssi0, ssi0);
Expand All @@ -139,6 +143,7 @@ ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel,

ss = _mm_hadd_ps(ss, ss);
ss = _mm_hadd_ps(ss, ss);
ss = _mm_add_ps(ss, mm_offset);
ssi0 = _mm_cvtps_epi32(ss);
ssi0 = _mm_packs_epi32(ssi0, ssi0);
ssi0 = _mm_packus_epi16(ssi0, ssi0);
Expand Down
3 changes: 1 addition & 2 deletions src/libImaging/FilterSIMD_5x5f_u8.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ ImagingFilter5x5f_u8(Imaging imOut, Imaging im, const float* kernel,
pix4##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_2[x]));

#define MM_KERNEL1x5_SUM(ss, row, krow) \
ss = _mm_setzero_ps(); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, kernel0##krow)); \
ss = _mm_mul_ps(pix0##row, kernel0##krow); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, kernel1##krow)); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix2##row, kernel2##krow)); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix3##row, kernel3##krow)); \
Expand Down

0 comments on commit bf7595a

Please sign in to comment.