Merge branch 'simd/master' into simd/9.1.x

uploadcare · Aug 10, 2024 · 9cadadd · 9cadadd
2 parents 0f44136 + 5967d99
commit 9cadadd
Show file tree

Hide file tree

Showing 29 changed files with 3,541 additions and 1,669 deletions.
diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml
diff --git a/.github/workflows/test-docker.yml b/.github/workflows/test-docker.yml
@@ -10,10 +10,6 @@ jobs:
       fail-fast: false
       matrix:
         docker: [
-          # Run slower jobs first to give them a headstart and reduce waiting time
-          ubuntu-20.04-focal-arm64v8,
-          ubuntu-20.04-focal-ppc64le,
-          ubuntu-20.04-focal-s390x,
           # Then run the remainder
           alpine,
           amazon-2-amd64,
@@ -30,13 +26,6 @@ jobs:
           ubuntu-20.04-focal-amd64,
         ]
         dockerTag: [main]
-        include:
-          - docker: "ubuntu-20.04-focal-arm64v8"
-            qemu-arch: "aarch64"
-          - docker: "ubuntu-20.04-focal-ppc64le"
-            qemu-arch: "ppc64le"
-          - docker: "ubuntu-20.04-focal-s390x"
-            qemu-arch: "s390x"
 
     name: ${{ matrix.docker }}
 

diff --git a/CHANGES.SIMD.rst b/CHANGES.SIMD.rst
@@ -0,0 +1,144 @@
+Changelog (Pillow-SIMD)
+=======================
+
+9.0.0.post1
+-----------
+
+- Fixed possible overflow in LUT processing
+- Restored compatibility with Visual C Compiler
+
+
+7.0.0.post4
+-----------
+
+- Filter: fixed wrong offset handling for 3x3 single-band version
+
+7.0.0.post3
+-----------
+
+- ColorLUT: fixed potential access violation, up to 2x faster
+
+7.0.0.post2
+-----------
+
+- ColorLUT: SSE4 & AVX2
+
+7.0.0.post1 & 6.2.2.post1 & 6.1.0.post1 & 6.0.0.post2
+-----------------------------------------------------
+
+- Bands: access violation in getband in some environments
+
+7.0.0.post0
+-----------
+
+- Reduce: SSE4
+
+6.0.0.post1
+-----------
+
+- GCC 9.0+: fixed unaligned read for ``_**_cvtepu8_epi32`` functions.
+
+6.0.0.post0 and 5.3.0.post1
+---------------------------
+
+- Resampling: Correct max coefficient calculation. Some rare combinations of
+  initial and requested sizes lead to black lines.
+
+4.3.0.post0
+-----------
+
+- Float-based filters, single-band: 3x3 SSE4, 5x5 SSE4
+- Float-based filters, multi-band: 3x3 SSE4 & AVX2, 5x5 SSE4
+- Int-based filters, multi-band: 3x3 SSE4 & AVX2, 5x5 SSE4 & AVX2
+- Box blur: fast path for radius < 1
+- Alpha composite: fast div approximation
+- Color conversion: RGB to L SSE4, fast div in RGBa to RGBA
+- Resampling: optimized coefficients loading
+- Split and get_channel: SSE4
+
+3.4.1.post1
+-----------
+
+- Critical memory error for some combinations of source/destination 
+  sizes is fixed.
+
+3.4.1.post0
+-----------
+
+- A lot of optimizations in resampling including 16-bit
+  intermediate color representation and heavy unrolling.
+
+3.3.2.post0
+-----------
+
+- Maintenance release
+
+3.3.0.post2
+-----------
+
+- Fixed error in RGBa -> RGBA conversion
+
+3.3.0.post1
+-----------
+
+Alpha compositing
+~~~~~~~~~~~~~~~~~
+
+- SSE4 and AVX2 fixed-point full loading implementation.
+  Up to 4.6x faster.
+
+3.3.0.post0
+-----------
+
+Resampling
+~~~~~~~~~~
+
+- SSE4 and AVX2 fixed-point full loading horizontal pass.
+- SSE4 and AVX2 fixed-point full loading vertical pass.
+
+Conversion
+~~~~~~~~~~
+
+- RGBA -> RGBa SSE4 and AVX2 fixed-point full loading implementations.
+  Up to 2.6x faster.
+- RGBa -> RGBA AVX2 implementation using gather instructions.
+  Up to 5x faster.
+
+
+3.2.0.post3
+-----------
+
+Resampling
+~~~~~~~~~~
+
+- SSE4 and AVX2 float full loading horizontal pass.
+- SSE4 float full loading vertical pass.
+
+
+3.2.0.post2
+-----------
+
+Resampling
+~~~~~~~~~~
+
+- SSE4 and AVX2 float full loading horizontal pass.
+- SSE4 float per-pixel loading vertical pass.
+
+
+2.9.0.post1
+-----------
+
+Resampling
+~~~~~~~~~~
+
+- SSE4 and AVX2 float per-pixel loading horizontal pass.
+- SSE4 float per-pixel loading vertical pass.
+- SSE4: Up to 2x for downscaling. Up to 3.5x for upscaling.
+- AVX2: Up to 2.7x for downscaling. Up to 3.5x for upscaling.
+
+
+Box blur
+~~~~~~~~
+
+- Simple SSE4 fixed-point implementations with per-pixel loading.
+- Up to 2.1x faster.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -17,6 +17,7 @@ graft src
 graft depends
 graft winbuild
 graft docs
+prune Tests
 
 # build/src control detritus
 exclude .appveyor.yml

diff --git a/PyPI.rst b/PyPI.rst
@@ -0,0 +1,6 @@
+
+`Pillow-SIMD repo and readme <https://github.com/uploadcare/pillow-simd>`_
+
+`Pillow-SIMD changelog <https://github.com/uploadcare/pillow-simd/blob/simd/master/CHANGES.SIMD.rst>`_
+
+`Pillow documentation <https://pillow.readthedocs.io/>`_