diff --git a/src/libImaging/Filter.c b/src/libImaging/Filter.c
index df3ca36307a..6b495a7d70b 100644
--- a/src/libImaging/Filter.c
+++ b/src/libImaging/Filter.c
@@ -26,6 +26,7 @@
 
 #include "Imaging.h"
 
+#if defined(__SSE4__)
 /* 5 is number of bits enought to account all kernel coefficients (1<<5 > 25).
    8 is number of bits in result.
    Any coefficients delta smaller than this precision will have no effect. */
@@ -41,6 +42,18 @@
 #include "FilterSIMD_5x5f_u8.c"
 #include "FilterSIMD_5x5f_4u8.c"
 #include "FilterSIMD_5x5i_4u8.c"
+#endif  // defined(__SSE4__)
+
+static inline UINT8
+clip8(float in) {
+    if (in <= 0.0) {
+        return 0;
+    }
+    if (in >= 255.0) {
+        return 255;
+    }
+    return (UINT8)in;
+}
 
 static inline INT32
 clip32(float in) {
@@ -110,14 +123,283 @@ ImagingExpand(Imaging imIn, int xmargin, int ymargin) {
     return imOut;
 }
 
+void
+ImagingFilter3x3(Imaging imOut, Imaging im, const float *kernel, float offset) {
+#define KERNEL1x3(in0, x, kernel, d)                               \
+    (_i2f(in0[x - d]) * (kernel)[0] + _i2f(in0[x]) * (kernel)[1] + \
+     _i2f(in0[x + d]) * (kernel)[2])
+
+    int x = 0, y = 0;
+
+    memcpy(imOut->image[0], im->image[0], im->linesize);
+    if (im->bands == 1) {
+        // Add one time for rounding
+        offset += 0.5;
+        if (im->type == IMAGING_TYPE_INT32) {
+            for (y = 1; y < im->ysize - 1; y++) {
+                INT32 *in_1 = (INT32 *)im->image[y - 1];
+                INT32 *in0 = (INT32 *)im->image[y];
+                INT32 *in1 = (INT32 *)im->image[y + 1];
+                INT32 *out = (INT32 *)imOut->image[y];
+
+                out[0] = in0[0];
+                for (x = 1; x < im->xsize - 1; x++) {
+                    float ss = offset;
+                    ss += KERNEL1x3(in1, x, &kernel[0], 1);
+                    ss += KERNEL1x3(in0, x, &kernel[3], 1);
+                    ss += KERNEL1x3(in_1, x, &kernel[6], 1);
+                    out[x] = clip32(ss);
+                }
+                out[x] = in0[x];
+            }
+        } else {
+            for (y = 1; y < im->ysize - 1; y++) {
+                UINT8 *in_1 = (UINT8 *)im->image[y - 1];
+                UINT8 *in0 = (UINT8 *)im->image[y];
+                UINT8 *in1 = (UINT8 *)im->image[y + 1];
+                UINT8 *out = (UINT8 *)imOut->image[y];
+
+                out[0] = in0[0];
+                for (x = 1; x < im->xsize - 1; x++) {
+                    float ss = offset;
+                    ss += KERNEL1x3(in1, x, &kernel[0], 1);
+                    ss += KERNEL1x3(in0, x, &kernel[3], 1);
+                    ss += KERNEL1x3(in_1, x, &kernel[6], 1);
+                    out[x] = clip8(ss);
+                }
+                out[x] = in0[x];
+            }
+        }
+    } else {
+        // Add one time for rounding
+        offset += 0.5;
+        for (y = 1; y < im->ysize - 1; y++) {
+            UINT8 *in_1 = (UINT8 *)im->image[y - 1];
+            UINT8 *in0 = (UINT8 *)im->image[y];
+            UINT8 *in1 = (UINT8 *)im->image[y + 1];
+            UINT8 *out = (UINT8 *)imOut->image[y];
+
+            memcpy(out, in0, sizeof(UINT32));
+            if (im->bands == 2) {
+                for (x = 1; x < im->xsize - 1; x++) {
+                    float ss0 = offset;
+                    float ss3 = offset;
+                    UINT32 v;
+                    ss0 += KERNEL1x3(in1, x * 4 + 0, &kernel[0], 4);
+                    ss3 += KERNEL1x3(in1, x * 4 + 3, &kernel[0], 4);
+                    ss0 += KERNEL1x3(in0, x * 4 + 0, &kernel[3], 4);
+                    ss3 += KERNEL1x3(in0, x * 4 + 3, &kernel[3], 4);
+                    ss0 += KERNEL1x3(in_1, x * 4 + 0, &kernel[6], 4);
+                    ss3 += KERNEL1x3(in_1, x * 4 + 3, &kernel[6], 4);
+                    v = MAKE_UINT32(clip8(ss0), 0, 0, clip8(ss3));
+                    memcpy(out + x * sizeof(v), &v, sizeof(v));
+                }
+            } else if (im->bands == 3) {
+                for (x = 1; x < im->xsize - 1; x++) {
+                    float ss0 = offset;
+                    float ss1 = offset;
+                    float ss2 = offset;
+                    UINT32 v;
+                    ss0 += KERNEL1x3(in1, x * 4 + 0, &kernel[0], 4);
+                    ss1 += KERNEL1x3(in1, x * 4 + 1, &kernel[0], 4);
+                    ss2 += KERNEL1x3(in1, x * 4 + 2, &kernel[0], 4);
+                    ss0 += KERNEL1x3(in0, x * 4 + 0, &kernel[3], 4);
+                    ss1 += KERNEL1x3(in0, x * 4 + 1, &kernel[3], 4);
+                    ss2 += KERNEL1x3(in0, x * 4 + 2, &kernel[3], 4);
+                    ss0 += KERNEL1x3(in_1, x * 4 + 0, &kernel[6], 4);
+                    ss1 += KERNEL1x3(in_1, x * 4 + 1, &kernel[6], 4);
+                    ss2 += KERNEL1x3(in_1, x * 4 + 2, &kernel[6], 4);
+                    v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), 0);
+                    memcpy(out + x * sizeof(v), &v, sizeof(v));
+                }
+            } else if (im->bands == 4) {
+                for (x = 1; x < im->xsize - 1; x++) {
+                    float ss0 = offset;
+                    float ss1 = offset;
+                    float ss2 = offset;
+                    float ss3 = offset;
+                    UINT32 v;
+                    ss0 += KERNEL1x3(in1, x * 4 + 0, &kernel[0], 4);
+                    ss1 += KERNEL1x3(in1, x * 4 + 1, &kernel[0], 4);
+                    ss2 += KERNEL1x3(in1, x * 4 + 2, &kernel[0], 4);
+                    ss3 += KERNEL1x3(in1, x * 4 + 3, &kernel[0], 4);
+                    ss0 += KERNEL1x3(in0, x * 4 + 0, &kernel[3], 4);
+                    ss1 += KERNEL1x3(in0, x * 4 + 1, &kernel[3], 4);
+                    ss2 += KERNEL1x3(in0, x * 4 + 2, &kernel[3], 4);
+                    ss3 += KERNEL1x3(in0, x * 4 + 3, &kernel[3], 4);
+                    ss0 += KERNEL1x3(in_1, x * 4 + 0, &kernel[6], 4);
+                    ss1 += KERNEL1x3(in_1, x * 4 + 1, &kernel[6], 4);
+                    ss2 += KERNEL1x3(in_1, x * 4 + 2, &kernel[6], 4);
+                    ss3 += KERNEL1x3(in_1, x * 4 + 3, &kernel[6], 4);
+                    v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), clip8(ss3));
+                    memcpy(out + x * sizeof(v), &v, sizeof(v));
+                }
+            }
+            memcpy(out + x * sizeof(UINT32), in0 + x * sizeof(UINT32), sizeof(UINT32));
+        }
+    }
+    memcpy(imOut->image[y], im->image[y], im->linesize);
+}
+
+void
+ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) {
+#define KERNEL1x5(in0, x, kernel, d)                                       \
+    (_i2f(in0[x - d - d]) * (kernel)[0] + _i2f(in0[x - d]) * (kernel)[1] + \
+     _i2f(in0[x]) * (kernel)[2] + _i2f(in0[x + d]) * (kernel)[3] +         \
+     _i2f(in0[x + d + d]) * (kernel)[4])
+
+    int x = 0, y = 0;
+
+    memcpy(imOut->image[0], im->image[0], im->linesize);
+    memcpy(imOut->image[1], im->image[1], im->linesize);
+    if (im->bands == 1) {
+        // Add one time for rounding
+        offset += 0.5;
+        if (im->type == IMAGING_TYPE_INT32) {
+            for (y = 2; y < im->ysize - 2; y++) {
+                INT32 *in_2 = (INT32 *)im->image[y - 2];
+                INT32 *in_1 = (INT32 *)im->image[y - 1];
+                INT32 *in0 = (INT32 *)im->image[y];
+                INT32 *in1 = (INT32 *)im->image[y + 1];
+                INT32 *in2 = (INT32 *)im->image[y + 2];
+                INT32 *out = (INT32 *)imOut->image[y];
+
+                out[0] = in0[0];
+                out[1] = in0[1];
+                for (x = 2; x < im->xsize - 2; x++) {
+                    float ss = offset;
+                    ss += KERNEL1x5(in2, x, &kernel[0], 1);
+                    ss += KERNEL1x5(in1, x, &kernel[5], 1);
+                    ss += KERNEL1x5(in0, x, &kernel[10], 1);
+                    ss += KERNEL1x5(in_1, x, &kernel[15], 1);
+                    ss += KERNEL1x5(in_2, x, &kernel[20], 1);
+                    out[x] = clip32(ss);
+                }
+                out[x + 0] = in0[x + 0];
+                out[x + 1] = in0[x + 1];
+            }
+        } else {
+            for (y = 2; y < im->ysize - 2; y++) {
+                UINT8 *in_2 = (UINT8 *)im->image[y - 2];
+                UINT8 *in_1 = (UINT8 *)im->image[y - 1];
+                UINT8 *in0 = (UINT8 *)im->image[y];
+                UINT8 *in1 = (UINT8 *)im->image[y + 1];
+                UINT8 *in2 = (UINT8 *)im->image[y + 2];
+                UINT8 *out = (UINT8 *)imOut->image[y];
+
+                out[0] = in0[0];
+                out[1] = in0[1];
+                for (x = 2; x < im->xsize - 2; x++) {
+                    float ss = offset;
+                    ss += KERNEL1x5(in2, x, &kernel[0], 1);
+                    ss += KERNEL1x5(in1, x, &kernel[5], 1);
+                    ss += KERNEL1x5(in0, x, &kernel[10], 1);
+                    ss += KERNEL1x5(in_1, x, &kernel[15], 1);
+                    ss += KERNEL1x5(in_2, x, &kernel[20], 1);
+                    out[x] = clip8(ss);
+                }
+                out[x + 0] = in0[x + 0];
+                out[x + 1] = in0[x + 1];
+            }
+        }
+    } else {
+        // Add one time for rounding
+        offset += 0.5;
+        for (y = 2; y < im->ysize - 2; y++) {
+            UINT8 *in_2 = (UINT8 *)im->image[y - 2];
+            UINT8 *in_1 = (UINT8 *)im->image[y - 1];
+            UINT8 *in0 = (UINT8 *)im->image[y];
+            UINT8 *in1 = (UINT8 *)im->image[y + 1];
+            UINT8 *in2 = (UINT8 *)im->image[y + 2];
+            UINT8 *out = (UINT8 *)imOut->image[y];
+
+            memcpy(out, in0, sizeof(UINT32) * 2);
+            if (im->bands == 2) {
+                for (x = 2; x < im->xsize - 2; x++) {
+                    float ss0 = offset;
+                    float ss3 = offset;
+                    UINT32 v;
+                    ss0 += KERNEL1x5(in2, x * 4 + 0, &kernel[0], 4);
+                    ss3 += KERNEL1x5(in2, x * 4 + 3, &kernel[0], 4);
+                    ss0 += KERNEL1x5(in1, x * 4 + 0, &kernel[5], 4);
+                    ss3 += KERNEL1x5(in1, x * 4 + 3, &kernel[5], 4);
+                    ss0 += KERNEL1x5(in0, x * 4 + 0, &kernel[10], 4);
+                    ss3 += KERNEL1x5(in0, x * 4 + 3, &kernel[10], 4);
+                    ss0 += KERNEL1x5(in_1, x * 4 + 0, &kernel[15], 4);
+                    ss3 += KERNEL1x5(in_1, x * 4 + 3, &kernel[15], 4);
+                    ss0 += KERNEL1x5(in_2, x * 4 + 0, &kernel[20], 4);
+                    ss3 += KERNEL1x5(in_2, x * 4 + 3, &kernel[20], 4);
+                    v = MAKE_UINT32(clip8(ss0), 0, 0, clip8(ss3));
+                    memcpy(out + x * sizeof(v), &v, sizeof(v));
+                }
+            } else if (im->bands == 3) {
+                for (x = 2; x < im->xsize - 2; x++) {
+                    float ss0 = offset;
+                    float ss1 = offset;
+                    float ss2 = offset;
+                    UINT32 v;
+                    ss0 += KERNEL1x5(in2, x * 4 + 0, &kernel[0], 4);
+                    ss1 += KERNEL1x5(in2, x * 4 + 1, &kernel[0], 4);
+                    ss2 += KERNEL1x5(in2, x * 4 + 2, &kernel[0], 4);
+                    ss0 += KERNEL1x5(in1, x * 4 + 0, &kernel[5], 4);
+                    ss1 += KERNEL1x5(in1, x * 4 + 1, &kernel[5], 4);
+                    ss2 += KERNEL1x5(in1, x * 4 + 2, &kernel[5], 4);
+                    ss0 += KERNEL1x5(in0, x * 4 + 0, &kernel[10], 4);
+                    ss1 += KERNEL1x5(in0, x * 4 + 1, &kernel[10], 4);
+                    ss2 += KERNEL1x5(in0, x * 4 + 2, &kernel[10], 4);
+                    ss0 += KERNEL1x5(in_1, x * 4 + 0, &kernel[15], 4);
+                    ss1 += KERNEL1x5(in_1, x * 4 + 1, &kernel[15], 4);
+                    ss2 += KERNEL1x5(in_1, x * 4 + 2, &kernel[15], 4);
+                    ss0 += KERNEL1x5(in_2, x * 4 + 0, &kernel[20], 4);
+                    ss1 += KERNEL1x5(in_2, x * 4 + 1, &kernel[20], 4);
+                    ss2 += KERNEL1x5(in_2, x * 4 + 2, &kernel[20], 4);
+                    v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), 0);
+                    memcpy(out + x * sizeof(v), &v, sizeof(v));
+                }
+            } else if (im->bands == 4) {
+                for (x = 2; x < im->xsize - 2; x++) {
+                    float ss0 = offset;
+                    float ss1 = offset;
+                    float ss2 = offset;
+                    float ss3 = offset;
+                    UINT32 v;
+                    ss0 += KERNEL1x5(in2, x * 4 + 0, &kernel[0], 4);
+                    ss1 += KERNEL1x5(in2, x * 4 + 1, &kernel[0], 4);
+                    ss2 += KERNEL1x5(in2, x * 4 + 2, &kernel[0], 4);
+                    ss3 += KERNEL1x5(in2, x * 4 + 3, &kernel[0], 4);
+                    ss0 += KERNEL1x5(in1, x * 4 + 0, &kernel[5], 4);
+                    ss1 += KERNEL1x5(in1, x * 4 + 1, &kernel[5], 4);
+                    ss2 += KERNEL1x5(in1, x * 4 + 2, &kernel[5], 4);
+                    ss3 += KERNEL1x5(in1, x * 4 + 3, &kernel[5], 4);
+                    ss0 += KERNEL1x5(in0, x * 4 + 0, &kernel[10], 4);
+                    ss1 += KERNEL1x5(in0, x * 4 + 1, &kernel[10], 4);
+                    ss2 += KERNEL1x5(in0, x * 4 + 2, &kernel[10], 4);
+                    ss3 += KERNEL1x5(in0, x * 4 + 3, &kernel[10], 4);
+                    ss0 += KERNEL1x5(in_1, x * 4 + 0, &kernel[15], 4);
+                    ss1 += KERNEL1x5(in_1, x * 4 + 1, &kernel[15], 4);
+                    ss2 += KERNEL1x5(in_1, x * 4 + 2, &kernel[15], 4);
+                    ss3 += KERNEL1x5(in_1, x * 4 + 3, &kernel[15], 4);
+                    ss0 += KERNEL1x5(in_2, x * 4 + 0, &kernel[20], 4);
+                    ss1 += KERNEL1x5(in_2, x * 4 + 1, &kernel[20], 4);
+                    ss2 += KERNEL1x5(in_2, x * 4 + 2, &kernel[20], 4);
+                    ss3 += KERNEL1x5(in_2, x * 4 + 3, &kernel[20], 4);
+                    v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), clip8(ss3));
+                    memcpy(out + x * sizeof(v), &v, sizeof(v));
+                }
+            }
+            memcpy(
+                out + x * sizeof(UINT32), in0 + x * sizeof(UINT32), sizeof(UINT32) * 2
+            );
+        }
+    }
+    memcpy(imOut->image[y], im->image[y], im->linesize);
+    memcpy(imOut->image[y + 1], im->image[y + 1], im->linesize);
+}
+
 Imaging
 ImagingFilter(Imaging im, int xsize, int ysize, const FLOAT32 *kernel, FLOAT32 offset) {
-    ImagingSectionCookie cookie;
     Imaging imOut;
-    int i, fast_path = 1;
-    FLOAT32 tmp;
-    INT16 norm_kernel[25];
-    INT32 norm_offset;
+    ImagingSectionCookie cookie;
 
     if (im->type != IMAGING_TYPE_UINT8 && im->type != IMAGING_TYPE_INT32) {
         return (Imaging)ImagingError_ModeError();
@@ -136,49 +418,67 @@ ImagingFilter(Imaging im, int xsize, int ysize, const FLOAT32 *kernel, FLOAT32 o
         return NULL;
     }
 
-    for (i = 0; i < xsize * ysize; i++) {
-        tmp = kernel[i] * (1 << PRECISION_BITS);
+    ImagingSectionEnter(&cookie);
+#if defined(__SSE4__)
+    {
+        int i, fast_path = 1;
+        FLOAT32 tmp;
+        INT16 norm_kernel[25];
+        INT32 norm_offset;
+
+        for (i = 0; i < xsize * ysize; i++) {
+            tmp = kernel[i] * (1 << PRECISION_BITS);
+            tmp += (tmp >= 0) ? 0.5 : -0.5;
+            if (tmp >= 32768 || tmp <= -32769) {
+                fast_path = 0;
+                break;
+            }
+            norm_kernel[i] = tmp;
+        }
+        tmp = offset * (1 << PRECISION_BITS);
         tmp += (tmp >= 0) ? 0.5 : -0.5;
-        if (tmp >= 32768 || tmp <= -32769) {
-            fast_path = 0;
-            break;
+        if (tmp >= 1 << 30) {
+            norm_offset = 1 << 30;
+        } else if (tmp <= -1 << 30) {
+            norm_offset = -1 << 30;
+        } else {
+            norm_offset = tmp;
         }
-        norm_kernel[i] = tmp;
-    }
-    tmp = offset * (1 << PRECISION_BITS);
-    tmp += (tmp >= 0) ? 0.5 : -0.5;
-    if (tmp >= 1 << 30) {
-        norm_offset = 1 << 30;
-    } else if (tmp <= -1 << 30) {
-        norm_offset = -1 << 30;
-    } else {
-        norm_offset = tmp;
-    }
 
-    ImagingSectionEnter(&cookie);
-    if (xsize == 3) {
-        /* 3x3 kernel. */
-        if (im->image8) {
-            ImagingFilter3x3f_u8(imOut, im, kernel, offset);
-        } else {
-            if (fast_path) {
-                ImagingFilter3x3i_4u8(imOut, im, norm_kernel, norm_offset);
+        if (xsize == 3) {
+            /* 3x3 kernel. */
+            if (im->image8) {
+                ImagingFilter3x3f_u8(imOut, im, kernel, offset);
             } else {
-                ImagingFilter3x3f_4u8(imOut, im, kernel, offset);
+                if (fast_path) {
+                    ImagingFilter3x3i_4u8(imOut, im, norm_kernel, norm_offset);
+                } else {
+                    ImagingFilter3x3f_4u8(imOut, im, kernel, offset);
+                }
             }
-        }
-    } else {
-        /* 5x5 kernel. */
-        if (im->image8) {
-            ImagingFilter5x5f_u8(imOut, im, kernel, offset);
         } else {
-            if (fast_path) {
-                ImagingFilter5x5i_4u8(imOut, im, norm_kernel, norm_offset);
+            /* 5x5 kernel. */
+            if (im->image8) {
+                ImagingFilter5x5f_u8(imOut, im, kernel, offset);
             } else {
-                ImagingFilter5x5f_4u8(imOut, im, kernel, offset);
+                if (fast_path) {
+                    ImagingFilter5x5i_4u8(imOut, im, norm_kernel, norm_offset);
+                } else {
+                    ImagingFilter5x5f_4u8(imOut, im, kernel, offset);
+                }
             }
         }
     }
+#else
+    if (xsize == 3) {
+        /* 3x3 kernel. */
+        ImagingFilter3x3(imOut, im, kernel, offset);
+    } else {
+        /* 5x5 kernel. */
+        ImagingFilter5x5(imOut, im, kernel, offset);
+    }
+#endif
+
     ImagingSectionLeave(&cookie);
     return imOut;
 }