From d3ee5bddde9691f8ea5c52aa363fae88e4472992 Mon Sep 17 00:00:00 2001
From: Geoffrey McRae <geoff@hostfission.com>
Date: Sun, 19 Nov 2023 15:45:15 +1100
Subject: [PATCH] [common] rect/framebuffer: improve avx implementations

---
 common/include/common/rects.h |  3 ++-
 common/src/framebuffer.c      | 44 ++++++++++++++++-------------------
 common/src/rects.c            | 37 +++++++++++++++++++++--------
 3 files changed, 49 insertions(+), 35 deletions(-)

diff --git a/common/include/common/rects.h b/common/include/common/rects.h
index df7e93fbb..7e08be133 100644
--- a/common/include/common/rects.h
+++ b/common/include/common/rects.h
@@ -27,7 +27,8 @@
 #include "common/framebuffer.h"
 #include "common/types.h"
 
-extern void (*rectCopyUnaligned)(uint8_t * dst, const uint8_t * src,
+extern void (*rectCopyUnaligned)(
+    uint8_t *restrict dst, const uint8_t *restrict src,
     int ystart, int yend, int dx, int dstPitch, int srcPitch, int width);
 
 void rectsBufferToFramebuffer(FrameDamageRect * rects, int count, int bpp,
diff --git a/common/src/framebuffer.c b/common/src/framebuffer.c
index a8233d821..24d507c8c 100644
--- a/common/src/framebuffer.c
+++ b/common/src/framebuffer.c
@@ -251,22 +251,20 @@ bool framebuffer_write_avx2(FrameBuffer * frame,
   /* copy in chunks */
   while (size > 127)
   {
-    __m256i *_d = (__m256i *)d;
-    __m256i *_s = (__m256i *)s;
-    __m256i v1 = _mm256_stream_load_si256(_s + 0);
-    __m256i v2 = _mm256_stream_load_si256(_s + 1);
-    __m256i v3 = _mm256_stream_load_si256(_s + 2);
-    __m256i v4 = _mm256_stream_load_si256(_s + 3);
-
-    _mm256_stream_si256(_d + 0, v1);
-    _mm256_stream_si256(_d + 1, v2);
-    _mm256_stream_si256(_d + 2, v3);
-    _mm256_stream_si256(_d + 3, v4);
-
-    s += 4;
-    d += 4;
+    __m256i v1 = _mm256_stream_load_si256(s + 0);
+    __m256i v2 = _mm256_stream_load_si256(s + 1);
+    __m256i v3 = _mm256_stream_load_si256(s + 2);
+    __m256i v4 = _mm256_stream_load_si256(s + 3);
+
+    _mm256_stream_si256(d + 0, v1);
+    _mm256_stream_si256(d + 1, v2);
+    _mm256_stream_si256(d + 2, v3);
+    _mm256_stream_si256(d + 3, v4);
+
+    s    += 4;
+    d    += 4;
     size -= 128;
-    wp += 128;
+    wp   += 128;
 
     if (wp % FB_CHUNK_SIZE == 0)
       atomic_store_explicit(&frame->wp, wp, memory_order_release);
@@ -274,18 +272,16 @@ bool framebuffer_write_avx2(FrameBuffer * frame,
 
   if (size > 63)
   {
-    __m256i *_d = (__m256i *)d;
-    __m256i *_s = (__m256i *)s;
-    __m256i v1 = _mm256_stream_load_si256(_s);
-    __m256i v2 = _mm256_stream_load_si256(_s + 1);
+    __m256i v1 = _mm256_stream_load_si256(s);
+    __m256i v2 = _mm256_stream_load_si256(s + 1);
 
-    _mm256_stream_si256(_d, v1);
-    _mm256_stream_si256(_d + 1, v2);
+    _mm256_stream_si256(d, v1);
+    _mm256_stream_si256(d + 1, v2);
 
-    s += 2;
-    d += 2;
+    s    += 2;
+    d    += 2;
     size -= 64;
-    wp += 64;
+    wp   += 64;
 
     if (wp % FB_CHUNK_SIZE == 0)
       atomic_store_explicit(&frame->wp, wp, memory_order_release);
diff --git a/common/src/rects.c b/common/src/rects.c
index c8a69cf05..d86ab7fd6 100644
--- a/common/src/rects.c
+++ b/common/src/rects.c
@@ -301,7 +301,8 @@ int rectsRejectContained(FrameDamageRect * rects, int count)
   return removeRects(rects, count, removed);
 }
 
-static void rectCopyUnaligned_memcpy(uint8_t * dst, const uint8_t * src,
+static void rectCopyUnaligned_memcpy(
+    uint8_t *restrict dst, const uint8_t *restrict src,
     int ystart, int yend, int dx, int dstPitch, int srcPitch, int width)
 {
   src += ystart * srcPitch + dx;
@@ -320,27 +321,42 @@ static void rectCopyUnaligned_memcpy(uint8_t * dst, const uint8_t * src,
   #pragma GCC push_options
   #pragma GCC target ("avx")
 #endif
-static void rectCopyUnaligned_avx(uint8_t * dst, const uint8_t * src,
+static void rectCopyUnaligned_avx(
+    uint8_t *restrict dst, const uint8_t *restrict src,
     int ystart, int yend, int dx, int dstPitch, int srcPitch, int width)
 {
   src += ystart * srcPitch + dx;
   dst += ystart * dstPitch + dx;
+
+  const int nvec = width / sizeof(__m256i);
+  const int rem  = width % sizeof(__m256i);
+
   for (int i = ystart; i < yend; ++i)
   {
-    int col;
-    for(col = 0; col <= width - 32; col += 32)
+    const __m256i *restrict s = (__m256i*)src;
+          __m256i *restrict d = (__m256i*)dst;
+
+    int vec;
+    for(vec = nvec; vec > 3; vec -= 4)
     {
-      _mm_prefetch(src + col + 256, _MM_HINT_T0);
-      __m256i srcData = _mm256_loadu_si256((__m256i*)(src + col));
-      _mm256_storeu_si256((__m256i*)(dst + col), srcData);
+      _mm256_stream_si256(d + 0, _mm256_load_si256(s + 0));
+      _mm256_stream_si256(d + 1, _mm256_load_si256(s + 1));
+      _mm256_stream_si256(d + 2, _mm256_load_si256(s + 2));
+      _mm256_stream_si256(d + 3, _mm256_load_si256(s + 3));
+
+      s += 4;
+      d += 4;
     }
 
-    for(; col < width; ++col)
+    for(; vec > 0; --vec, ++d, ++s)
+      _mm256_stream_si256(d, _mm256_load_si256(s));
+
+    for(int col = width - rem; col < width; ++col)
       dst[col] = src[col];
 
     src += srcPitch;
     dst += dstPitch;
- }
+  }
 }
 #ifdef __clang__
   #pragma clang attribute pop
@@ -348,7 +364,8 @@ static void rectCopyUnaligned_avx(uint8_t * dst, const uint8_t * src,
   #pragma GCC pop_options
 #endif
 
-static void _rectCopyUnaligned(uint8_t * dst, const uint8_t * src,
+static void _rectCopyUnaligned(
+  uint8_t *restrict dst, const uint8_t *restrict src,
     int ystart, int yend, int dx, int dstPitch, int srcPitch, int width)
 {
   if (cpuInfo_getFeatures()->avx)