From d3ee5bddde9691f8ea5c52aa363fae88e4472992 Mon Sep 17 00:00:00 2001 From: Geoffrey McRae Date: Sun, 19 Nov 2023 15:45:15 +1100 Subject: [PATCH] [common] rect/framebuffer: improve avx implementations --- common/include/common/rects.h | 3 ++- common/src/framebuffer.c | 44 ++++++++++++++++------------------- common/src/rects.c | 37 +++++++++++++++++++++-------- 3 files changed, 49 insertions(+), 35 deletions(-) diff --git a/common/include/common/rects.h b/common/include/common/rects.h index df7e93fbb..7e08be133 100644 --- a/common/include/common/rects.h +++ b/common/include/common/rects.h @@ -27,7 +27,8 @@ #include "common/framebuffer.h" #include "common/types.h" -extern void (*rectCopyUnaligned)(uint8_t * dst, const uint8_t * src, +extern void (*rectCopyUnaligned)( + uint8_t *restrict dst, const uint8_t *restrict src, int ystart, int yend, int dx, int dstPitch, int srcPitch, int width); void rectsBufferToFramebuffer(FrameDamageRect * rects, int count, int bpp, diff --git a/common/src/framebuffer.c b/common/src/framebuffer.c index a8233d821..24d507c8c 100644 --- a/common/src/framebuffer.c +++ b/common/src/framebuffer.c @@ -251,22 +251,20 @@ bool framebuffer_write_avx2(FrameBuffer * frame, /* copy in chunks */ while (size > 127) { - __m256i *_d = (__m256i *)d; - __m256i *_s = (__m256i *)s; - __m256i v1 = _mm256_stream_load_si256(_s + 0); - __m256i v2 = _mm256_stream_load_si256(_s + 1); - __m256i v3 = _mm256_stream_load_si256(_s + 2); - __m256i v4 = _mm256_stream_load_si256(_s + 3); - - _mm256_stream_si256(_d + 0, v1); - _mm256_stream_si256(_d + 1, v2); - _mm256_stream_si256(_d + 2, v3); - _mm256_stream_si256(_d + 3, v4); - - s += 4; - d += 4; + __m256i v1 = _mm256_stream_load_si256(s + 0); + __m256i v2 = _mm256_stream_load_si256(s + 1); + __m256i v3 = _mm256_stream_load_si256(s + 2); + __m256i v4 = _mm256_stream_load_si256(s + 3); + + _mm256_stream_si256(d + 0, v1); + _mm256_stream_si256(d + 1, v2); + _mm256_stream_si256(d + 2, v3); + _mm256_stream_si256(d + 3, v4); + + s += 4; + d += 4; size -= 128; - wp += 128; + wp += 128; if (wp % FB_CHUNK_SIZE == 0) atomic_store_explicit(&frame->wp, wp, memory_order_release); @@ -274,18 +272,16 @@ bool framebuffer_write_avx2(FrameBuffer * frame, if (size > 63) { - __m256i *_d = (__m256i *)d; - __m256i *_s = (__m256i *)s; - __m256i v1 = _mm256_stream_load_si256(_s); - __m256i v2 = _mm256_stream_load_si256(_s + 1); + __m256i v1 = _mm256_stream_load_si256(s); + __m256i v2 = _mm256_stream_load_si256(s + 1); - _mm256_stream_si256(_d, v1); - _mm256_stream_si256(_d + 1, v2); + _mm256_stream_si256(d, v1); + _mm256_stream_si256(d + 1, v2); - s += 2; - d += 2; + s += 2; + d += 2; size -= 64; - wp += 64; + wp += 64; if (wp % FB_CHUNK_SIZE == 0) atomic_store_explicit(&frame->wp, wp, memory_order_release); diff --git a/common/src/rects.c b/common/src/rects.c index c8a69cf05..d86ab7fd6 100644 --- a/common/src/rects.c +++ b/common/src/rects.c @@ -301,7 +301,8 @@ int rectsRejectContained(FrameDamageRect * rects, int count) return removeRects(rects, count, removed); } -static void rectCopyUnaligned_memcpy(uint8_t * dst, const uint8_t * src, +static void rectCopyUnaligned_memcpy( + uint8_t *restrict dst, const uint8_t *restrict src, int ystart, int yend, int dx, int dstPitch, int srcPitch, int width) { src += ystart * srcPitch + dx; @@ -320,27 +321,42 @@ static void rectCopyUnaligned_memcpy(uint8_t * dst, const uint8_t * src, #pragma GCC push_options #pragma GCC target ("avx") #endif -static void rectCopyUnaligned_avx(uint8_t * dst, const uint8_t * src, +static void rectCopyUnaligned_avx( + uint8_t *restrict dst, const uint8_t *restrict src, int ystart, int yend, int dx, int dstPitch, int srcPitch, int width) { src += ystart * srcPitch + dx; dst += ystart * dstPitch + dx; + + const int nvec = width / sizeof(__m256i); + const int rem = width % sizeof(__m256i); + for (int i = ystart; i < yend; ++i) { - int col; - for(col = 0; col <= width - 32; col += 32) + const __m256i *restrict s = (__m256i*)src; + __m256i *restrict d = (__m256i*)dst; + + int vec; + for(vec = nvec; vec > 3; vec -= 4) { - _mm_prefetch(src + col + 256, _MM_HINT_T0); - __m256i srcData = _mm256_loadu_si256((__m256i*)(src + col)); - _mm256_storeu_si256((__m256i*)(dst + col), srcData); + _mm256_stream_si256(d + 0, _mm256_load_si256(s + 0)); + _mm256_stream_si256(d + 1, _mm256_load_si256(s + 1)); + _mm256_stream_si256(d + 2, _mm256_load_si256(s + 2)); + _mm256_stream_si256(d + 3, _mm256_load_si256(s + 3)); + + s += 4; + d += 4; } - for(; col < width; ++col) + for(; vec > 0; --vec, ++d, ++s) + _mm256_stream_si256(d, _mm256_load_si256(s)); + + for(int col = width - rem; col < width; ++col) dst[col] = src[col]; src += srcPitch; dst += dstPitch; - } + } } #ifdef __clang__ #pragma clang attribute pop @@ -348,7 +364,8 @@ static void rectCopyUnaligned_avx(uint8_t * dst, const uint8_t * src, #pragma GCC pop_options #endif -static void _rectCopyUnaligned(uint8_t * dst, const uint8_t * src, +static void _rectCopyUnaligned( + uint8_t *restrict dst, const uint8_t *restrict src, int ystart, int yend, int dx, int dstPitch, int srcPitch, int width) { if (cpuInfo_getFeatures()->avx)