Skip to content

Commit

Permalink
[common] rect/framebuffer: improve avx implementations
Browse files Browse the repository at this point in the history
  • Loading branch information
gnif committed Nov 19, 2023
1 parent 0ce4c34 commit d3ee5bd
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 35 deletions.
3 changes: 2 additions & 1 deletion common/include/common/rects.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
#include "common/framebuffer.h"
#include "common/types.h"

extern void (*rectCopyUnaligned)(uint8_t * dst, const uint8_t * src,
extern void (*rectCopyUnaligned)(
uint8_t *restrict dst, const uint8_t *restrict src,
int ystart, int yend, int dx, int dstPitch, int srcPitch, int width);

void rectsBufferToFramebuffer(FrameDamageRect * rects, int count, int bpp,
Expand Down
44 changes: 20 additions & 24 deletions common/src/framebuffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -251,41 +251,37 @@ bool framebuffer_write_avx2(FrameBuffer * frame,
/* copy in chunks */
while (size > 127)
{
__m256i *_d = (__m256i *)d;
__m256i *_s = (__m256i *)s;
__m256i v1 = _mm256_stream_load_si256(_s + 0);
__m256i v2 = _mm256_stream_load_si256(_s + 1);
__m256i v3 = _mm256_stream_load_si256(_s + 2);
__m256i v4 = _mm256_stream_load_si256(_s + 3);

_mm256_stream_si256(_d + 0, v1);
_mm256_stream_si256(_d + 1, v2);
_mm256_stream_si256(_d + 2, v3);
_mm256_stream_si256(_d + 3, v4);

s += 4;
d += 4;
__m256i v1 = _mm256_stream_load_si256(s + 0);
__m256i v2 = _mm256_stream_load_si256(s + 1);
__m256i v3 = _mm256_stream_load_si256(s + 2);
__m256i v4 = _mm256_stream_load_si256(s + 3);

_mm256_stream_si256(d + 0, v1);
_mm256_stream_si256(d + 1, v2);
_mm256_stream_si256(d + 2, v3);
_mm256_stream_si256(d + 3, v4);

s += 4;
d += 4;
size -= 128;
wp += 128;
wp += 128;

if (wp % FB_CHUNK_SIZE == 0)
atomic_store_explicit(&frame->wp, wp, memory_order_release);
}

if (size > 63)
{
__m256i *_d = (__m256i *)d;
__m256i *_s = (__m256i *)s;
__m256i v1 = _mm256_stream_load_si256(_s);
__m256i v2 = _mm256_stream_load_si256(_s + 1);
__m256i v1 = _mm256_stream_load_si256(s);
__m256i v2 = _mm256_stream_load_si256(s + 1);

_mm256_stream_si256(_d, v1);
_mm256_stream_si256(_d + 1, v2);
_mm256_stream_si256(d, v1);
_mm256_stream_si256(d + 1, v2);

s += 2;
d += 2;
s += 2;
d += 2;
size -= 64;
wp += 64;
wp += 64;

if (wp % FB_CHUNK_SIZE == 0)
atomic_store_explicit(&frame->wp, wp, memory_order_release);
Expand Down
37 changes: 27 additions & 10 deletions common/src/rects.c
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,8 @@ int rectsRejectContained(FrameDamageRect * rects, int count)
return removeRects(rects, count, removed);
}

static void rectCopyUnaligned_memcpy(uint8_t * dst, const uint8_t * src,
static void rectCopyUnaligned_memcpy(
uint8_t *restrict dst, const uint8_t *restrict src,
int ystart, int yend, int dx, int dstPitch, int srcPitch, int width)
{
src += ystart * srcPitch + dx;
Expand All @@ -320,35 +321,51 @@ static void rectCopyUnaligned_memcpy(uint8_t * dst, const uint8_t * src,
#pragma GCC push_options
#pragma GCC target ("avx")
#endif
static void rectCopyUnaligned_avx(uint8_t * dst, const uint8_t * src,
static void rectCopyUnaligned_avx(
uint8_t *restrict dst, const uint8_t *restrict src,
int ystart, int yend, int dx, int dstPitch, int srcPitch, int width)
{
src += ystart * srcPitch + dx;
dst += ystart * dstPitch + dx;

const int nvec = width / sizeof(__m256i);
const int rem = width % sizeof(__m256i);

for (int i = ystart; i < yend; ++i)
{
int col;
for(col = 0; col <= width - 32; col += 32)
const __m256i *restrict s = (__m256i*)src;
__m256i *restrict d = (__m256i*)dst;

int vec;
for(vec = nvec; vec > 3; vec -= 4)
{
_mm_prefetch(src + col + 256, _MM_HINT_T0);
__m256i srcData = _mm256_loadu_si256((__m256i*)(src + col));
_mm256_storeu_si256((__m256i*)(dst + col), srcData);
_mm256_stream_si256(d + 0, _mm256_load_si256(s + 0));
_mm256_stream_si256(d + 1, _mm256_load_si256(s + 1));
_mm256_stream_si256(d + 2, _mm256_load_si256(s + 2));
_mm256_stream_si256(d + 3, _mm256_load_si256(s + 3));

s += 4;
d += 4;
}

for(; col < width; ++col)
for(; vec > 0; --vec, ++d, ++s)
_mm256_stream_si256(d, _mm256_load_si256(s));

for(int col = width - rem; col < width; ++col)
dst[col] = src[col];

src += srcPitch;
dst += dstPitch;
}
}
}
#ifdef __clang__
#pragma clang attribute pop
#else
#pragma GCC pop_options
#endif

static void _rectCopyUnaligned(uint8_t * dst, const uint8_t * src,
static void _rectCopyUnaligned(
uint8_t *restrict dst, const uint8_t *restrict src,
int ystart, int yend, int dx, int dstPitch, int srcPitch, int width)
{
if (cpuInfo_getFeatures()->avx)
Expand Down

0 comments on commit d3ee5bd

Please sign in to comment.