From 3a238de99814ae05094d92b7ac62be70f6aba19c Mon Sep 17 00:00:00 2001 From: Martin Pulec Date: Wed, 18 Sep 2024 12:11:42 +0200 Subject: [PATCH] vdisp/sdl: R10k conversion optimized Create inner loop with fixed amount of iterations (16). This will allow the compiler to unroll the inner loop and vectorize (16 iterations per 4 bytes is 512b allowing up to 512b instructions). The eventual rest (%16 != 0) is computed per pixel as it used to be.. --- src/video_display/sdl2.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/video_display/sdl2.c b/src/video_display/sdl2.c index 517caebd1..4c4dd7f01 100644 --- a/src/video_display/sdl2.c +++ b/src/video_display/sdl2.c @@ -833,7 +833,17 @@ static struct video_frame *display_sdl2_getf(void *state) static void r10k_to_sdl2(size_t count, uint32_t *buf) { + enum { + LOOP_ITEMS = 16, + }; unsigned int i = 0; + for (; i < count / LOOP_ITEMS; ++i) { + for (int j = 0; j < LOOP_ITEMS; ++j) { + uint32_t val = htonl(*buf); + *buf++ = val >> 2; + } + } + i *= LOOP_ITEMS; for (; i < count; ++i) { uint32_t val = htonl(*buf); *buf++ = val >> 2;