diff --git a/.github/scripts/Linux/arm/bootstrap.sh b/.github/scripts/Linux/arm/bootstrap.sh
index 181cf3a1e3..de30166981 100755
--- a/.github/scripts/Linux/arm/bootstrap.sh
+++ b/.github/scripts/Linux/arm/bootstrap.sh
@@ -40,23 +40,7 @@ apt -y install libcaca-dev libmagickwand-dev libnatpmp-dev libopencv-core-dev li
 /.github/scripts/Linux/install_others.sh ximea
 
 # FFmpeg
-if [ "$ARCH" = armhf ]; then # Raspbian - build own FFmpeg with OMX camera patch
-        apt -y install libraspberrypi-dev libdrm-dev
-        sed -i '/^deb /p;s/deb /deb-src /' /etc/apt/sources.list
-        apt -y update && apt -y build-dep ffmpeg
-        raspbian_build_sdl2
-        apt -y remove libavcodec58 && apt -y autoremove
-        git clone --depth 1 -b n4.3.3 https://github.com/FFmpeg/FFmpeg.git && cd FFmpeg
-
-        # apply patches
-        find /.github/scripts/Linux/arm/ffmpeg-arm-patches -name '*.patch' -print0 | sort -z | xargs -0 -n 1 git apply
-
-        ./configure --enable-gpl --disable-stripping --enable-libaom --enable-libmp3lame --enable-libopenjpeg --enable-libopus --enable-libspeex --enable-libvpx --enable-libwebp --enable-libx265 --enable-omx --enable-neon --enable-libx264 --enable-mmal --enable-omx-rpi --enable-rpi --enable-vout-drm --enable-libdrm --enable-v4l2-request --enable-libudev --cpu=arm1176jzf-s --enable-shared --disable-static
-        make -j3 install
-        cd "$OLDPWD"
-else
-        apt -y install libavcodec-dev libavformat-dev libsdl2-dev libswscale-dev
-fi
+apt -y install libavcodec-dev libavformat-dev libsdl2-dev libswscale-dev
 
 # mkappimage
 mkai_arch=$(dpkg --print-architecture)
diff --git a/.github/scripts/Linux/arm/ffmpeg-arm-patches/0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch b/.github/scripts/Linux/arm/ffmpeg-arm-patches/0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch
deleted file mode 100644
index 6a91596ad1..0000000000
--- a/.github/scripts/Linux/arm/ffmpeg-arm-patches/0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch
+++ /dev/null
@@ -1,287 +0,0 @@
-From: James Cowgill <jcowgill@debian.org>
-Date: Sun, 11 Aug 2019 16:50:56 +0100
-Subject: avcodec/arm/sbcenc: avoid callee preserved vfp registers
-
-When compiling FFmpeg with GCC-9, some very random segfaults were
-observed in code which had previously called down into the SBC encoder
-NEON assembly routines. This was caused by these functions clobbering
-some of the vfp callee saved registers (d8 - d15 aka q4 - q7). GCC was
-using these registers to save local variables, but after these
-functions returned, they would contain garbage.
-
-Fix by reallocating the registers in the two affected functions in
-the following way:
- ff_sbc_analyze_4_neon: q2-q5 => q8-q11, then q1-q4 => q8-q11
- ff_sbc_analyze_8_neon: q2-q9 => q8-q15
-
-The reason for using these replacements is to keep closely related
-sets of registers consecutively numbered which hopefully makes the
-code more easy to follow. Since this commit only reallocates
-registers, it should have no performance impact.
-
-Signed-off-by: James Cowgill <jcowgill@debian.org>
----
- libavcodec/arm/sbcdsp_neon.S | 220 +++++++++++++++++++++----------------------
- 1 file changed, 110 insertions(+), 110 deletions(-)
-
-diff --git a/libavcodec/arm/sbcdsp_neon.S b/libavcodec/arm/sbcdsp_neon.S
-index d83d21d..914abfb 100644
---- a/libavcodec/arm/sbcdsp_neon.S
-+++ b/libavcodec/arm/sbcdsp_neon.S
-@@ -38,49 +38,49 @@ function ff_sbc_analyze_4_neon, export=1
-         /* TODO: merge even and odd cases (or even merge all four calls to this
-          * function) in order to have only aligned reads from 'in' array
-          * and reduce number of load instructions */
--        vld1.16         {d4, d5}, [r0, :64]!
--        vld1.16         {d8, d9}, [r2, :128]!
-+        vld1.16         {d16, d17}, [r0, :64]!
-+        vld1.16         {d20, d21}, [r2, :128]!
- 
--        vmull.s16       q0, d4, d8
--        vld1.16         {d6,  d7}, [r0, :64]!
--        vmull.s16       q1, d5, d9
--        vld1.16         {d10, d11}, [r2, :128]!
-+        vmull.s16       q0, d16, d20
-+        vld1.16         {d18, d19}, [r0, :64]!
-+        vmull.s16       q1, d17, d21
-+        vld1.16         {d22, d23}, [r2, :128]!
- 
--        vmlal.s16       q0, d6, d10
--        vld1.16         {d4, d5}, [r0, :64]!
--        vmlal.s16       q1, d7, d11
--        vld1.16         {d8, d9}, [r2, :128]!
-+        vmlal.s16       q0, d18, d22
-+        vld1.16         {d16, d17}, [r0, :64]!
-+        vmlal.s16       q1, d19, d23
-+        vld1.16         {d20, d21}, [r2, :128]!
- 
--        vmlal.s16       q0, d4, d8
--        vld1.16         {d6,  d7}, [r0, :64]!
--        vmlal.s16       q1, d5, d9
--        vld1.16         {d10, d11}, [r2, :128]!
-+        vmlal.s16       q0, d16, d20
-+        vld1.16         {d18, d19}, [r0, :64]!
-+        vmlal.s16       q1, d17, d21
-+        vld1.16         {d22, d23}, [r2, :128]!
- 
--        vmlal.s16       q0, d6, d10
--        vld1.16         {d4, d5}, [r0, :64]!
--        vmlal.s16       q1, d7, d11
--        vld1.16         {d8, d9}, [r2, :128]!
-+        vmlal.s16       q0, d18, d22
-+        vld1.16         {d16, d17}, [r0, :64]!
-+        vmlal.s16       q1, d19, d23
-+        vld1.16         {d20, d21}, [r2, :128]!
- 
--        vmlal.s16       q0, d4, d8
--        vmlal.s16       q1, d5, d9
-+        vmlal.s16       q0, d16, d20
-+        vmlal.s16       q1, d17, d21
- 
-         vpadd.s32       d0, d0, d1
-         vpadd.s32       d1, d2, d3
- 
-         vrshrn.s32      d0, q0, SBC_PROTO_FIXED_SCALE
- 
--        vld1.16         {d2, d3, d4, d5}, [r2, :128]!
-+        vld1.16         {d16, d17, d18, d19}, [r2, :128]!
- 
-         vdup.i32        d1, d0[1]  /* TODO: can be eliminated */
-         vdup.i32        d0, d0[0]  /* TODO: can be eliminated */
- 
--        vmull.s16       q3, d2, d0
--        vmull.s16       q4, d3, d0
--        vmlal.s16       q3, d4, d1
--        vmlal.s16       q4, d5, d1
-+        vmull.s16       q10, d16, d0
-+        vmull.s16       q11, d17, d0
-+        vmlal.s16       q10, d18, d1
-+        vmlal.s16       q11, d19, d1
- 
--        vpadd.s32       d0, d6, d7 /* TODO: can be eliminated */
--        vpadd.s32       d1, d8, d9 /* TODO: can be eliminated */
-+        vpadd.s32       d0, d20, d21 /* TODO: can be eliminated */
-+        vpadd.s32       d1, d22, d23 /* TODO: can be eliminated */
- 
-         vst1.32         {d0, d1}, [r1, :128]
- 
-@@ -91,57 +91,57 @@ function ff_sbc_analyze_8_neon, export=1
-         /* TODO: merge even and odd cases (or even merge all four calls to this
-          * function) in order to have only aligned reads from 'in' array
-          * and reduce number of load instructions */
--        vld1.16         {d4, d5}, [r0, :64]!
--        vld1.16         {d8, d9}, [r2, :128]!
--
--        vmull.s16       q6, d4, d8
--        vld1.16         {d6,  d7}, [r0, :64]!
--        vmull.s16       q7, d5, d9
--        vld1.16         {d10, d11}, [r2, :128]!
--        vmull.s16       q8, d6, d10
--        vld1.16         {d4, d5}, [r0, :64]!
--        vmull.s16       q9, d7, d11
--        vld1.16         {d8, d9}, [r2, :128]!
--
--        vmlal.s16       q6, d4, d8
--        vld1.16         {d6,  d7}, [r0, :64]!
--        vmlal.s16       q7, d5, d9
--        vld1.16         {d10, d11}, [r2, :128]!
--        vmlal.s16       q8, d6, d10
--        vld1.16         {d4, d5}, [r0, :64]!
--        vmlal.s16       q9, d7, d11
--        vld1.16         {d8, d9}, [r2, :128]!
--
--        vmlal.s16       q6, d4, d8
--        vld1.16         {d6,  d7}, [r0, :64]!
--        vmlal.s16       q7, d5, d9
--        vld1.16         {d10, d11}, [r2, :128]!
--        vmlal.s16       q8, d6, d10
--        vld1.16         {d4, d5}, [r0, :64]!
--        vmlal.s16       q9, d7, d11
--        vld1.16         {d8, d9}, [r2, :128]!
--
--        vmlal.s16       q6, d4, d8
--        vld1.16         {d6,  d7}, [r0, :64]!
--        vmlal.s16       q7, d5, d9
--        vld1.16         {d10, d11}, [r2, :128]!
--        vmlal.s16       q8, d6, d10
--        vld1.16         {d4, d5}, [r0, :64]!
--        vmlal.s16       q9, d7, d11
--        vld1.16         {d8, d9}, [r2, :128]!
--
--        vmlal.s16       q6, d4, d8
--        vld1.16         {d6,  d7}, [r0, :64]!
--        vmlal.s16       q7, d5, d9
--        vld1.16         {d10, d11}, [r2, :128]!
--
--        vmlal.s16       q8, d6, d10
--        vmlal.s16       q9, d7, d11
--
--        vpadd.s32       d0, d12, d13
--        vpadd.s32       d1, d14, d15
--        vpadd.s32       d2, d16, d17
--        vpadd.s32       d3, d18, d19
-+        vld1.16         {d16, d17}, [r0, :64]!
-+        vld1.16         {d20, d21}, [r2, :128]!
-+
-+        vmull.s16       q12, d16, d20
-+        vld1.16         {d18, d19}, [r0, :64]!
-+        vmull.s16       q13, d17, d21
-+        vld1.16         {d22, d23}, [r2, :128]!
-+        vmull.s16       q14, d18, d22
-+        vld1.16         {d16, d17}, [r0, :64]!
-+        vmull.s16       q15, d19, d23
-+        vld1.16         {d20, d21}, [r2, :128]!
-+
-+        vmlal.s16       q12, d16, d20
-+        vld1.16         {d18, d19}, [r0, :64]!
-+        vmlal.s16       q13, d17, d21
-+        vld1.16         {d22, d23}, [r2, :128]!
-+        vmlal.s16       q14, d18, d22
-+        vld1.16         {d16, d17}, [r0, :64]!
-+        vmlal.s16       q15, d19, d23
-+        vld1.16         {d20, d21}, [r2, :128]!
-+
-+        vmlal.s16       q12, d16, d20
-+        vld1.16         {d18, d19}, [r0, :64]!
-+        vmlal.s16       q13, d17, d21
-+        vld1.16         {d22, d23}, [r2, :128]!
-+        vmlal.s16       q14, d18, d22
-+        vld1.16         {d16, d17}, [r0, :64]!
-+        vmlal.s16       q15, d19, d23
-+        vld1.16         {d20, d21}, [r2, :128]!
-+
-+        vmlal.s16       q12, d16, d20
-+        vld1.16         {d18, d19}, [r0, :64]!
-+        vmlal.s16       q13, d17, d21
-+        vld1.16         {d22, d23}, [r2, :128]!
-+        vmlal.s16       q14, d18, d22
-+        vld1.16         {d16, d17}, [r0, :64]!
-+        vmlal.s16       q15, d19, d23
-+        vld1.16         {d20, d21}, [r2, :128]!
-+
-+        vmlal.s16       q12, d16, d20
-+        vld1.16         {d18, d19}, [r0, :64]!
-+        vmlal.s16       q13, d17, d21
-+        vld1.16         {d22, d23}, [r2, :128]!
-+
-+        vmlal.s16       q14, d18, d22
-+        vmlal.s16       q15, d19, d23
-+
-+        vpadd.s32       d0, d24, d25
-+        vpadd.s32       d1, d26, d27
-+        vpadd.s32       d2, d28, d29
-+        vpadd.s32       d3, d30, d31
- 
-         vrshr.s32       q0, q0, SBC_PROTO_FIXED_SCALE
-         vrshr.s32       q1, q1, SBC_PROTO_FIXED_SCALE
-@@ -153,38 +153,38 @@ function ff_sbc_analyze_8_neon, export=1
-         vdup.i32        d1, d0[1]  /* TODO: can be eliminated */
-         vdup.i32        d0, d0[0]  /* TODO: can be eliminated */
- 
--        vld1.16         {d4, d5}, [r2, :128]!
--        vmull.s16       q6, d4, d0
--        vld1.16         {d6, d7}, [r2, :128]!
--        vmull.s16       q7, d5, d0
--        vmull.s16       q8, d6, d0
--        vmull.s16       q9, d7, d0
--
--        vld1.16         {d4, d5}, [r2, :128]!
--        vmlal.s16       q6, d4, d1
--        vld1.16         {d6, d7}, [r2, :128]!
--        vmlal.s16       q7, d5, d1
--        vmlal.s16       q8, d6, d1
--        vmlal.s16       q9, d7, d1
--
--        vld1.16         {d4, d5}, [r2, :128]!
--        vmlal.s16       q6, d4, d2
--        vld1.16         {d6, d7}, [r2, :128]!
--        vmlal.s16       q7, d5, d2
--        vmlal.s16       q8, d6, d2
--        vmlal.s16       q9, d7, d2
--
--        vld1.16         {d4, d5}, [r2, :128]!
--        vmlal.s16       q6, d4, d3
--        vld1.16         {d6, d7}, [r2, :128]!
--        vmlal.s16       q7, d5, d3
--        vmlal.s16       q8, d6, d3
--        vmlal.s16       q9, d7, d3
--
--        vpadd.s32       d0, d12, d13 /* TODO: can be eliminated */
--        vpadd.s32       d1, d14, d15 /* TODO: can be eliminated */
--        vpadd.s32       d2, d16, d17 /* TODO: can be eliminated */
--        vpadd.s32       d3, d18, d19 /* TODO: can be eliminated */
-+        vld1.16         {d16, d17}, [r2, :128]!
-+        vmull.s16       q12, d16, d0
-+        vld1.16         {d18, d19}, [r2, :128]!
-+        vmull.s16       q13, d17, d0
-+        vmull.s16       q14, d18, d0
-+        vmull.s16       q15, d19, d0
-+
-+        vld1.16         {d16, d17}, [r2, :128]!
-+        vmlal.s16       q12, d16, d1
-+        vld1.16         {d18, d19}, [r2, :128]!
-+        vmlal.s16       q13, d17, d1
-+        vmlal.s16       q14, d18, d1
-+        vmlal.s16       q15, d19, d1
-+
-+        vld1.16         {d16, d17}, [r2, :128]!
-+        vmlal.s16       q12, d16, d2
-+        vld1.16         {d18, d19}, [r2, :128]!
-+        vmlal.s16       q13, d17, d2
-+        vmlal.s16       q14, d18, d2
-+        vmlal.s16       q15, d19, d2
-+
-+        vld1.16         {d16, d17}, [r2, :128]!
-+        vmlal.s16       q12, d16, d3
-+        vld1.16         {d18, d19}, [r2, :128]!
-+        vmlal.s16       q13, d17, d3
-+        vmlal.s16       q14, d18, d3
-+        vmlal.s16       q15, d19, d3
-+
-+        vpadd.s32       d0, d24, d25 /* TODO: can be eliminated */
-+        vpadd.s32       d1, d26, d27 /* TODO: can be eliminated */
-+        vpadd.s32       d2, d28, d29 /* TODO: can be eliminated */
-+        vpadd.s32       d3, d30, d31 /* TODO: can be eliminated */
- 
-         vst1.32         {d0, d1, d2, d3}, [r1, :128]
- 
diff --git a/.github/scripts/Linux/arm/ffmpeg-arm-patches/0001-avcodec-omx-Enable-inline-header.patch b/.github/scripts/Linux/arm/ffmpeg-arm-patches/0001-avcodec-omx-Enable-inline-header.patch
deleted file mode 100644
index 41f900d81e..0000000000
--- a/.github/scripts/Linux/arm/ffmpeg-arm-patches/0001-avcodec-omx-Enable-inline-header.patch
+++ /dev/null
@@ -1,39 +0,0 @@
-From 6e410823e63103342e8fc3407ff6698808d4d1ef Mon Sep 17 00:00:00 2001
-From: Pascal <pascal@serveurperso.com>
-Date: Tue, 10 Dec 2019 17:36:11 +0000
-Subject: [PATCH] avcodec/omx: Enable inline header
-
----
- libavcodec/omx.c | 9 +++++++++
- 1 file changed, 9 insertions(+)
-
-diff --git a/libavcodec/omx.c b/libavcodec/omx.c
-index 0a6a308309..06beb4dd02 100644
---- a/libavcodec/omx.c
-+++ b/libavcodec/omx.c
-@@ -28,6 +28,7 @@
- #include <dlfcn.h>
- #include <OMX_Core.h>
- #include <OMX_Component.h>
-+#include <OMX_Broadcom.h>
- #include <pthread.h>
- #include <stdio.h>
- #include <stdlib.h>
-@@ -516,6 +517,14 @@ static av_cold int omx_component_init(AVCodecContext *avctx, const char *role)
-     if (err != OMX_ErrorNone)
-         av_log(avctx, AV_LOG_WARNING, "Unable to set video bitrate parameter\n");
- 
-+    OMX_CONFIG_PORTBOOLEANTYPE vid_param_inline_header = {0, };
-+    INIT_STRUCT(vid_param_inline_header);
-+    vid_param_inline_header.nPortIndex = s->out_port;
-+    vid_param_inline_header.bEnabled = OMX_TRUE;
-+    err = OMX_SetParameter(s->handle, OMX_IndexParamBrcmVideoAVCInlineHeaderEnable, &vid_param_inline_header);
-+    if (err != OMX_ErrorNone)
-+        av_log(avctx, AV_LOG_WARNING, "Unable to set video inline header parameter\n");
-+
-     if (avctx->codec->id == AV_CODEC_ID_H264) {
-         OMX_VIDEO_PARAM_AVCTYPE avc = { 0 };
-         INIT_STRUCT(avc);
--- 
-2.25.1
-
diff --git a/.github/scripts/Linux/arm/ffmpeg-arm-patches/0002-Fix-build-on-powerpc-and-ppc64.patch b/.github/scripts/Linux/arm/ffmpeg-arm-patches/0002-Fix-build-on-powerpc-and-ppc64.patch
deleted file mode 100644
index ffafb125ec..0000000000
--- a/.github/scripts/Linux/arm/ffmpeg-arm-patches/0002-Fix-build-on-powerpc-and-ppc64.patch
+++ /dev/null
@@ -1,29 +0,0 @@
-From: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
-Date: Tue, 19 Jan 2021 20:35:29 +0100
-Subject: Fix build on powerpc and ppc64
-
----
- libswscale/ppc/yuv2rgb_altivec.c | 10 ++++++++++
- 1 file changed, 10 insertions(+)
-
-diff --git a/libswscale/ppc/yuv2rgb_altivec.c b/libswscale/ppc/yuv2rgb_altivec.c
-index 5365452..930ef6b 100644
---- a/libswscale/ppc/yuv2rgb_altivec.c
-+++ b/libswscale/ppc/yuv2rgb_altivec.c
-@@ -283,6 +283,16 @@ static inline void cvtyuvtoRGB(SwsContext *c, vector signed short Y,
-  * ------------------------------------------------------------------------------
-  */
- 
-+#if !HAVE_VSX
-+static inline vector unsigned char vec_xl(signed long long offset, const ubyte *addr)
-+{
-+    const vector unsigned char *v_addr = (const vector unsigned char *) (addr + offset);
-+    vector unsigned char align_perm = vec_lvsl(offset, addr);
-+
-+    return (vector unsigned char) vec_perm(v_addr[0], v_addr[1], align_perm);
-+}
-+#endif /* !HAVE_VSX */
-+
- #define DEFCSP420_CVT(name, out_pixels)                                       \
- static int altivec_ ## name(SwsContext *c, const unsigned char **in,          \
-                             int *instrides, int srcSliceY, int srcSliceH,     \
diff --git a/.github/scripts/Linux/arm/ffmpeg-arm-patches/0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch b/.github/scripts/Linux/arm/ffmpeg-arm-patches/0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch
deleted file mode 100644
index d30e312bb3..0000000000
--- a/.github/scripts/Linux/arm/ffmpeg-arm-patches/0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-From: Paul B Mahol <onemda@gmail.com>
-Date: Sun, 14 Feb 2021 17:20:03 +0100
-Subject: avcodec/pngenc: remove monowhite from apng formats
-
-Monowhite pixel format is not supported, and it does not make sense
-to add support for it.
-
-Fixes #7989
----
- libavcodec/pngenc.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/libavcodec/pngenc.c b/libavcodec/pngenc.c
-index efcae8c..eebb164 100644
---- a/libavcodec/pngenc.c
-+++ b/libavcodec/pngenc.c
-@@ -1174,7 +1174,7 @@ AVCodec ff_apng_encoder = {
-         AV_PIX_FMT_PAL8,
-         AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY8A,
-         AV_PIX_FMT_GRAY16BE, AV_PIX_FMT_YA16BE,
--        AV_PIX_FMT_MONOBLACK, AV_PIX_FMT_NONE
-+        AV_PIX_FMT_NONE
-     },
-     .priv_class     = &apngenc_class,
- };
diff --git a/.github/scripts/Linux/arm/ffmpeg-arm-patches/0004-ffmpeg-4.3.3-rpi_13.patch b/.github/scripts/Linux/arm/ffmpeg-arm-patches/0004-ffmpeg-4.3.3-rpi_13.patch
deleted file mode 100644
index 579ed3d140..0000000000
--- a/.github/scripts/Linux/arm/ffmpeg-arm-patches/0004-ffmpeg-4.3.3-rpi_13.patch
+++ /dev/null
@@ -1,67508 +0,0 @@
---- a/configure
-+++ b/configure
-@@ -207,6 +207,7 @@ External library support:
-   --disable-bzlib          disable bzlib [autodetect]
-   --disable-coreimage      disable Apple CoreImage framework [autodetect]
-   --enable-chromaprint     enable audio fingerprinting with chromaprint [no]
-+  --disable-epoxy          disable epoxy [autodetect]
-   --enable-frei0r          enable frei0r video filtering [no]
-   --enable-gcrypt          enable gcrypt, needed for rtmp(t)e support
-                            if openssl, librtmp or gmp is not used [no]
-@@ -274,6 +275,7 @@ External library support:
-   --enable-libtls          enable LibreSSL (via libtls), needed for https support
-                            if openssl, gnutls or mbedtls is not used [no]
-   --enable-libtwolame      enable MP2 encoding via libtwolame [no]
-+  --disable-libudev        disable libudev [autodetect]
-   --enable-libv4l2         enable libv4l2/v4l-utils [no]
-   --enable-libvidstab      enable video stabilization using vid.stab [no]
-   --enable-libvmaf         enable vmaf filter via libvmaf [no]
-@@ -336,12 +338,17 @@ External library support:
-   --enable-libmfx          enable Intel MediaSDK (AKA Quick Sync Video) code via libmfx [no]
-   --enable-libnpp          enable Nvidia Performance Primitives-based code [no]
-   --enable-mmal            enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
-+  --enable-rpi             enable other rpi specific stuff [no]
-+  --enable-sand            enable sand video formats [rpi]
-+  --enable-vout-drm        enable the vout_drm module - for internal testing only [no]
-+  --enable-vout-egl        enable the vout_egl module - for internal testing only [no]
-   --disable-nvdec          disable Nvidia video decoding acceleration (via hwaccel) [autodetect]
-   --disable-nvenc          disable Nvidia video encoding code [autodetect]
-   --enable-omx             enable OpenMAX IL code [no]
-   --enable-omx-rpi         enable OpenMAX IL code for Raspberry Pi [no]
-   --enable-rkmpp           enable Rockchip Media Process Platform code [no]
-   --disable-v4l2-m2m       disable V4L2 mem2mem code [autodetect]
-+  --enable-v4l2-request    enable V4L2 request API code [no]
-   --disable-vaapi          disable Video Acceleration API (mainly Unix/Intel) code [autodetect]
-   --disable-vdpau          disable Nvidia Video Decode and Presentation API for Unix code [autodetect]
-   --disable-videotoolbox   disable VideoToolbox code [autodetect]
-@@ -1699,7 +1706,9 @@ EXTERNAL_AUTODETECT_LIBRARY_LIST="
-     avfoundation
-     bzlib
-     coreimage
-+    epoxy
-     iconv
-+    libudev
-     libxcb
-     libxcb_shm
-     libxcb_shape
-@@ -1861,7 +1870,10 @@ HWACCEL_LIBRARY_LIST="
-     mmal
-     omx
-     opencl
-+    v4l2_request
-     vulkan
-+    rpi4_8
-+    rpi4_10
- "
- 
- DOCUMENT_LIST="
-@@ -1877,12 +1889,16 @@ FEATURE_LIST="
-     gray
-     hardcoded_tables
-     omx_rpi
-+    rpi
-     runtime_cpudetect
-     safe_bitstream_reader
-+    sand
-     shared
-     small
-     static
-     swscale_alpha
-+    vout_drm
-+    vout_egl
- "
- 
- # this list should be kept in linking order
-@@ -1923,6 +1939,7 @@ SUBSYSTEM_LIST="
-     pixelutils
-     network
-     rdft
-+    rpi
- "
- 
- # COMPONENT_LIST needs to come last to ensure correct dependency checking
-@@ -2405,9 +2422,11 @@ CONFIG_EXTRA="
-     rangecoder
-     riffdec
-     riffenc
-+    rpi
-     rtpdec
-     rtpenc_chain
-     rv34dsp
-+    sand
-     scene_sad
-     sinewin
-     snappy
-@@ -2737,6 +2756,8 @@ hap_decoder_select="snappy texturedsp"
- hap_encoder_deps="libsnappy"
- hap_encoder_select="texturedspenc"
- hevc_decoder_select="bswapdsp cabac golomb hevcparse videodsp"
-+hevc_rpi_decoder_deps="rpi"
-+hevc_rpi_decoder_select="hevc_decoder sand"
- huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp"
- huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp"
- hymt_decoder_select="huffyuv_decoder"
-@@ -2903,6 +2924,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder
- dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32"
- ffnvcodec_deps_any="libdl LoadLibrary"
- nvdec_deps="ffnvcodec"
-+v4l2_request_deps="linux_videodev2_h linux_media_h v4l2_timeval_to_ns libdrm libudev"
- vaapi_x11_deps="xlib"
- videotoolbox_hwaccel_deps="videotoolbox pthreads"
- videotoolbox_hwaccel_extralibs="-framework QuartzCore"
-@@ -2934,6 +2956,12 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicP
- hevc_dxva2_hwaccel_select="hevc_decoder"
- hevc_nvdec_hwaccel_deps="nvdec"
- hevc_nvdec_hwaccel_select="hevc_decoder"
-+hevc_v4l2request_hwaccel_deps="v4l2_request"
-+hevc_v4l2request_hwaccel_select="hevc_decoder"
-+hevc_rpi4_10_hwaccel_deps="rpi"
-+hevc_rpi4_10_hwaccel_select="hevc_decoder"
-+hevc_rpi4_8_hwaccel_deps="rpi"
-+hevc_rpi4_8_hwaccel_select="hevc_decoder"
- hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC"
- hevc_vaapi_hwaccel_select="hevc_decoder"
- hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC"
-@@ -3401,8 +3429,13 @@ sndio_indev_deps="sndio"
- sndio_outdev_deps="sndio"
- v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h"
- v4l2_indev_suggest="libv4l2"
-+v4l2_outdev_deps="libdrm"
- v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h"
- v4l2_outdev_suggest="libv4l2"
-+vout_drm_outdev_deps="libdrm"
-+vout_egl_outdev_deps="xlib epoxy"
-+vout_rpi_outdev_deps="rpi"
-+vout_rpi_outdev_select="sand"
- vfwcap_indev_deps="vfw32 vfwcap_defines"
- xcbgrab_indev_deps="libxcb"
- xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes"
-@@ -3618,6 +3651,7 @@ tonemap_vaapi_filter_deps="vaapi VAProcF
- tonemap_opencl_filter_deps="opencl const_nan"
- transpose_opencl_filter_deps="opencl"
- transpose_vaapi_filter_deps="vaapi VAProcPipelineCaps_rotation_flags"
-+unsand_filter_select="sand"
- unsharp_opencl_filter_deps="opencl"
- uspp_filter_deps="gpl avcodec"
- vaguedenoiser_filter_deps="gpl"
-@@ -6102,6 +6136,12 @@ check_func_headers glob.h glob
- enabled xlib &&
-     check_lib xlib "X11/Xlib.h X11/extensions/Xvlib.h" XvGetPortAttribute -lXv -lX11 -lXext
- 
-+enabled libudev &&
-+    check_pkg_config libudev libudev libudev.h udev_new
-+
-+enabled epoxy &&
-+    check_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version
-+
- check_headers direct.h
- check_headers dirent.h
- check_headers dxgidebug.h
-@@ -6430,11 +6470,12 @@ enabled mbedtls           && { check_pkg
-                                check_lib mbedtls mbedtls/ssl.h mbedtls_ssl_init -lmbedtls -lmbedx509 -lmbedcrypto ||
-                                die "ERROR: mbedTLS not found"; }
- enabled mediacodec        && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; }
--enabled mmal              && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
-+( enabled rpi ||
-+  enabled mmal )          && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
-                                { ! enabled cross_compile &&
-                                  add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline &&
-                                  add_ldflags -L/opt/vc/lib/ &&
--                                 check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host; } ||
-+                                 check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host -lvcos -lvcsm -lvchostif -lvchiq_arm; } ||
-                                die "ERROR: mmal not found" &&
-                                check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; }
- enabled openal            && { { for al_extralibs in "${OPENAL_LIBS}" "-lopenal" "-lOpenAL32"; do
-@@ -6475,8 +6516,16 @@ enabled rkmpp             && { require_p
-                                { enabled libdrm ||
-                                  die "ERROR: rkmpp requires --enable-libdrm"; }
-                              }
-+enabled v4l2_request      && { enabled libdrm ||
-+                               die "ERROR: v4l2-request requires --enable-libdrm"; } &&
-+                             { enabled libudev ||
-+                               die "ERROR: v4l2-request requires libudev"; }
- enabled vapoursynth       && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init
- 
-+enabled vout_drm && { enabled libdrm || die "ERROR: vout_drm requires --enable-libdrm"; }
-+
-+enabled vout_egl && { enabled epoxy || die "ERROR: vout_egl requires epoxy"; } &&
-+                    { enabled xlib  || die "ERROR: vout_egl requires xlib"; }
- 
- if enabled gcrypt; then
-     GCRYPT_CONFIG="${cross_prefix}libgcrypt-config"
-@@ -6556,6 +6605,8 @@ if enabled v4l2_m2m; then
-     check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;"
- fi
- 
-+check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
-+check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
- check_headers sys/videoio.h
- test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
- 
---- a/fftools/ffmpeg.c
-+++ b/fftools/ffmpeg.c
-@@ -2119,8 +2119,8 @@ static int ifilter_send_frame(InputFilte
-                        ifilter->channel_layout != frame->channel_layout;
-         break;
-     case AVMEDIA_TYPE_VIDEO:
--        need_reinit |= ifilter->width  != frame->width ||
--                       ifilter->height != frame->height;
-+        need_reinit |= ifilter->width  != av_frame_cropped_width(frame) ||
-+                       ifilter->height != av_frame_cropped_height(frame);
-         break;
-     }
- 
-@@ -2131,6 +2131,9 @@ static int ifilter_send_frame(InputFilte
-         (ifilter->hw_frames_ctx && ifilter->hw_frames_ctx->data != frame->hw_frames_ctx->data))
-         need_reinit = 1;
- 
-+    if (no_cvt_hw && fg->graph)
-+        need_reinit = 0;
-+
-     if (need_reinit) {
-         ret = ifilter_parameters_from_frame(ifilter, frame);
-         if (ret < 0)
-@@ -2401,8 +2404,7 @@ static int decode_video(InputStream *ist
-         decoded_frame->top_field_first = ist->top_field_first;
- 
-     ist->frames_decoded++;
--
--    if (ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
-+    if (!no_cvt_hw && ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
-         err = ist->hwaccel_retrieve_data(ist->dec_ctx, decoded_frame);
-         if (err < 0)
-             goto fail;
-@@ -2600,7 +2602,12 @@ static int process_input_packet(InputStr
-         case AVMEDIA_TYPE_VIDEO:
-             ret = decode_video    (ist, repeating ? NULL : &avpkt, &got_output, &duration_pts, !pkt,
-                                    &decode_failed);
--            if (!repeating || !pkt || got_output) {
-+            // Pi: Do not inc dts if no_cvt_hw set
-+            // V4L2 H264 decode has long latency and sometimes spits out a long
-+            // stream of output without input. In this case incrementing DTS is wrong.
-+            // There may be cases where the condition as written is correct so only
-+            // "fix" in the cases which cause problems
-+            if (!repeating || !pkt || (got_output && !no_cvt_hw)) {
-                 if (pkt && pkt->duration) {
-                     duration_dts = av_rescale_q(pkt->duration, ist->st->time_base, AV_TIME_BASE_Q);
-                 } else if(ist->dec_ctx->framerate.num != 0 && ist->dec_ctx->framerate.den != 0) {
-@@ -2820,6 +2827,16 @@ static enum AVPixelFormat get_format(AVC
-         } else {
-             const HWAccel *hwaccel = NULL;
-             int i;
-+
-+            if (no_cvt_hw) {
-+                config = avcodec_get_hw_config(s->codec, 0);
-+                if (config->methods == AV_CODEC_HW_CONFIG_METHOD_INTERNAL) {
-+                    av_log(s, AV_LOG_DEBUG, "no_cvt_hw so accepting pix_fmt %d with codec internal hwaccel\n", *p);
-+                    ist->hwaccel_pix_fmt = *p;
-+                    break;
-+                }
-+            }
-+
-             for (i = 0; hwaccels[i].name; i++) {
-                 if (hwaccels[i].pix_fmt == *p) {
-                     hwaccel = &hwaccels[i];
-@@ -2914,6 +2931,15 @@ static int init_input_stream(int ist_ind
-             return ret;
-         }
- 
-+#if CONFIG_HEVC_RPI_DECODER
-+        ret = -1;
-+        if (strcmp(codec->name, "hevc_rpi") == 0 &&
-+            (ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) {
-+            ist->dec = codec = avcodec_find_decoder_by_name("hevc");
-+            av_log(NULL, AV_LOG_INFO, "Failed to open hevc_rpi - trying hevc\n");
-+        }
-+        if (ret < 0)
-+#endif
-         if ((ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) {
-             if (ret == AVERROR_EXPERIMENTAL)
-                 abort_codec_experimental(codec, 0);
---- a/fftools/ffmpeg.h
-+++ b/fftools/ffmpeg.h
-@@ -61,6 +61,7 @@ enum HWAccelID {
-     HWACCEL_GENERIC,
-     HWACCEL_VIDEOTOOLBOX,
-     HWACCEL_QSV,
-+    HWACCEL_RPI,
- };
- 
- typedef struct HWAccel {
-@@ -590,6 +591,7 @@ extern int video_sync_method;
- extern float frame_drop_threshold;
- extern int do_benchmark;
- extern int do_benchmark_all;
-+extern int no_cvt_hw;
- extern int do_deinterlace;
- extern int do_hex_dump;
- extern int do_pkt_dump;
---- a/fftools/ffmpeg_filter.c
-+++ b/fftools/ffmpeg_filter.c
-@@ -1186,8 +1186,8 @@ int ifilter_parameters_from_frame(InputF
- 
-     ifilter->format = frame->format;
- 
--    ifilter->width               = frame->width;
--    ifilter->height              = frame->height;
-+    ifilter->width               = av_frame_cropped_width(frame);
-+    ifilter->height              = av_frame_cropped_height(frame);
-     ifilter->sample_aspect_ratio = frame->sample_aspect_ratio;
- 
-     ifilter->sample_rate         = frame->sample_rate;
---- a/fftools/ffmpeg_hw.c
-+++ b/fftools/ffmpeg_hw.c
-@@ -75,6 +75,8 @@ static char *hw_device_default_name(enum
-     char *name;
-     size_t index_pos;
-     int index, index_limit = 1000;
-+    if (!type_name)
-+        return NULL;
-     index_pos = strlen(type_name);
-     name = av_malloc(index_pos + 4);
-     if (!name)
---- a/fftools/ffmpeg_opt.c
-+++ b/fftools/ffmpeg_opt.c
-@@ -130,6 +130,12 @@ static const char *opt_name_enc_time_bas
-     }\
- }
- 
-+#if CONFIG_RPI
-+static int rpi_init(AVCodecContext *avctx) {
-+    return 0;
-+}
-+#endif
-+
- const HWAccel hwaccels[] = {
- #if CONFIG_VIDEOTOOLBOX
-     { "videotoolbox", videotoolbox_init, HWACCEL_VIDEOTOOLBOX, AV_PIX_FMT_VIDEOTOOLBOX },
-@@ -137,6 +143,10 @@ const HWAccel hwaccels[] = {
- #if CONFIG_LIBMFX
-     { "qsv",   qsv_init,   HWACCEL_QSV,   AV_PIX_FMT_QSV },
- #endif
-+#if CONFIG_RPI
-+    {  "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_8 },
-+    {  "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_10 },
-+#endif
-     { 0 },
- };
- HWDevice *filter_hw_device;
-@@ -155,6 +165,7 @@ float frame_drop_threshold = 0;
- int do_deinterlace    = 0;
- int do_benchmark      = 0;
- int do_benchmark_all  = 0;
-+int no_cvt_hw         = 0;
- int do_hex_dump       = 0;
- int do_pkt_dump       = 0;
- int copy_ts           = 0;
-@@ -3460,6 +3471,8 @@ const OptionDef options[] = {
-         "add timings for benchmarking" },
-     { "benchmark_all",  OPT_BOOL | OPT_EXPERT,                       { &do_benchmark_all },
-       "add timings for each task" },
-+    { "no_cvt_hw",      OPT_BOOL | OPT_EXPERT,                       { &no_cvt_hw },
-+      "do not auto-convert hw frames to sw" },
-     { "progress",       HAS_ARG | OPT_EXPERT,                        { .func_arg = opt_progress },
-       "write program-readable progress information", "url" },
-     { "stdin",          OPT_BOOL | OPT_EXPERT,                       { &stdin_interaction },
---- a/libavcodec/Makefile
-+++ b/libavcodec/Makefile
-@@ -19,6 +19,7 @@ HEADERS = ac3_parser.h
-           mediacodec.h                                                  \
-           packet.h                                                      \
-           qsv.h                                                         \
-+          rpi_zc.h                                                      \
-           vaapi.h                                                       \
-           vdpau.h                                                       \
-           version.h                                                     \
-@@ -138,6 +139,7 @@ OBJS-$(CONFIG_QSVDEC)                  +
- OBJS-$(CONFIG_QSVENC)                  += qsvenc.o
- OBJS-$(CONFIG_RANGECODER)              += rangecoder.o
- OBJS-$(CONFIG_RDFT)                    += rdft.o
-+OBJS-$(CONFIG_RPI)                     += rpi_qpu.o rpi_mailbox.o rpi_zc.o
- OBJS-$(CONFIG_RV34DSP)                 += rv34dsp.o
- OBJS-$(CONFIG_SHARED)                  += log2_tab.o reverse.o
- OBJS-$(CONFIG_SINEWIN)                 += sinewin.o sinewin_fixed.o
-@@ -152,7 +154,10 @@ OBJS-$(CONFIG_VIDEODSP)                +
- OBJS-$(CONFIG_VP3DSP)                  += vp3dsp.o
- OBJS-$(CONFIG_VP56DSP)                 += vp56dsp.o
- OBJS-$(CONFIG_VP8DSP)                  += vp8dsp.o
--OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o
-+OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o\
-+                                          weak_link.o
-+OBJS-$(CONFIG_V4L2_REQUEST)            += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\
-+					  v4l2_req_devscan.o weak_link.o
- OBJS-$(CONFIG_WMA_FREQS)               += wma_freqs.o
- OBJS-$(CONFIG_WMV2DSP)                 += wmv2dsp.o
- 
-@@ -391,6 +396,14 @@ OBJS-$(CONFIG_HEVC_QSV_DECODER)        +
- OBJS-$(CONFIG_HEVC_QSV_ENCODER)        += qsvenc_hevc.o hevc_ps_enc.o       \
-                                           hevc_data.o
- OBJS-$(CONFIG_HEVC_RKMPP_DECODER)      += rkmppdec.o
-+OBJS-$(CONFIG_RPI)                     += rpi_mem.o \
-+                                          rpi_mailbox.o rpi_zc.o
-+OBJS-$(CONFIG_HEVC_RPI_DECODER)        += rpi_hevcdec.o rpi_hevc_mvs.o \
-+                                          rpi_hevc_cabac.o rpi_hevc_refs.o rpi_hevcpred.o    \
-+                                          rpi_hevcdsp.o rpi_hevc_filter.o rpi_hevc_data.o    \
-+                                          rpi_hevc_shader.o rpi_hevc_shader_template.o       \
-+                                          rpi_hevc_parse.o h2645_parse.o rpi_hevc_ps.o \
-+                                          rpi_hevc_sei.o rpi_hevc_data.o rpi_qpu.o rpi_mem.o
- OBJS-$(CONFIG_HEVC_VAAPI_ENCODER)      += vaapi_encode_h265.o h265_profile_level.o
- OBJS-$(CONFIG_HEVC_V4L2M2M_DECODER)    += v4l2_m2m_dec.o
- OBJS-$(CONFIG_HEVC_V4L2M2M_ENCODER)    += v4l2_m2m_enc.o
-@@ -909,6 +922,10 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL)
- OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
- OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL)         += nvdec_hevc.o
- OBJS-$(CONFIG_HEVC_QSV_HWACCEL)           += qsvdec_h2645.o
-+OBJS-$(CONFIG_HEVC_RPI4_8_HWACCEL)        += rpivid_hevc.o
-+OBJS-$(CONFIG_HEVC_RPI4_10_HWACCEL)       += rpivid_hevc.o
-+OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o\
-+                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o
- OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o h265_profile_level.o
- OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o
- OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
-@@ -1261,3 +1278,31 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
- $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
- $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
- endif
-+
-+ifdef CONFIG_HEVC_RPI_DECODER
-+QASM_PY := ../local/bin/qasm.py
-+VASMVIDCORE := ../local/bin/vasmvidcore_std
-+
-+ifneq ("$(wildcard $(QASM_PY))","")
-+$(SUBDIR)rpi_hevc_shader.c: $(SUBDIR)rpi_hevc_shader.qasm
-+	$(QASM_PY) -mc_c:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@
-+
-+$(SUBDIR)rpi_hevc_shader.h: $(SUBDIR)rpi_hevc_shader.qasm
-+	$(QASM_PY) -mc_h:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@
-+endif
-+
-+ifneq ("$(wildcard $(VASMVIDCORE))","")
-+$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s
-+	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@
-+$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s
-+	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@
-+
-+$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin
-+	python pi-util/make_array.py $<
-+$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin
-+	python pi-util/make_array.py $<
-+endif
-+
-+$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h
-+$(SUBDIR)rpi_hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_shader.h
-+endif
---- a/libavcodec/aarch64/Makefile
-+++ b/libavcodec/aarch64/Makefile
-@@ -44,10 +44,12 @@ NEON-OBJS-$(CONFIG_H264PRED)
- NEON-OBJS-$(CONFIG_H264QPEL)            += aarch64/h264qpel_neon.o             \
-                                            aarch64/hpeldsp_neon.o
- NEON-OBJS-$(CONFIG_HPELDSP)             += aarch64/hpeldsp_neon.o
--NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/simple_idct_neon.o
-+NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/idctdsp_neon.o              \
-+                                           aarch64/simple_idct_neon.o
- NEON-OBJS-$(CONFIG_MDCT)                += aarch64/mdct_neon.o
- NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
- NEON-OBJS-$(CONFIG_PIXBLOCKDSP)         += aarch64/pixblockdsp_neon.o
-+NEON-OBJS-$(CONFIG_VC1DSP)              += aarch64/vc1dsp_neon.o
- NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
- 
- # decoders/encoders
---- a/libavcodec/aarch64/idctdsp_init_aarch64.c
-+++ b/libavcodec/aarch64/idctdsp_init_aarch64.c
-@@ -27,19 +27,29 @@
- #include "libavcodec/idctdsp.h"
- #include "idct.h"
- 
-+void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
-+void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
-+void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
-+
- av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
-                                      unsigned high_bit_depth)
- {
-     int cpu_flags = av_get_cpu_flags();
- 
--    if (have_neon(cpu_flags) && !avctx->lowres && !high_bit_depth) {
--        if (avctx->idct_algo == FF_IDCT_AUTO ||
--            avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
--            avctx->idct_algo == FF_IDCT_SIMPLENEON) {
--            c->idct_put  = ff_simple_idct_put_neon;
--            c->idct_add  = ff_simple_idct_add_neon;
--            c->idct      = ff_simple_idct_neon;
--            c->perm_type = FF_IDCT_PERM_PARTTRANS;
-+    if (have_neon(cpu_flags)) {
-+        if (!avctx->lowres && !high_bit_depth) {
-+            if (avctx->idct_algo == FF_IDCT_AUTO ||
-+                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
-+                avctx->idct_algo == FF_IDCT_SIMPLENEON) {
-+                c->idct_put  = ff_simple_idct_put_neon;
-+                c->idct_add  = ff_simple_idct_add_neon;
-+                c->idct      = ff_simple_idct_neon;
-+                c->perm_type = FF_IDCT_PERM_PARTTRANS;
-+            }
-         }
-+
-+        c->add_pixels_clamped        = ff_add_pixels_clamped_neon;
-+        c->put_pixels_clamped        = ff_put_pixels_clamped_neon;
-+        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
-     }
- }
---- /dev/null
-+++ b/libavcodec/aarch64/idctdsp_neon.S
-@@ -0,0 +1,130 @@
-+/*
-+ * IDCT AArch64 NEON optimisations
-+ *
-+ * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/aarch64/asm.S"
-+
-+// Clamp 16-bit signed block coefficients to unsigned 8-bit
-+// On entry:
-+//   x0 -> array of 64x 16-bit coefficients
-+//   x1 -> 8-bit results
-+//   x2 = row stride for results, bytes
-+function ff_put_pixels_clamped_neon, export=1
-+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
-+        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0]
-+        sqxtun          v0.8b, v0.8h
-+        sqxtun          v1.8b, v1.8h
-+        sqxtun          v2.8b, v2.8h
-+        sqxtun          v3.8b, v3.8h
-+        sqxtun          v4.8b, v4.8h
-+        st1             {v0.8b}, [x1], x2
-+        sqxtun          v0.8b, v5.8h
-+        st1             {v1.8b}, [x1], x2
-+        sqxtun          v1.8b, v6.8h
-+        st1             {v2.8b}, [x1], x2
-+        sqxtun          v2.8b, v7.8h
-+        st1             {v3.8b}, [x1], x2
-+        st1             {v4.8b}, [x1], x2
-+        st1             {v0.8b}, [x1], x2
-+        st1             {v1.8b}, [x1], x2
-+        st1             {v2.8b}, [x1]
-+        ret
-+endfunc
-+
-+// Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128)
-+// On entry:
-+//   x0 -> array of 64x 16-bit coefficients
-+//   x1 -> 8-bit results
-+//   x2 = row stride for results, bytes
-+function ff_put_signed_pixels_clamped_neon, export=1
-+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
-+        movi            v4.8b, #128
-+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
-+        sqxtn           v0.8b, v0.8h
-+        sqxtn           v1.8b, v1.8h
-+        sqxtn           v2.8b, v2.8h
-+        sqxtn           v3.8b, v3.8h
-+        sqxtn           v5.8b, v16.8h
-+        add             v0.8b, v0.8b, v4.8b
-+        sqxtn           v6.8b, v17.8h
-+        add             v1.8b, v1.8b, v4.8b
-+        sqxtn           v7.8b, v18.8h
-+        add             v2.8b, v2.8b, v4.8b
-+        sqxtn           v16.8b, v19.8h
-+        add             v3.8b, v3.8b, v4.8b
-+        st1             {v0.8b}, [x1], x2
-+        add             v0.8b, v5.8b, v4.8b
-+        st1             {v1.8b}, [x1], x2
-+        add             v1.8b, v6.8b, v4.8b
-+        st1             {v2.8b}, [x1], x2
-+        add             v2.8b, v7.8b, v4.8b
-+        st1             {v3.8b}, [x1], x2
-+        add             v3.8b, v16.8b, v4.8b
-+        st1             {v0.8b}, [x1], x2
-+        st1             {v1.8b}, [x1], x2
-+        st1             {v2.8b}, [x1], x2
-+        st1             {v3.8b}, [x1]
-+        ret
-+endfunc
-+
-+// Add 16-bit signed block coefficients to unsigned 8-bit
-+// On entry:
-+//   x0 -> array of 64x 16-bit coefficients
-+//   x1 -> 8-bit input and results
-+//   x2 = row stride for 8-bit input and results, bytes
-+function ff_add_pixels_clamped_neon, export=1
-+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
-+        mov             x3, x1
-+        ld1             {v4.8b}, [x1], x2
-+        ld1             {v5.8b}, [x1], x2
-+        ld1             {v6.8b}, [x1], x2
-+        ld1             {v7.8b}, [x1], x2
-+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
-+        uaddw           v0.8h, v0.8h, v4.8b
-+        uaddw           v1.8h, v1.8h, v5.8b
-+        uaddw           v2.8h, v2.8h, v6.8b
-+        ld1             {v4.8b}, [x1], x2
-+        uaddw           v3.8h, v3.8h, v7.8b
-+        ld1             {v5.8b}, [x1], x2
-+        sqxtun          v0.8b, v0.8h
-+        ld1             {v6.8b}, [x1], x2
-+        sqxtun          v1.8b, v1.8h
-+        ld1             {v7.8b}, [x1]
-+        sqxtun          v2.8b, v2.8h
-+        sqxtun          v3.8b, v3.8h
-+        uaddw           v4.8h, v16.8h, v4.8b
-+        st1             {v0.8b}, [x3], x2
-+        uaddw           v0.8h, v17.8h, v5.8b
-+        st1             {v1.8b}, [x3], x2
-+        uaddw           v1.8h, v18.8h, v6.8b
-+        st1             {v2.8b}, [x3], x2
-+        uaddw           v2.8h, v19.8h, v7.8b
-+        sqxtun          v4.8b, v4.8h
-+        sqxtun          v0.8b, v0.8h
-+        st1             {v3.8b}, [x3], x2
-+        sqxtun          v1.8b, v1.8h
-+        sqxtun          v2.8b, v2.8h
-+        st1             {v4.8b}, [x3], x2
-+        st1             {v0.8b}, [x3], x2
-+        st1             {v1.8b}, [x3], x2
-+        st1             {v2.8b}, [x3]
-+        ret
-+endfunc
---- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
-+++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
-@@ -21,10 +21,28 @@
- #include "libavutil/attributes.h"
- #include "libavutil/cpu.h"
- #include "libavutil/aarch64/cpu.h"
-+#include "libavutil/intreadwrite.h"
- #include "libavcodec/vc1dsp.h"
- 
- #include "config.h"
- 
-+void ff_vc1_inv_trans_8x8_neon(int16_t *block);
-+void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
-+void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
-+void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
-+
-+void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
-+void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
-+void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
-+void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
-+
-+void ff_vc1_v_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
-+void ff_vc1_h_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
-+void ff_vc1_v_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
-+void ff_vc1_h_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
-+void ff_vc1_v_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq);
-+void ff_vc1_h_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq);
-+
- void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
-                                 int h, int x, int y);
- void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
-@@ -34,14 +52,90 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t
- void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
-                                 int h, int x, int y);
- 
-+int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
-+
-+static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
-+{
-+    /* Dealing with starting and stopping, and removing escape bytes, are
-+     * comparatively less time-sensitive, so are more clearly expressed using
-+     * a C wrapper around the assembly inner loop. Note that we assume a
-+     * little-endian machine that supports unaligned loads. */
-+    int dsize = 0;
-+    while (size >= 4)
-+    {
-+        int found = 0;
-+        while (!found && (((uintptr_t) dst) & 7) && size >= 4)
-+        {
-+            found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
-+            if (!found)
-+            {
-+                *dst++ = *src++;
-+                --size;
-+                ++dsize;
-+            }
-+        }
-+        if (!found)
-+        {
-+            int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
-+            dst += skip;
-+            src += skip;
-+            size -= skip;
-+            dsize += skip;
-+            while (!found && size >= 4)
-+            {
-+                found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
-+                if (!found)
-+                {
-+                    *dst++ = *src++;
-+                    --size;
-+                    ++dsize;
-+                }
-+            }
-+        }
-+        if (found)
-+        {
-+            *dst++ = *src++;
-+            *dst++ = *src++;
-+            ++src;
-+            size -= 3;
-+            dsize += 2;
-+        }
-+    }
-+    while (size > 0)
-+    {
-+        *dst++ = *src++;
-+        --size;
-+        ++dsize;
-+    }
-+    return dsize;
-+}
-+
- av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
- {
-     int cpu_flags = av_get_cpu_flags();
- 
-     if (have_neon(cpu_flags)) {
-+        dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon;
-+        dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_neon;
-+        dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_neon;
-+        dsp->vc1_inv_trans_4x4 = ff_vc1_inv_trans_4x4_neon;
-+        dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_neon;
-+        dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
-+        dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon;
-+        dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
-+
-+        dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_neon;
-+        dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_neon;
-+        dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_neon;
-+        dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_neon;
-+        dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
-+        dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
-+
-         dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
-         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
-         dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
-         dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
-+
-+        dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
-     }
- }
---- /dev/null
-+++ b/libavcodec/aarch64/vc1dsp_neon.S
-@@ -0,0 +1,1546 @@
-+/*
-+ * VC1 AArch64 NEON optimisations
-+ *
-+ * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/aarch64/asm.S"
-+
-+// VC-1 8x8 inverse transform
-+// On entry:
-+//   x0 -> array of 16-bit inverse transform coefficients, in column-major order
-+// On exit:
-+//   array at x0 updated to hold transformed block; also now held in row-major order
-+function ff_vc1_inv_trans_8x8_neon, export=1
-+        ld1             {v1.16b, v2.16b}, [x0], #32
-+        ld1             {v3.16b, v4.16b}, [x0], #32
-+        ld1             {v5.16b, v6.16b}, [x0], #32
-+        shl             v1.8h, v1.8h, #2        //         8/2 * src[0]
-+        sub             x1, x0, #3*32
-+        ld1             {v16.16b, v17.16b}, [x0]
-+        shl             v7.8h, v2.8h, #4        //          16 * src[8]
-+        shl             v18.8h, v2.8h, #2       //           4 * src[8]
-+        shl             v19.8h, v4.8h, #4       //                        16 * src[24]
-+        ldr             d0, .Lcoeffs_it8
-+        shl             v5.8h, v5.8h, #2        //                                      8/2 * src[32]
-+        shl             v20.8h, v6.8h, #4       //                                       16 * src[40]
-+        shl             v21.8h, v6.8h, #2       //                                        4 * src[40]
-+        shl             v22.8h, v17.8h, #4      //                                                      16 * src[56]
-+        ssra            v20.8h, v19.8h, #2      //                         4 * src[24] + 16 * src[40]
-+        mul             v23.8h, v3.8h, v0.h[0]  //                       6/2 * src[16]
-+        sub             v19.8h, v19.8h, v21.8h  //                        16 * src[24] -  4 * src[40]
-+        ssra            v7.8h, v22.8h, #2       //          16 * src[8]                               +  4 * src[56]
-+        sub             v18.8h, v22.8h, v18.8h  //        -  4 * src[8]                               + 16 * src[56]
-+        shl             v3.8h, v3.8h, #3        //                      16/2 * src[16]
-+        mls             v20.8h, v2.8h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
-+        ssra            v1.8h, v1.8h, #1        //        12/2 * src[0]
-+        ssra            v5.8h, v5.8h, #1        //                                     12/2 * src[32]
-+        mla             v7.8h, v4.8h, v0.h[2]   //          16 * src[8] + 15 * src[24]                +  4 * src[56]
-+        shl             v21.8h, v16.8h, #3      //                                                    16/2 * src[48]
-+        mls             v19.8h, v2.8h, v0.h[1]  //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
-+        sub             v2.8h, v23.8h, v21.8h   // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
-+        mla             v18.8h, v4.8h, v0.h[1]  //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
-+        add             v4.8h, v1.8h, v5.8h     // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
-+        sub             v1.8h, v1.8h, v5.8h     // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
-+        mla             v3.8h, v16.8h, v0.h[0]  // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
-+        mla             v7.8h, v6.8h, v0.h[1]   //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
-+        add             v5.8h, v1.8h, v2.8h     // t6/2 = t2/2 + t4/2
-+        sub             v16.8h, v1.8h, v2.8h    // t7/2 = t2/2 - t4/2
-+        mla             v20.8h, v17.8h, v0.h[1] // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
-+        add             v21.8h, v1.8h, v2.8h    // t6/2 = t2/2 + t4/2
-+        add             v22.8h, v4.8h, v3.8h    // t5/2 = t1/2 + t3/2
-+        mls             v19.8h, v17.8h, v0.h[2] // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
-+        sub             v17.8h, v4.8h, v3.8h    // t8/2 = t1/2 - t3/2
-+        add             v23.8h, v4.8h, v3.8h    // t5/2 = t1/2 + t3/2
-+        mls             v18.8h, v6.8h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
-+        sub             v1.8h, v1.8h, v2.8h     // t7/2 = t2/2 - t4/2
-+        sub             v2.8h, v4.8h, v3.8h     // t8/2 = t1/2 - t3/2
-+        neg             v3.8h, v7.8h            // -t1
-+        neg             v4.8h, v20.8h           // +t2
-+        neg             v6.8h, v19.8h           // +t3
-+        ssra            v22.8h, v7.8h, #1       // (t5 + t1) >> 1
-+        ssra            v1.8h, v19.8h, #1       // (t7 - t3) >> 1
-+        neg             v7.8h, v18.8h           // +t4
-+        ssra            v5.8h, v4.8h, #1        // (t6 + t2) >> 1
-+        ssra            v16.8h, v6.8h, #1       // (t7 + t3) >> 1
-+        ssra            v2.8h, v18.8h, #1       // (t8 - t4) >> 1
-+        ssra            v17.8h, v7.8h, #1       // (t8 + t4) >> 1
-+        ssra            v21.8h, v20.8h, #1      // (t6 - t2) >> 1
-+        ssra            v23.8h, v3.8h, #1       // (t5 - t1) >> 1
-+        srshr           v3.8h, v22.8h, #2       // (t5 + t1 + 4) >> 3
-+        srshr           v4.8h, v5.8h, #2        // (t6 + t2 + 4) >> 3
-+        srshr           v5.8h, v16.8h, #2       // (t7 + t3 + 4) >> 3
-+        srshr           v6.8h, v17.8h, #2       // (t8 + t4 + 4) >> 3
-+        srshr           v2.8h, v2.8h, #2        // (t8 - t4 + 4) >> 3
-+        srshr           v1.8h, v1.8h, #2        // (t7 - t3 + 4) >> 3
-+        srshr           v7.8h, v21.8h, #2       // (t6 - t2 + 4) >> 3
-+        srshr           v16.8h, v23.8h, #2      // (t5 - t1 + 4) >> 3
-+        trn2            v17.8h, v3.8h, v4.8h
-+        trn2            v18.8h, v5.8h, v6.8h
-+        trn2            v19.8h, v2.8h, v1.8h
-+        trn2            v20.8h, v7.8h, v16.8h
-+        trn1            v21.4s, v17.4s, v18.4s
-+        trn2            v17.4s, v17.4s, v18.4s
-+        trn1            v18.4s, v19.4s, v20.4s
-+        trn2            v19.4s, v19.4s, v20.4s
-+        trn1            v3.8h, v3.8h, v4.8h
-+        trn2            v4.2d, v21.2d, v18.2d
-+        trn1            v20.2d, v17.2d, v19.2d
-+        trn1            v5.8h, v5.8h, v6.8h
-+        trn1            v1.8h, v2.8h, v1.8h
-+        trn1            v2.8h, v7.8h, v16.8h
-+        trn1            v6.2d, v21.2d, v18.2d
-+        trn2            v7.2d, v17.2d, v19.2d
-+        shl             v16.8h, v20.8h, #4      //                        16 * src[24]
-+        shl             v17.8h, v4.8h, #4       //                                       16 * src[40]
-+        trn1            v18.4s, v3.4s, v5.4s
-+        trn1            v19.4s, v1.4s, v2.4s
-+        shl             v21.8h, v7.8h, #4       //                                                      16 * src[56]
-+        shl             v22.8h, v6.8h, #2       //           4 * src[8]
-+        shl             v23.8h, v4.8h, #2       //                                        4 * src[40]
-+        trn2            v3.4s, v3.4s, v5.4s
-+        trn2            v1.4s, v1.4s, v2.4s
-+        shl             v2.8h, v6.8h, #4        //          16 * src[8]
-+        sub             v5.8h, v16.8h, v23.8h   //                        16 * src[24] -  4 * src[40]
-+        ssra            v17.8h, v16.8h, #2      //                         4 * src[24] + 16 * src[40]
-+        sub             v16.8h, v21.8h, v22.8h  //        -  4 * src[8]                               + 16 * src[56]
-+        trn1            v22.2d, v18.2d, v19.2d
-+        trn2            v18.2d, v18.2d, v19.2d
-+        trn1            v19.2d, v3.2d, v1.2d
-+        ssra            v2.8h, v21.8h, #2       //          16 * src[8]                               +  4 * src[56]
-+        mls             v17.8h, v6.8h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
-+        shl             v21.8h, v22.8h, #2      //         8/2 * src[0]
-+        shl             v18.8h, v18.8h, #2      //                                      8/2 * src[32]
-+        mls             v5.8h, v6.8h, v0.h[1]   //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
-+        shl             v6.8h, v19.8h, #3       //                      16/2 * src[16]
-+        trn2            v1.2d, v3.2d, v1.2d
-+        mla             v16.8h, v20.8h, v0.h[1] //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
-+        ssra            v21.8h, v21.8h, #1      //        12/2 * src[0]
-+        ssra            v18.8h, v18.8h, #1      //                                     12/2 * src[32]
-+        mul             v3.8h, v19.8h, v0.h[0]  //                       6/2 * src[16]
-+        shl             v19.8h, v1.8h, #3       //                                                    16/2 * src[48]
-+        mla             v2.8h, v20.8h, v0.h[2]  //          16 * src[8] + 15 * src[24]                +  4 * src[56]
-+        add             v20.8h, v21.8h, v18.8h  // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
-+        mla             v6.8h, v1.8h, v0.h[0]   // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
-+        sub             v1.8h, v21.8h, v18.8h   // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
-+        sub             v3.8h, v3.8h, v19.8h    // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
-+        mla             v17.8h, v7.8h, v0.h[1]  // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
-+        mls             v5.8h, v7.8h, v0.h[2]   // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
-+        add             v7.8h, v1.8h, v3.8h     // t6/2 = t2/2 + t4/2
-+        add             v18.8h, v20.8h, v6.8h   // t5/2 = t1/2 + t3/2
-+        mls             v16.8h, v4.8h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
-+        sub             v19.8h, v1.8h, v3.8h    // t7/2 = t2/2 - t4/2
-+        neg             v21.8h, v17.8h          // +t2
-+        mla             v2.8h, v4.8h, v0.h[1]   //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
-+        sub             v0.8h, v20.8h, v6.8h    // t8/2 = t1/2 - t3/2
-+        neg             v4.8h, v5.8h            // +t3
-+        sub             v22.8h, v1.8h, v3.8h    // t7/2 = t2/2 - t4/2
-+        sub             v23.8h, v20.8h, v6.8h   // t8/2 = t1/2 - t3/2
-+        neg             v24.8h, v16.8h          // +t4
-+        add             v6.8h, v20.8h, v6.8h    // t5/2 = t1/2 + t3/2
-+        add             v1.8h, v1.8h, v3.8h     // t6/2 = t2/2 + t4/2
-+        ssra            v7.8h, v21.8h, #1       // (t6 + t2) >> 1
-+        neg             v3.8h, v2.8h            // -t1
-+        ssra            v18.8h, v2.8h, #1       // (t5 + t1) >> 1
-+        ssra            v19.8h, v4.8h, #1       // (t7 + t3) >> 1
-+        ssra            v0.8h, v24.8h, #1       // (t8 + t4) >> 1
-+        srsra           v23.8h, v16.8h, #1      // (t8 - t4 + 1) >> 1
-+        srsra           v22.8h, v5.8h, #1       // (t7 - t3 + 1) >> 1
-+        srsra           v1.8h, v17.8h, #1       // (t6 - t2 + 1) >> 1
-+        srsra           v6.8h, v3.8h, #1        // (t5 - t1 + 1) >> 1
-+        srshr           v2.8h, v18.8h, #6       // (t5 + t1 + 64) >> 7
-+        srshr           v3.8h, v7.8h, #6        // (t6 + t2 + 64) >> 7
-+        srshr           v4.8h, v19.8h, #6       // (t7 + t3 + 64) >> 7
-+        srshr           v5.8h, v0.8h, #6        // (t8 + t4 + 64) >> 7
-+        srshr           v16.8h, v23.8h, #6      // (t8 - t4 + 65) >> 7
-+        srshr           v17.8h, v22.8h, #6      // (t7 - t3 + 65) >> 7
-+        st1             {v2.16b, v3.16b}, [x1], #32
-+        srshr           v0.8h, v1.8h, #6        // (t6 - t2 + 65) >> 7
-+        srshr           v1.8h, v6.8h, #6        // (t5 - t1 + 65) >> 7
-+        st1             {v4.16b, v5.16b}, [x1], #32
-+        st1             {v16.16b, v17.16b}, [x1], #32
-+        st1             {v0.16b, v1.16b}, [x1]
-+        ret
-+endfunc
-+
-+// VC-1 8x4 inverse transform
-+// On entry:
-+//   x0 -> array of 8-bit samples, in row-major order
-+//   x1 = row stride for 8-bit sample array
-+//   x2 -> array of 16-bit inverse transform coefficients, in row-major order
-+// On exit:
-+//   array at x0 updated by saturated addition of (narrowed) transformed block
-+function ff_vc1_inv_trans_8x4_neon, export=1
-+        ld1             {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32
-+        mov             x3, x0
-+        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x2]
-+        ldr             q0, .Lcoeffs_it8        // includes 4-point coefficients in upper half of vector
-+        ld1             {v5.8b}, [x0], x1
-+        trn2            v6.4h, v1.4h, v3.4h
-+        trn2            v7.4h, v2.4h, v4.4h
-+        trn1            v1.4h, v1.4h, v3.4h
-+        trn1            v2.4h, v2.4h, v4.4h
-+        trn2            v3.4h, v16.4h, v18.4h
-+        trn2            v4.4h, v17.4h, v19.4h
-+        trn1            v16.4h, v16.4h, v18.4h
-+        trn1            v17.4h, v17.4h, v19.4h
-+        ld1             {v18.8b}, [x0], x1
-+        trn1            v19.2s, v6.2s, v3.2s
-+        trn2            v3.2s, v6.2s, v3.2s
-+        trn1            v6.2s, v7.2s, v4.2s
-+        trn2            v4.2s, v7.2s, v4.2s
-+        trn1            v7.2s, v1.2s, v16.2s
-+        trn1            v20.2s, v2.2s, v17.2s
-+        shl             v21.4h, v19.4h, #4      //          16 * src[1]
-+        trn2            v1.2s, v1.2s, v16.2s
-+        shl             v16.4h, v3.4h, #4       //                        16 * src[3]
-+        trn2            v2.2s, v2.2s, v17.2s
-+        shl             v17.4h, v6.4h, #4       //                                      16 * src[5]
-+        ld1             {v22.8b}, [x0], x1
-+        shl             v23.4h, v4.4h, #4       //                                                    16 * src[7]
-+        mul             v24.4h, v1.4h, v0.h[0]  //                       6/2 * src[2]
-+        ld1             {v25.8b}, [x0]
-+        shl             v26.4h, v19.4h, #2      //           4 * src[1]
-+        shl             v27.4h, v6.4h, #2       //                                       4 * src[5]
-+        ssra            v21.4h, v23.4h, #2      //          16 * src[1]                             +  4 * src[7]
-+        ssra            v17.4h, v16.4h, #2      //                         4 * src[3] + 16 * src[5]
-+        sub             v23.4h, v23.4h, v26.4h  //        -  4 * src[1]                             + 16 * src[7]
-+        sub             v16.4h, v16.4h, v27.4h  //                        16 * src[3] -  4 * src[5]
-+        shl             v7.4h, v7.4h, #2        //         8/2 * src[0]
-+        shl             v20.4h, v20.4h, #2      //                                     8/2 * src[4]
-+        mla             v21.4h, v3.4h, v0.h[2]  //          16 * src[1] + 15 * src[3]               +  4 * src[7]
-+        shl             v1.4h, v1.4h, #3        //                      16/2 * src[2]
-+        mls             v17.4h, v19.4h, v0.h[2] //        - 15 * src[1] +  4 * src[3] + 16 * src[5]
-+        ssra            v7.4h, v7.4h, #1        //        12/2 * src[0]
-+        mls             v16.4h, v19.4h, v0.h[1] //        -  9 * src[1] + 16 * src[3] -  4 * src[5]
-+        ssra            v20.4h, v20.4h, #1      //                                    12/2 * src[4]
-+        mla             v23.4h, v3.4h, v0.h[1]  //        -  4 * src[1] +  9 * src[3]               + 16 * src[7]
-+        shl             v3.4h, v2.4h, #3        //                                                  16/2 * src[6]
-+        mla             v1.4h, v2.4h, v0.h[0]   // t3/2 =               16/2 * src[2]             +  6/2 * src[6]
-+        mla             v21.4h, v6.4h, v0.h[1]  //  t1  =   16 * src[1] + 15 * src[3] +  9 * src[5] +  4 * src[7]
-+        mla             v17.4h, v4.4h, v0.h[1]  // -t2  = - 15 * src[1] +  4 * src[3] + 16 * src[5] +  9 * src[7]
-+        sub             v2.4h, v24.4h, v3.4h    // t4/2 =                6/2 * src[2]             - 16/2 * src[6]
-+        mls             v16.4h, v4.4h, v0.h[2]  // -t3  = -  9 * src[1] + 16 * src[3] -  4 * src[5] - 15 * src[7]
-+        add             v3.4h, v7.4h, v20.4h    // t1/2 = 12/2 * src[0]             + 12/2 * src[4]
-+        mls             v23.4h, v6.4h, v0.h[2]  // -t4  = -  4 * src[1] +  9 * src[3] - 15 * src[5] + 16 * src[7]
-+        sub             v4.4h, v7.4h, v20.4h    // t2/2 = 12/2 * src[0]             - 12/2 * src[4]
-+        neg             v6.4h, v21.4h           // -t1
-+        add             v7.4h, v3.4h, v1.4h     // t5/2 = t1/2 + t3/2
-+        sub             v19.4h, v3.4h, v1.4h    // t8/2 = t1/2 - t3/2
-+        add             v20.4h, v4.4h, v2.4h    // t6/2 = t2/2 + t4/2
-+        sub             v24.4h, v4.4h, v2.4h    // t7/2 = t2/2 - t4/2
-+        add             v26.4h, v3.4h, v1.4h    // t5/2 = t1/2 + t3/2
-+        add             v27.4h, v4.4h, v2.4h    // t6/2 = t2/2 + t4/2
-+        sub             v2.4h, v4.4h, v2.4h     // t7/2 = t2/2 - t4/2
-+        sub             v1.4h, v3.4h, v1.4h     // t8/2 = t1/2 - t3/2
-+        neg             v3.4h, v17.4h           // +t2
-+        neg             v4.4h, v16.4h           // +t3
-+        neg             v28.4h, v23.4h          // +t4
-+        ssra            v7.4h, v21.4h, #1       // (t5 + t1) >> 1
-+        ssra            v1.4h, v23.4h, #1       // (t8 - t4) >> 1
-+        ssra            v20.4h, v3.4h, #1       // (t6 + t2) >> 1
-+        ssra            v24.4h, v4.4h, #1       // (t7 + t3) >> 1
-+        ssra            v19.4h, v28.4h, #1      // (t8 + t4) >> 1
-+        ssra            v2.4h, v16.4h, #1       // (t7 - t3) >> 1
-+        ssra            v27.4h, v17.4h, #1      // (t6 - t2) >> 1
-+        ssra            v26.4h, v6.4h, #1       // (t5 - t1) >> 1
-+        trn1            v1.2d, v7.2d, v1.2d
-+        trn1            v2.2d, v20.2d, v2.2d
-+        trn1            v3.2d, v24.2d, v27.2d
-+        trn1            v4.2d, v19.2d, v26.2d
-+        srshr           v1.8h, v1.8h, #2        // (t5 + t1 + 4) >> 3, (t8 - t4 + 4) >> 3
-+        srshr           v2.8h, v2.8h, #2        // (t6 + t2 + 4) >> 3, (t7 - t3 + 4) >> 3
-+        srshr           v3.8h, v3.8h, #2        // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3
-+        srshr           v4.8h, v4.8h, #2        // (t8 + t4 + 4) >> 3, (t5 - t1 + 4) >> 3
-+        trn2            v6.8h, v1.8h, v2.8h
-+        trn1            v1.8h, v1.8h, v2.8h
-+        trn2            v2.8h, v3.8h, v4.8h
-+        trn1            v3.8h, v3.8h, v4.8h
-+        trn2            v4.4s, v6.4s, v2.4s
-+        trn1            v7.4s, v1.4s, v3.4s
-+        trn2            v1.4s, v1.4s, v3.4s
-+        mul             v3.8h, v4.8h, v0.h[5]   //                                                           22/2 * src[24]
-+        trn1            v2.4s, v6.4s, v2.4s
-+        mul             v4.8h, v4.8h, v0.h[4]   //                                                           10/2 * src[24]
-+        mul             v6.8h, v7.8h, v0.h[6]   //            17 * src[0]
-+        mul             v1.8h, v1.8h, v0.h[6]   //                                            17 * src[16]
-+        mls             v3.8h, v2.8h, v0.h[4]   //  t4/2 =                - 10/2 * src[8]                  + 22/2 * src[24]
-+        mla             v4.8h, v2.8h, v0.h[5]   //  t3/2 =                  22/2 * src[8]                  + 10/2 * src[24]
-+        add             v0.8h, v6.8h, v1.8h     //   t1  =    17 * src[0]                 +   17 * src[16]
-+        sub             v1.8h, v6.8h, v1.8h     //   t2  =    17 * src[0]                 -   17 * src[16]
-+        neg             v2.8h, v3.8h            // -t4/2
-+        neg             v6.8h, v4.8h            // -t3/2
-+        ssra            v4.8h, v0.8h, #1        // (t1 + t3) >> 1
-+        ssra            v2.8h, v1.8h, #1        // (t2 - t4) >> 1
-+        ssra            v3.8h, v1.8h, #1        // (t2 + t4) >> 1
-+        ssra            v6.8h, v0.8h, #1        // (t1 - t3) >> 1
-+        srshr           v0.8h, v4.8h, #6        // (t1 + t3 + 64) >> 7
-+        srshr           v1.8h, v2.8h, #6        // (t2 - t4 + 64) >> 7
-+        srshr           v2.8h, v3.8h, #6        // (t2 + t4 + 64) >> 7
-+        srshr           v3.8h, v6.8h, #6        // (t1 - t3 + 64) >> 7
-+        uaddw           v0.8h, v0.8h, v5.8b
-+        uaddw           v1.8h, v1.8h, v18.8b
-+        uaddw           v2.8h, v2.8h, v22.8b
-+        uaddw           v3.8h, v3.8h, v25.8b
-+        sqxtun          v0.8b, v0.8h
-+        sqxtun          v1.8b, v1.8h
-+        sqxtun          v2.8b, v2.8h
-+        sqxtun          v3.8b, v3.8h
-+        st1             {v0.8b}, [x3], x1
-+        st1             {v1.8b}, [x3], x1
-+        st1             {v2.8b}, [x3], x1
-+        st1             {v3.8b}, [x3]
-+        ret
-+endfunc
-+
-+// VC-1 4x8 inverse transform
-+// On entry:
-+//   x0 -> array of 8-bit samples, in row-major order
-+//   x1 = row stride for 8-bit sample array
-+//   x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
-+// On exit:
-+//   array at x0 updated by saturated addition of (narrowed) transformed block
-+function ff_vc1_inv_trans_4x8_neon, export=1
-+        mov             x3, #16
-+        ldr             q0, .Lcoeffs_it8        // includes 4-point coefficients in upper half of vector
-+        mov             x4, x0
-+        ld1             {v1.d}[0], [x2], x3     // 00 01 02 03
-+        ld1             {v2.d}[0], [x2], x3     // 10 11 12 13
-+        ld1             {v3.d}[0], [x2], x3     // 20 21 22 23
-+        ld1             {v4.d}[0], [x2], x3     // 30 31 32 33
-+        ld1             {v1.d}[1], [x2], x3     // 40 41 42 43
-+        ld1             {v2.d}[1], [x2], x3     // 50 51 52 53
-+        ld1             {v3.d}[1], [x2], x3     // 60 61 62 63
-+        ld1             {v4.d}[1], [x2]         // 70 71 72 73
-+        ld1             {v5.s}[0], [x0], x1
-+        ld1             {v6.s}[0], [x0], x1
-+        ld1             {v7.s}[0], [x0], x1
-+        trn2            v16.8h, v1.8h, v2.8h    // 01 11 03 13 41 51 43 53
-+        trn1            v1.8h, v1.8h, v2.8h     // 00 10 02 12 40 50 42 52
-+        trn2            v2.8h, v3.8h, v4.8h     // 21 31 23 33 61 71 63 73
-+        trn1            v3.8h, v3.8h, v4.8h     // 20 30 22 32 60 70 62 72
-+        ld1             {v4.s}[0], [x0], x1
-+        trn2            v17.4s, v16.4s, v2.4s   // 03 13 23 33 43 53 63 73
-+        trn1            v18.4s, v1.4s, v3.4s    // 00 10 20 30 40 50 60 70
-+        trn1            v2.4s, v16.4s, v2.4s    // 01 11 21 31 41 51 61 71
-+        mul             v16.8h, v17.8h, v0.h[4] //                                                          10/2 * src[3]
-+        ld1             {v5.s}[1], [x0], x1
-+        mul             v17.8h, v17.8h, v0.h[5] //                                                          22/2 * src[3]
-+        ld1             {v6.s}[1], [x0], x1
-+        trn2            v1.4s, v1.4s, v3.4s     // 02 12 22 32 42 52 62 72
-+        mul             v3.8h, v18.8h, v0.h[6]  //            17 * src[0]
-+        ld1             {v7.s}[1], [x0], x1
-+        mul             v1.8h, v1.8h, v0.h[6]   //                                            17 * src[2]
-+        ld1             {v4.s}[1], [x0]
-+        mla             v16.8h, v2.8h, v0.h[5]  //  t3/2 =                  22/2 * src[1]                 + 10/2 * src[3]
-+        mls             v17.8h, v2.8h, v0.h[4]  //  t4/2 =                - 10/2 * src[1]                 + 22/2 * src[3]
-+        add             v2.8h, v3.8h, v1.8h     //   t1  =    17 * src[0]                 +   17 * src[2]
-+        sub             v1.8h, v3.8h, v1.8h     //   t2  =    17 * src[0]                 -   17 * src[2]
-+        neg             v3.8h, v16.8h           // -t3/2
-+        ssra            v16.8h, v2.8h, #1       // (t1 + t3) >> 1
-+        neg             v18.8h, v17.8h          // -t4/2
-+        ssra            v17.8h, v1.8h, #1       // (t2 + t4) >> 1
-+        ssra            v3.8h, v2.8h, #1        // (t1 - t3) >> 1
-+        ssra            v18.8h, v1.8h, #1       // (t2 - t4) >> 1
-+        srshr           v1.8h, v16.8h, #2       // (t1 + t3 + 64) >> 3
-+        srshr           v2.8h, v17.8h, #2       // (t2 + t4 + 64) >> 3
-+        srshr           v3.8h, v3.8h, #2        // (t1 - t3 + 64) >> 3
-+        srshr           v16.8h, v18.8h, #2      // (t2 - t4 + 64) >> 3
-+        trn2            v17.8h, v2.8h, v3.8h    // 12 13 32 33 52 53 72 73
-+        trn2            v18.8h, v1.8h, v16.8h   // 10 11 30 31 50 51 70 71
-+        trn1            v1.8h, v1.8h, v16.8h    // 00 01 20 21 40 41 60 61
-+        trn1            v2.8h, v2.8h, v3.8h     // 02 03 22 23 42 43 62 63
-+        trn1            v3.4s, v18.4s, v17.4s   // 10 11 12 13 50 51 52 53
-+        trn2            v16.4s, v18.4s, v17.4s  // 30 31 32 33 70 71 72 73
-+        trn1            v17.4s, v1.4s, v2.4s    // 00 01 02 03 40 41 42 43
-+        mov             d18, v3.d[1]            // 50 51 52 53
-+        shl             v19.4h, v3.4h, #4       //          16 * src[8]
-+        mov             d20, v16.d[1]           // 70 71 72 73
-+        shl             v21.4h, v16.4h, #4      //                        16 * src[24]
-+        mov             d22, v17.d[1]           // 40 41 42 43
-+        shl             v23.4h, v3.4h, #2       //           4 * src[8]
-+        shl             v24.4h, v18.4h, #4      //                                       16 * src[40]
-+        shl             v25.4h, v20.4h, #4      //                                                      16 * src[56]
-+        shl             v26.4h, v18.4h, #2      //                                        4 * src[40]
-+        trn2            v1.4s, v1.4s, v2.4s     // 20 21 22 23 60 61 62 63
-+        ssra            v24.4h, v21.4h, #2      //                         4 * src[24] + 16 * src[40]
-+        sub             v2.4h, v25.4h, v23.4h   //        -  4 * src[8]                               + 16 * src[56]
-+        shl             v17.4h, v17.4h, #2      //         8/2 * src[0]
-+        sub             v21.4h, v21.4h, v26.4h  //                        16 * src[24] -  4 * src[40]
-+        shl             v22.4h, v22.4h, #2      //                                      8/2 * src[32]
-+        mov             d23, v1.d[1]            // 60 61 62 63
-+        ssra            v19.4h, v25.4h, #2      //          16 * src[8]                               +  4 * src[56]
-+        mul             v25.4h, v1.4h, v0.h[0]  //                       6/2 * src[16]
-+        shl             v1.4h, v1.4h, #3        //                      16/2 * src[16]
-+        mls             v24.4h, v3.4h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
-+        ssra            v17.4h, v17.4h, #1      //        12/2 * src[0]
-+        mls             v21.4h, v3.4h, v0.h[1]  //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
-+        ssra            v22.4h, v22.4h, #1      //                                     12/2 * src[32]
-+        mla             v2.4h, v16.4h, v0.h[1]  //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
-+        shl             v3.4h, v23.4h, #3       //                                                    16/2 * src[48]
-+        mla             v19.4h, v16.4h, v0.h[2] //          16 * src[8] + 15 * src[24]                +  4 * src[56]
-+        mla             v1.4h, v23.4h, v0.h[0]  // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
-+        mla             v24.4h, v20.4h, v0.h[1] // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
-+        add             v16.4h, v17.4h, v22.4h  // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
-+        sub             v3.4h, v25.4h, v3.4h    // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
-+        sub             v17.4h, v17.4h, v22.4h  // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
-+        mls             v21.4h, v20.4h, v0.h[2] // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
-+        mla             v19.4h, v18.4h, v0.h[1] //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
-+        add             v20.4h, v16.4h, v1.4h   // t5/2 = t1/2 + t3/2
-+        mls             v2.4h, v18.4h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
-+        sub             v0.4h, v16.4h, v1.4h    // t8/2 = t1/2 - t3/2
-+        add             v18.4h, v17.4h, v3.4h   // t6/2 = t2/2 + t4/2
-+        sub             v22.4h, v17.4h, v3.4h   // t7/2 = t2/2 - t4/2
-+        neg             v23.4h, v24.4h          // +t2
-+        sub             v25.4h, v17.4h, v3.4h   // t7/2 = t2/2 - t4/2
-+        add             v3.4h, v17.4h, v3.4h    // t6/2 = t2/2 + t4/2
-+        neg             v17.4h, v21.4h          // +t3
-+        sub             v26.4h, v16.4h, v1.4h   // t8/2 = t1/2 - t3/2
-+        add             v1.4h, v16.4h, v1.4h    // t5/2 = t1/2 + t3/2
-+        neg             v16.4h, v19.4h          // -t1
-+        neg             v27.4h, v2.4h           // +t4
-+        ssra            v20.4h, v19.4h, #1      // (t5 + t1) >> 1
-+        srsra           v0.4h, v2.4h, #1        // (t8 - t4 + 1) >> 1
-+        ssra            v18.4h, v23.4h, #1      // (t6 + t2) >> 1
-+        srsra           v22.4h, v21.4h, #1      // (t7 - t3 + 1) >> 1
-+        ssra            v25.4h, v17.4h, #1      // (t7 + t3) >> 1
-+        srsra           v3.4h, v24.4h, #1       // (t6 - t2 + 1) >> 1
-+        ssra            v26.4h, v27.4h, #1      // (t8 + t4) >> 1
-+        srsra           v1.4h, v16.4h, #1       // (t5 - t1 + 1) >> 1
-+        trn1            v0.2d, v20.2d, v0.2d
-+        trn1            v2.2d, v18.2d, v22.2d
-+        trn1            v3.2d, v25.2d, v3.2d
-+        trn1            v1.2d, v26.2d, v1.2d
-+        srshr           v0.8h, v0.8h, #6        // (t5 + t1 + 64) >> 7, (t8 - t4 + 65) >> 7
-+        srshr           v2.8h, v2.8h, #6        // (t6 + t2 + 64) >> 7, (t7 - t3 + 65) >> 7
-+        srshr           v3.8h, v3.8h, #6        // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7
-+        srshr           v1.8h, v1.8h, #6        // (t8 + t4 + 64) >> 7, (t5 - t1 + 65) >> 7
-+        uaddw           v0.8h, v0.8h, v5.8b
-+        uaddw           v2.8h, v2.8h, v6.8b
-+        uaddw           v3.8h, v3.8h, v7.8b
-+        uaddw           v1.8h, v1.8h, v4.8b
-+        sqxtun          v0.8b, v0.8h
-+        sqxtun          v2.8b, v2.8h
-+        sqxtun          v3.8b, v3.8h
-+        sqxtun          v1.8b, v1.8h
-+        st1             {v0.s}[0], [x4], x1
-+        st1             {v2.s}[0], [x4], x1
-+        st1             {v3.s}[0], [x4], x1
-+        st1             {v1.s}[0], [x4], x1
-+        st1             {v0.s}[1], [x4], x1
-+        st1             {v2.s}[1], [x4], x1
-+        st1             {v3.s}[1], [x4], x1
-+        st1             {v1.s}[1], [x4]
-+        ret
-+endfunc
-+
-+// VC-1 4x4 inverse transform
-+// On entry:
-+//   x0 -> array of 8-bit samples, in row-major order
-+//   x1 = row stride for 8-bit sample array
-+//   x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
-+// On exit:
-+//   array at x0 updated by saturated addition of (narrowed) transformed block
-+function ff_vc1_inv_trans_4x4_neon, export=1
-+        mov             x3, #16
-+        ldr             d0, .Lcoeffs_it4
-+        mov             x4, x0
-+        ld1             {v1.d}[0], [x2], x3     // 00 01 02 03
-+        ld1             {v2.d}[0], [x2], x3     // 10 11 12 13
-+        ld1             {v3.d}[0], [x2], x3     // 20 21 22 23
-+        ld1             {v4.d}[0], [x2]         // 30 31 32 33
-+        ld1             {v5.s}[0], [x0], x1
-+        ld1             {v5.s}[1], [x0], x1
-+        ld1             {v6.s}[0], [x0], x1
-+        trn2            v7.4h, v1.4h, v2.4h     // 01 11 03 13
-+        trn1            v1.4h, v1.4h, v2.4h     // 00 10 02 12
-+        ld1             {v6.s}[1], [x0]
-+        trn2            v2.4h, v3.4h, v4.4h     // 21 31 23 33
-+        trn1            v3.4h, v3.4h, v4.4h     // 20 30 22 32
-+        trn2            v4.2s, v7.2s, v2.2s     // 03 13 23 33
-+        trn1            v16.2s, v1.2s, v3.2s    // 00 10 20 30
-+        trn1            v2.2s, v7.2s, v2.2s     // 01 11 21 31
-+        trn2            v1.2s, v1.2s, v3.2s     // 02 12 22 32
-+        mul             v3.4h, v4.4h, v0.h[0]   //                                                          10/2 * src[3]
-+        mul             v4.4h, v4.4h, v0.h[1]   //                                                          22/2 * src[3]
-+        mul             v7.4h, v16.4h, v0.h[2]  //            17 * src[0]
-+        mul             v1.4h, v1.4h, v0.h[2]   //                                            17 * src[2]
-+        mla             v3.4h, v2.4h, v0.h[1]   //  t3/2 =                  22/2 * src[1]                 + 10/2 * src[3]
-+        mls             v4.4h, v2.4h, v0.h[0]   //  t4/2 =                - 10/2 * src[1]                 + 22/2 * src[3]
-+        add             v2.4h, v7.4h, v1.4h     //   t1  =    17 * src[0]                 +   17 * src[2]
-+        sub             v1.4h, v7.4h, v1.4h     //   t2  =    17 * src[0]                 -   17 * src[2]
-+        neg             v7.4h, v3.4h            // -t3/2
-+        neg             v16.4h, v4.4h           // -t4/2
-+        ssra            v3.4h, v2.4h, #1        // (t1 + t3) >> 1
-+        ssra            v4.4h, v1.4h, #1        // (t2 + t4) >> 1
-+        ssra            v16.4h, v1.4h, #1       // (t2 - t4) >> 1
-+        ssra            v7.4h, v2.4h, #1        // (t1 - t3) >> 1
-+        srshr           v1.4h, v3.4h, #2        // (t1 + t3 + 64) >> 3
-+        srshr           v2.4h, v4.4h, #2        // (t2 + t4 + 64) >> 3
-+        srshr           v3.4h, v16.4h, #2       // (t2 - t4 + 64) >> 3
-+        srshr           v4.4h, v7.4h, #2        // (t1 - t3 + 64) >> 3
-+        trn2            v7.4h, v1.4h, v3.4h     // 10 11 30 31
-+        trn1            v1.4h, v1.4h, v3.4h     // 00 01 20 21
-+        trn2            v3.4h, v2.4h, v4.4h     // 12 13 32 33
-+        trn1            v2.4h, v2.4h, v4.4h     // 02 03 22 23
-+        trn2            v4.2s, v7.2s, v3.2s     // 30 31 32 33
-+        trn1            v16.2s, v1.2s, v2.2s    // 00 01 02 03
-+        trn1            v3.2s, v7.2s, v3.2s     // 10 11 12 13
-+        trn2            v1.2s, v1.2s, v2.2s     // 20 21 22 23
-+        mul             v2.4h, v4.4h, v0.h[1]   //                                                           22/2 * src[24]
-+        mul             v4.4h, v4.4h, v0.h[0]   //                                                           10/2 * src[24]
-+        mul             v7.4h, v16.4h, v0.h[2]  //            17 * src[0]
-+        mul             v1.4h, v1.4h, v0.h[2]   //                                            17 * src[16]
-+        mls             v2.4h, v3.4h, v0.h[0]   //  t4/2 =                - 10/2 * src[8]                  + 22/2 * src[24]
-+        mla             v4.4h, v3.4h, v0.h[1]   //  t3/2 =                  22/2 * src[8]                  + 10/2 * src[24]
-+        add             v0.4h, v7.4h, v1.4h     //   t1  =    17 * src[0]                 +   17 * src[16]
-+        sub             v1.4h, v7.4h, v1.4h     //   t2  =    17 * src[0]                 -   17 * src[16]
-+        neg             v3.4h, v2.4h            // -t4/2
-+        neg             v7.4h, v4.4h            // -t3/2
-+        ssra            v4.4h, v0.4h, #1        // (t1 + t3) >> 1
-+        ssra            v3.4h, v1.4h, #1        // (t2 - t4) >> 1
-+        ssra            v2.4h, v1.4h, #1        // (t2 + t4) >> 1
-+        ssra            v7.4h, v0.4h, #1        // (t1 - t3) >> 1
-+        trn1            v0.2d, v4.2d, v3.2d
-+        trn1            v1.2d, v2.2d, v7.2d
-+        srshr           v0.8h, v0.8h, #6        // (t1 + t3 + 64) >> 7, (t2 - t4 + 64) >> 7
-+        srshr           v1.8h, v1.8h, #6        // (t2 + t4 + 64) >> 7, (t1 - t3 + 64) >> 7
-+        uaddw           v0.8h, v0.8h, v5.8b
-+        uaddw           v1.8h, v1.8h, v6.8b
-+        sqxtun          v0.8b, v0.8h
-+        sqxtun          v1.8b, v1.8h
-+        st1             {v0.s}[0], [x4], x1
-+        st1             {v0.s}[1], [x4], x1
-+        st1             {v1.s}[0], [x4], x1
-+        st1             {v1.s}[1], [x4]
-+        ret
-+endfunc
-+
-+// VC-1 8x8 inverse transform, DC case
-+// On entry:
-+//   x0 -> array of 8-bit samples, in row-major order
-+//   x1 = row stride for 8-bit sample array
-+//   x2 -> 16-bit inverse transform DC coefficient
-+// On exit:
-+//   array at x0 updated by saturated addition of (narrowed) transformed block
-+function ff_vc1_inv_trans_8x8_dc_neon, export=1
-+        ldrsh           w2, [x2]
-+        mov             x3, x0
-+        ld1             {v0.8b}, [x0], x1
-+        ld1             {v1.8b}, [x0], x1
-+        ld1             {v2.8b}, [x0], x1
-+        add             w2, w2, w2, lsl #1
-+        ld1             {v3.8b}, [x0], x1
-+        ld1             {v4.8b}, [x0], x1
-+        add             w2, w2, #1
-+        ld1             {v5.8b}, [x0], x1
-+        asr             w2, w2, #1
-+        ld1             {v6.8b}, [x0], x1
-+        add             w2, w2, w2, lsl #1
-+        ld1             {v7.8b}, [x0]
-+        add             w0, w2, #16
-+        asr             w0, w0, #5
-+        dup             v16.8h, w0
-+        uaddw           v0.8h, v16.8h, v0.8b
-+        uaddw           v1.8h, v16.8h, v1.8b
-+        uaddw           v2.8h, v16.8h, v2.8b
-+        uaddw           v3.8h, v16.8h, v3.8b
-+        uaddw           v4.8h, v16.8h, v4.8b
-+        uaddw           v5.8h, v16.8h, v5.8b
-+        sqxtun          v0.8b, v0.8h
-+        uaddw           v6.8h, v16.8h, v6.8b
-+        sqxtun          v1.8b, v1.8h
-+        uaddw           v7.8h, v16.8h, v7.8b
-+        sqxtun          v2.8b, v2.8h
-+        sqxtun          v3.8b, v3.8h
-+        sqxtun          v4.8b, v4.8h
-+        st1             {v0.8b}, [x3], x1
-+        sqxtun          v0.8b, v5.8h
-+        st1             {v1.8b}, [x3], x1
-+        sqxtun          v1.8b, v6.8h
-+        st1             {v2.8b}, [x3], x1
-+        sqxtun          v2.8b, v7.8h
-+        st1             {v3.8b}, [x3], x1
-+        st1             {v4.8b}, [x3], x1
-+        st1             {v0.8b}, [x3], x1
-+        st1             {v1.8b}, [x3], x1
-+        st1             {v2.8b}, [x3]
-+        ret
-+endfunc
-+
-+// VC-1 8x4 inverse transform, DC case
-+// On entry:
-+//   x0 -> array of 8-bit samples, in row-major order
-+//   x1 = row stride for 8-bit sample array
-+//   x2 -> 16-bit inverse transform DC coefficient
-+// On exit:
-+//   array at x0 updated by saturated addition of (narrowed) transformed block
-+function ff_vc1_inv_trans_8x4_dc_neon, export=1
-+        ldrsh           w2, [x2]
-+        mov             x3, x0
-+        ld1             {v0.8b}, [x0], x1
-+        ld1             {v1.8b}, [x0], x1
-+        ld1             {v2.8b}, [x0], x1
-+        add             w2, w2, w2, lsl #1
-+        ld1             {v3.8b}, [x0]
-+        add             w0, w2, #1
-+        asr             w0, w0, #1
-+        add             w0, w0, w0, lsl #4
-+        add             w0, w0, #64
-+        asr             w0, w0, #7
-+        dup             v4.8h, w0
-+        uaddw           v0.8h, v4.8h, v0.8b
-+        uaddw           v1.8h, v4.8h, v1.8b
-+        uaddw           v2.8h, v4.8h, v2.8b
-+        uaddw           v3.8h, v4.8h, v3.8b
-+        sqxtun          v0.8b, v0.8h
-+        sqxtun          v1.8b, v1.8h
-+        sqxtun          v2.8b, v2.8h
-+        sqxtun          v3.8b, v3.8h
-+        st1             {v0.8b}, [x3], x1
-+        st1             {v1.8b}, [x3], x1
-+        st1             {v2.8b}, [x3], x1
-+        st1             {v3.8b}, [x3]
-+        ret
-+endfunc
-+
-+// VC-1 4x8 inverse transform, DC case
-+// On entry:
-+//   x0 -> array of 8-bit samples, in row-major order
-+//   x1 = row stride for 8-bit sample array
-+//   x2 -> 16-bit inverse transform DC coefficient
-+// On exit:
-+//   array at x0 updated by saturated addition of (narrowed) transformed block
-+function ff_vc1_inv_trans_4x8_dc_neon, export=1
-+        ldrsh           w2, [x2]
-+        mov             x3, x0
-+        ld1             {v0.s}[0], [x0], x1
-+        ld1             {v1.s}[0], [x0], x1
-+        ld1             {v2.s}[0], [x0], x1
-+        add             w2, w2, w2, lsl #4
-+        ld1             {v3.s}[0], [x0], x1
-+        add             w2, w2, #4
-+        asr             w2, w2, #3
-+        add             w2, w2, w2, lsl #1
-+        ld1             {v0.s}[1], [x0], x1
-+        add             w2, w2, #16
-+        asr             w2, w2, #5
-+        dup             v4.8h, w2
-+        ld1             {v1.s}[1], [x0], x1
-+        ld1             {v2.s}[1], [x0], x1
-+        ld1             {v3.s}[1], [x0]
-+        uaddw           v0.8h, v4.8h, v0.8b
-+        uaddw           v1.8h, v4.8h, v1.8b
-+        uaddw           v2.8h, v4.8h, v2.8b
-+        uaddw           v3.8h, v4.8h, v3.8b
-+        sqxtun          v0.8b, v0.8h
-+        sqxtun          v1.8b, v1.8h
-+        sqxtun          v2.8b, v2.8h
-+        sqxtun          v3.8b, v3.8h
-+        st1             {v0.s}[0], [x3], x1
-+        st1             {v1.s}[0], [x3], x1
-+        st1             {v2.s}[0], [x3], x1
-+        st1             {v3.s}[0], [x3], x1
-+        st1             {v0.s}[1], [x3], x1
-+        st1             {v1.s}[1], [x3], x1
-+        st1             {v2.s}[1], [x3], x1
-+        st1             {v3.s}[1], [x3]
-+        ret
-+endfunc
-+
-+// VC-1 4x4 inverse transform, DC case
-+// On entry:
-+//   x0 -> array of 8-bit samples, in row-major order
-+//   x1 = row stride for 8-bit sample array
-+//   x2 -> 16-bit inverse transform DC coefficient
-+// On exit:
-+//   array at x0 updated by saturated addition of (narrowed) transformed block
-+function ff_vc1_inv_trans_4x4_dc_neon, export=1
-+        ldrsh           w2, [x2]
-+        mov             x3, x0
-+        ld1             {v0.s}[0], [x0], x1
-+        ld1             {v1.s}[0], [x0], x1
-+        ld1             {v0.s}[1], [x0], x1
-+        add             w2, w2, w2, lsl #4
-+        ld1             {v1.s}[1], [x0]
-+        add             w0, w2, #4
-+        asr             w0, w0, #3
-+        add             w0, w0, w0, lsl #4
-+        add             w0, w0, #64
-+        asr             w0, w0, #7
-+        dup             v2.8h, w0
-+        uaddw           v0.8h, v2.8h, v0.8b
-+        uaddw           v1.8h, v2.8h, v1.8b
-+        sqxtun          v0.8b, v0.8h
-+        sqxtun          v1.8b, v1.8h
-+        st1             {v0.s}[0], [x3], x1
-+        st1             {v1.s}[0], [x3], x1
-+        st1             {v0.s}[1], [x3], x1
-+        st1             {v1.s}[1], [x3]
-+        ret
-+endfunc
-+
-+.align  5
-+.Lcoeffs_it8:
-+.quad   0x000F00090003
-+.Lcoeffs_it4:
-+.quad   0x0011000B0005
-+.Lcoeffs:
-+.quad   0x00050002
-+
-+// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
-+// On entry:
-+//   x0 -> top-left pel of lower block
-+//   x1 = row stride, bytes
-+//   w2 = PQUANT bitstream parameter
-+function ff_vc1_v_loop_filter4_neon, export=1
-+        sub             x3, x0, w1, sxtw #2
-+        ldr             d0, .Lcoeffs
-+        ld1             {v1.s}[0], [x0], x1     // P5
-+        ld1             {v2.s}[0], [x3], x1     // P1
-+        ld1             {v3.s}[0], [x3], x1     // P2
-+        ld1             {v4.s}[0], [x0], x1     // P6
-+        ld1             {v5.s}[0], [x3], x1     // P3
-+        ld1             {v6.s}[0], [x0], x1     // P7
-+        ld1             {v7.s}[0], [x3]         // P4
-+        ld1             {v16.s}[0], [x0]        // P8
-+        ushll           v17.8h, v1.8b, #1       // 2*P5
-+        dup             v18.8h, w2              // pq
-+        ushll           v2.8h, v2.8b, #1        // 2*P1
-+        uxtl            v3.8h, v3.8b            // P2
-+        uxtl            v4.8h, v4.8b            // P6
-+        uxtl            v19.8h, v5.8b           // P3
-+        mls             v2.4h, v3.4h, v0.h[1]   // 2*P1-5*P2
-+        uxtl            v3.8h, v6.8b            // P7
-+        mls             v17.4h, v4.4h, v0.h[1]  // 2*P5-5*P6
-+        ushll           v5.8h, v5.8b, #1        // 2*P3
-+        uxtl            v6.8h, v7.8b            // P4
-+        mla             v17.4h, v3.4h, v0.h[1]  // 2*P5-5*P6+5*P7
-+        uxtl            v3.8h, v16.8b           // P8
-+        mla             v2.4h, v19.4h, v0.h[1]  // 2*P1-5*P2+5*P3
-+        uxtl            v1.8h, v1.8b            // P5
-+        mls             v5.4h, v6.4h, v0.h[1]   // 2*P3-5*P4
-+        mls             v17.4h, v3.4h, v0.h[0]  // 2*P5-5*P6+5*P7-2*P8
-+        sub             v3.4h, v6.4h, v1.4h     // P4-P5
-+        mls             v2.4h, v6.4h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4
-+        mla             v5.4h, v1.4h, v0.h[1]   // 2*P3-5*P4+5*P5
-+        mls             v5.4h, v4.4h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
-+        abs             v4.4h, v3.4h
-+        srshr           v7.4h, v17.4h, #3
-+        srshr           v2.4h, v2.4h, #3
-+        sshr            v4.4h, v4.4h, #1        // clip
-+        srshr           v5.4h, v5.4h, #3
-+        abs             v7.4h, v7.4h            // a2
-+        sshr            v3.4h, v3.4h, #8        // clip_sign
-+        abs             v2.4h, v2.4h            // a1
-+        cmeq            v16.4h, v4.4h, #0       // test clip == 0
-+        abs             v17.4h, v5.4h           // a0
-+        sshr            v5.4h, v5.4h, #8        // a0_sign
-+        cmhs            v19.4h, v2.4h, v7.4h    // test a1 >= a2
-+        cmhs            v18.4h, v17.4h, v18.4h  // test a0 >= pq
-+        sub             v3.4h, v3.4h, v5.4h     // clip_sign - a0_sign
-+        bsl             v19.8b, v7.8b, v2.8b    // a3
-+        orr             v2.8b, v16.8b, v18.8b   // test clip == 0 || a0 >= pq
-+        uqsub           v5.4h, v17.4h, v19.4h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
-+        cmhs            v7.4h, v19.4h, v17.4h   // test a3 >= a0
-+        mul             v0.4h, v5.4h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
-+        orr             v5.8b, v2.8b, v7.8b     // test clip == 0 || a0 >= pq || a3 >= a0
-+        mov             w0, v5.s[1]             // move to gp reg
-+        ushr            v0.4h, v0.4h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
-+        cmhs            v5.4h, v0.4h, v4.4h
-+        tbnz            w0, #0, 1f              // none of the 4 pixel pairs should be updated if this one is not filtered
-+        bsl             v5.8b, v4.8b, v0.8b     // FFMIN(d, clip)
-+        bic             v0.8b, v5.8b, v2.8b     // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
-+        mls             v6.4h, v0.4h, v3.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
-+        mla             v1.4h, v0.4h, v3.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
-+        sqxtun          v0.8b, v6.8h
-+        sqxtun          v1.8b, v1.8h
-+        st1             {v0.s}[0], [x3], x1
-+        st1             {v1.s}[0], [x3]
-+1:      ret
-+endfunc
-+
-+// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
-+// On entry:
-+//   x0 -> top-left pel of right block
-+//   x1 = row stride, bytes
-+//   w2 = PQUANT bitstream parameter
-+function ff_vc1_h_loop_filter4_neon, export=1
-+        sub             x3, x0, #4              // where to start reading
-+        ldr             d0, .Lcoeffs
-+        ld1             {v1.8b}, [x3], x1
-+        sub             x0, x0, #1              // where to start writing
-+        ld1             {v2.8b}, [x3], x1
-+        ld1             {v3.8b}, [x3], x1
-+        ld1             {v4.8b}, [x3]
-+        dup             v5.8h, w2               // pq
-+        trn1            v6.8b, v1.8b, v2.8b
-+        trn2            v1.8b, v1.8b, v2.8b
-+        trn1            v2.8b, v3.8b, v4.8b
-+        trn2            v3.8b, v3.8b, v4.8b
-+        trn1            v4.4h, v6.4h, v2.4h     // P1, P5
-+        trn1            v7.4h, v1.4h, v3.4h     // P2, P6
-+        trn2            v2.4h, v6.4h, v2.4h     // P3, P7
-+        trn2            v1.4h, v1.4h, v3.4h     // P4, P8
-+        ushll           v3.8h, v4.8b, #1        // 2*P1, 2*P5
-+        uxtl            v6.8h, v7.8b            // P2, P6
-+        uxtl            v7.8h, v2.8b            // P3, P7
-+        uxtl            v1.8h, v1.8b            // P4, P8
-+        mls             v3.8h, v6.8h, v0.h[1]   // 2*P1-5*P2, 2*P5-5*P6
-+        ushll           v2.8h, v2.8b, #1        // 2*P3, 2*P7
-+        uxtl            v4.8h, v4.8b            // P1, P5
-+        mla             v3.8h, v7.8h, v0.h[1]   // 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
-+        mov             d6, v6.d[1]             // P6
-+        mls             v3.8h, v1.8h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
-+        mov             d4, v4.d[1]             // P5
-+        mls             v2.4h, v1.4h, v0.h[1]   // 2*P3-5*P4
-+        mla             v2.4h, v4.4h, v0.h[1]   // 2*P3-5*P4+5*P5
-+        sub             v7.4h, v1.4h, v4.4h     // P4-P5
-+        mls             v2.4h, v6.4h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
-+        srshr           v3.8h, v3.8h, #3
-+        abs             v6.4h, v7.4h
-+        sshr            v7.4h, v7.4h, #8        // clip_sign
-+        srshr           v2.4h, v2.4h, #3
-+        abs             v3.8h, v3.8h            // a1, a2
-+        sshr            v6.4h, v6.4h, #1        // clip
-+        mov             d16, v3.d[1]            // a2
-+        abs             v17.4h, v2.4h           // a0
-+        cmeq            v18.4h, v6.4h, #0       // test clip == 0
-+        sshr            v2.4h, v2.4h, #8        // a0_sign
-+        cmhs            v19.4h, v3.4h, v16.4h   // test a1 >= a2
-+        cmhs            v5.4h, v17.4h, v5.4h    // test a0 >= pq
-+        sub             v2.4h, v7.4h, v2.4h     // clip_sign - a0_sign
-+        bsl             v19.8b, v16.8b, v3.8b   // a3
-+        orr             v3.8b, v18.8b, v5.8b    // test clip == 0 || a0 >= pq
-+        uqsub           v5.4h, v17.4h, v19.4h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
-+        cmhs            v7.4h, v19.4h, v17.4h   // test a3 >= a0
-+        mul             v0.4h, v5.4h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
-+        orr             v5.8b, v3.8b, v7.8b     // test clip == 0 || a0 >= pq || a3 >= a0
-+        mov             w2, v5.s[1]             // move to gp reg
-+        ushr            v0.4h, v0.4h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
-+        cmhs            v5.4h, v0.4h, v6.4h
-+        tbnz            w2, #0, 1f              // none of the 4 pixel pairs should be updated if this one is not filtered
-+        bsl             v5.8b, v6.8b, v0.8b     // FFMIN(d, clip)
-+        bic             v0.8b, v5.8b, v3.8b     // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
-+        mla             v4.4h, v0.4h, v2.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
-+        mls             v1.4h, v0.4h, v2.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
-+        sqxtun          v3.8b, v4.8h
-+        sqxtun          v2.8b, v1.8h
-+        st2             {v2.b, v3.b}[0], [x0], x1
-+        st2             {v2.b, v3.b}[1], [x0], x1
-+        st2             {v2.b, v3.b}[2], [x0], x1
-+        st2             {v2.b, v3.b}[3], [x0]
-+1:      ret
-+endfunc
-+
-+// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
-+// On entry:
-+//   x0 -> top-left pel of lower block
-+//   x1 = row stride, bytes
-+//   w2 = PQUANT bitstream parameter
-+function ff_vc1_v_loop_filter8_neon, export=1
-+        sub             x3, x0, w1, sxtw #2
-+        ldr             d0, .Lcoeffs
-+        ld1             {v1.8b}, [x0], x1       // P5
-+        movi            v2.2d, #0x0000ffff00000000
-+        ld1             {v3.8b}, [x3], x1       // P1
-+        ld1             {v4.8b}, [x3], x1       // P2
-+        ld1             {v5.8b}, [x0], x1       // P6
-+        ld1             {v6.8b}, [x3], x1       // P3
-+        ld1             {v7.8b}, [x0], x1       // P7
-+        ushll           v16.8h, v1.8b, #1       // 2*P5
-+        ushll           v3.8h, v3.8b, #1        // 2*P1
-+        ld1             {v17.8b}, [x3]          // P4
-+        uxtl            v4.8h, v4.8b            // P2
-+        ld1             {v18.8b}, [x0]          // P8
-+        uxtl            v5.8h, v5.8b            // P6
-+        dup             v19.8h, w2              // pq
-+        uxtl            v20.8h, v6.8b           // P3
-+        mls             v3.8h, v4.8h, v0.h[1]   // 2*P1-5*P2
-+        uxtl            v4.8h, v7.8b            // P7
-+        ushll           v6.8h, v6.8b, #1        // 2*P3
-+        mls             v16.8h, v5.8h, v0.h[1]  // 2*P5-5*P6
-+        uxtl            v7.8h, v17.8b           // P4
-+        uxtl            v17.8h, v18.8b          // P8
-+        mla             v16.8h, v4.8h, v0.h[1]  // 2*P5-5*P6+5*P7
-+        uxtl            v1.8h, v1.8b            // P5
-+        mla             v3.8h, v20.8h, v0.h[1]  // 2*P1-5*P2+5*P3
-+        sub             v4.8h, v7.8h, v1.8h     // P4-P5
-+        mls             v6.8h, v7.8h, v0.h[1]   // 2*P3-5*P4
-+        mls             v16.8h, v17.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8
-+        abs             v17.8h, v4.8h
-+        sshr            v4.8h, v4.8h, #8        // clip_sign
-+        mls             v3.8h, v7.8h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4
-+        sshr            v17.8h, v17.8h, #1      // clip
-+        mla             v6.8h, v1.8h, v0.h[1]   // 2*P3-5*P4+5*P5
-+        srshr           v16.8h, v16.8h, #3
-+        mls             v6.8h, v5.8h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
-+        cmeq            v5.8h, v17.8h, #0       // test clip == 0
-+        srshr           v3.8h, v3.8h, #3
-+        abs             v16.8h, v16.8h          // a2
-+        abs             v3.8h, v3.8h            // a1
-+        srshr           v6.8h, v6.8h, #3
-+        cmhs            v18.8h, v3.8h, v16.8h   // test a1 >= a2
-+        abs             v20.8h, v6.8h           // a0
-+        sshr            v6.8h, v6.8h, #8        // a0_sign
-+        bsl             v18.16b, v16.16b, v3.16b // a3
-+        cmhs            v3.8h, v20.8h, v19.8h   // test a0 >= pq
-+        sub             v4.8h, v4.8h, v6.8h     // clip_sign - a0_sign
-+        uqsub           v6.8h, v20.8h, v18.8h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
-+        cmhs            v16.8h, v18.8h, v20.8h  // test a3 >= a0
-+        orr             v3.16b, v5.16b, v3.16b  // test clip == 0 || a0 >= pq
-+        mul             v0.8h, v6.8h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
-+        orr             v5.16b, v3.16b, v16.16b // test clip == 0 || a0 >= pq || a3 >= a0
-+        cmtst           v2.2d, v5.2d, v2.2d     // if 2nd of each group of is not filtered, then none of the others in the group should be either
-+        mov             w0, v5.s[1]             // move to gp reg
-+        ushr            v0.8h, v0.8h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
-+        mov             w2, v5.s[3]
-+        orr             v2.16b, v3.16b, v2.16b
-+        cmhs            v3.8h, v0.8h, v17.8h
-+        and             w0, w0, w2
-+        bsl             v3.16b, v17.16b, v0.16b // FFMIN(d, clip)
-+        tbnz            w0, #0, 1f              // none of the 8 pixel pairs should be updated in this case
-+        bic             v0.16b, v3.16b, v2.16b  // set each d to zero if it should not be filtered
-+        mls             v7.8h, v0.8h, v4.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
-+        mla             v1.8h, v0.8h, v4.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
-+        sqxtun          v0.8b, v7.8h
-+        sqxtun          v1.8b, v1.8h
-+        st1             {v0.8b}, [x3], x1
-+        st1             {v1.8b}, [x3]
-+1:      ret
-+endfunc
-+
-+// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
-+// On entry:
-+//   x0 -> top-left pel of right block
-+//   x1 = row stride, bytes
-+//   w2 = PQUANT bitstream parameter
-+function ff_vc1_h_loop_filter8_neon, export=1
-+        sub             x3, x0, #4              // where to start reading
-+        ldr             d0, .Lcoeffs
-+        ld1             {v1.8b}, [x3], x1       // P1[0], P2[0]...
-+        sub             x0, x0, #1              // where to start writing
-+        ld1             {v2.8b}, [x3], x1
-+        add             x4, x0, x1, lsl #2
-+        ld1             {v3.8b}, [x3], x1
-+        ld1             {v4.8b}, [x3], x1
-+        ld1             {v5.8b}, [x3], x1
-+        ld1             {v6.8b}, [x3], x1
-+        ld1             {v7.8b}, [x3], x1
-+        trn1            v16.8b, v1.8b, v2.8b    // P1[0], P1[1], P3[0]...
-+        ld1             {v17.8b}, [x3]
-+        trn2            v1.8b, v1.8b, v2.8b     // P2[0], P2[1], P4[0]...
-+        trn1            v2.8b, v3.8b, v4.8b     // P1[2], P1[3], P3[2]...
-+        trn2            v3.8b, v3.8b, v4.8b     // P2[2], P2[3], P4[2]...
-+        dup             v4.8h, w2               // pq
-+        trn1            v18.8b, v5.8b, v6.8b    // P1[4], P1[5], P3[4]...
-+        trn2            v5.8b, v5.8b, v6.8b     // P2[4], P2[5], P4[4]...
-+        trn1            v6.4h, v16.4h, v2.4h    // P1[0], P1[1], P1[2], P1[3], P5[0]...
-+        trn1            v19.4h, v1.4h, v3.4h    // P2[0], P2[1], P2[2], P2[3], P6[0]...
-+        trn1            v20.8b, v7.8b, v17.8b   // P1[6], P1[7], P3[6]...
-+        trn2            v7.8b, v7.8b, v17.8b    // P2[6], P2[7], P4[6]...
-+        trn2            v2.4h, v16.4h, v2.4h    // P3[0], P3[1], P3[2], P3[3], P7[0]...
-+        trn2            v1.4h, v1.4h, v3.4h     // P4[0], P4[1], P4[2], P4[3], P8[0]...
-+        trn1            v3.4h, v18.4h, v20.4h   // P1[4], P1[5], P1[6], P1[7], P5[4]...
-+        trn1            v16.4h, v5.4h, v7.4h    // P2[4], P2[5], P2[6], P2[7], P6[4]...
-+        trn2            v17.4h, v18.4h, v20.4h  // P3[4], P3[5], P3[6], P3[7], P7[4]...
-+        trn2            v5.4h, v5.4h, v7.4h     // P4[4], P4[5], P4[6], P4[7], P8[4]...
-+        trn1            v7.2s, v6.2s, v3.2s     // P1
-+        trn1            v18.2s, v19.2s, v16.2s  // P2
-+        trn2            v3.2s, v6.2s, v3.2s     // P5
-+        trn2            v6.2s, v19.2s, v16.2s   // P6
-+        trn1            v16.2s, v2.2s, v17.2s   // P3
-+        trn2            v2.2s, v2.2s, v17.2s    // P7
-+        ushll           v7.8h, v7.8b, #1        // 2*P1
-+        trn1            v17.2s, v1.2s, v5.2s    // P4
-+        ushll           v19.8h, v3.8b, #1       // 2*P5
-+        trn2            v1.2s, v1.2s, v5.2s     // P8
-+        uxtl            v5.8h, v18.8b           // P2
-+        uxtl            v6.8h, v6.8b            // P6
-+        uxtl            v18.8h, v16.8b          // P3
-+        mls             v7.8h, v5.8h, v0.h[1]   // 2*P1-5*P2
-+        uxtl            v2.8h, v2.8b            // P7
-+        ushll           v5.8h, v16.8b, #1       // 2*P3
-+        mls             v19.8h, v6.8h, v0.h[1]  // 2*P5-5*P6
-+        uxtl            v16.8h, v17.8b          // P4
-+        uxtl            v1.8h, v1.8b            // P8
-+        mla             v19.8h, v2.8h, v0.h[1]  // 2*P5-5*P6+5*P7
-+        uxtl            v2.8h, v3.8b            // P5
-+        mla             v7.8h, v18.8h, v0.h[1]  // 2*P1-5*P2+5*P3
-+        sub             v3.8h, v16.8h, v2.8h    // P4-P5
-+        mls             v5.8h, v16.8h, v0.h[1]  // 2*P3-5*P4
-+        mls             v19.8h, v1.8h, v0.h[0]  // 2*P5-5*P6+5*P7-2*P8
-+        abs             v1.8h, v3.8h
-+        sshr            v3.8h, v3.8h, #8        // clip_sign
-+        mls             v7.8h, v16.8h, v0.h[0]  // 2*P1-5*P2+5*P3-2*P4
-+        sshr            v1.8h, v1.8h, #1        // clip
-+        mla             v5.8h, v2.8h, v0.h[1]   // 2*P3-5*P4+5*P5
-+        srshr           v17.8h, v19.8h, #3
-+        mls             v5.8h, v6.8h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
-+        cmeq            v6.8h, v1.8h, #0        // test clip == 0
-+        srshr           v7.8h, v7.8h, #3
-+        abs             v17.8h, v17.8h          // a2
-+        abs             v7.8h, v7.8h            // a1
-+        srshr           v5.8h, v5.8h, #3
-+        cmhs            v18.8h, v7.8h, v17.8h   // test a1 >= a2
-+        abs             v19.8h, v5.8h           // a0
-+        sshr            v5.8h, v5.8h, #8        // a0_sign
-+        bsl             v18.16b, v17.16b, v7.16b // a3
-+        cmhs            v4.8h, v19.8h, v4.8h    // test a0 >= pq
-+        sub             v3.8h, v3.8h, v5.8h     // clip_sign - a0_sign
-+        uqsub           v5.8h, v19.8h, v18.8h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
-+        cmhs            v7.8h, v18.8h, v19.8h   // test a3 >= a0
-+        orr             v4.16b, v6.16b, v4.16b  // test clip == 0 || a0 >= pq
-+        mul             v0.8h, v5.8h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
-+        orr             v5.16b, v4.16b, v7.16b  // test clip == 0 || a0 >= pq || a3 >= a0
-+        mov             w2, v5.s[1]             // move to gp reg
-+        ushr            v0.8h, v0.8h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
-+        mov             w3, v5.s[3]
-+        cmhs            v5.8h, v0.8h, v1.8h
-+        and             w5, w2, w3
-+        bsl             v5.16b, v1.16b, v0.16b  // FFMIN(d, clip)
-+        tbnz            w5, #0, 2f              // none of the 8 pixel pairs should be updated in this case
-+        bic             v0.16b, v5.16b, v4.16b  // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
-+        mla             v2.8h, v0.8h, v3.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
-+        mls             v16.8h, v0.8h, v3.8h    // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
-+        sqxtun          v1.8b, v2.8h
-+        sqxtun          v0.8b, v16.8h
-+        tbnz            w2, #0, 1f              // none of the first 4 pixel pairs should be updated if so
-+        st2             {v0.b, v1.b}[0], [x0], x1
-+        st2             {v0.b, v1.b}[1], [x0], x1
-+        st2             {v0.b, v1.b}[2], [x0], x1
-+        st2             {v0.b, v1.b}[3], [x0]
-+1:      tbnz            w3, #0, 2f              // none of the second 4 pixel pairs should be updated if so
-+        st2             {v0.b, v1.b}[4], [x4], x1
-+        st2             {v0.b, v1.b}[5], [x4], x1
-+        st2             {v0.b, v1.b}[6], [x4], x1
-+        st2             {v0.b, v1.b}[7], [x4]
-+2:      ret
-+endfunc
-+
-+// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
-+// On entry:
-+//   x0 -> top-left pel of lower block
-+//   x1 = row stride, bytes
-+//   w2 = PQUANT bitstream parameter
-+function ff_vc1_v_loop_filter16_neon, export=1
-+        sub             x3, x0, w1, sxtw #2
-+        ldr             d0, .Lcoeffs
-+        ld1             {v1.16b}, [x0], x1      // P5
-+        movi            v2.2d, #0x0000ffff00000000
-+        ld1             {v3.16b}, [x3], x1      // P1
-+        ld1             {v4.16b}, [x3], x1      // P2
-+        ld1             {v5.16b}, [x0], x1      // P6
-+        ld1             {v6.16b}, [x3], x1      // P3
-+        ld1             {v7.16b}, [x0], x1      // P7
-+        ushll           v16.8h, v1.8b, #1       // 2*P5[0..7]
-+        ushll           v17.8h, v3.8b, #1       // 2*P1[0..7]
-+        ld1             {v18.16b}, [x3]         // P4
-+        uxtl            v19.8h, v4.8b           // P2[0..7]
-+        ld1             {v20.16b}, [x0]         // P8
-+        uxtl            v21.8h, v5.8b           // P6[0..7]
-+        dup             v22.8h, w2              // pq
-+        ushll2          v3.8h, v3.16b, #1       // 2*P1[8..15]
-+        mls             v17.8h, v19.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]
-+        ushll2          v19.8h, v1.16b, #1      // 2*P5[8..15]
-+        uxtl2           v4.8h, v4.16b           // P2[8..15]
-+        mls             v16.8h, v21.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]
-+        uxtl2           v5.8h, v5.16b           // P6[8..15]
-+        uxtl            v23.8h, v6.8b           // P3[0..7]
-+        uxtl            v24.8h, v7.8b           // P7[0..7]
-+        mls             v3.8h, v4.8h, v0.h[1]   // 2*P1[8..15]-5*P2[8..15]
-+        ushll           v4.8h, v6.8b, #1        // 2*P3[0..7]
-+        uxtl            v25.8h, v18.8b          // P4[0..7]
-+        mls             v19.8h, v5.8h, v0.h[1]  // 2*P5[8..15]-5*P6[8..15]
-+        uxtl2           v26.8h, v6.16b          // P3[8..15]
-+        mla             v17.8h, v23.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
-+        uxtl2           v7.8h, v7.16b           // P7[8..15]
-+        ushll2          v6.8h, v6.16b, #1       // 2*P3[8..15]
-+        mla             v16.8h, v24.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
-+        uxtl2           v18.8h, v18.16b         // P4[8..15]
-+        uxtl            v23.8h, v20.8b          // P8[0..7]
-+        mls             v4.8h, v25.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]
-+        uxtl            v24.8h, v1.8b           // P5[0..7]
-+        uxtl2           v20.8h, v20.16b         // P8[8..15]
-+        mla             v3.8h, v26.8h, v0.h[1]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
-+        uxtl2           v1.8h, v1.16b           // P5[8..15]
-+        sub             v26.8h, v25.8h, v24.8h  // P4[0..7]-P5[0..7]
-+        mla             v19.8h, v7.8h, v0.h[1]  // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
-+        sub             v7.8h, v18.8h, v1.8h    // P4[8..15]-P5[8..15]
-+        mls             v6.8h, v18.8h, v0.h[1]  // 2*P3[8..15]-5*P4[8..15]
-+        abs             v27.8h, v26.8h
-+        sshr            v26.8h, v26.8h, #8      // clip_sign[0..7]
-+        mls             v17.8h, v25.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
-+        abs             v28.8h, v7.8h
-+        sshr            v27.8h, v27.8h, #1      // clip[0..7]
-+        mls             v16.8h, v23.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
-+        sshr            v7.8h, v7.8h, #8        // clip_sign[8..15]
-+        sshr            v23.8h, v28.8h, #1      // clip[8..15]
-+        mla             v4.8h, v24.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
-+        cmeq            v28.8h, v27.8h, #0      // test clip[0..7] == 0
-+        srshr           v17.8h, v17.8h, #3
-+        mls             v3.8h, v18.8h, v0.h[0]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
-+        cmeq            v29.8h, v23.8h, #0      // test clip[8..15] == 0
-+        srshr           v16.8h, v16.8h, #3
-+        mls             v19.8h, v20.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
-+        abs             v17.8h, v17.8h          // a1[0..7]
-+        mla             v6.8h, v1.8h, v0.h[1]   // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
-+        srshr           v3.8h, v3.8h, #3
-+        mls             v4.8h, v21.8h, v0.h[0]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
-+        abs             v16.8h, v16.8h          // a2[0..7]
-+        srshr           v19.8h, v19.8h, #3
-+        mls             v6.8h, v5.8h, v0.h[0]   // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
-+        cmhs            v5.8h, v17.8h, v16.8h   // test a1[0..7] >= a2[0..7]
-+        abs             v3.8h, v3.8h            // a1[8..15]
-+        srshr           v4.8h, v4.8h, #3
-+        abs             v19.8h, v19.8h          // a2[8..15]
-+        bsl             v5.16b, v16.16b, v17.16b // a3[0..7]
-+        srshr           v6.8h, v6.8h, #3
-+        cmhs            v16.8h, v3.8h, v19.8h   // test a1[8..15] >= a2[8.15]
-+        abs             v17.8h, v4.8h           // a0[0..7]
-+        sshr            v4.8h, v4.8h, #8        // a0_sign[0..7]
-+        bsl             v16.16b, v19.16b, v3.16b // a3[8..15]
-+        uqsub           v3.8h, v17.8h, v5.8h    // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
-+        abs             v19.8h, v6.8h           // a0[8..15]
-+        cmhs            v20.8h, v17.8h, v22.8h  // test a0[0..7] >= pq
-+        cmhs            v5.8h, v5.8h, v17.8h    // test a3[0..7] >= a0[0..7]
-+        sub             v4.8h, v26.8h, v4.8h    // clip_sign[0..7] - a0_sign[0..7]
-+        sshr            v6.8h, v6.8h, #8        // a0_sign[8..15]
-+        mul             v3.8h, v3.8h, v0.h[1]   // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
-+        uqsub           v17.8h, v19.8h, v16.8h  // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
-+        orr             v20.16b, v28.16b, v20.16b // test clip[0..7] == 0 || a0[0..7] >= pq
-+        cmhs            v21.8h, v19.8h, v22.8h  // test a0[8..15] >= pq
-+        cmhs            v16.8h, v16.8h, v19.8h  // test a3[8..15] >= a0[8..15]
-+        mul             v0.8h, v17.8h, v0.h[1]  // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
-+        sub             v6.8h, v7.8h, v6.8h     // clip_sign[8..15] - a0_sign[8..15]
-+        orr             v5.16b, v20.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
-+        ushr            v3.8h, v3.8h, #3        // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
-+        orr             v7.16b, v29.16b, v21.16b // test clip[8..15] == 0 || a0[8..15] >= pq
-+        cmtst           v17.2d, v5.2d, v2.2d    // if 2nd of each group of is not filtered, then none of the others in the group should be either
-+        mov             w0, v5.s[1]             // move to gp reg
-+        cmhs            v19.8h, v3.8h, v27.8h
-+        ushr            v0.8h, v0.8h, #3        // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
-+        mov             w2, v5.s[3]
-+        orr             v5.16b, v7.16b, v16.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
-+        orr             v16.16b, v20.16b, v17.16b
-+        bsl             v19.16b, v27.16b, v3.16b // FFMIN(d[0..7], clip[0..7])
-+        cmtst           v2.2d, v5.2d, v2.2d
-+        cmhs            v3.8h, v0.8h, v23.8h
-+        mov             w4, v5.s[1]
-+        mov             w5, v5.s[3]
-+        and             w0, w0, w2
-+        bic             v5.16b, v19.16b, v16.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
-+        orr             v2.16b, v7.16b, v2.16b
-+        bsl             v3.16b, v23.16b, v0.16b // FFMIN(d[8..15], clip[8..15])
-+        mls             v25.8h, v5.8h, v4.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
-+        and             w2, w4, w5
-+        bic             v0.16b, v3.16b, v2.16b  // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
-+        mla             v24.8h, v5.8h, v4.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
-+        and             w0, w0, w2
-+        mls             v18.8h, v0.8h, v6.8h    // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
-+        sqxtun          v2.8b, v25.8h
-+        tbnz            w0, #0, 1f              // none of the 16 pixel pairs should be updated in this case
-+        mla             v1.8h, v0.8h, v6.8h     // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
-+        sqxtun          v0.8b, v24.8h
-+        sqxtun2         v2.16b, v18.8h
-+        sqxtun2         v0.16b, v1.8h
-+        st1             {v2.16b}, [x3], x1
-+        st1             {v0.16b}, [x3]
-+1:      ret
-+endfunc
-+
-+// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
-+// On entry:
-+//   x0 -> top-left pel of right block
-+//   x1 = row stride, bytes
-+//   w2 = PQUANT bitstream parameter
-+function ff_vc1_h_loop_filter16_neon, export=1
-+        sub             x3, x0, #4              // where to start reading
-+        ldr             d0, .Lcoeffs
-+        ld1             {v1.8b}, [x3], x1       // P1[0], P2[0]...
-+        sub             x0, x0, #1              // where to start writing
-+        ld1             {v2.8b}, [x3], x1
-+        add             x4, x0, x1, lsl #3
-+        ld1             {v3.8b}, [x3], x1
-+        add             x5, x0, x1, lsl #2
-+        ld1             {v4.8b}, [x3], x1
-+        add             x6, x4, x1, lsl #2
-+        ld1             {v5.8b}, [x3], x1
-+        ld1             {v6.8b}, [x3], x1
-+        ld1             {v7.8b}, [x3], x1
-+        trn1            v16.8b, v1.8b, v2.8b    // P1[0], P1[1], P3[0]...
-+        ld1             {v17.8b}, [x3], x1
-+        trn2            v1.8b, v1.8b, v2.8b     // P2[0], P2[1], P4[0]...
-+        ld1             {v2.8b}, [x3], x1
-+        trn1            v18.8b, v3.8b, v4.8b    // P1[2], P1[3], P3[2]...
-+        ld1             {v19.8b}, [x3], x1
-+        trn2            v3.8b, v3.8b, v4.8b     // P2[2], P2[3], P4[2]...
-+        ld1             {v4.8b}, [x3], x1
-+        trn1            v20.8b, v5.8b, v6.8b    // P1[4], P1[5], P3[4]...
-+        ld1             {v21.8b}, [x3], x1
-+        trn2            v5.8b, v5.8b, v6.8b     // P2[4], P2[5], P4[4]...
-+        ld1             {v6.8b}, [x3], x1
-+        trn1            v22.8b, v7.8b, v17.8b   // P1[6], P1[7], P3[6]...
-+        ld1             {v23.8b}, [x3], x1
-+        trn2            v7.8b, v7.8b, v17.8b    // P2[6], P2[7], P4[6]...
-+        ld1             {v17.8b}, [x3], x1
-+        trn1            v24.8b, v2.8b, v19.8b   // P1[8], P1[9], P3[8]...
-+        ld1             {v25.8b}, [x3]
-+        trn2            v2.8b, v2.8b, v19.8b    // P2[8], P2[9], P4[8]...
-+        trn1            v19.4h, v16.4h, v18.4h  // P1[0], P1[1], P1[2], P1[3], P5[0]...
-+        trn1            v26.8b, v4.8b, v21.8b   // P1[10], P1[11], P3[10]...
-+        trn2            v4.8b, v4.8b, v21.8b    // P2[10], P2[11], P4[10]...
-+        trn1            v21.4h, v1.4h, v3.4h    // P2[0], P2[1], P2[2], P2[3], P6[0]...
-+        trn1            v27.4h, v20.4h, v22.4h  // P1[4], P1[5], P1[6], P1[7], P5[4]...
-+        trn1            v28.8b, v6.8b, v23.8b   // P1[12], P1[13], P3[12]...
-+        trn2            v6.8b, v6.8b, v23.8b    // P2[12], P2[13], P4[12]...
-+        trn1            v23.4h, v5.4h, v7.4h    // P2[4], P2[5], P2[6], P2[7], P6[4]...
-+        trn1            v29.4h, v24.4h, v26.4h  // P1[8], P1[9], P1[10], P1[11], P5[8]...
-+        trn1            v30.8b, v17.8b, v25.8b  // P1[14], P1[15], P3[14]...
-+        trn2            v17.8b, v17.8b, v25.8b  // P2[14], P2[15], P4[14]...
-+        trn1            v25.4h, v2.4h, v4.4h    // P2[8], P2[9], P2[10], P2[11], P6[8]...
-+        trn1            v31.2s, v19.2s, v27.2s  // P1[0..7]
-+        trn2            v19.2s, v19.2s, v27.2s  // P5[0..7]
-+        trn1            v27.2s, v21.2s, v23.2s  // P2[0..7]
-+        trn2            v21.2s, v21.2s, v23.2s  // P6[0..7]
-+        trn1            v23.4h, v28.4h, v30.4h  // P1[12], P1[13], P1[14], P1[15], P5[12]...
-+        trn2            v16.4h, v16.4h, v18.4h  // P3[0], P3[1], P3[2], P3[3], P7[0]...
-+        trn1            v18.4h, v6.4h, v17.4h   // P2[12], P2[13], P2[14], P2[15], P6[12]...
-+        trn2            v20.4h, v20.4h, v22.4h  // P3[4], P3[5], P3[6], P3[7], P7[4]...
-+        trn2            v22.4h, v24.4h, v26.4h  // P3[8], P3[9], P3[10], P3[11], P7[8]...
-+        trn1            v24.2s, v29.2s, v23.2s  // P1[8..15]
-+        trn2            v23.2s, v29.2s, v23.2s  // P5[8..15]
-+        trn1            v26.2s, v25.2s, v18.2s  // P2[8..15]
-+        trn2            v18.2s, v25.2s, v18.2s  // P6[8..15]
-+        trn2            v25.4h, v28.4h, v30.4h  // P3[12], P3[13], P3[14], P3[15], P7[12]...
-+        trn2            v1.4h, v1.4h, v3.4h     // P4[0], P4[1], P4[2], P4[3], P8[0]...
-+        trn2            v3.4h, v5.4h, v7.4h     // P4[4], P4[5], P4[6], P4[7], P8[4]...
-+        trn2            v2.4h, v2.4h, v4.4h     // P4[8], P4[9], P4[10], P4[11], P8[8]...
-+        trn2            v4.4h, v6.4h, v17.4h    // P4[12], P4[13], P4[14], P4[15], P8[12]...
-+        ushll           v5.8h, v31.8b, #1       // 2*P1[0..7]
-+        ushll           v6.8h, v19.8b, #1       // 2*P5[0..7]
-+        trn1            v7.2s, v16.2s, v20.2s   // P3[0..7]
-+        uxtl            v17.8h, v27.8b          // P2[0..7]
-+        trn2            v16.2s, v16.2s, v20.2s  // P7[0..7]
-+        uxtl            v20.8h, v21.8b          // P6[0..7]
-+        trn1            v21.2s, v22.2s, v25.2s  // P3[8..15]
-+        ushll           v24.8h, v24.8b, #1      // 2*P1[8..15]
-+        trn2            v22.2s, v22.2s, v25.2s  // P7[8..15]
-+        ushll           v25.8h, v23.8b, #1      // 2*P5[8..15]
-+        trn1            v27.2s, v1.2s, v3.2s    // P4[0..7]
-+        uxtl            v26.8h, v26.8b          // P2[8..15]
-+        mls             v5.8h, v17.8h, v0.h[1]  // 2*P1[0..7]-5*P2[0..7]
-+        uxtl            v17.8h, v18.8b          // P6[8..15]
-+        mls             v6.8h, v20.8h, v0.h[1]  // 2*P5[0..7]-5*P6[0..7]
-+        trn1            v18.2s, v2.2s, v4.2s    // P4[8..15]
-+        uxtl            v28.8h, v7.8b           // P3[0..7]
-+        mls             v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]
-+        uxtl            v16.8h, v16.8b          // P7[0..7]
-+        uxtl            v26.8h, v21.8b          // P3[8..15]
-+        mls             v25.8h, v17.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]
-+        uxtl            v22.8h, v22.8b          // P7[8..15]
-+        ushll           v7.8h, v7.8b, #1        // 2*P3[0..7]
-+        uxtl            v27.8h, v27.8b          // P4[0..7]
-+        trn2            v1.2s, v1.2s, v3.2s     // P8[0..7]
-+        ushll           v3.8h, v21.8b, #1       // 2*P3[8..15]
-+        trn2            v2.2s, v2.2s, v4.2s     // P8[8..15]
-+        uxtl            v4.8h, v18.8b           // P4[8..15]
-+        mla             v5.8h, v28.8h, v0.h[1]  // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
-+        uxtl            v1.8h, v1.8b            // P8[0..7]
-+        mla             v6.8h, v16.8h, v0.h[1]  // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
-+        uxtl            v2.8h, v2.8b            // P8[8..15]
-+        uxtl            v16.8h, v19.8b          // P5[0..7]
-+        mla             v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
-+        uxtl            v18.8h, v23.8b          // P5[8..15]
-+        dup             v19.8h, w2              // pq
-+        mla             v25.8h, v22.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
-+        sub             v21.8h, v27.8h, v16.8h  // P4[0..7]-P5[0..7]
-+        sub             v22.8h, v4.8h, v18.8h   // P4[8..15]-P5[8..15]
-+        mls             v7.8h, v27.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]
-+        abs             v23.8h, v21.8h
-+        mls             v3.8h, v4.8h, v0.h[1]   // 2*P3[8..15]-5*P4[8..15]
-+        abs             v26.8h, v22.8h
-+        sshr            v21.8h, v21.8h, #8      // clip_sign[0..7]
-+        mls             v5.8h, v27.8h, v0.h[0]  // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
-+        sshr            v23.8h, v23.8h, #1      // clip[0..7]
-+        sshr            v26.8h, v26.8h, #1      // clip[8..15]
-+        mls             v6.8h, v1.8h, v0.h[0]   // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
-+        sshr            v1.8h, v22.8h, #8       // clip_sign[8..15]
-+        cmeq            v22.8h, v23.8h, #0      // test clip[0..7] == 0
-+        mls             v24.8h, v4.8h, v0.h[0]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
-+        cmeq            v28.8h, v26.8h, #0      // test clip[8..15] == 0
-+        srshr           v5.8h, v5.8h, #3
-+        mls             v25.8h, v2.8h, v0.h[0]  // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
-+        srshr           v2.8h, v6.8h, #3
-+        mla             v7.8h, v16.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
-+        srshr           v6.8h, v24.8h, #3
-+        mla             v3.8h, v18.8h, v0.h[1]  // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
-+        abs             v5.8h, v5.8h            // a1[0..7]
-+        srshr           v24.8h, v25.8h, #3
-+        mls             v3.8h, v17.8h, v0.h[0]  // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
-+        abs             v2.8h, v2.8h            // a2[0..7]
-+        abs             v6.8h, v6.8h            // a1[8..15]
-+        mls             v7.8h, v20.8h, v0.h[0]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
-+        abs             v17.8h, v24.8h          // a2[8..15]
-+        cmhs            v20.8h, v5.8h, v2.8h    // test a1[0..7] >= a2[0..7]
-+        srshr           v3.8h, v3.8h, #3
-+        cmhs            v24.8h, v6.8h, v17.8h   // test a1[8..15] >= a2[8.15]
-+        srshr           v7.8h, v7.8h, #3
-+        bsl             v20.16b, v2.16b, v5.16b // a3[0..7]
-+        abs             v2.8h, v3.8h            // a0[8..15]
-+        sshr            v3.8h, v3.8h, #8        // a0_sign[8..15]
-+        bsl             v24.16b, v17.16b, v6.16b // a3[8..15]
-+        abs             v5.8h, v7.8h            // a0[0..7]
-+        sshr            v6.8h, v7.8h, #8        // a0_sign[0..7]
-+        cmhs            v7.8h, v2.8h, v19.8h    // test a0[8..15] >= pq
-+        sub             v1.8h, v1.8h, v3.8h     // clip_sign[8..15] - a0_sign[8..15]
-+        uqsub           v3.8h, v2.8h, v24.8h    // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
-+        cmhs            v2.8h, v24.8h, v2.8h    // test a3[8..15] >= a0[8..15]
-+        uqsub           v17.8h, v5.8h, v20.8h   // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
-+        cmhs            v19.8h, v5.8h, v19.8h   // test a0[0..7] >= pq
-+        orr             v7.16b, v28.16b, v7.16b // test clip[8..15] == 0 || a0[8..15] >= pq
-+        sub             v6.8h, v21.8h, v6.8h    // clip_sign[0..7] - a0_sign[0..7]
-+        mul             v3.8h, v3.8h, v0.h[1]   // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
-+        cmhs            v5.8h, v20.8h, v5.8h    // test a3[0..7] >= a0[0..7]
-+        orr             v19.16b, v22.16b, v19.16b // test clip[0..7] == 0 || a0[0..7] >= pq
-+        mul             v0.8h, v17.8h, v0.h[1]  // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
-+        orr             v2.16b, v7.16b, v2.16b  // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
-+        orr             v5.16b, v19.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
-+        ushr            v3.8h, v3.8h, #3        // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
-+        mov             w7, v2.s[1]
-+        mov             w8, v2.s[3]
-+        ushr            v0.8h, v0.8h, #3        // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
-+        mov             w2, v5.s[1]             // move to gp reg
-+        cmhs            v2.8h, v3.8h, v26.8h
-+        mov             w3, v5.s[3]
-+        cmhs            v5.8h, v0.8h, v23.8h
-+        bsl             v2.16b, v26.16b, v3.16b // FFMIN(d[8..15], clip[8..15])
-+        and             w9, w7, w8
-+        bsl             v5.16b, v23.16b, v0.16b // FFMIN(d[0..7], clip[0..7])
-+        and             w10, w2, w3
-+        bic             v0.16b, v2.16b, v7.16b  // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
-+        and             w9, w10, w9
-+        bic             v2.16b, v5.16b, v19.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
-+        mls             v4.8h, v0.8h, v1.8h     // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
-+        tbnz            w9, #0, 4f              // none of the 16 pixel pairs should be updated in this case
-+        mls             v27.8h, v2.8h, v6.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
-+        mla             v16.8h, v2.8h, v6.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
-+        sqxtun          v2.8b, v4.8h
-+        mla             v18.8h, v0.8h, v1.8h    // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
-+        sqxtun          v0.8b, v27.8h
-+        sqxtun          v1.8b, v16.8h
-+        sqxtun          v3.8b, v18.8h
-+        tbnz            w2, #0, 1f
-+        st2             {v0.b, v1.b}[0], [x0], x1
-+        st2             {v0.b, v1.b}[1], [x0], x1
-+        st2             {v0.b, v1.b}[2], [x0], x1
-+        st2             {v0.b, v1.b}[3], [x0]
-+1:      tbnz            w3, #0, 2f
-+        st2             {v0.b, v1.b}[4], [x5], x1
-+        st2             {v0.b, v1.b}[5], [x5], x1
-+        st2             {v0.b, v1.b}[6], [x5], x1
-+        st2             {v0.b, v1.b}[7], [x5]
-+2:      tbnz            w7, #0, 3f
-+        st2             {v2.b, v3.b}[0], [x4], x1
-+        st2             {v2.b, v3.b}[1], [x4], x1
-+        st2             {v2.b, v3.b}[2], [x4], x1
-+        st2             {v2.b, v3.b}[3], [x4]
-+3:      tbnz            w8, #0, 4f
-+        st2             {v2.b, v3.b}[4], [x6], x1
-+        st2             {v2.b, v3.b}[5], [x6], x1
-+        st2             {v2.b, v3.b}[6], [x6], x1
-+        st2             {v2.b, v3.b}[7], [x6]
-+4:      ret
-+endfunc
-+
-+// Copy at most the specified number of bytes from source to destination buffer,
-+// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence
-+// On entry:
-+//   x0 -> source buffer
-+//   w1 = max number of bytes to copy
-+//   x2 -> destination buffer, optimally 8-byte aligned
-+// On exit:
-+//   w0 = number of bytes not copied
-+function ff_vc1_unescape_buffer_helper_neon, export=1
-+        // Offset by 80 to screen out cases that are too short for us to handle,
-+        // and also make it easy to test for loop termination, or to determine
-+        // whether we need an odd number of half-iterations of the loop.
-+        subs            w1, w1, #80
-+        b.mi            90f
-+
-+        // Set up useful constants
-+        movi            v20.4s, #3, lsl #24
-+        movi            v21.4s, #3, lsl #16
-+
-+        tst             w1, #32
-+        b.ne            1f
-+
-+          ld1             {v0.16b, v1.16b, v2.16b}, [x0], #48
-+          ext             v25.16b, v0.16b, v1.16b, #1
-+          ext             v26.16b, v0.16b, v1.16b, #2
-+          ext             v27.16b, v0.16b, v1.16b, #3
-+          ext             v29.16b, v1.16b, v2.16b, #1
-+          ext             v30.16b, v1.16b, v2.16b, #2
-+          ext             v31.16b, v1.16b, v2.16b, #3
-+          bic             v24.16b, v0.16b, v20.16b
-+          bic             v25.16b, v25.16b, v20.16b
-+          bic             v26.16b, v26.16b, v20.16b
-+          bic             v27.16b, v27.16b, v20.16b
-+          bic             v28.16b, v1.16b, v20.16b
-+          bic             v29.16b, v29.16b, v20.16b
-+          bic             v30.16b, v30.16b, v20.16b
-+          bic             v31.16b, v31.16b, v20.16b
-+          eor             v24.16b, v24.16b, v21.16b
-+          eor             v25.16b, v25.16b, v21.16b
-+          eor             v26.16b, v26.16b, v21.16b
-+          eor             v27.16b, v27.16b, v21.16b
-+          eor             v28.16b, v28.16b, v21.16b
-+          eor             v29.16b, v29.16b, v21.16b
-+          eor             v30.16b, v30.16b, v21.16b
-+          eor             v31.16b, v31.16b, v21.16b
-+          cmeq            v24.4s, v24.4s, #0
-+          cmeq            v25.4s, v25.4s, #0
-+          cmeq            v26.4s, v26.4s, #0
-+          cmeq            v27.4s, v27.4s, #0
-+          add             w1, w1, #32
-+          b               3f
-+
-+1:      ld1             {v3.16b, v4.16b, v5.16b}, [x0], #48
-+        ext             v25.16b, v3.16b, v4.16b, #1
-+        ext             v26.16b, v3.16b, v4.16b, #2
-+        ext             v27.16b, v3.16b, v4.16b, #3
-+        ext             v29.16b, v4.16b, v5.16b, #1
-+        ext             v30.16b, v4.16b, v5.16b, #2
-+        ext             v31.16b, v4.16b, v5.16b, #3
-+        bic             v24.16b, v3.16b, v20.16b
-+        bic             v25.16b, v25.16b, v20.16b
-+        bic             v26.16b, v26.16b, v20.16b
-+        bic             v27.16b, v27.16b, v20.16b
-+        bic             v28.16b, v4.16b, v20.16b
-+        bic             v29.16b, v29.16b, v20.16b
-+        bic             v30.16b, v30.16b, v20.16b
-+        bic             v31.16b, v31.16b, v20.16b
-+        eor             v24.16b, v24.16b, v21.16b
-+        eor             v25.16b, v25.16b, v21.16b
-+        eor             v26.16b, v26.16b, v21.16b
-+        eor             v27.16b, v27.16b, v21.16b
-+        eor             v28.16b, v28.16b, v21.16b
-+        eor             v29.16b, v29.16b, v21.16b
-+        eor             v30.16b, v30.16b, v21.16b
-+        eor             v31.16b, v31.16b, v21.16b
-+        cmeq            v24.4s, v24.4s, #0
-+        cmeq            v25.4s, v25.4s, #0
-+        cmeq            v26.4s, v26.4s, #0
-+        cmeq            v27.4s, v27.4s, #0
-+        // Drop through...
-+2:        mov             v0.16b, v5.16b
-+          ld1             {v1.16b, v2.16b}, [x0], #32
-+        cmeq            v28.4s, v28.4s, #0
-+        cmeq            v29.4s, v29.4s, #0
-+        cmeq            v30.4s, v30.4s, #0
-+        cmeq            v31.4s, v31.4s, #0
-+        orr             v24.16b, v24.16b, v25.16b
-+        orr             v26.16b, v26.16b, v27.16b
-+        orr             v28.16b, v28.16b, v29.16b
-+        orr             v30.16b, v30.16b, v31.16b
-+          ext             v25.16b, v0.16b, v1.16b, #1
-+        orr             v22.16b, v24.16b, v26.16b
-+          ext             v26.16b, v0.16b, v1.16b, #2
-+          ext             v27.16b, v0.16b, v1.16b, #3
-+          ext             v29.16b, v1.16b, v2.16b, #1
-+        orr             v23.16b, v28.16b, v30.16b
-+          ext             v30.16b, v1.16b, v2.16b, #2
-+          ext             v31.16b, v1.16b, v2.16b, #3
-+          bic             v24.16b, v0.16b, v20.16b
-+          bic             v25.16b, v25.16b, v20.16b
-+          bic             v26.16b, v26.16b, v20.16b
-+        orr             v22.16b, v22.16b, v23.16b
-+          bic             v27.16b, v27.16b, v20.16b
-+          bic             v28.16b, v1.16b, v20.16b
-+          bic             v29.16b, v29.16b, v20.16b
-+          bic             v30.16b, v30.16b, v20.16b
-+          bic             v31.16b, v31.16b, v20.16b
-+        addv            s22, v22.4s
-+          eor             v24.16b, v24.16b, v21.16b
-+          eor             v25.16b, v25.16b, v21.16b
-+          eor             v26.16b, v26.16b, v21.16b
-+          eor             v27.16b, v27.16b, v21.16b
-+          eor             v28.16b, v28.16b, v21.16b
-+        mov             w3, v22.s[0]
-+          eor             v29.16b, v29.16b, v21.16b
-+          eor             v30.16b, v30.16b, v21.16b
-+          eor             v31.16b, v31.16b, v21.16b
-+          cmeq            v24.4s, v24.4s, #0
-+          cmeq            v25.4s, v25.4s, #0
-+          cmeq            v26.4s, v26.4s, #0
-+          cmeq            v27.4s, v27.4s, #0
-+        cbnz            w3, 90f
-+        st1             {v3.16b, v4.16b}, [x2], #32
-+3:          mov             v3.16b, v2.16b
-+            ld1             {v4.16b, v5.16b}, [x0], #32
-+          cmeq            v28.4s, v28.4s, #0
-+          cmeq            v29.4s, v29.4s, #0
-+          cmeq            v30.4s, v30.4s, #0
-+          cmeq            v31.4s, v31.4s, #0
-+          orr             v24.16b, v24.16b, v25.16b
-+          orr             v26.16b, v26.16b, v27.16b
-+          orr             v28.16b, v28.16b, v29.16b
-+          orr             v30.16b, v30.16b, v31.16b
-+            ext             v25.16b, v3.16b, v4.16b, #1
-+          orr             v22.16b, v24.16b, v26.16b
-+            ext             v26.16b, v3.16b, v4.16b, #2
-+            ext             v27.16b, v3.16b, v4.16b, #3
-+            ext             v29.16b, v4.16b, v5.16b, #1
-+          orr             v23.16b, v28.16b, v30.16b
-+            ext             v30.16b, v4.16b, v5.16b, #2
-+            ext             v31.16b, v4.16b, v5.16b, #3
-+            bic             v24.16b, v3.16b, v20.16b
-+            bic             v25.16b, v25.16b, v20.16b
-+            bic             v26.16b, v26.16b, v20.16b
-+          orr             v22.16b, v22.16b, v23.16b
-+            bic             v27.16b, v27.16b, v20.16b
-+            bic             v28.16b, v4.16b, v20.16b
-+            bic             v29.16b, v29.16b, v20.16b
-+            bic             v30.16b, v30.16b, v20.16b
-+            bic             v31.16b, v31.16b, v20.16b
-+          addv            s22, v22.4s
-+            eor             v24.16b, v24.16b, v21.16b
-+            eor             v25.16b, v25.16b, v21.16b
-+            eor             v26.16b, v26.16b, v21.16b
-+            eor             v27.16b, v27.16b, v21.16b
-+            eor             v28.16b, v28.16b, v21.16b
-+          mov             w3, v22.s[0]
-+            eor             v29.16b, v29.16b, v21.16b
-+            eor             v30.16b, v30.16b, v21.16b
-+            eor             v31.16b, v31.16b, v21.16b
-+            cmeq            v24.4s, v24.4s, #0
-+            cmeq            v25.4s, v25.4s, #0
-+            cmeq            v26.4s, v26.4s, #0
-+            cmeq            v27.4s, v27.4s, #0
-+          cbnz            w3, 91f
-+          st1             {v0.16b, v1.16b}, [x2], #32
-+        subs            w1, w1, #64
-+        b.pl            2b
-+
-+90:     add             w0, w1, #80
-+        ret
-+
-+91:     sub             w1, w1, #32
-+        b               90b
-+endfunc
---- a/libavcodec/allcodecs.c
-+++ b/libavcodec/allcodecs.c
-@@ -149,6 +149,7 @@ extern AVCodec ff_hap_decoder;
- extern AVCodec ff_hevc_decoder;
- extern AVCodec ff_hevc_qsv_decoder;
- extern AVCodec ff_hevc_rkmpp_decoder;
-+extern AVCodec ff_hevc_rpi_decoder;
- extern AVCodec ff_hevc_v4l2m2m_decoder;
- extern AVCodec ff_hnm4_video_decoder;
- extern AVCodec ff_hq_hqa_decoder;
-@@ -890,6 +891,41 @@ static enum AVCodecID remap_deprecated_c
-     }
- }
- 
-+static int codec_supports_format(const AVCodec * const p, const enum AVPixelFormat fmt)
-+{
-+    const enum AVPixelFormat *pf = p->pix_fmts;
-+
-+    // Assume good if we lack info
-+    if (pf == NULL)
-+        return 1;
-+    if (fmt == AV_PIX_FMT_NONE)
-+        return 0;
-+
-+    for (; *pf != AV_PIX_FMT_NONE; ++pf) {
-+        if (*pf == fmt)
-+            return 1;
-+    }
-+    return 0;
-+}
-+
-+AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt)
-+{
-+    const AVCodec *p, *experimental = NULL;
-+    void *i = 0;
-+
-+    id= remap_deprecated_codec_id(id);
-+    while ((p = av_codec_iterate(&i))) {
-+        if (av_codec_is_decoder(p) && p->id == id && codec_supports_format(p, fmt)) {
-+            if (p->capabilities & AV_CODEC_CAP_EXPERIMENTAL && !experimental) {
-+                experimental = p;
-+            } else
-+                return (AVCodec *)p;
-+        }
-+        p = p->next;
-+    }
-+    return (AVCodec *)experimental;
-+}
-+
- static AVCodec *find_codec(enum AVCodecID id, int (*x)(const AVCodec *))
- {
-     const AVCodec *p, *experimental = NULL;
---- a/libavcodec/arm/Makefile
-+++ b/libavcodec/arm/Makefile
-@@ -40,6 +40,8 @@ OBJS-$(CONFIG_AAC_DECODER)             +
-                                           arm/sbrdsp_init_arm.o
- OBJS-$(CONFIG_DCA_DECODER)             += arm/synth_filter_init_arm.o
- OBJS-$(CONFIG_HEVC_DECODER)            += arm/hevcdsp_init_arm.o
-+OBJS-$(CONFIG_HEVC_RPI_DECODER)        += arm/rpi_hevcdsp_init_arm.o    \
-+                                          arm/rpi_hevcpred_init_arm.o
- OBJS-$(CONFIG_MLP_DECODER)             += arm/mlpdsp_init_arm.o
- OBJS-$(CONFIG_RV40_DECODER)            += arm/rv40dsp_init_arm.o
- OBJS-$(CONFIG_SBC_ENCODER)             += arm/sbcdsp_init_arm.o
-@@ -140,10 +142,24 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        +
- NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
- NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
- NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
-+                                          arm/hevcdsp_idct_neon.o    \
-                                           arm/hevcdsp_deblock_neon.o    \
-                                           arm/hevcdsp_idct_neon.o       \
-                                           arm/hevcdsp_qpel_neon.o       \
-                                           arm/hevcdsp_sao_neon.o
-+NEON-OBJS-$(CONFIG_HEVC_RPI_DECODER)   += arm/rpi_hevcdsp_init_neon.o    \
-+                                          arm/rpi_hevc_misc_neon.o       \
-+                                          arm/rpi_hevcdsp_deblock_neon.o \
-+                                          arm/rpi_hevcdsp_idct_neon.o    \
-+                                          arm/rpi_hevcdsp_res8_neon.o    \
-+                                          arm/rpi_hevcdsp_res16_neon.o   \
-+                                          arm/rpi_hevcdsp_sao_neon.o     \
-+                                          arm/rpi_hevcpred_init_neon.o   \
-+                                          arm/rpi_hevcpred_intra_angular_neon.o \
-+                                          arm/rpi_hevcpred_intra_dc_neon.o \
-+                                          arm/rpi_hevcpred_intra_filter_neon.o \
-+                                          arm/rpi_hevcpred_intra_hv_neon.o \
-+                                          arm/rpi_hevcpred_intra_planar_neon.o
- NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
- NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
-                                           arm/rv40dsp_neon.o
---- a/libavcodec/arm/cabac.h
-+++ b/libavcodec/arm/cabac.h
-@@ -26,83 +26,209 @@
- #include "libavutil/internal.h"
- #include "libavcodec/cabac.h"
- 
-+
- #define get_cabac_inline get_cabac_inline_arm
- static av_always_inline int get_cabac_inline_arm(CABACContext *c,
--                                                 uint8_t *const state)
-+                                                 uint8_t *state)
- {
--    int bit;
--    void *reg_b, *reg_c, *tmp;
-+    const uint8_t *mlps_tables = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128;
-+    int bit, ptr, low, tmp1, tmp2;
-+    __asm__ volatile (
-+        "ldr     %[bit], [%[c], %[range_off]]             \n\t"
-+        "ldrb    %[ptr], [%[state]]                       \n\t"
-+        "sub     %[tmp1], %[mlps_tables], %[lps_off]      \n\t"
-+        "and     %[tmp2], %[bit], #0xc0                   \n\t"
-+        "add     %[tmp1], %[tmp1], %[ptr]                 \n\t"
-+        "ldr     %[low], [%[c], %[low_off]]               \n\t"
-+        "ldrb    %[tmp2], [%[tmp1], %[tmp2], lsl #1]      \n\t"
-+        "sub     %[bit], %[bit], %[tmp2]                  \n\t"
-+        "mov     %[tmp1], %[bit]                          \n\t"
-+        "cmp     %[low], %[bit], lsl #17                  \n\t"
-+        "itt     ge                                       \n\t"
-+        "movge   %[tmp1], %[tmp2]                         \n\t"
-+        "mvnge   %[ptr], %[ptr]                           \n\t"
-+        "clz     %[tmp2], %[tmp1]                         \n\t"
-+        "it      ge                                       \n\t"
-+        "subge   %[low], %[low], %[bit], lsl #17          \n\t"
-+        "sub     %[tmp2], %[tmp2], #23                    \n\t"
-+        "and     %[bit], %[ptr], #1                       \n\t"
-+        "ldrb    %[mlps_tables], [%[mlps_tables], %[ptr]] \n\t"
-+        "lsl     %[low], %[low], %[tmp2]                  \n\t"
-+        "lsls    %[ptr], %[low], #16                      \n\t"
-+        "bne     1f                                       \n\t"
-+        "ldr     %[ptr], [%[c], %[ptr_off]]               \n\t"
-+        "lsl     %[tmp2], %[tmp1], %[tmp2]                \n\t"
-+#if UNCHECKED_BITSTREAM_READER
-+        "strb    %[mlps_tables], [%[state]]               \n\t"
-+        "rbit    %[state], %[low]                         \n\t"
-+        "ldrh    %[tmp1], [%[ptr]], #2                    \n\t"
-+#else
-+        "ldr     %[tmp1], [%[c], %[end_off]]              \n\t"
-+        "strb    %[mlps_tables], [%[state]]               \n\t"
-+        "rbit    %[state], %[low]                         \n\t"
-+        "cmp     %[tmp1], %[ptr]                          \n\t"
-+#if CONFIG_THUMB
-+        "it      cs                                       \n\t"
-+        "ldrhcs  %[tmp1], [%[ptr]], #2                    \n\t"
-+#else
-+        "ldrcsh  %[tmp1], [%[ptr]], #2                    \n\t"
-+#endif
-+#endif
-+        "clz     %[state], %[state]                       \n\t"
-+        "movw    %[mlps_tables], #0xffff                  \n\t"
-+        "sub     %[state], %[state], #16                  \n\t"
-+        "str     %[tmp2], [%[c], %[range_off]]            \n\t"
-+        "rev     %[tmp1], %[tmp1]                         \n\t"
-+        "str     %[ptr], [%[c], %[ptr_off]]               \n\t"
-+        "lsr     %[tmp1], %[tmp1], #15                    \n\t"
-+        "sub     %[tmp1], %[tmp1], %[mlps_tables]         \n\t"
-+#if CONFIG_THUMB
-+        "lsl     %[tmp1], %[tmp1], %[state]               \n\t"
-+        "add     %[low], %[low], %[tmp1]                  \n\t"
-+#else
-+        "add     %[low], %[low], %[tmp1], lsl %[state]    \n\t"
-+#endif
-+        "str     %[low], [%[c], %[low_off]]               \n\t"
-+        "b       2f                                       \n\t"
-+        "1:                                               \n\t"
-+        "strb    %[mlps_tables], [%[state]]               \n\t"
-+        "lsl     %[tmp1], %[tmp1], %[tmp2]                \n\t"
-+        "str     %[low], [%[c], %[low_off]]               \n\t"
-+        "str     %[tmp1], [%[c], %[range_off]]            \n\t"
-+        "2:                                               \n\t"
-+    :  // Outputs
-+             [state]"+r"(state),
-+       [mlps_tables]"+r"(mlps_tables),
-+               [bit]"=&r"(bit),
-+               [ptr]"=&r"(ptr),
-+               [low]"=&r"(low),
-+              [tmp1]"=&r"(tmp1),
-+              [tmp2]"=&r"(tmp2)
-+    :  // Inputs
-+               [c]"r"(c),
-+         [low_off]"J"(offsetof(CABACContext, low)),
-+       [range_off]"J"(offsetof(CABACContext, range)),
-+         [ptr_off]"J"(offsetof(CABACContext, bytestream)),
-+         [end_off]"J"(offsetof(CABACContext, bytestream_end)),
-+         [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
-+    :  // Clobbers
-+       "cc", "memory"
-+    );
-+    return bit;
-+}
- 
--    __asm__ volatile(
--        "ldrb       %[bit]        , [%[state]]                  \n\t"
--        "add        %[r_b]        , %[tables]   , %[lps_off]    \n\t"
--        "mov        %[tmp]        , %[range]                    \n\t"
--        "and        %[range]      , %[range]    , #0xC0         \n\t"
--        "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
--        "ldrb       %[range]      , [%[r_b], %[range], lsl #1]  \n\t"
--        "add        %[r_b]        , %[tables]   , %[norm_off]   \n\t"
--        "sub        %[r_c]        , %[tmp]      , %[range]      \n\t"
--        "lsl        %[tmp]        , %[r_c]      , #17           \n\t"
--        "cmp        %[tmp]        , %[low]                      \n\t"
--        "it         gt                                          \n\t"
--        "movgt      %[range]      , %[r_c]                      \n\t"
--        "itt        cc                                          \n\t"
--        "mvncc      %[bit]        , %[bit]                      \n\t"
--        "subcc      %[low]        , %[low]      , %[tmp]        \n\t"
--        "add        %[r_c]        , %[tables]   , %[mlps_off]   \n\t"
--        "ldrb       %[tmp]        , [%[r_b], %[range]]          \n\t"
--        "ldrb       %[r_b]        , [%[r_c], %[bit]]            \n\t"
--        "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
--        "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
--        "uxth       %[r_c]        , %[low]                      \n\t"
--        "strb       %[r_b]        , [%[state]]                  \n\t"
--        "tst        %[r_c]        , %[r_c]                      \n\t"
--        "bne        2f                                          \n\t"
--        "ldr        %[r_c]        , [%[c], %[byte]]             \n\t"
-+#define get_cabac_bypass get_cabac_bypass_arm
-+static inline int get_cabac_bypass_arm(CABACContext * const c)
-+{
-+    uint32_t low = c->low, range, ptr, tmp;
-+    int rv;
-+    __asm volatile (
-+        "ldr        %[range] , [%[c], %[range_off]] \n\t"
-+        "mov        %[rv]    , #0                   \n\t"
-+        "ldr        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
-+        "lsl        %[low]   , #1                   \n\t"
-+#if !UNCHECKED_BITSTREAM_READER
-+        "ldr        %[tmp]   , [%[c], %[end_off]]   \n\t"
-+#endif
-+        "cmp        %[low]   , %[range], lsl #17    \n\t"
-+        "itt         cs                              \n\t"
-+        "subcs      %[low]   , %[low], %[range], lsl #17 \n\t"
-+        "movcs      %[rv]    , #1                   \n\t"
- #if UNCHECKED_BITSTREAM_READER
--        "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
--        "add        %[r_c]        , %[r_c]      , #2            \n\t"
--        "str        %[r_c]        , [%[c], %[byte]]             \n\t"
--#else
--        "ldr        %[r_b]        , [%[c], %[end]]              \n\t"
--        "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
--        "cmp        %[r_c]        , %[r_b]                      \n\t"
--        "itt        lt                                          \n\t"
--        "addlt      %[r_c]        , %[r_c]      , #2            \n\t"
--        "strlt      %[r_c]        , [%[c], %[byte]]             \n\t"
--#endif
--        "sub        %[r_c]        , %[low]      , #1            \n\t"
--        "add        %[r_b]        , %[tables]   , %[norm_off]   \n\t"
--        "eor        %[r_c]        , %[low]      , %[r_c]        \n\t"
--        "rev        %[tmp]        , %[tmp]                      \n\t"
--        "lsr        %[r_c]        , %[r_c]      , #15           \n\t"
--        "lsr        %[tmp]        , %[tmp]      , #15           \n\t"
--        "ldrb       %[r_c]        , [%[r_b], %[r_c]]            \n\t"
--        "movw       %[r_b]        , #0xFFFF                     \n\t"
--        "sub        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
--        "rsb        %[r_c]        , %[r_c]      , #7            \n\t"
--        "lsl        %[tmp]        , %[tmp]      , %[r_c]        \n\t"
--        "add        %[low]        , %[low]      , %[tmp]        \n\t"
--        "2:                                                     \n\t"
--        :    [bit]"=&r"(bit),
--             [low]"+&r"(c->low),
--           [range]"+&r"(c->range),
--             [r_b]"=&r"(reg_b),
--             [r_c]"=&r"(reg_c),
--             [tmp]"=&r"(tmp)
--        :        [c]"r"(c),
--             [state]"r"(state),
--            [tables]"r"(ff_h264_cabac_tables),
--              [byte]"M"(offsetof(CABACContext, bytestream)),
--               [end]"M"(offsetof(CABACContext, bytestream_end)),
--          [norm_off]"I"(H264_NORM_SHIFT_OFFSET),
--           [lps_off]"I"(H264_LPS_RANGE_OFFSET),
--          [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
--        : "memory", "cc"
--        );
-+        "ldrh       %[tmp]   , [%[ptr]], #2         \n\t"
-+#else
-+        "cmp        %[tmp]   , %[ptr]               \n\t"
-+#if CONFIG_THUMB
-+        "it         cs                              \n\t"
-+        "ldrhcs     %[tmp]   , [%[ptr]], #2         \n\t"
-+#else
-+        "ldrcsh     %[tmp]   , [%[ptr]], #2         \n\t"
-+#endif
-+#endif
-+        "lsls       %[range] , %[low], #16          \n\t"
-+        "bne        1f                              \n\t"
- 
--    return bit & 1;
-+        "str        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
-+        "rev        %[tmp]   , %[tmp]               \n\t"
-+        "add        %[low]   , %[low], %[tmp], lsr #15 \n\t"
-+        "movw       %[tmp]   , 0xFFFF               \n\t"
-+        "sub        %[low]   , %[tmp]               \n\t"
-+        "1:                                         \n\t"
-+        "str        %[low]   , [%[c], %[low_off]]   \n\t"
-+        : // Outputs
-+               [rv]"=&r"(rv),
-+              [low]"+r"(low),
-+            [range]"=&r"(range),
-+              [ptr]"=&r"(ptr),
-+              [tmp]"=&r"(tmp)
-+        : // Inputs
-+                    [c]"r"(c),
-+              [low_off]"J"(offsetof(CABACContext, low)),
-+            [range_off]"J"(offsetof(CABACContext, range)),
-+              [ptr_off]"J"(offsetof(CABACContext, bytestream)),
-+              [end_off]"J"(offsetof(CABACContext, bytestream_end))
-+        : // Clobbers
-+            "memory", "cc"
-+    );
-+    return rv;
- }
-+
-+
-+#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
-+static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
-+{
-+    uint32_t low = c->low, range, ptr, tmp;
-+    __asm volatile (
-+        "ldr        %[range] , [%[c], %[range_off]] \n\t"
-+        "ldr        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
-+        "lsl        %[low]   , #1                   \n\t"
-+#if !UNCHECKED_BITSTREAM_READER
-+        "ldr        %[tmp]   , [%[c], %[end_off]]   \n\t"
-+#endif
-+        "cmp        %[low]   , %[range], lsl #17    \n\t"
-+        "it         cs                              \n\t"
-+        "subcs      %[low]   , %[low], %[range], lsl #17 \n\t"
-+        "it         cc                              \n\t"
-+        "rsbcc      %[rv]    , %[rv], #0            \n\t"
-+#if UNCHECKED_BITSTREAM_READER
-+        "ldrh       %[tmp]   , [%[ptr]], #2         \n\t"
-+#else
-+        "cmp        %[tmp]   , %[ptr]               \n\t"
-+#if CONFIG_THUMB
-+        "it         cs                              \n\t"
-+        "ldrhcs     %[tmp]   , [%[ptr]], #2         \n\t"
-+#else
-+        "ldrcsh     %[tmp]   , [%[ptr]], #2         \n\t"
-+#endif
-+#endif
-+        "lsls       %[range] , %[low], #16          \n\t"
-+        "bne        1f                              \n\t"
-+
-+        "str        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
-+        "rev        %[tmp]   , %[tmp]               \n\t"
-+        "add        %[low]   , %[low], %[tmp], lsr #15 \n\t"
-+        "movw       %[tmp]   , 0xFFFF               \n\t"
-+        "sub        %[low]   , %[tmp]               \n\t"
-+        "1:                                         \n\t"
-+        "str        %[low]   , [%[c], %[low_off]]   \n\t"
-+        : // Outputs
-+               [rv]"+r"(rv),
-+              [low]"+r"(low),
-+            [range]"=&r"(range),
-+              [ptr]"=&r"(ptr),
-+              [tmp]"=&r"(tmp)
-+        : // Inputs
-+                    [c]"r"(c),
-+              [low_off]"J"(offsetof(CABACContext, low)),
-+            [range_off]"J"(offsetof(CABACContext, range)),
-+              [ptr_off]"J"(offsetof(CABACContext, bytestream)),
-+              [end_off]"J"(offsetof(CABACContext, bytestream_end))
-+        : // Clobbers
-+            "memory", "cc"
-+    );
-+    return rv;
-+}
-+
- #endif /* HAVE_ARMV6T2_INLINE */
- 
- #endif /* AVCODEC_ARM_CABAC_H */
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevc_cabac.h
-@@ -0,0 +1,607 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_ARM_HEVC_CABAC_H
-+#define AVCODEC_ARM_HEVC_CABAC_H
-+
-+#include "config.h"
-+#if HAVE_ARMV6T2_INLINE
-+
-+#define hevc_mem_bits32 hevc_mem_bits32_arm
-+static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits)
-+{
-+    unsigned int n;
-+    __asm__ (
-+        "rev        %[n], %[x]                     \n\t"
-+        : [n]"=r"(n)
-+        : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3)))
-+        :
-+        );
-+    return n << (bits & 7);
-+}
-+
-+
-+// ---------------------------------------------------------------------------
-+//
-+// Helper fns - little bits of code where ARM has an instraction that the
-+// compiler doesn't know about / use
-+
-+#define trans_scale_sat trans_scale_sat_arm
-+static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
-+{
-+    int rv;
-+    int t = ((level * (int)(scale * scale_m)) >> shift) + 1;
-+
-+    __asm__ (
-+    "ssat %[rv], #16, %[t], ASR #1 \n\t"
-+    : [rv]"=r"(rv)
-+    : [t]"r"(t)
-+    :
-+    );
-+    return rv;
-+}
-+
-+#define update_rice update_rice_arm
-+static inline void update_rice_arm(uint8_t * const stat_coeff,
-+    const unsigned int last_coeff_abs_level_remaining,
-+    const unsigned int c_rice_param)
-+{
-+    int t = last_coeff_abs_level_remaining << 1;
-+    __asm__ (
-+    "lsrs  %[t], %[t], %[shift]             \n\t"
-+
-+    "it    eq                               \n\t"
-+    "subeq %[stat], %[stat], #1             \n\t"
-+    "cmp   %[t], #6                         \n\t"
-+    "adc   %[stat], %[stat], #0             \n\t"
-+    "usat  %[stat], #8, %[stat]             \n\t"
-+    : [stat]"+r"(*stat_coeff),
-+         [t]"+r"(t)
-+    :  [shift]"r"(c_rice_param)
-+    : "cc"
-+    );
-+}
-+
-+// ---------------------------------------------------------------------------
-+//
-+// CABAC get loops
-+//
-+// Where the loop is simple enough we can normally do 10-30% better than the
-+// compiler
-+
-+// Get the residual greater than 1 bits
-+
-+#define get_cabac_greater1_bits get_cabac_greater1_bits_arm
-+static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n,
-+    uint8_t * const state0)
-+{
-+    unsigned int i, reg_b, st, tmp, bit, rv;
-+     __asm__ (
-+         "mov        %[i]          , #0                          \n\t"
-+         "mov        %[rv]         , #0                          \n\t"
-+         "1:                                                     \n\t"
-+         "add        %[i]          , %[i]        , #1            \n\t"
-+         "cmp        %[rv]         , #0                          \n\t"
-+         "ite        eq                                          \n\t"
-+         "usateq     %[st]         , #2          , %[i]          \n\t"
-+         "movne      %[st]         , #0                          \n\t"
-+         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
-+         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
-+
-+         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
-+         "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
-+         "ldrb       %[tmp]        , [%[r_b], %[tmp], lsl #1]    \n\t"
-+         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
-+
-+         "cmp        %[low]        , %[range], lsl #17           \n\t"
-+         "ittt       ge                                          \n\t"
-+         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-+         "movge      %[range]      , %[tmp]                      \n\t"
-+         "mvnge      %[bit]        , %[bit]                      \n\t"
-+
-+         "clz        %[tmp]        , %[range]                    \n\t"
-+         "sub        %[tmp]        , #23                         \n\t"
-+         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
-+         "and        %[bit]        , %[bit]      , #1            \n\t"
-+         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
-+         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
-+         "orr        %[rv]         , %[bit]      , %[rv], lsl #1 \n\t"
-+         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
-+
-+// There is a small speed gain from combining both conditions, using a single
-+// branch and then working out what that meant later
-+         "lsls       %[tmp]        , %[low]      , #16           \n\t"
-+         "it         ne                                          \n\t"
-+         "cmpne      %[n]          , %[i]                        \n\t"
-+         "bne        1b                                          \n\t"
-+
-+// If reload is not required then we must have run out of flags to decode
-+         "tst        %[tmp]        , %[tmp]                      \n\t"
-+         "bne        2f                                          \n\t"
-+
-+// Do reload
-+         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
-+         "rbit       %[bit]        , %[low]                      \n\t"
-+         "movw       %[r_b]        , #0xFFFF                     \n\t"
-+         "clz        %[bit]        , %[bit]                      \n\t"
-+         "rev        %[tmp]        , %[tmp]                      \n\t"
-+         "sub        %[bit]        , %[bit]      , #16           \n\t"
-+         "cmp        %[n]          , %[i]                        \n\t"
-+         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
-+
-+#if CONFIG_THUMB
-+         "lsl        %[tmp]        , %[tmp]      , %[bit]        \n\t"
-+         "add        %[low]        , %[low]      , %[tmp]        \n\t"
-+#else
-+         "add        %[low]        , %[low]      , %[tmp], lsl %[bit] \n\t"
-+#endif
-+
-+         "bne        1b                                          \n\t"
-+         "2:                                                     \n\t"
-+         :    [bit]"=&r"(bit),
-+              [low]"+r"(c->low),
-+            [range]"+r"(c->range),
-+              [r_b]"=&r"(reg_b),
-+             [bptr]"+r"(c->bytestream),
-+                [i]"=&r"(i),
-+              [tmp]"=&r"(tmp),
-+               [st]"=&r"(st),
-+               [rv]"=&r"(rv)
-+          :  [state0]"r"(state0),
-+                  [n]"r"(n),
-+        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
-+            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
-+         : "memory", "cc"
-+    );
-+    return rv;
-+}
-+
-+
-+// n must be > 0 on entry
-+#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm
-+static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0,
-+    unsigned int n,
-+    const uint8_t * ctx_map,
-+    uint8_t * p)
-+{
-+    unsigned int reg_b, tmp, st, bit;
-+     __asm__ (
-+// Get bin from map
-+#if CONFIG_THUMB
-+         "add        %[ctx_map]    , %[n]                        \n\t"
-+         "ldrb       %[st]         , [%[ctx_map]]                \n\t"
-+#else
-+         "ldrb       %[st]         , [%[ctx_map], %[n]]!         \n\t"
-+#endif
-+         "1:                                                     \n\t"
-+
-+// Load state & ranges
-+         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
-+         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
-+         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
-+         "add        %[r_b]        , %[r_b]      , %[tmp], lsl #1 \n\t"
-+         "ldrb       %[tmp]        , [%[r_b], %[bit]]            \n\t"
-+         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
-+
-+         "cmp        %[low]        , %[range], lsl #17           \n\t"
-+         "ittt       ge                                          \n\t"
-+         "mvnge      %[bit]        , %[bit]                      \n\t"
-+         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-+         "movge      %[range]      , %[tmp]                      \n\t"
-+
-+// Renorm
-+         "clz        %[tmp]        , %[range]                    \n\t"
-+         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
-+         "sub        %[tmp]        , #23                         \n\t"
-+         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
-+         "tst        %[bit]        , #1                          \n\t"
-+         "ldrb       %[st]         , [%[ctx_map], #-1]!          \n\t"
-+         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
-+// GCC asm seems to need strbne written differently for thumb and arm
-+#if CONFIG_THUMB
-+         "it         ne                                          \n\t"
-+         "strbne     %[n]          , [%[idx]]    , #1            \n\t"
-+#else
-+         "strneb     %[n]          , [%[idx]]    , #1            \n\t"
-+#endif
-+
-+// There is a small speed gain from combining both conditions, using a single
-+// branch and then working out what that meant later
-+         "subs       %[n]          , %[n]        , #1            \n\t"
-+         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
-+#if CONFIG_THUMB
-+         "itt        ne                                          \n\t"
-+         "lslsne     %[tmp]        , %[low]      , #16           \n\t"
-+#else
-+         "lslnes     %[tmp]        , %[low]      , #16           \n\t"
-+#endif
-+         "bne        1b                                          \n\t"
-+
-+// If we have bits left then n must be 0 so give up now
-+         "lsls       %[tmp]        , %[low]      , #16           \n\t"
-+         "bne        2f                                          \n\t"
-+
-+// Do reload
-+         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
-+         "rbit       %[bit]        , %[low]                      \n\t"
-+         "movw       %[r_b]        , #0xFFFF                     \n\t"
-+         "clz        %[bit]        , %[bit]                      \n\t"
-+         "cmp        %[n]          , #0                          \n\t"
-+         "rev        %[tmp]        , %[tmp]                      \n\t"
-+         "sub        %[bit]        , %[bit]      , #16           \n\t"
-+         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
-+
-+#if CONFIG_THUMB
-+         "lsl        %[tmp]        , %[tmp]      , %[bit]        \n\t"
-+         "add        %[low]        , %[low]      , %[tmp]        \n\t"
-+#else
-+         "add        %[low]        , %[low]      , %[tmp], lsl %[bit] \n\t"
-+#endif
-+
-+// Check to see if we still have more to do
-+         "bne        1b                                          \n\t"
-+         "2:                                                     \n\t"
-+         :    [bit]"=&r"(bit),
-+              [low]"+r"(c->low),
-+            [range]"+r"(c->range),
-+              [r_b]"=&r"(reg_b),
-+             [bptr]"+r"(c->bytestream),
-+              [idx]"+r"(p),
-+                [n]"+r"(n),
-+              [tmp]"=&r"(tmp),
-+               [st]"=&r"(st),
-+          [ctx_map]"+r"(ctx_map)
-+          :  [state0]"r"(state0),
-+        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
-+            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
-+         : "memory", "cc"
-+    );
-+
-+    return p;
-+}
-+
-+// ---------------------------------------------------------------------------
-+//
-+// CABAC_BY22 functions
-+
-+
-+#define get_cabac_by22_start get_cabac_by22_start_arm
-+static inline void get_cabac_by22_start_arm(CABACContext * const c)
-+{
-+    const uint8_t *ptr = c->bytestream;
-+    register uint32_t low __asm__("r1"), range __asm__("r2");
-+    uint32_t m, range8, bits;
-+#if !USE_BY22_DIV
-+    uintptr_t inv;
-+#endif
-+
-+    av_assert2(offsetof (CABACContext, low) == 0);
-+    av_assert2(offsetof (CABACContext, range) == 4);
-+    av_assert2(offsetof (CABACContext, by22.range) == offsetof (CABACContext, by22.bits) + 2);
-+    __asm__ volatile (
-+        "ldmia   %[c], {%[low], %[range]}                         \n\t"
-+        : // Outputs
-+               [low]"=r"(low),
-+             [range]"=r"(range)
-+        : // Inputs
-+                 [c]"r"(c)
-+        : // Clobbers
-+    );
-+#if !USE_BY22_DIV
-+    inv = (uintptr_t)cabac_by22_inv_range;
-+#endif
-+    __asm__ volatile (
-+        "ldr     %[m], [%[ptr]], #-("AV_STRINGIFY(CABAC_BITS)"/8) \n\t"
-+#if !USE_BY22_DIV
-+        "uxtb    %[range8], %[range]                              \n\t"
-+#endif
-+        "rbit    %[bits], %[low]                                  \n\t"
-+        "lsl     %[low], %[low], #22 - "AV_STRINGIFY(CABAC_BITS)" \n\t"
-+        "clz     %[bits], %[bits]                                 \n\t"
-+        "str     %[ptr], [%[c], %[ptr_off]]                       \n\t"
-+        "rev     %[m], %[m]                                       \n\t"
-+        "rsb     %[ptr], %[bits], #9 + "AV_STRINGIFY(CABAC_BITS)" \n\t"
-+        "eor     %[m], %[m], #0x80000000                          \n\t"
-+#if !USE_BY22_DIV
-+        "ldr     %[inv], [%[inv], %[range8], lsl #2]              \n\t"
-+        "pkhbt   %[range], %[bits], %[range], lsl #16             \n\t"
-+        "str     %[range], [%[c], %[bits_off]]                    \n\t"
-+#else
-+        "strh    %[bits], [%[c], %[bits_off]]                     \n\t"
-+#endif
-+#if CONFIG_THUMB
-+        "lsr     %[m], %[ptr]                                     \n\t"
-+        "eor     %[range], %[low], %[m]                           \n\t"
-+#else
-+        "eor     %[range], %[low], %[m], lsr %[ptr]               \n\t"
-+#endif
-+        : // Outputs
-+               [ptr]"+&r"(ptr),
-+               [low]"+&r"(low),
-+             [range]"+&r"(range),
-+#if !USE_BY22_DIV
-+               [inv]"+&r"(inv),
-+#endif
-+                 [m]"=&r"(m),
-+            [range8]"=&r"(range8),
-+              [bits]"=&r"(bits)
-+        : // Inputs
-+                   [c]"r"(c),
-+            [bits_off]"J"(offsetof (CABACContext, by22.bits)),
-+             [ptr_off]"J"(offsetof (CABACContext, bytestream))
-+        : // Clobbers
-+            "memory"
-+    );
-+    c->low = range;
-+#if !USE_BY22_DIV
-+    c->range = inv;
-+#endif
-+}
-+
-+#define get_cabac_by22_peek get_cabac_by22_peek_arm
-+static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
-+{
-+    uint32_t rv = c->low &~ 1, tmp;
-+    __asm__ (
-+        "cmp      %[inv] , #0                    \n\t"
-+        "it       ne                             \n\t"
-+        "umullne  %[tmp] , %[rv] , %[inv], %[rv] \n\t"
-+        :  // Outputs
-+             [rv]"+r"(rv),
-+             [tmp]"=r"(tmp)
-+        :  // Inputs
-+             [inv]"r"(c->range)
-+        :  // Clobbers
-+                "cc"
-+    );
-+    return rv << 1;
-+}
-+
-+#define get_cabac_by22_flush get_cabac_by22_flush_arm
-+static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, uint32_t val)
-+{
-+    uint32_t bits, ptr, tmp1, tmp2;
-+    __asm__ volatile (
-+        "ldrh    %[bits], [%[cc], %[bits_off]]     \n\t"
-+        "ldr     %[ptr], [%[cc], %[ptr_off]]       \n\t"
-+        "rsb     %[tmp1], %[n], #32                \n\t"
-+        "add     %[bits], %[bits], %[n]            \n\t"
-+        "ldrh    %[tmp2], [%[cc], %[range_off]]    \n\t"
-+        "lsr     %[tmp1], %[val], %[tmp1]          \n\t"
-+        "ldr     %[val], [%[cc], %[low_off]]       \n\t"
-+#if CONFIG_THUMB
-+        "add     %[ptr], %[ptr], %[bits], lsr #3   \n\t"
-+        "ldr     %[ptr], [%[ptr]]                  \n\t"
-+#else
-+        "ldr     %[ptr], [%[ptr], %[bits], lsr #3] \n\t"
-+#endif
-+        "mul     %[tmp1], %[tmp2], %[tmp1]         \n\t"
-+        "and     %[tmp2], %[bits], #7              \n\t"
-+        "strh    %[bits], [%[cc], %[bits_off]]     \n\t"
-+        "rev     %[ptr], %[ptr]                    \n\t"
-+        "lsl     %[tmp1], %[tmp1], #23             \n\t"
-+#if CONFIG_THUMB
-+        "lsl     %[val], %[n]                      \n\t"
-+        "sub     %[val], %[tmp1]                   \n\t"
-+#else
-+        "rsb     %[val], %[tmp1], %[val], lsl %[n] \n\t"
-+#endif
-+        "lsl     %[ptr], %[ptr], %[tmp2]           \n\t"
-+        "orr     %[val], %[val], %[ptr], lsr #9    \n\t"
-+        "str     %[val], [%[cc], %[low_off]]       \n\t"
-+        :  // Outputs
-+            [val]"+r"(val),
-+           [bits]"=&r"(bits),
-+            [ptr]"=&r"(ptr),
-+           [tmp1]"=&r"(tmp1),
-+           [tmp2]"=&r"(tmp2)
-+        :  // Inputs
-+                  [cc]"r"(c),
-+                   [n]"r"(n),
-+            [bits_off]"J"(offsetof(CABACContext, by22.bits)),
-+             [ptr_off]"J"(offsetof(CABACContext, bytestream)),
-+           [range_off]"J"(offsetof(CABACContext, by22.range)),
-+             [low_off]"J"(offsetof(CABACContext, low))
-+        :  // Clobbers
-+           "memory"
-+    );
-+}
-+
-+#define coeff_abs_level_remaining_decode_bypass coeff_abs_level_remaining_decode_bypass_arm
-+static inline int coeff_abs_level_remaining_decode_bypass_arm(CABACContext *const c, unsigned int rice_param)
-+{
-+    uint32_t last_coeff_abs_level_remaining;
-+    uint32_t prefix, n1, range, n2, ptr, tmp1, tmp2;
-+    __asm__ volatile (
-+        "ldr     %[remain], [%[cc], %[low_off]]               \n\t"
-+        "ldr     %[prefix], [%[cc], %[range_off]]             \n\t"
-+        "bic     %[remain], %[remain], #1                     \n\t"
-+        "ldrh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
-+        "ldr     %[ptr], [%[cc], %[ptr_off]]                  \n\t"
-+        "cmp     %[prefix], #0                                \n\t"
-+        "it      ne                                           \n\t"
-+        "umullne %[prefix], %[remain], %[prefix], %[remain]   \n\t"
-+        "ldrh    %[range], [%[cc], %[by22_range_off]]         \n\t"
-+        "lsl     %[remain], %[remain], #1                     \n\t"
-+        "mvn     %[prefix], %[remain]                         \n\t"
-+        "clz     %[prefix], %[prefix]                         \n\t"
-+        "rsbs    %[n1], %[prefix], #2                         \n\t"
-+        "bcc     1f                                           \n\t"
-+        "adc     %[n1], %[rice], %[prefix]                    \n\t"
-+        "add     %[tmp2], %[tmp2], %[n1]                      \n\t"
-+        "rsb     %[n2], %[n1], #32                            \n\t"
-+        "and     %[tmp1], %[tmp2], #7                         \n\t"
-+        "strh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
-+        "lsr     %[tmp2], %[tmp2], #3                         \n\t"
-+        "lsr     %[n2], %[remain], %[n2]                      \n\t"
-+        "mul     %[n2], %[range], %[n2]                       \n\t"
-+        "ldr     %[range], [%[cc], %[low_off]]                \n\t"
-+        "ldr     %[ptr], [%[ptr], %[tmp2]]                    \n\t"
-+        "rsb     %[tmp2], %[rice], #31                        \n\t"
-+        "lsl     %[remain], %[remain], %[prefix]              \n\t"
-+        "lsl     %[n2], %[n2], #23                            \n\t"
-+#if CONFIG_THUMB
-+        "lsl     %[range], %[n1]                              \n\t"
-+        "sub     %[range], %[n2]                              \n\t"
-+#else
-+        "rsb     %[range], %[n2], %[range], lsl %[n1]         \n\t"
-+#endif
-+        "rev     %[ptr], %[ptr]                               \n\t"
-+        "lsl     %[n2], %[prefix], %[rice]                    \n\t"
-+#if CONFIG_THUMB
-+        "lsr     %[remain], %[tmp2]                           \n\t"
-+        "add     %[remain], %[n2]                             \n\t"
-+#else
-+        "add     %[remain], %[n2], %[remain], lsr %[tmp2]     \n\t"
-+#endif
-+        "b       3f                                           \n\t"
-+        "1:                                                   \n\t"
-+        "add     %[n2], %[rice], %[prefix], lsl #1            \n\t"
-+        "cmp     %[n2], %[peek_bits_plus_2]                   \n\t"
-+        "bhi     2f                                           \n\t"
-+        "sub     %[n1], %[n2], #2                             \n\t"
-+        "add     %[tmp2], %[tmp2], %[n1]                      \n\t"
-+        "rsb     %[n2], %[n1], #32                            \n\t"
-+        "strh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
-+        "lsr     %[tmp1], %[tmp2], #3                         \n\t"
-+        "lsr     %[n2], %[remain], %[n2]                      \n\t"
-+        "mul     %[n2], %[range], %[n2]                       \n\t"
-+        "rsb     %[range], %[rice], #34                       \n\t"
-+        "ldr     %[ptr], [%[ptr], %[tmp1]]                    \n\t"
-+        "and     %[tmp1], %[tmp2], #7                         \n\t"
-+        "lsl     %[remain], %[remain], %[prefix]              \n\t"
-+        "ldr     %[tmp2], [%[cc], %[low_off]]                 \n\t"
-+        "rsb     %[prefix], %[prefix], %[range]               \n\t"
-+        "orr     %[remain], %[remain], #0x80000000            \n\t"
-+        "rev     %[ptr], %[ptr]                               \n\t"
-+        "lsl     %[n2], %[n2], #23                            \n\t"
-+        "mov     %[range], #2                                 \n\t"
-+#if CONFIG_THUMB
-+        "lsl     %[tmp2], %[n1]                               \n\t"
-+        "sub     %[tmp2], %[n2]                               \n\t"
-+#else
-+        "rsb     %[tmp2], %[n2], %[tmp2], lsl %[n1]           \n\t"
-+#endif
-+        "lsl     %[ptr], %[ptr], %[tmp1]                      \n\t"
-+        "lsl     %[rice], %[range], %[rice]                   \n\t"
-+        "orr     %[range], %[tmp2], %[ptr], lsr #9            \n\t"
-+#if CONFIG_THUMB
-+        "lsr     %[remain], %[prefix]                         \n\t"
-+        "add     %[remain], %[rice]                           \n\t"
-+#else
-+        "add     %[remain], %[rice], %[remain], lsr %[prefix] \n\t"
-+#endif
-+        "b       4f                                           \n\t"
-+        "2:                                                   \n\t"
-+        "add     %[n1], %[tmp2], %[prefix]                    \n\t"
-+#if CONFIG_THUMB
-+        "add     %[tmp2], %[ptr], %[n1], lsr #3               \n\t"
-+        "ldr     %[tmp2], [%[tmp2]]                           \n\t"
-+#else
-+        "ldr     %[tmp2], [%[ptr], %[n1], lsr #3]             \n\t"
-+#endif
-+        "rsb     %[tmp1], %[prefix], #32                      \n\t"
-+        "push    {%[rice]}                                    \n\t"
-+        "and     %[rice], %[n1], #7                           \n\t"
-+        "lsr     %[tmp1], %[remain], %[tmp1]                  \n\t"
-+        "ldr     %[ptr], [%[cc], %[low_off]]                  \n\t"
-+        "mul     %[remain], %[range], %[tmp1]                 \n\t"
-+        "rev     %[tmp2], %[tmp2]                             \n\t"
-+        "rsb     %[n2], %[prefix], %[n2]                      \n\t"
-+        "ldr     %[tmp1], [%[cc], %[range_off]]               \n\t"
-+        "lsl     %[rice], %[tmp2], %[rice]                    \n\t"
-+        "sub     %[tmp2], %[n2], #2                           \n\t"
-+        "lsl     %[remain], %[remain], #23                    \n\t"
-+#if CONFIG_THUMB
-+        "lsl     %[ptr], %[prefix]                            \n\t"
-+        "rsb     %[remain], %[ptr]                            \n\t"
-+#else
-+        "rsb     %[remain], %[remain], %[ptr], lsl %[prefix]  \n\t"
-+#endif
-+        "orr     %[remain], %[remain], %[rice], lsr #9        \n\t"
-+        "add     %[prefix], %[n1], %[tmp2]                    \n\t"
-+        "bic     %[n1], %[remain], #1                         \n\t"
-+        "ldr     %[ptr], [%[cc], %[ptr_off]]                  \n\t"
-+        "cmp     %[tmp1], #0                                  \n\t"
-+        "rsb     %[rice], %[tmp2], #32                        \n\t"
-+        "it      ne                                           \n\t"
-+        "umullne %[tmp1], %[n1], %[tmp1], %[n1]               \n\t"
-+        "and     %[tmp1], %[prefix], #7                       \n\t"
-+#if CONFIG_THUMB
-+        "add     %[ptr], %[ptr], %[prefix], lsr #3            \n\t"
-+        "ldr     %[ptr], [%[ptr]]                             \n\t"
-+#else
-+        "ldr     %[ptr], [%[ptr], %[prefix], lsr #3]          \n\t"
-+#endif
-+        "lsl     %[n1], %[n1], #1                             \n\t"
-+        "lsr     %[rice], %[n1], %[rice]                      \n\t"
-+        "rsb     %[n2], %[n2], #34                            \n\t"
-+        "mul     %[range], %[range], %[rice]                  \n\t"
-+        "pop     {%[rice]}                                    \n\t"
-+        "rev     %[ptr], %[ptr]                               \n\t"
-+        "orr     %[n1], %[n1], #0x80000000                    \n\t"
-+        "strh    %[prefix], [%[cc], %[by22_bits_off]]         \n\t"
-+        "mov     %[prefix], #2                                \n\t"
-+        "lsl     %[range], %[range], #23                      \n\t"
-+#if CONFIG_THUMB
-+        "lsl     %[remain], %[tmp2]                           \n\t"
-+        "rsb     %[range], %[remain]                          \n\t"
-+#else
-+        "rsb     %[range], %[range], %[remain], lsl %[tmp2]   \n\t"
-+#endif
-+        "lsl     %[remain], %[prefix], %[rice]                \n\t"
-+#if CONFIG_THUMB
-+        "lsr     %[n1], %[n2]                                 \n\t"
-+        "add     %[remain], %[n1]                             \n\t"
-+#else
-+        "add     %[remain], %[remain], %[n1], lsr %[n2]       \n\t"
-+#endif
-+        "3:                                                   \n\t"
-+        "lsl     %[ptr], %[ptr], %[tmp1]                      \n\t"
-+        "orr     %[range], %[range], %[ptr], lsr #9           \n\t"
-+        "4:                                                   \n\t"
-+        "str     %[range], [%[cc], %[low_off]]                \n\t"
-+        :  // Outputs
-+            [remain]"=&r"(last_coeff_abs_level_remaining),
-+              [rice]"+r"(rice_param),
-+            [prefix]"=&r"(prefix),
-+                [n1]"=&r"(n1),
-+             [range]"=&r"(range),
-+                [n2]"=&r"(n2),
-+               [ptr]"=&r"(ptr),
-+              [tmp1]"=&r"(tmp1),
-+              [tmp2]"=&r"(tmp2)
-+        :  // Inputs
-+                          [cc]"r"(c),
-+            [peek_bits_plus_2]"I"(CABAC_BY22_PEEK_BITS + 2),
-+                     [low_off]"J"(offsetof(CABACContext, low)),
-+                   [range_off]"J"(offsetof(CABACContext, range)),
-+               [by22_bits_off]"J"(offsetof(CABACContext, by22.bits)),
-+              [by22_range_off]"J"(offsetof(CABACContext, by22.range)),
-+                     [ptr_off]"J"(offsetof(CABACContext, bytestream))
-+        :  // Clobbers
-+           "cc", "memory"
-+    );
-+    return last_coeff_abs_level_remaining;
-+}
-+
-+#endif /* HAVE_ARMV6T2_INLINE */
-+
-+#endif /* AVCODEC_ARM_HEVC_CABAC_H */
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevc_idct_fn_neon.S
-@@ -0,0 +1,183 @@
-+/*
-+ * ARM NEON optimised IDCT functions for HEVC decoding
-+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ * Copyright (C) 2018 John Cox, ben Avison for Raspberry Pi (Trading)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+@ Included multiple times from hevc_idct_neon.S
-+@ Macros defined there
-+
-+#define DC_SHIFT  (15 - BIT_DEPTH)
-+#define DC_ADD    (1 | (1 << (14 - BIT_DEPTH)))
-+#define TRN_SHIFT (20 - BIT_DEPTH)
-+
-+function JOIN(ff_hevc_rpi_idct_4x4_dc_neon_, BIT_DEPTH), export=1
-+        ldrsh       r1, [r0]
-+        add         r1, #DC_ADD
-+        asr         r1, #DC_SHIFT
-+        vdup.16     q0, r1
-+        vdup.16     q1, r1
-+        vst1.16     {q0, q1}, [r0]
-+        bx lr
-+endfunc
-+
-+function JOIN(ff_hevc_rpi_idct_8x8_dc_neon_, BIT_DEPTH), export=1
-+        ldrsh       r1, [r0]
-+        add         r2, r0, #32
-+        mov         r3, #64
-+        add         r1, #DC_ADD
-+        asr         r1, #DC_SHIFT
-+        vdup.16     q8, r1
-+        vdup.16     q9, r1
-+        vst1.16     {q8, q9}, [r0], r3
-+        vst1.16     {q8, q9}, [r2], r3
-+        vst1.16     {q8, q9}, [r0]
-+        vst1.16     {q8, q9}, [r2]
-+        bx lr
-+endfunc
-+
-+function JOIN(ff_hevc_rpi_idct_16x16_dc_neon_, BIT_DEPTH), export=1
-+        ldrsh       r1, [r0]
-+        add         r2, r0, #32
-+        mov         r3, #64
-+        add         r1, #DC_ADD
-+        mov         ip, #16*16
-+        asr         r1, #DC_SHIFT
-+        vdup.16     q8, r1
-+        vdup.16     q9, r1
-+1:      vst1.16     {q8, q9}, [r0], r3
-+        subs        ip, ip, #32
-+        vst1.16     {q8, q9}, [r2], r3
-+        bhi         1b
-+        bx lr
-+endfunc
-+
-+function JOIN(ff_hevc_rpi_idct_32x32_dc_neon_, BIT_DEPTH), export=1
-+        ldrsh       r1, [r0]
-+        add         r2, r0, #32
-+        mov         r3, #64
-+        add         r1, #DC_ADD
-+        mov         ip, #32*32
-+        asr         r1, #DC_SHIFT
-+        vdup.16     q8, r1
-+        vdup.16     q9, r1
-+1:      vst1.16     {q8, q9}, [r0], r3
-+        subs        ip, ip, #32
-+        vst1.16     {q8, q9}, [r2], r3
-+        bhi         1b
-+        bx lr
-+endfunc
-+
-+
-+function JOIN(ff_hevc_rpi_transform_4x4_neon_, BIT_DEPTH), export=1
-+        vldr.i32    s0, =0x00240053 // 36 and 83
-+        vld1.16     {q14, q15}, [r0 :256]  // coeffs
-+
-+        tr4_shift   #7
-+
-+        vzip.16     d28, d29
-+        vzip.16     d30, d31
-+        vzip.32     q14, q15
-+
-+        tr4_shift   #TRN_SHIFT
-+
-+        vst4.16     {q14, q15}, [r0 :256]
-+        bx lr
-+
-+        .ltorg
-+endfunc
-+
-+
-+
-+function JOIN(ff_hevc_rpi_transform_luma_4x4_neon_, BIT_DEPTH), export=1
-+        vmov.i32    d0, #0x4a  // 74
-+        vld1.16     {q14, q15}, [r0 :256]  // coeffs
-+        vmov.i32    d1, #0x1d  // 29
-+        vmov.i32    d2, #0x37  // 55
-+
-+        tr4_luma_shift #7
-+
-+        vzip.16     d28, d29
-+        vzip.16     d30, d31
-+        vzip.32     q14, q15
-+
-+        tr4_luma_shift #TRN_SHIFT
-+
-+        vst4.16     {q14, q15}, [r0 :256]
-+        bx lr
-+endfunc
-+
-+function JOIN(ff_hevc_rpi_transform_8x8_neon_, BIT_DEPTH), export=1
-+        add      r2, r0, #16
-+        adr      r3, tr4f
-+        vpush    {d8-d15}
-+        vld1.16  {d0, d1}, [r3]
-+        mov      r3, #32
-+
-+        tr8_vert  d16, d17, d18, d19, d24, d25, d26, d27, q8,  q9,  \
-+            "sub      r0, r0, #128-8",                              \
-+            "sub      r2, r2, #128-8",                              \
-+            "cmp      r1, #4"
-+        ble      2f
-+
-+        tr8_vert  d20, d21, d22, d23, d28, d29, d30, d31, q10, q11, \
-+            "sub      r0, r0, #128+8",                              \
-+            "sub      r2, r2, #128+8+16-32",                        \
-+            "mov      r3, #64"
-+
-+        vzip.16  d16, d17
-+        vzip.16  d18, d19
-+
-+        vzip.16  d20, d21
-+        vzip.16  d22, d23
-+        vzip.16  d28, d29
-+        vzip.16  d30, d31
-+        vzip.32  q10, q11
-+        vzip.32  q14, q15
-+1:
-+        vzip.16  d24, d25
-+        vzip.16  d26, d27
-+        vzip.32  q8, q9
-+        vzip.32  q12, q13
-+
-+        tr8_horiz d16, d17, d18, d19, d20, d21, d22, d23, q8,  q9,  TRN_SHIFT
-+        tr8_horiz d24, d25, d26, d27, d28, d29, d30, d31, q12, q13, TRN_SHIFT
-+
-+        vpop     {d8-d15}
-+        bx       lr
-+
-+2:      vmov.i64 q10, #0
-+        sub      r0, r0, #8
-+        vmov.i64 q11, #0
-+        sub      r2, r2, #8+16-32
-+        vmov.i64 q14, #0
-+        mov      r3, #64
-+        vmov.i64 q15, #0
-+
-+        vzip.16  d16, d17
-+        vzip.16  d18, d19
-+
-+        b        1b
-+
-+endfunc
-+
-+#undef DC_SHIFT
-+#undef DC_ADD
-+#undef TRN_SHIFT
-+
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevc_misc_neon.S
-@@ -0,0 +1,267 @@
-+/*
-+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Written by John Cox, Ben Avison
-+*/
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+@ rpi_zap_coeff_vals_neon(
-+@   uint16_t * buf,          [r0]
-+@   unsigned int log_n_m2)   [r1]
-+
-+function rpi_zap_coeff_vals_neon, export=1
-+        mov      ip, #1
-+        vmov.i64 q0, #0
-+        teq      r1, #0
-+        vmov.i64 q1, #0
-+        beq      2f
-+
-+        lsl      ip, r1    @ 2, 4 or 8
-+        add      r2, r0, #32
-+        lsl      ip, r1    @ 4, 16 or 64 = number of 32-byte blocks to zero
-+        mov      r3, #64
-+1:      vst1.8   {q0,q1}, [r0:256], r3
-+        subs     ip, #2
-+        vst1.8   {q0,q1}, [r2:256], r3
-+        bne      1b
-+        bx       lr
-+
-+2:      vst1.8   {q0,q1}, [r0:256]
-+        bx       lr
-+endfunc
-+
-+@ PIC jump tables are more expensive than absolute for A32 code
-+.set jent_pic, CONFIG_PIC || CONFIG_THUMB
-+
-+@ Jump table entry - if in neon mode the bottom bit must be set
-+@ ? There is probably a real asm instruction to do this but I haven't found it
-+.macro jent lab
-+.if jent_pic
-+T       .short ((0 + \lab) - (0 + 98b)) / 2
-+A       .short (0 + \lab) - (4 + 98b)
-+.else
-+T       .word   1 + \lab
-+A       .word   \lab
-+.endif
-+.endm
-+
-+.set expected_next, 0
-+
-+.macro cpy_compound val, p1, p2, drop_thru=0
-+.if \p1 + \p2 != \val
-+.error "Bad addition!  \p1 + \p2 != \val"
-+.endif
-+.if expected_next != 0 && expected_next != \val
-+.error "Drop thru failure"
-+.endif
-+\val\():
-+        push       {r0-r3}
-+        bl          100\p1\()b
-+        pop        {r0-r3}
-+        add         r0, #\p1
-+        add         r2, #\p1
-+.if \drop_thru == 0
-+        b           \p2\()b
-+.set expected_next, 0
-+.else
-+.set expected_next, \p2
-+.endif
-+.endm
-+
-+@ ff_hevc_cpy_blks8x4_neon(
-+@   dst         [r0]
-+@   dst_stride  [r1]
-+@   src         [r2]
-+@   src_stride  [r3]
-+@   width       [sp, #0] (bytes)
-+@   height)     [sp, #4]
-+@
-+@ Power of 2 widths are directly coded, all others are done in stripes
-+@ We expect the vast majority of calls to be power of 2
-+@
-+@ Currently has min width of 8, but we could make that 4 without issue
-+@ Min height is 4
-+
-+function ff_hevc_rpi_cpy_blks8x4_neon, export=1
-+        ldr         r12, [sp, #0]
-+        push       {r11, lr}
-+.if jent_pic
-+A       adr         lr,  98f - 2
-+.else
-+A       adr         lr,  98f - 4
-+.endif
-+        lsr         r12, #3
-+        ldr         r11, [sp, #(8 + 4)]
-+.if jent_pic
-+A       lsl         r12, #1
-+A       ldrsh       lr,  [lr,  r12]
-+A       add         pc,  lr
-+T       tbh         [pc, r12, lsl #1]
-+.else
-+        @ A32 only, Thumb is always PIC
-+        ldr         pc,  [lr,  r12, lsl #2]
-+.endif
-+
-+98:
-+T       .short      0 @ unused
-+        jent        8f
-+        jent        16f
-+        jent        24f
-+        jent        32f
-+        jent        40f
-+        jent        48f
-+        jent        56f
-+        jent        64f
-+        jent        72f
-+        jent        80f
-+        jent        88f
-+        jent        96f
-+        jent        104f
-+        jent        112f
-+        jent        120f
-+        jent        128f
-+
-+1008:
-+        push       {r11, lr}
-+8:
-+        add         lr,  r2,  r3
-+        lsl         r3,  #1
-+        add         r12, r0,  r1
-+        lsl         r1,  #1
-+1:
-+        vld1.32    {d0 }, [r2],  r3
-+        vld1.32    {d1 }, [lr],  r3
-+        vld1.32    {d2 }, [r2],  r3
-+        vld1.32    {d3 }, [lr],  r3
-+        subs        r11,  #4
-+        vst1.32    {d0 }, [r0],  r1
-+        vst1.32    {d1 }, [r12], r1
-+        vst1.32    {d2 }, [r0],  r1
-+        vst1.32    {d3 }, [r12], r1
-+        bgt         1b
-+        pop        {r11, pc}
-+
-+10016:
-+        push       {r11, lr}
-+16:
-+        add         lr,  r2,  r3
-+        lsl         r3,  #1
-+        add         r12, r0,  r1
-+        lsl         r1,  #1
-+1:
-+        vld1.32    {q0 }, [r2],  r3
-+        vld1.32    {q1 }, [lr],  r3
-+        vld1.32    {q2 }, [r2],  r3
-+        vld1.32    {q3 }, [lr],  r3
-+        subs        r11, #4
-+        vst1.32    {q0 }, [r0],  r1
-+        vst1.32    {q1 }, [r12], r1
-+        vst1.32    {q2 }, [r0],  r1
-+        vst1.32    {q3 }, [r12], r1
-+        bgt         1b
-+        pop        {r11, pc}
-+
-+10032:
-+        push       {r11, lr}
-+32:
-+        add         lr,  r2,  r3
-+        lsl         r3,  #1
-+        add         r12, r0,  r1
-+        lsl         r1,  #1
-+1:
-+        vld1.32    {q8,  q9 }, [r2],  r3
-+        vld1.32    {q10, q11}, [lr],  r3
-+        vld1.32    {q12, q13}, [r2],  r3
-+        vld1.32    {q14, q15}, [lr],  r3
-+        subs        r11, #4
-+        vst1.32    {q8,  q9 }, [r0],  r1
-+        vst1.32    {q10, q11}, [r12], r1
-+        vst1.32    {q12, q13}, [r0],  r1
-+        vst1.32    {q14, q15}, [r12], r1
-+        bgt         1b
-+        pop        {r11, pc}
-+
-+10064:
-+        push       {r11, lr}
-+64:
-+        add         lr,  r2,  #32
-+        add         r12, r0,  #32
-+1:
-+        vld1.32    {q8,  q9 }, [r2],  r3
-+        vld1.32    {q10, q11}, [lr],  r3
-+        vld1.32    {q12, q13}, [r2],  r3
-+        vld1.32    {q14, q15}, [lr],  r3
-+        subs        r11, #2
-+        vst1.32    {q8,  q9 }, [r0],  r1
-+        vst1.32    {q10, q11}, [r12], r1
-+        vst1.32    {q12, q13}, [r0],  r1
-+        vst1.32    {q14, q15}, [r12], r1
-+        bgt         1b
-+        pop        {r11, pc}
-+
-+128:
-+        push       {r4, r5}
-+        @ We could do this with fewer registers if we jump around but I
-+        @ have a primative urge to load sequentially
-+        mov         r4,  #64
-+        add         lr,  r2,  #32
-+        add         r12, r0,  #32
-+        sub         r3,  r4
-+        sub         r1,  r4
-+1:
-+        vld1.32    {q8,  q9 }, [r2],  r4
-+        vld1.32    {q10, q11}, [lr],  r4
-+        vld1.32    {q12, q13}, [r2],  r3
-+        vld1.32    {q14, q15}, [lr],  r3
-+        subs        r11, #1
-+        vst1.32    {q8,  q9 }, [r0],  r4
-+        vst1.32    {q10, q11}, [r12], r4
-+        vst1.32    {q12, q13}, [r0],  r1
-+        vst1.32    {q14, q15}, [r12], r1
-+        bgt         1b
-+        pop        {r4, r5, r11, pc}
-+
-+@ Use drop_thru where we can
-+cpy_compound 104, 64, 40, 1
-+cpy_compound 40, 32, 8
-+
-+cpy_compound 112, 64, 48, 1
-+cpy_compound 48, 32, 16
-+
-+cpy_compound 120, 64, 56, 1
-+cpy_compound 56, 32, 24, 1
-+cpy_compound 24, 16, 8
-+
-+cpy_compound 72, 64, 8
-+cpy_compound 80, 64, 16
-+cpy_compound 88, 64, 24
-+cpy_compound 96, 64, 32
-+
-+
-+endfunc
-+
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevc_misc_neon.h
-@@ -0,0 +1,438 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_ARM_RPI_HEVC_MISC_H
-+#define AVCODEC_ARM_RPI_HEVC_MISC_H
-+
-+#include "config.h"
-+#if HAVE_NEON_INLINE && !CONFIG_THUMB
-+
-+static av_noinline void ff_hevc_rpi_copy_vert_v2h_neon(uint8_t *dst, const uint8_t *src,
-+                                                       int pixel_shift, int height,
-+                                                       ptrdiff_t stride_src)
-+{
-+    const uint8_t *src2 = src + stride_src;
-+    stride_src <<= 1;
-+    switch (pixel_shift)
-+    {
-+        case 2:
-+            __asm__ volatile (
-+                "vld1.32     {d0[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.32     {d0[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.32     {d1[0]}, [%[src]], %[stride_src]  \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vld1.32     {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+                "beq         2f                                \n\t"
-+                "1:                                            \n\t"
-+                "vld1.32     {d2[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.32     {d2[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.32     {d3[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.32     {d3[1]}, [%[src2]], %[stride_src] \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.32     {q0}, [%[dst]]!                   \n\t"
-+                "beq         3f                                \n\t"
-+                "vld1.32     {d0[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.32     {d0[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.32     {d1[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.32     {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.32     {q1}, [%[dst]]!                   \n\t"
-+                "bne         1b                                \n\t"
-+                "2:                                            \n\t"
-+                "vst1.32     {q0}, [%[dst]]                    \n\t"
-+                "b           4f                                \n\t"
-+                "3:                                            \n\t"
-+                "vst1.32     {q1}, [%[dst]]                    \n\t"
-+                "4:                                            \n\t"
-+                :  // Outputs
-+                           [src]"+r"(src),
-+                          [src2]"+r"(src2),
-+                           [dst]"+r"(dst),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_src]"r"(stride_src)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+        case 1:
-+            __asm__ volatile (
-+                "vld1.16     {d0[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.16     {d1[0]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.16     {d0[1]}, [%[src]], %[stride_src]  \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vld1.16     {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+                "beq         2f                                \n\t"
-+                "1:                                            \n\t"
-+                "vld1.16     {d2[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.16     {d3[0]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.16     {d2[1]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.16     {d3[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vzip.16     d0, d1                            \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.16     {d0}, [%[dst]]!                   \n\t"
-+                "beq         3f                                \n\t"
-+                "vld1.16     {d0[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.16     {d1[0]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.16     {d0[1]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.16     {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vzip.16     d2, d3                            \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.16     {d2}, [%[dst]]!                   \n\t"
-+                "bne         1b                                \n\t"
-+                "2:                                            \n\t"
-+                "vzip.16     d0, d1                            \n\t"
-+                "vst1.16     {d0}, [%[dst]]                    \n\t"
-+                "b           4f                                \n\t"
-+                "3:                                            \n\t"
-+                "vzip.16     d2, d3                            \n\t"
-+                "vst1.16     {d2}, [%[dst]]                    \n\t"
-+                "4:                                            \n\t"
-+                :  // Outputs
-+                           [src]"+r"(src),
-+                          [src2]"+r"(src2),
-+                           [dst]"+r"(dst),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_src]"r"(stride_src)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+        default:
-+            __asm__ volatile (
-+                "vld1.8      {d0[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d1[0]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d0[1]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d0[2]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d1[2]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d0[3]}, [%[src]], %[stride_src]  \n\t"
-+                "subs        %[height], #8                     \n\t"
-+                "vld1.8      {d1[3]}, [%[src2]], %[stride_src] \n\t"
-+                "beq         2f                                \n\t"
-+                "1:                                            \n\t"
-+                "vld1.8      {d2[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d3[0]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d2[1]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d3[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d2[2]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d3[2]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d2[3]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d3[3]}, [%[src2]], %[stride_src] \n\t"
-+                "vzip.8      d0, d1                            \n\t"
-+                "subs        %[height], #8                     \n\t"
-+                "vst1.8      {d0}, [%[dst]]!                   \n\t"
-+                "beq         3f                                \n\t"
-+                "vld1.8      {d0[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d1[0]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d0[1]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d0[2]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d1[2]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d0[3]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d1[3]}, [%[src2]], %[stride_src] \n\t"
-+                "vzip.8      d2, d3                            \n\t"
-+                "subs        %[height], #8                     \n\t"
-+                "vst1.8      {d2}, [%[dst]]!                   \n\t"
-+                "bne         1b                                \n\t"
-+                "2:                                            \n\t"
-+                "vzip.8      d0, d1                            \n\t"
-+                "vst1.8      {d0}, [%[dst]]                    \n\t"
-+                "b           4f                                \n\t"
-+                "3:                                            \n\t"
-+                "vzip.8      d2, d3                            \n\t"
-+                "vst1.8      {d2}, [%[dst]]                    \n\t"
-+                "4:                                            \n\t"
-+                :  // Outputs
-+                           [src]"+r"(src),
-+                          [src2]"+r"(src2),
-+                           [dst]"+r"(dst),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_src]"r"(stride_src)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+    }
-+}
-+
-+static av_noinline void ff_hevc_rpi_copy_vert_h2v_neon(uint8_t *dst, const uint8_t *src,
-+                                                       int pixel_shift, int height,
-+                                                      ptrdiff_t stride_dst)
-+{
-+    uint8_t *dst2 = dst + stride_dst;
-+    stride_dst <<= 1;
-+    switch (pixel_shift)
-+    {
-+        case 2:
-+            __asm__ volatile (
-+                "subs        %[height], #4                     \n\t"
-+                "vld1.32     {q0}, [%[src]]!                   \n\t"
-+                "beq         2f                                \n\t"
-+                "1:                                            \n\t"
-+                "vld1.32     {q1}, [%[src]]!                   \n\t"
-+                "vst1.32     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.32     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.32     {d1[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.32     {d1[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "beq         3f                                \n\t"
-+                "vld1.32     {q0}, [%[src]]!                   \n\t"
-+                "vst1.32     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.32     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.32     {d3[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.32     {d3[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "bne         1b                                \n\t"
-+                "2:                                            \n\t"
-+                "vst1.32     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.32     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.32     {d1[0]}, [%[dst]]                 \n\t"
-+                "vst1.32     {d1[1]}, [%[dst2]]                \n\t"
-+                "b           4f                                \n\t"
-+                "3:                                            \n\t"
-+                "vst1.32     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.32     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.32     {d3[0]}, [%[dst]]                 \n\t"
-+                "vst1.32     {d3[1]}, [%[dst2]]                \n\t"
-+                "4:                                            \n\t"
-+                :  // Outputs
-+                           [dst]"+r"(dst),
-+                          [dst2]"+r"(dst2),
-+                           [src]"+r"(src),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_dst]"r"(stride_dst)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+        case 1:
-+            __asm__ volatile (
-+                "subs        %[height], #4                     \n\t"
-+                "vld1.16     {d0}, [%[src]]!                   \n\t"
-+                "beq         2f                                \n\t"
-+                "1:                                            \n\t"
-+                "vld1.16     {d2}, [%[src]]!                   \n\t"
-+                "vst1.16     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.16     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.16     {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.16     {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
-+                "beq         3f                                \n\t"
-+                "vld1.16     {d0}, [%[src]]!                   \n\t"
-+                "vst1.16     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.16     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.16     {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.16     {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
-+                "bne         1b                                \n\t"
-+                "2:                                            \n\t"
-+                "vst1.16     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.16     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.16     {d0[2]}, [%[dst]]                 \n\t"
-+                "vst1.16     {d0[3]}, [%[dst2]]                \n\t"
-+                "b           4f                                \n\t"
-+                "3:                                            \n\t"
-+                "vst1.16     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.16     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.16     {d2[2]}, [%[dst]]                 \n\t"
-+                "vst1.16     {d2[3]}, [%[dst2]]                \n\t"
-+                "4:                                            \n\t"
-+                :  // Outputs
-+                           [dst]"+r"(dst),
-+                          [dst2]"+r"(dst2),
-+                           [src]"+r"(src),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_dst]"r"(stride_dst)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+        default:
-+            __asm__ volatile (
-+                "subs        %[height], #8                     \n\t"
-+                "vld1.8      {d0}, [%[src]]!                   \n\t"
-+                "beq         2f                                \n\t"
-+                "1:                                            \n\t"
-+                "vld1.8      {d2}, [%[src]]!                   \n\t"
-+                "vst1.8      {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d0[4]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d0[5]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d0[6]}, [%[dst]], %[stride_dst]  \n\t"
-+                "subs        %[height], #8                     \n\t"
-+                "vst1.8      {d0[7]}, [%[dst2]], %[stride_dst] \n\t"
-+                "beq         3f                                \n\t"
-+                "vld1.8      {d0}, [%[src]]!                   \n\t"
-+                "vst1.8      {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d2[4]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d2[5]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d2[6]}, [%[dst]], %[stride_dst]  \n\t"
-+                "subs        %[height], #8                     \n\t"
-+                "vst1.8      {d2[7]}, [%[dst2]], %[stride_dst] \n\t"
-+                "bne         1b                                \n\t"
-+                "2:                                            \n\t"
-+                "vst1.8      {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d0[4]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d0[5]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d0[6]}, [%[dst]]                 \n\t"
-+                "vst1.8      {d0[7]}, [%[dst2]]                \n\t"
-+                "b           4f                                \n\t"
-+                "3:                                            \n\t"
-+                "vst1.8      {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d2[4]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d2[5]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d2[6]}, [%[dst]]                 \n\t"
-+                "vst1.8      {d2[7]}, [%[dst2]]                \n\t"
-+                "4:                                            \n\t"
-+                :  // Outputs
-+                           [dst]"+r"(dst),
-+                          [dst2]"+r"(dst2),
-+                           [src]"+r"(src),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_dst]"r"(stride_dst)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+    }
-+}
-+
-+static av_noinline void ff_hevc_rpi_copy_vert_v2v_neon(uint8_t *dst, const uint8_t *src,
-+                                                       int pixel_shift, int height,
-+                                                       ptrdiff_t stride_dst, ptrdiff_t stride_src)
-+{
-+    int x, y;
-+    switch (pixel_shift)
-+    {
-+        case 2:
-+            __asm__ volatile (
-+                "ldr         %[x], [%[src]], %[stride_src] \n\t"
-+                "ldr         %[y], [%[src]], %[stride_src] \n\t"
-+                "str         %[x], [%[dst]], %[stride_dst] \n\t"
-+                "sub         %[height], #2                 \n\t"
-+                "1:                                        \n\t"
-+                "ldr         %[x], [%[src]], %[stride_src] \n\t"
-+                "str         %[y], [%[dst]], %[stride_dst] \n\t"
-+                "ldr         %[y], [%[src]], %[stride_src] \n\t"
-+                "subs        %[height], #2                 \n\t"
-+                "str         %[x], [%[dst]], %[stride_dst] \n\t"
-+                "bne         1b                            \n\t"
-+                "str         %[y], [%[dst]]                \n\t"
-+                :  // Outputs
-+                             [x]"=&r"(x),
-+                             [y]"=&r"(y),
-+                           [src]"+r"(src),
-+                           [dst]"+r"(dst),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_src]"r"(stride_src),
-+                    [stride_dst]"r"(stride_dst)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+        case 1:
-+            __asm__ volatile (
-+                "ldrh        %[x], [%[src]], %[stride_src] \n\t"
-+                "ldrh        %[y], [%[src]], %[stride_src] \n\t"
-+                "strh        %[x], [%[dst]], %[stride_dst] \n\t"
-+                "sub         %[height], #2                 \n\t"
-+                "1:                                        \n\t"
-+                "ldrh        %[x], [%[src]], %[stride_src] \n\t"
-+                "strh        %[y], [%[dst]], %[stride_dst] \n\t"
-+                "ldrh        %[y], [%[src]], %[stride_src] \n\t"
-+                "subs        %[height], #2                 \n\t"
-+                "strh        %[x], [%[dst]], %[stride_dst] \n\t"
-+                "bne         1b                            \n\t"
-+                "strh        %[y], [%[dst]]                \n\t"
-+                :  // Outputs
-+                             [x]"=&r"(x),
-+                             [y]"=&r"(y),
-+                           [src]"+r"(src),
-+                           [dst]"+r"(dst),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_src]"r"(stride_src),
-+                    [stride_dst]"r"(stride_dst)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+        default:
-+            __asm__ volatile (
-+                "ldrb        %[x], [%[src]], %[stride_src] \n\t"
-+                "ldrb        %[y], [%[src]], %[stride_src] \n\t"
-+                "strb        %[x], [%[dst]], %[stride_dst] \n\t"
-+                "sub         %[height], #2                 \n\t"
-+                "1:                                        \n\t"
-+                "ldrb        %[x], [%[src]], %[stride_src] \n\t"
-+                "strb        %[y], [%[dst]], %[stride_dst] \n\t"
-+                "ldrb        %[y], [%[src]], %[stride_src] \n\t"
-+                "subs        %[height], #2                 \n\t"
-+                "strb        %[x], [%[dst]], %[stride_dst] \n\t"
-+                "bne         1b                            \n\t"
-+                "strb        %[y], [%[dst]]                \n\t"
-+                :  // Outputs
-+                             [x]"=&r"(x),
-+                             [y]"=&r"(y),
-+                           [src]"+r"(src),
-+                           [dst]"+r"(dst),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_src]"r"(stride_src),
-+                    [stride_dst]"r"(stride_dst)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+    }
-+}
-+
-+#define ff_hevc_rpi_copy_vert ff_hevc_rpi_copy_vert_neon
-+static inline void ff_hevc_rpi_copy_vert_neon(uint8_t *dst, const uint8_t *src,
-+                                              int pixel_shift, int height,
-+                                              ptrdiff_t stride_dst, ptrdiff_t stride_src)
-+{
-+    if (stride_dst == 1 << pixel_shift)
-+        ff_hevc_rpi_copy_vert_v2h_neon(dst, src, pixel_shift, height, stride_src);
-+    else if (stride_src == 1 << pixel_shift)
-+        ff_hevc_rpi_copy_vert_h2v_neon(dst, src, pixel_shift, height, stride_dst);
-+    else
-+        ff_hevc_rpi_copy_vert_v2v_neon(dst, src, pixel_shift, height, stride_dst, stride_src);
-+}
-+
-+#endif /* HAVE_NEON_INLINE */
-+
-+#endif /* AVCODEC_ARM_RPI_HEVC_MISC_H */
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevc_mv_arm.h
-@@ -0,0 +1,93 @@
-+/*
-+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Written by John Cox, Ben Avison
-+*/
-+
-+#ifndef AVCODEC_ARM_RPI_HEVC_MV_H
-+#define AVCODEC_ARM_RPI_HEVC_MV_H
-+
-+#if HAVE_ARMV6T2_INLINE
-+static inline MvXY mvxy_add_arm(const MvXY a, const MvXY b)
-+{
-+    MvXY r;
-+    __asm__ (
-+        "sadd16    %[r], %[a], %[b]        \n\t"
-+        : [r]"=r"(r)
-+        : [a]"r"(a),
-+          [b]"r"(b)
-+        :
-+        );
-+    return r;
-+}
-+#define mvxy_add mvxy_add_arm
-+#endif
-+
-+#if HAVE_ARMV6T2_INLINE
-+#if (defined(__ARM_ARCH_EXT_IDIV__) || defined (__ARM_FEATURE_IDIV))
-+static inline int32_t mv_scale_xy_arm(int32_t xy, int td, int tb)
-+{
-+    int t;
-+    __asm__ (
-+    "ssat   %[td], #8,    %[td]          \n\t"
-+    "ssat   %[tb], #8,    %[tb]          \n\t"
-+    "eor    %[t],  %[td], %[td], asr #31 \n\t"
-+    "adds   %[t],  %[t],  %[td], lsr #31 \n\t"
-+    "asr    %[t],  #1                    \n\t"
-+    "add    %[t],  #0x4000               \n\t"
-+    "it ne                               \n\t"
-+    "sdivne %[t],  %[t],  %[td]          \n\t"
-+    "mov    %[td], #32                   \n\t"
-+    "smlabb %[td], %[t],  %[tb], %[td]   \n\t"
-+    "ssat   %[td], #13,   %[td], asr #6  \n\t"
-+    "mov    %[tb], #127                  \n\t"
-+    "smlatb %[t],  %[xy], %[td], %[tb]   \n\t"
-+    "smlabb %[tb], %[xy], %[td], %[tb]   \n\t"
-+// This takes the sign of x & y for rounding at the "wrong" point
-+// (i.e. after adding 127) but for the range of values (-1,-127)
-+// where it does the wrong thing you get the right answer (0) anyway
-+    "add    %[t],  %[t],  %[t],  lsr #31 \n\t"
-+    "add    %[xy], %[tb], %[tb], lsr #31 \n\t"
-+    "ssat   %[t],  #16,   %[t],  asr #8  \n\t"
-+    "ssat   %[xy], #16,   %[xy], asr #8  \n\t"
-+    "pkhbt  %[xy], %[xy], %[t],  lsl #16 \n\t"
-+    :
-+         [t]"=&r"(t),
-+        [xy]"+r"(xy),
-+        [td]"+r"(td),
-+        [tb]"+r"(tb)
-+    :
-+    :
-+        "cc"
-+    );
-+    return xy;
-+}
-+#define mv_scale_xy mv_scale_xy_arm
-+#endif
-+#endif
-+
-+#endif // AVCODEC_ARM_RPI_HEVC_MV_H
-+
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_arm.h
-@@ -0,0 +1,26 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_ARM_HEVCDSP_ARM_H
-+#define AVCODEC_ARM_HEVCDSP_ARM_H
-+
-+#include "libavcodec/rpi_hevcdsp.h"
-+
-+void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth);
-+
-+#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
-@@ -0,0 +1,1634 @@
-+/*
-+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1
-+ */
-+
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+.macro hevc_loop_filter_uv_body1 P1a, P0a, Q0a, Q1a, I1, I2, I3, I4, I5, I6, I7, I8
-+        vsubl.u8  q0, \Q0a, \P0a
-+        vsubl.u8  q1, \P1a, \Q1a
-+        vdup.16   d4, r2
-+        \I1
-+        vshl.i16  q0, #2
-+        \I2
-+        vadd.i16  q0, q1
-+        \I3
-+        vmovl.u8  q2, d4
-+        \I4
-+        vneg.s16  q1, q2
-+        \I5
-+        vrshr.s16 q0, #3
-+        \I6
-+        \I7
-+        \I8
-+        vmin.s16  q0, q2
-+        vmovl.u8  q2, \Q0a
-+        vmax.s16  q0, q1
-+        vaddw.u8  q1, q0, \P0a
-+        vsub.i16  q0, q2, q0
-+        vqmovun.s16 \P0a, q1
-+        vqmovun.s16 \Q0a, q0
-+.endm
-+
-+
-+.macro hevc_loop_filter_uv_body2 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, I1, I2, I3, I4, I5, I6, I7
-+        vsubl.u8  q0, \Q0a, \P0a  @ q0a - p0a
-+        lsr       r12, r2, #16
-+        vsubl.u8  q1, \Q0b, \P0b  @ q0b - p0b
-+        vsubl.u8  q2, \P1a, \Q1a  @ p1a - q1a
-+        vsubl.u8  q3, \P1b, \Q1b  @ p1b - q1b
-+        vshl.i16  q0, #2          @ (q0a - p0a) * 4
-+        vshl.i16  q1, #2          @ (q0b - p0b) * 4
-+        vadd.i16  q0, q2          @ ((q0a - p0a) * 4) + p1a - q1a
-+        vadd.i16  q1, q3          @ ((q0b - p0b) * 4) + p1b - q1b
-+        vdup.16   d4, r2          @ tc0a, tc0b
-+        vdup.16   d6, r12         @ tc1a, tc1b
-+        vrshr.s16 q0, #3          @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
-+        \I1
-+        vrshr.s16 q1, #3          @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
-+        \I2
-+        vmovl.u8  q2, d4          @ tc0a, tc0b
-+        \I3
-+        vmovl.u8  q3, d6          @ tc1a, tc1b
-+        \I4
-+        vmin.s16  q0, q2
-+        \I5
-+        vneg.s16  q2, q2          @ -tc0a, -tc0b
-+        \I6
-+        vmin.s16  q1, q3
-+        \I7
-+        vneg.s16  q3, q3          @ -tc1a, -tc1b
-+        vmax.s16  q0, q2          @ delta0a
-+        vmovl.u8  q2, \Q0a
-+        vmax.s16  q1, q3          @ delta0b
-+        vaddw.u8  q3, q0, \P0a    @ p0a + delta0a
-+        vsub.i16  q0, q2, q0      @ q0a - delta0a
-+        vmovl.u8  q2, \Q0b
-+        vsub.i16  q2, q1          @ q0b - delta0b
-+        vaddw.u8  q1, \P0b        @ p0b + delta0b
-+        vqmovun.s16 \Q0a, q0
-+        vqmovun.s16 \P0a, q3
-+        vqmovun.s16 \Q0b, q2
-+        vqmovun.s16 \P0b, q1
-+.endm
-+
-+
-+@ Preserves r12
-+@ Clobbers r2
-+@ P0a et al all contain UVUVUVUV
-+@ r2 (tc4) contains
-+@   [0..7]   tc U a
-+@   [8..15]  tc V a
-+
-+.macro hevc_loop_filter_uv_body1_16 P1a, P0a, Q0a, Q1a, bit_depth, I1, I2, I3, I4, I5, I6, I7, I8
-+        vsub.i16  q0, \Q0a, \P0a
-+        vsub.i16  q1, \P1a, \Q1a
-+        vdup.16   d4, r2
-+        \I1
-+        vshl.i16  q0, #2
-+        \I2
-+        vadd.i16  q0, q1
-+        \I3
-+        vshll.u8  q2, d4, #\bit_depth - 8
-+        \I4
-+        vneg.s16  q1, q2
-+        \I5
-+        vrshr.s16 q0, #3
-+        \I6
-+        \I7
-+        \I8
-+        vmin.s16  q0, q2
-+        vmov.i16  q2, #0
-+        vmax.s16  q0, q1
-+        vadd.i16  \P0a, q0
-+        vsub.i16  \Q0a, q0
-+        vmov.i16  q1, #(1 << \bit_depth) - 1
-+        vmax.s16  \P0a, q2
-+        vmax.s16  \Q0a, q2
-+        vmin.s16  \P0a, q1
-+        vmin.s16  \Q0a, q1
-+.endm
-+
-+@ Clobbers r2, r12
-+@ P0a et al all contain UVUVUVUV
-+@ r2 (tc4) contains
-+@   [0..7]   tc U a
-+@   [8..15]  tc V a
-+@  [16..23]  tc U b
-+@  [24..31]  tc V b
-+
-+.macro hevc_loop_filter_uv_body2_16 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, bit_depth, I1, I2, I3, I4, I5, I6, I7
-+        vsub.i16  q0, \Q0a, \P0a  @ q0a - p0a
-+        lsr       r12, r2, #16
-+        vsub.i16  q1, \Q0b, \P0b  @ q0b - p0b
-+        vsub.i16  q2, \P1a, \Q1a  @ p1a - q1a
-+        vsub.i16  q3, \P1b, \Q1b  @ p1b - q1b
-+        vshl.i16  q0, #2          @ (q0a - p0a) * 4
-+        vshl.i16  q1, #2          @ (q0b - p0b) * 4
-+        vadd.i16  q0, q2          @ ((q0a - p0a) * 4) + p1a - q1a
-+        vadd.i16  q1, q3          @ ((q0b - p0b) * 4) + p1b - q1b
-+        vdup.16   d4, r2          @ tc0a, tc0b
-+        vdup.16   d6, r12         @ tc1a, tc1b
-+        vrshr.s16 q0, #3          @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
-+        \I1
-+        vrshr.s16 q1, #3          @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
-+        \I2
-+        vshll.u8  q2, d4, #\bit_depth - 8 @ tc0a, tc0b
-+        \I3
-+        vshll.u8  q3, d6, #\bit_depth - 8 @ tc1a, tc1b
-+        \I4
-+        vmin.s16  q0, q2
-+        \I5
-+        vneg.s16  q2, q2          @ -tc0a, -tc0b
-+        \I6
-+        vmin.s16  q1, q3
-+        \I7
-+        vneg.s16  q3, q3          @ -tc1a, -tc1b
-+        vmax.s16  q0, q2          @ delta0a
-+        vadd.i16  \P0a, q0        @ p0a + delta0a
-+        vsub.i16  \Q0a, q0        @ q0a - delta0a
-+        vmax.s16  q1, q3          @ delta0b
-+        vadd.i16  \P0b, q1        @ p0b + delta0b
-+        vsub.i16  \Q0b, q1        @ q0b - delta0b
-+        vmov.i16  q2, #0
-+        vmov.i16  q3, #(1 << \bit_depth) - 1
-+        vmax.s16  \P0a, q2
-+        vmax.s16  \Q0a, q2
-+        vmax.s16  \P0b, q2
-+        vmax.s16  \Q0b, q2
-+        vmin.s16  \P0a, q3
-+        vmin.s16  \Q0a, q3
-+        vmin.s16  \P0b, q3
-+        vmin.s16  \Q0b, q3
-+.endm
-+
-+
-+
-+@   uint8_t *_no_p,     [sp+0]
-+@   uint8_t *_no_q)     [sp+4]
-+
-+.macro hevc_loop_filter_luma_start
-+        ldr     r12, [r3]
-+        ldr      r3, [r3, #4]
-+        orrs     r3, r12, r3, lsl #16
-+        it       eq
-+        bxeq     lr
-+        push     {r4-r10,lr}            @ 32 bytes
-+        ldrd     r4, r5, [sp, #32]      @ &_no_p
-+        ldrb     r4, [r4]
-+        ldrb     r5, [r5]
-+        movs     r10, r4
-+        it ne
-+        movne    r10, #1
-+        cmp      r5, #0
-+        it ne
-+        orrne    r10, #2
-+.endm
-+
-+@ Input:
-+@  r2          beta    (raw: needs shift for bitdepth > 8)
-+@  r3[ 0:15]   tc[0]   (raw: needs shift for bitdepth > 8)
-+@  r3[16:31]   tc[1]   (raw: needs shift for bitdepth > 8)
-+@
-+@ Input & output
-+@  8-bit: d16-d23      (Q3,Q2,Q1,Q0,P0,P1,P2,P3)
-+@ 16-bit:  q8-q15
-+@
-+@  r1         -r1
-+@  r10        b1->C, b0->N  (r10 junk)
-+@
-+@ Junks:
-+@  r5, r6, r7, r8, r9
-+
-+.macro m_filter_luma bit_depth, Q11, Q15
-+.if \bit_depth == 8
-+        vmovl.u8    q14, d22      @ q2,7 q2,6 ... q2,0 = TQ2' ... Q2' TQ2 ... Q2
-+        vmovl.u8    q13, d21      @ q1,7 q1,6 ... q1,0 = TQ1' ... Q1' TQ1 ... Q1
-+        vmovl.u8    q12, d20      @ q0,7 q0,6 ... q0,0 = TQ0' ... Q0' TQ0 ... Q0
-+        vmovl.u8    \Q11, d19     @ p0,7 p0,6 ... p0,0 = TP0' ... P0' TP0 ... P0
-+        vmovl.u8    q10, d18      @ p1,7 p1,6 ... p1,0 = TP1' ... P1' TP1 ... P1
-+        vmovl.u8    q9, d17       @ p2,7 p2,6 ... p2,0 = TP2' ... P2' TP2 ... P2
-+.endif
-+        vadd.i16    q0, q9, \Q11  @ P2 + P0
-+.if \bit_depth > 8
-+        lsl         r3, r3, #(\bit_depth - 8)
-+.endif
-+        vadd.i16    q1, q14, q12  @ Q2 + Q0
-+.if \bit_depth > 8
-+        lsl         r2, r2, #(\bit_depth - 8)
-+.endif
-+        vsub.i16    q0, q10       @ P2 - P1 + P0
-+        lsr         r5, r3, #16
-+        vsub.i16    q1, q13       @ Q2 - Q1 + Q0
-+.if \bit_depth == 8
-+        vmovl.u8    q8, d16       @ p3,7 p3,6 ... p3,0 = TP3' ... P3' TP3 ... P3
-+        vmovl.u8    \Q15, d23     @ q3,7 q3,6 ... q3,0 = TQ3' ... Q3' TQ3 ... Q3
-+.endif
-+        vabd.s16    q0, q10       @ dp0 = abs(P2 - 2 * P1 + P0)
-+        vabd.s16    q1, q13       @ dq0 = abs(Q2 - 2 * Q1 + Q0)
-+        vmov.i64    q2, #0xffffffff0000
-+        vbic        q0, q2        @ only dp0(') and dp3(')
-+        vbic        q1, q2        @ only dq0(') and dq3(')
-+        vsra.u64    q0, #16
-+        vsra.u64    q1, #16
-+        vdup.16     q3, r2        @ beta
-+        vdup.16     d14, r3       @ tC[0]
-+        vdup.16     d15, r5       @ tC[1]
-+        vabd.s16    q4, q8, \Q11  @ abs(TP3'-TP0' ... P3'-P0' TP3-TP0 ... P3-P0)
-+        vmovn.i32   d0, q0        @ dp3' dp0' dp3 dp0
-+        vmovn.i32   d1, q1        @ dq3' dq0' dq3 dq0
-+        vadd.i16    d5, d0, d1    @ d3'=dp3'+dq3' d0'=dp0'+dq0' d3=dp3+dq3 d0=dp0+dq0
-+        vabd.s16    q5, \Q11, q12 @ abs(TP0'-TQ0' ... P0'-Q0' TP0-TQ0 ... P0-Q0)
-+        vaba.s16    q4, \Q15, q12 @ +abs(TQ3'-TQ0' ... Q3'-Q0' TQ3-TQ0 ... Q3-Q0)
-+        vpadd.i16   d2, d5, d5    @ dontcare dontcare d0'+d3' d0+d3
-+        vshl.s16    q6, q7, #2    @ tC[] * 4
-+        vrhadd.s16  q6, q7        @ tc25 = (tc[] * 5 + 1) >> 1
-+        vcgt.s16    d2, d6, d2    @ if (d0 + d3 < beta)
-+        vmov        r7, s4        @ (d2) r7 = mask of blocks to apply filtering (16b/block)
-+        vshr.s16    q1, q3, #3    @ beta_3 = beta >> 3
-+        cmp         r7, #0
-+        beq         .Lbypasswrite
-+
-+        vcgt.s16    q5, q6, q5    @ if < tc25
-+        vcgt.s16    q4, q1, q4    @ if (abs({T}P[0-3]{'}-{T}P[0-3]{'})+abs({T}Q[0-3]{'}-{T}Q[0-3]{'}) < beta_3)
-+        vand        q4, q5
-+        vbic        d8, d4
-+        vbic        d9, d4
-+        vshr.s16    q3, #2        @ beta_2 = beta >> 2
-+        vsra.u64    q4, #16
-+        vshl.s16    d5, #1        @ d3'<<1 d0'<<1 d3<<1 d0<<1
-+        vshl.i16    q7, #1        @ tc2 = tC[] << 1
-+        vcgt.s16    d6, d5        @ if (d3'<<1 < beta_2) etc
-+        vmovn.i32   d8, q4        @ beta_3 && tc25 tests, prime block in ms half
-+        vand        d6, d8        @ && beta_2 tests, prime in ms half
-+        vpadd.i16   d0, d1        @ dq0'+dq3' dq0+dq3 dp0'+dp3' dp0+dp3
-+        vneg.s16    q6, q7        @ -tc2
-+        vmovn.i32   d8, q3
-+        vshrn.i32   d6, q3, #16
-+        vand        d6, d8
-+        vmov        r5, r6, d0    @ r5 = dp0'+dp3' dp0+dp3  r6 = dq0'+dq3' dq0+dq3
-+        vmov        r8, s12       @ (d6) r8 = mask of strong filtering blocks (16b/block)
-+        vadd.i16    q0, \Q11, q12 @ p0 + q0
-+        ands        r9, r7, r8
-+        beq         1f
-+
-+        vadd.i16    q2, q0, q10   @ p1 + p0 + q0
-+        vadd.i16    q3, q0, q13   @ p0 + q0 + q1
-+        lsr         r3, r9, #16
-+        vadd.i16    q1, q2, q9    @ p2 + p1 + p0 + q0 (new P1 before clipping)
-+        vadd.i16    q4, q3, q14   @ p0 + q0 + q1 + q2 (new Q1 before clipping)
-+        vadd.i16    q0, q8, q9    @ p3 + p2
-+        vadd.i16    q5, \Q15, q14 @ q2 + q3
-+        vadd.i16    q2, q1        @ p2 + 2 * p1 + 2 * p0 + 2 * q0
-+        vadd.i16    q3, q4        @ 2 * p0 + 2 * q0 + 2 * q1 + q2
-+        vshl.i16    q0, #1        @ 2 * p3 + 2 * p2
-+        vshl.i16    q5, #1        @ 2 * q2 + 2 * q3
-+        vadd.i16    q0, q1        @ 2 * p3 + 3 * p2 + p1 + p0 + q0 (new P2 before clipping)
-+        vadd.i16    q5, q4        @ p0 + q0 + q1 + 3 * q2 + 2 * q3 (new Q2 before clipping)
-+        vadd.i16    q2, q13       @ p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 (new P0 before clipping)
-+        vadd.i16    q3, q10       @ p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 (new Q0 before clipping)
-+        vrshr.s16   q0, #3        @ scale, with rounding
-+        vrshr.s16   q5, #3
-+        vrshr.s16   q1, #2
-+        vrshr.s16   q4, #2
-+        vrshr.s16   q2, #3
-+        vrshr.s16   q3, #3
-+        vsub.i16    q0, q9        @ find difference
-+        vsub.i16    q5, q14
-+        vsub.i16    q1, q10
-+        vsub.i16    q4, q13
-+        vsub.i16    q2, \Q11
-+        vsub.i16    q3, q12
-+        vmax.s16    q0, q6        @ clip difference to -tc2 .. tc2
-+        vmax.s16    q5, q6
-+        vmax.s16    q1, q6
-+        vmax.s16    q4, q6
-+        vmax.s16    q2, q6
-+        vmax.s16    q3, q6
-+        vdup.16     d12, r9       @ expand mask, reuse q6 due to register pressure
-+        vdup.16     d13, r3
-+        vmin.s16    q0, q7
-+        vmin.s16    q5, q7
-+        vmin.s16    q1, q7
-+        vmin.s16    q4, q7
-+        vmin.s16    q2, q7
-+        vmin.s16    q3, q7
-+        vadd.i16    q0, q9        @ apply difference
-+        vadd.i16    q5, q14
-+        vadd.i16    q1, q10
-+        vadd.i16    q4, q13
-+        vadd.i16    q2, \Q11
-+        vadd.i16    q3, q12
-+        vbit        q9, q0, q6    @ apply filtered values according to mask
-+        vbit        q14, q5, q6
-+        vbit        q10, q1, q6
-+        vbit        q13, q4, q6
-+        vbit        \Q11, q2, q6
-+        vbit        q12, q3, q6
-+        vneg.s16    q6, q7        @ restore -tc2
-+
-+1:
-+        bics        r9, r7, r8
-+        beq         2f
-+
-+        vsub.i16    q0, q12, \Q11 @ q0 - p0
-+        vsub.i16    q1, q13, q10  @ q1 - p1
-+        lsr         r3, r9, #16
-+        vshl.i16    q2, q0, #3
-+        lsr         r7, r5, #16
-+        vadd.i16    q3, q0, q2    @ 9 * (q0 - p0)
-+        lsr         r8, r6, #16
-+        vshl.i16    q2, q1, #1
-+        vadd.i16    q4, q1, q2    @ 3 * (q1 - p1)
-+        vshr.s16    q6, #1        @ -tc = -tc2 >> 1
-+        vsub.i16    q5, q3, q4
-+        vrhadd.s16  q1, q9, \Q11  @ (p2 + p0 + 1) >> 1
-+        vrhadd.s16  q3, q14, q12  @ (q2 + q0 + 1) >> 1
-+        vrshr.s16   q5, #4        @ delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4
-+        vsub.i16    q1, q10       @ ((p2 + p0 + 1) >> 1) - p1
-+        vsub.i16    q3, q13       @ ((q2 + q0 + 1) >> 1) - q1
-+        vmax.s16    q6, q5        @
-+        vshr.s16    q4, q7, #1    @ tc = tc2 >> 1
-+        vdup.16     q0, r2        @ beta
-+        vmin.s16    q6, q4        @ delta0 clamped to [-tc, tc]
-+        vshr.s16    q4, #1        @ tc_2 = tc >> 1
-+        vhadd.s16   q1, q6        @ (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
-+        vhsub.s16   q3, q6        @ (((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
-+        vshr.s16    q2, q0, #1    @ beta >> 1
-+        vadd.i16    q2, q0        @ beta + (beta >> 1)
-+        vneg.s16    q0, q4        @ -tc_2
-+        vabs.s16    q5, q5        @ abs(original delta0)
-+        vshr.s16    q2, #3        @ (beta + (beta >> 1)) >> 3
-+        vmax.s16    q1, q0
-+        vmax.s16    q3, q0
-+        vshl.s16    q0, q7, #2    @ 8 * tc
-+        vadd.i16    q7, q0        @ 10 * tc
-+        vdup.16     d0, r9
-+        vdup.16     d1, r3        @ q0 = mask of blocks to apply filtering
-+        vmin.s16    q1, q4        @ deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2)
-+        vmin.s16    q3, q4        @ deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 + delta0) >> 1, -tc_2, tc_2)
-+        vdup.16     d8, r5        @ dp0 + dp3
-+        vdup.16     d9, r7        @ dp0' + dp3'
-+        vcgt.s16    q7, q5        @ if ((10 * tc) > abs(delta0))
-+        vdup.16     d10, r6       @ dq0 + dq3
-+        vdup.16     d11, r8       @ dq0' + dq3'
-+        vand        q7, q0        @ AND block and line masks
-+        vcgt.s16    q4, q2, q4    @ if (((beta + (beta >> 1)) >> 3) > dp0 + dp3), i.e. if (nd_p > 1)
-+        vadd.i16    q0, q1, q10   @ p1 + deltap1
-+        vcgt.s16    q5, q2, q5    @ if (((beta + (beta >> 1)) >> 3) > dq0 + dq3), i.e. if (nd_q > 1)
-+        vadd.i16    q3, q3, q13   @ q1 + deltaq1
-+        vadd.i16    q1, \Q11, q6  @ p0 + delta0
-+        vsub.i16    q2, q12, q6   @ q0 - delta0
-+        vand        q4, q7        @ AND nd_p test with block/line masks
-+        vand        q5, q7        @ AND nd_q test with block/line masks
-+        vbit        q10, q0, q4
-+        vbit        \Q11, q1, q7
-+        vbit        q12, q2, q7
-+        vbit        q13, q3, q5
-+
-+2:
-+.if \bit_depth == 8
-+        vmovn.i16 d16, q8
-+        vmovn.i16 d23, \Q15
-+        neg       r1, r1
-+        vqmovun.s16 d17, q9
-+        vqmovun.s16 d18, q10
-+        vqmovun.s16 d19, \Q11
-+        lsls      r10, #31
-+        vqmovun.s16 d20, q12
-+        vqmovun.s16 d21, q13
-+        vqmovun.s16 d22, q14
-+.else
-+        vmov.i16  q0, #0
-+        vmov.i16  q1, #(1 << \bit_depth - 1)
-+        @ q8 & q15 should be unaltered and so don't require clipping
-+        neg       r1, r1
-+        vmax.s16  q9,  q0
-+        vmax.s16  q10, q0
-+        vmax.s16  q11, q0
-+        vmax.s16  q12, q0
-+        vmax.s16  q13, q0
-+        vmax.s16  q14, q0
-+        lsls      r10, #31
-+        vmin.s16  q9,  q1
-+        vmin.s16  q10, q1
-+        vmin.s16  q11, q1
-+        vmin.s16  q12, q1
-+        vmin.s16  q13, q1
-+        vmin.s16  q14, q1
-+.endif
-+        bx        lr
-+.endm
-+
-+function hevc_loop_filter_luma_body
-+        m_filter_luma 8, q15, q11
-+endfunc
-+
-+@ void ff_hevc_rpi_v_loop_filter_luma_neon_8(
-+@   uint8_t *_pix,      [r0]
-+@   ptrdiff_t _stride,  [r1]
-+@   int _beta,          [r2]
-+@   int *_tc,           [r3]
-+@   uint8_t *_no_p,     [sp+0]
-+@   uint8_t *_no_q)     [sp+4]
-+
-+function ff_hevc_rpi_v_loop_filter_luma_neon_8, export=1
-+        hevc_loop_filter_luma_start
-+
-+        sub      r4, r0, #4
-+        b        .Lv_loop_luma_common
-+endfunc
-+
-+@ void ff_hevc_rpi_v_loop_filter2_luma_neon(
-+@   uint8_t * pix_r,    [r0]
-+@   ptrdiff_t _stride,  [r1]
-+@   int _beta,          [r2]
-+@   int tc2,            [r3]
-+@   int no_f,           [sp+0]
-+@   uint8_t * pix_l)    [sp+4]
-+
-+function ff_hevc_rpi_v_loop_filter_luma2_neon_8, export=1
-+        cmp      r3, #0
-+        it       eq
-+        bxeq     lr
-+        push     {r4-r10,lr}            @ 32 bytes
-+        ldr      r4, [sp, #36]
-+        ldr      r10, [sp, #32]
-+
-+.Lv_loop_luma_common:
-+        vpush    {d8-d15}
-+
-+        @ It's slightly faster to do unlaned loads and transpose in the
-+        @ 8-bit case, even though it needs more instructions, because
-+        @ VLD4.8 is a really slow way to read from memory.
-+        vld1.32 {d16[0]}, [r4:32], r1
-+        vld1.32 {d20[0]}, [r0:32], r1
-+        vld1.32 {d16[1]}, [r4:32], r1
-+        vld1.32 {d20[1]}, [r0:32], r1
-+        vld1.32 {d17[0]}, [r4:32], r1
-+        vld1.32 {d21[0]}, [r0:32], r1
-+        vld1.32 {d17[1]}, [r4:32], r1
-+        vld1.32 {d21[1]}, [r0:32], r1
-+        vld1.32 {d18[0]}, [r4:32], r1
-+        vld1.32 {d22[0]}, [r0:32], r1
-+        vld1.32 {d18[1]}, [r4:32], r1
-+        vld1.32 {d22[1]}, [r0:32], r1
-+        vld1.32 {d19[0]}, [r4:32], r1
-+        vld1.32 {d23[0]}, [r0:32], r1
-+        vld1.32 {d19[1]}, [r4:32]
-+        vld1.32 {d23[1]}, [r0:32]
-+        vuzp.16 q8, q9
-+        vuzp.16 q10, q11
-+        vuzp.8  q8, q9
-+        vuzp.8  q10, q11
-+        vswp    d17, d18
-+        vswp    d21, d22
-+
-+        bl hevc_loop_filter_luma_body
-+
-+        add     r6, r4, r1
-+        add     r2, r0, r1
-+        lsl     r1, #1
-+
-+        vpop     {d8-d15}
-+
-+        @ no_p[1]
-+        bmi     1f
-+        vst4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
-+        vst4.8  {d16[6],d17[6],d18[6],d19[6]}, [r6:32], r1
-+        vst4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
-+        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r6:32], r1
-+
-+        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
-+        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r6:32], r1
-+        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
-+        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r6:32]
-+1:
-+        @ no_q[1]
-+        bcs     1f
-+        vst4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
-+        vst4.8  {d20[6],d21[6],d22[6],d23[6]}, [r2:32], r1
-+        vst4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
-+        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r2:32], r1
-+
-+        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
-+        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1
-+        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
-+        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r2:32]
-+1:
-+        pop      {r4-r10,pc}
-+
-+.Lbypasswrite:
-+        vpop     {d8-d15}
-+        pop      {r4-r10,pc}
-+endfunc
-+
-+.macro m_filter_v_luma_16 bit_depth
-+        vpush    {d8-d15}
-+
-+        @ Uses slightly fewer instructions to do laned loads than unlaned
-+        @ and transpose.  This also means that we can use the same code for
-+        @ both split & unsplit deblock
-+        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r4], r1
-+        vld4.16  {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
-+
-+        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
-+        vld4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
-+
-+        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r4], r1
-+        vld4.16  {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
-+
-+        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
-+        vld4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
-+
-+        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r4], r1
-+        vld4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
-+
-+        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
-+        vld4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
-+
-+        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
-+        vld4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
-+
-+        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4]
-+        vld4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0]
-+
-+        bl hevc_loop_filter_luma_body_\bit_depth
-+
-+        add      r6, r4, r1
-+        add      r2, r0, r1
-+        lsl      r1, #1
-+
-+        vpop     {d8-d15}
-+
-+        @ p[1]
-+        bmi      1f
-+        vst4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4], r1
-+        vst4.16  {d17[2], d19[2], d21[2], d23[2]}, [r6], r1
-+        vst4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
-+        vst4.16  {d17[0], d19[0], d21[0], d23[0]}, [r6], r1
-+        vst4.16  {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
-+        vst4.16  {d16[2], d18[2], d20[2], d22[2]}, [r6], r1
-+        vst4.16  {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
-+        vst4.16  {d16[0], d18[0], d20[0], d22[0]}, [r6]
-+1:
-+        @ q[1]
-+        bcs      1f
-+        vst4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0], r1
-+        vst4.16  {d25[2], d27[2], d29[2], d31[2]}, [r2], r1
-+        vst4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
-+        vst4.16  {d25[0], d27[0], d29[0], d31[0]}, [r2], r1
-+        vst4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
-+        vst4.16  {d24[2], d26[2], d28[2], d30[2]}, [r2], r1
-+        vst4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
-+        vst4.16  {d24[0], d26[0], d28[0], d30[0]}, [r2]
-+1:
-+        pop      {r4-r10,pc}
-+.endm
-+
-+
-+
-+
-+@ void (*hevc_h_loop_filter_luma)(uint8_t *pix,     [r0]
-+@                                 ptrdiff_t stride, [r1]
-+@                                 int beta,         [r2]
-+@                                 int32_t *tc,      [r3]
-+@                                 uint8_t *no_p,    sp[0]
-+@                                 uint8_t *no_q);   sp[4]
-+@
-+@ Src should always be on 8 byte boundry & all in the same slice
-+
-+function ff_hevc_rpi_h_loop_filter_luma_neon_8, export=1
-+        hevc_loop_filter_luma_start
-+        b        .Lh_loop_filter_luma_common_8
-+endfunc
-+
-+function ff_hevc_rpi_h_loop_filter_luma2_neon_8, export=1
-+        cmp      r3, #0
-+        it       eq
-+        bxeq     lr
-+        push     {r4-r10,lr}            @ 32 bytes
-+        ldr      r10, [sp, #32]
-+
-+.Lh_loop_filter_luma_common_8:
-+        sub      r4, r0, r1, lsl #2
-+        add      r0, r4, r1
-+        lsl      r1, #1
-+        vpush    {d8-d15}
-+
-+        vld1.8  {d16}, [r4], r1
-+        vld1.8  {d17}, [r0], r1
-+        vld1.8  {d18}, [r4], r1
-+        vld1.8  {d19}, [r0], r1
-+        vld1.8  {d20}, [r4], r1
-+        vld1.8  {d21}, [r0], r1
-+        vld1.8  {d22}, [r4]
-+        vld1.8  {d23}, [r0]
-+
-+        bl hevc_loop_filter_luma_body
-+
-+        add      r0, r0, r1, lsl #1
-+        add      r2, r4, r1, lsl #1
-+        add      r6, r4, r1, asr #1
-+        vpop     {d8-d15}
-+
-+        @ P2-P0
-+        bcs      1f
-+        vst1.8   {d22}, [r4], r1
-+        vst1.8   {d21}, [r6]
-+        vst1.8   {d20}, [r4]
-+1:
-+        @ Q0-Q2
-+        bmi      1f
-+        vst1.8   {d19}, [r0], r1
-+        vst1.8   {d18}, [r2]
-+        vst1.8   {d17}, [r0]
-+1:
-+        pop      {r4-r10,pc}
-+endfunc
-+
-+
-+.macro m_filter_h_luma_16 bit_depth
-+        sub      r4, r0, r1, lsl #2
-+        add      r0, r4, r1
-+        lsl      r1, #1
-+        vpush    {d8-d15}
-+
-+        vld1.16 { q8}, [r4], r1
-+        vld1.16 { q9}, [r0], r1
-+        vld1.16 {q10}, [r4], r1
-+        vld1.16 {q11}, [r0], r1
-+        vld1.16 {q12}, [r4], r1
-+        vld1.16 {q13}, [r0], r1
-+        vld1.16 {q14}, [r4]
-+        vld1.16 {q15}, [r0]
-+
-+        bl hevc_loop_filter_luma_body_\bit_depth
-+
-+        add      r0, r0, r1, lsl #1
-+        add      r2, r4, r1, lsl #1
-+        add      r6, r4, r1, asr #1
-+        vpop     {d8-d15}
-+
-+        @ P2-P0
-+        bcs      1f
-+        vst1.16  {q14}, [r4], r1
-+        vst1.16  {q13}, [r6]
-+        vst1.16  {q12}, [r4]
-+1:
-+        bmi      1f
-+        vst1.16  {q11}, [r0], r1
-+        vst1.16  {q10}, [r2]
-+        vst1.16  { q9}, [r0]
-+1:
-+        pop      {r4-r10,pc}
-+.endm
-+
-+
-+@ void ff_hevc_rpi_h_loop_filter_uv_neon(uint8_t * src_r,        // r0
-+@                                     unsigned int stride,   // r1
-+@                                     uint32_t tc4,          // r2
-+@                                     unsigned int no_f);    // r3
-+@
-+@ no_f
-+@ 0  tl P0
-+@ 1  tr P1
-+@ 2  bl Q0
-+@ 3  br Q1
-+@
-+@ Probably not worth having the P/Qa only special case in this direction
-+@ Given layout we won't save any memory reads or avoid any cache dirtying
-+@ We would save a bit of computation but I expect the partials to be less
-+@ common in the H direction than V due to how we arrange deblock.
-+
-+function ff_hevc_rpi_h_loop_filter_uv_neon_8, export=1
-+        sub      r12, r0, r1
-+        cmp      r2, #0
-+        it eq
-+        bxeq     lr
-+        vld1.8   {d26,d27}, [r0]
-+        lsl      r1, #1
-+        sub      r0, r1
-+        vld1.8   {d18,d19}, [r12], r1
-+        vld1.8   {d16,d17}, [r0], r1
-+        vld1.8   {d28,d29}, [r12]
-+
-+        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29, \
-+        "sub      r12, r0, r1, asr #1"
-+
-+        lsls     r3, #29                @ b2 -> N, b3 -> C
-+        it pl
-+        vstrpl   d26, [r0, #0]
-+        it cc
-+        vstrcc   d27, [r0, #8]
-+        lsls     r3, #2                 @ b0 -> N, b1 -> C
-+        it pl
-+        vstrpl   d18, [r12, #0]
-+        it cc
-+        vstrcc   d19, [r12, #8]
-+        bx       lr
-+
-+endfunc
-+
-+
-+@ void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src_r,     // r0
-+@                                     unsigned int stride,   // r1
-+@                                     uint32_t tc4,          // r2
-+@                                     unsigned int no_f);    // r3
-+@
-+@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
-+@
-+@ Macro here actual function near bottom
-+
-+.macro m_filter_h_uv_16 bit_depth
-+        sub      r12, r0, r1
-+        cmp      r2, #0
-+        it eq
-+        bxeq     lr
-+        vld1.16  {q12, q13}, [r0]
-+        lsl      r1, #1
-+        sub      r0, r1
-+        vld1.16  {q10, q11}, [r12], r1
-+        vld1.16  {q8,  q9 }, [r0], r1
-+        vld1.16  {q14, q15}, [r12]
-+
-+        hevc_loop_filter_uv_body2_16  q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth, \
-+        "sub      r12, r0, r1, asr #1", \
-+        "cmp      r3, #0"
-+
-+        bne      1f
-+        vst1.16  {q10, q11}, [r12]
-+        vst1.16  {q12, q13}, [r0]
-+        bx       lr
-+
-+        @ At least one no_f bit is set
-+        @ Which means we need to break this apart in an ugly fashion
-+1:
-+        lsls     r3, #29                @ b2 -> N, b3 -> C
-+        itt pl
-+        vstrpl   d24, [r0, #0]
-+        vstrpl   d25, [r0, #8]
-+        itt cc
-+        vstrcc   d26, [r0, #16]
-+        vstrcc   d27, [r0, #24]
-+        lsls     r3, #2                 @ b0 -> N, b1 -> C
-+        itt pl
-+        vstrpl   d20, [r12, #0]
-+        vstrpl   d21, [r12, #8]
-+        itt cc
-+        vstrcc   d22, [r12, #16]
-+        vstrcc   d23, [r12, #24]
-+        bx       lr
-+.endm
-+
-+
-+@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
-+@                                     unsigned int stride,   // r1
-+@                                     uint32_t tc4,          // r2
-+@                                     uint8_t * src_l,       // r3
-+@                                     unsigned int no_f);   // sp[0]
-+@
-+@ no_f:
-+@ 0  tl P0
-+@ 1  tr Q0
-+@ 2  bl P1
-+@ 3  br Q1
-+
-+function ff_hevc_rpi_v_loop_filter_uv2_neon_8, export=1
-+        cmp      r2, #0
-+        it eq
-+        bxeq     lr
-+        push     {lr}
-+        vld2.16  {d16[0], d18[0]}, [r3], r1
-+        vld2.16  {d20[0], d22[0]}, [r0], r1
-+
-+        cmp      r2, #0x10000
-+        vld2.16  {d16[1], d18[1]}, [r3], r1
-+        vld2.16  {d20[1], d22[1]}, [r0], r1
-+
-+        vld2.16  {d16[2], d18[2]}, [r3], r1
-+        vld2.16  {d20[2], d22[2]}, [r0], r1
-+
-+        vld2.16  {d16[3], d18[3]}, [r3], r1
-+        vld2.16  {d20[3], d22[3]}, [r0], r1
-+        blo      10f
-+
-+        vld2.16  {d17[0], d19[0]}, [r3], r1
-+        vld2.16  {d21[0], d23[0]}, [r0], r1
-+
-+        sub      ip, r0, r3
-+        vld2.16  {d17[1], d19[1]}, [r3], r1
-+        vld2.16  {d21[1], d23[1]}, [r0], r1
-+
-+        cmp      ip, #4
-+        vld2.16  {d17[2], d19[2]}, [r3], r1
-+        vld2.16  {d21[2], d23[2]}, [r0], r1
-+
-+        vld2.16  {d17[3], d19[3]}, [r3]
-+        vld2.16  {d21[3], d23[3]}, [r0]
-+
-+        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23 \
-+        "ldr      lr, [sp, #4]", \
-+        "neg      r1, r1",       \
-+        "it eq; cmpeq lr, #0",   \
-+        "add      r3, #2",       \
-+        "add      ip, r3, r1",   \
-+        "add      r2, r0, r1",   \
-+        "lsl      r1, #1"
-+
-+        bne      1f
-+
-+@ Much/most of the time r0 == r3 + 4 and no_f == 0
-+@ so it is worth having this special case
-+        vst2.16   {d19[3], d21[3]}, [r3], r1    @ P0b, Q0b
-+        vst2.16   {d19[2], d21[2]}, [ip], r1
-+        vst2.16   {d19[1], d21[1]}, [r3], r1
-+        vst2.16   {d19[0], d21[0]}, [ip], r1
-+        vst2.16   {d18[3], d20[3]}, [r3], r1    @ P0a, Q0a
-+        vst2.16   {d18[2], d20[2]}, [ip], r1
-+        vst2.16   {d18[1], d20[1]}, [r3]
-+        vst2.16   {d18[0], d20[0]}, [ip]
-+        pop       {pc}
-+
-+@ Either split or partial
-+1:
-+        lsls     lr, #29               @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
-+        ittt cs
-+        addcs    r0, r0, r1, lsl #1
-+        addcs    r2, r2, r1, lsl #1
-+        bcs      1f
-+        @ Q0b
-+        vst1.16  {d21[3]}, [r0], r1
-+        vst1.16  {d21[2]}, [r2], r1
-+        vst1.16  {d21[1]}, [r0], r1
-+        vst1.16  {d21[0]}, [r2], r1
-+1:
-+        ittt mi
-+        addmi    r3, r3, r1, lsl #1
-+        addmi    ip, ip, r1, lsl #1
-+        bmi      1f
-+        @ P0b
-+        vst1.16  {d19[3]}, [r3], r1
-+        vst1.16  {d19[2]}, [ip], r1
-+        vst1.16  {d19[1]}, [r3], r1
-+        vst1.16  {d19[0]}, [ip], r1
-+1:
-+        lsls     lr, #2                @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
-+        bcs      1f
-+        @ Q0a
-+        vst1.16  {d20[3]}, [r0], r1
-+        vst1.16  {d20[2]}, [r2], r1
-+        vst1.16  {d20[1]}, [r0]
-+        vst1.16  {d20[0]}, [r2]
-+1:
-+        it       mi
-+        popmi    {pc}
-+        @ P0a
-+        vst1.16  {d18[3]}, [r3], r1
-+        vst1.16  {d18[2]}, [ip], r1
-+        vst1.16  {d18[1]}, [r3]
-+        vst1.16  {d18[0]}, [ip]
-+        pop      {pc}
-+
-+@ Single lump (rather than double)
-+10:
-+        @ As we have post inced r0/r3 in the load the easiest thing to do is
-+        @ to subtract and write forwards, rather than backwards (as above)
-+        @ b0 (P0a) -> N, b1 (Q0a) -> C
-+
-+        hevc_loop_filter_uv_body1 d16, d18, d20, d22 \
-+        "ldr      lr, [sp, #4]",       \
-+        "add      r3, #2",             \
-+        "sub      r0, r0, r1, lsl #2", \
-+        "sub      r3, r3, r1, lsl #2", \
-+        "lsls     lr, #31",            \
-+        "add      r2, r0, r1",         \
-+        "add      ip, r3, r1",         \
-+        "lsl      r1, #1"
-+
-+        bcs      3f
-+        @ Q0a
-+        vst1.16  {d20[0]}, [r0], r1
-+        vst1.16  {d20[1]}, [r2], r1
-+        vst1.16  {d20[2]}, [r0]
-+        vst1.16  {d20[3]}, [r2]
-+3:
-+        it       mi
-+        popmi    {pc}
-+        @ P0a
-+        vst1.16  {d18[0]}, [r3], r1
-+        vst1.16  {d18[1]}, [ip], r1
-+        vst1.16  {d18[2]}, [r3]
-+        vst1.16  {d18[3]}, [ip]
-+        pop      {pc}
-+
-+endfunc
-+
-+
-+@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
-+@                                     unsigned int stride,   // r1
-+@                                     uint32_t tc4,          // r2
-+@                                     uint8_t * src_l,       // r3
-+@                                     unsigned int no_f);   // sp[0]
-+@
-+
-+@ no_f
-+@ 0  tl P0a
-+@ 1  tr Q0a
-+@ 2  bl P0b
-+@ 3  br Q0b
-+
-+@ P1: q8,  q12
-+@ P0: q9,  q13
-+@ Q0: q10, q14
-+@ Q1: q11, q15
-+
-+.macro m_filter_v_uv2_16 bit_depth
-+        cmp      r2, #0
-+        it eq
-+        bxeq     lr
-+        push     {lr}
-+        vld2.32  {d16[0], d18[0]}, [r3], r1
-+        vld2.32  {d20[0], d22[0]}, [r0], r1
-+
-+        cmp      r2, #0x10000
-+        vld2.32  {d16[1], d18[1]}, [r3], r1
-+        vld2.32  {d20[1], d22[1]}, [r0], r1
-+
-+        vld2.32  {d17[0], d19[0]}, [r3], r1
-+        vld2.32  {d21[0], d23[0]}, [r0], r1
-+
-+        vld2.32  {d17[1], d19[1]}, [r3], r1
-+        vld2.32  {d21[1], d23[1]}, [r0], r1
-+        blo      10f
-+
-+        vld2.32  {d24[0], d26[0]}, [r3], r1
-+        vld2.32  {d28[0], d30[0]}, [r0], r1
-+
-+        sub      ip, r0, r3
-+        vld2.32  {d24[1], d26[1]}, [r3], r1
-+        vld2.32  {d28[1], d30[1]}, [r0], r1
-+
-+        cmp      ip, #8
-+        vld2.32  {d25[0], d27[0]}, [r3], r1
-+        vld2.32  {d29[0], d31[0]}, [r0], r1
-+
-+        vld2.32  {d25[1], d27[1]}, [r3]
-+        vld2.32  {d29[1], d31[1]}, [r0]
-+
-+        hevc_loop_filter_uv_body2_16  q8, q12, q9, q13, q10, q14, q11, q15, \bit_depth, \
-+        "ldr      lr, [sp, #4]", \
-+        "neg      r1, r1",       \
-+        "it eq; cmpeq lr, #0",   \
-+        "add      r3, #4",       \
-+        "add      ip, r3, r1",   \
-+        "add      r2, r0, r1",   \
-+        "lsl      r1, #1"
-+
-+        bne      1f
-+
-+@ Much/most of the time r0 == r3 + 8 and no_f == 0
-+@ so it is worth having this special case
-+        vst2.32   {d27[1], d29[1]}, [r3], r1    @ P0b, Q0b
-+        vst2.32   {d27[0], d29[0]}, [ip], r1
-+        vst2.32   {d26[1], d28[1]}, [r3], r1
-+        vst2.32   {d26[0], d28[0]}, [ip], r1
-+        vst2.32   {d19[1], d21[1]}, [r3], r1    @ P0a, Q0a
-+        vst2.32   {d19[0], d21[0]}, [ip], r1
-+        vst2.32   {d18[1], d20[1]}, [r3]
-+        vst2.32   {d18[0], d20[0]}, [ip]
-+        pop       {pc}
-+
-+@ Either split or partial
-+1:
-+        lsls     lr, #29               @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
-+        ittt cs
-+        addcs    r0, r0, r1, lsl #1
-+        addcs    r2, r2, r1, lsl #1
-+        bcs      1f
-+        @ Q0b
-+        vst1.32  {d29[1]}, [r0], r1
-+        vst1.32  {d29[0]}, [r2], r1
-+        vst1.32  {d28[1]}, [r0], r1
-+        vst1.32  {d28[0]}, [r2], r1
-+1:
-+        ittt mi
-+        addmi    r3, r3, r1, lsl #1
-+        addmi    ip, ip, r1, lsl #1
-+        bmi      1f
-+        @ P0b
-+        vst1.32  {d27[1]}, [r3], r1
-+        vst1.32  {d27[0]}, [ip], r1
-+        vst1.32  {d26[1]}, [r3], r1
-+        vst1.32  {d26[0]}, [ip], r1
-+1:
-+        lsls     lr, #2                @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
-+        bcs      1f
-+        @ Q0a
-+        vst1.32  {d21[1]}, [r0], r1
-+        vst1.32  {d21[0]}, [r2], r1
-+        vst1.32  {d20[1]}, [r0]
-+        vst1.32  {d20[0]}, [r2]
-+1:
-+        it       mi
-+        popmi    {pc}
-+        @ P0a
-+        vst1.32  {d19[1]}, [r3], r1
-+        vst1.32  {d19[0]}, [ip], r1
-+        vst1.32  {d18[1]}, [r3]
-+        vst1.32  {d18[0]}, [ip]
-+        pop      {pc}
-+
-+@ Single lump (rather than double)
-+10:
-+        @ As we have post inced r0/r3 in the load the easiest thing to do is
-+        @ to subtract and write forwards, rather than backwards (as above)
-+        @ b0 (P0a) -> N, b1 (Q0a) -> C
-+
-+        hevc_loop_filter_uv_body1_16  q8, q9, q10, q11, \bit_depth, \
-+        "ldr      lr, [sp, #4]",       \
-+        "add      r3, #4",             \
-+        "sub      r0, r0, r1, lsl #2", \
-+        "sub      r3, r3, r1, lsl #2", \
-+        "lsls     lr, #31",            \
-+        "add      r2, r0, r1",         \
-+        "add      ip, r3, r1",         \
-+        "lsl      r1, #1"
-+
-+        bcs      3f
-+        @ Q0a
-+        vst1.32  {d20[0]}, [r0], r1
-+        vst1.32  {d20[1]}, [r2], r1
-+        vst1.32  {d21[0]}, [r0]
-+        vst1.32  {d21[1]}, [r2]
-+3:
-+        it       mi
-+        popmi    {pc}
-+        @ P0a
-+        vst1.32  {d18[0]}, [r3], r1
-+        vst1.32  {d18[1]}, [ip], r1
-+        vst1.32  {d19[0]}, [r3]
-+        vst1.32  {d19[1]}, [ip]
-+        pop      {pc}
-+.endm
-+
-+
-+@ The NEON version is faster under ideal circumstances (i.e. everything in L1)
-+@ But in real world testing it is ~20% slower, presumably due to code size
-+
-+#if 0 // NEON version
-+
-+/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
-+ *                                            const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+ *                                            int in_inc0, int in_inc1)
-+ */
-+function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
-+        mov         ip, sp
-+        push        {a1-a3,v1-v8,lr}
-+        ldm         ip, {v1-v6}
-+        cmp         a1, #2
-+        bls         2f
-+        vpush       {d8-d13}
-+        sub         v5, v5, #10
-+        sub         v6, v6, #10
-+1:
-+        vld2.32     {d0[0], d2[0]}, [a3]!
-+        vld2.32     {d4[0], d6[0]}, [a4]!
-+          vmov.u8     q12, #0
-+        ldrb        a2, [a3], #1
-+        ldrb        ip, [a4], #1
-+        ldrb        v8, [a3], #1
-+        ldrb        lr, [a4], #1
-+        add         a2, v1, a2, lsl #2
-+        vld1.8      {d24[0]}, [a3], v5
-+        add         ip, v3, ip, lsl #2
-+        vld1.8      {d25[0]}, [a4], v6
-+        add         v8, v2, v8, lsl #2
-+        vld1.32     {d16[0]}, [a2]
-+        add         lr, v4, lr, lsl #2
-+        vld1.32     {d20[0]}, [ip]
-+        vld1.32     {d18[0]}, [v8]
-+        vld1.32     {d22[0]}, [lr]
-+
-+        vld2.32     {d0[1], d2[1]}, [a3]!
-+        vld2.32     {d4[1], d6[1]}, [a4]!
-+        ldrb        a2, [a3], #1
-+          vmov.u16    d12, #1
-+        ldrb        ip, [a4], #1
-+          vmov.u16    d13, #2
-+        ldrb        v8, [a3], #1
-+          vmov.u16    d27, #4
-+        ldrb        lr, [a4], #1
-+        add         a2, v1, a2, lsl #2
-+        vld1.8      {d24[2]}, [a3], v5
-+        add         ip, v3, ip, lsl #2
-+        vld1.8      {d25[2]}, [a4], v6
-+        add         v8, v2, v8, lsl #2
-+        vld1.32     {d16[1]}, [a2]
-+        add         lr, v4, lr, lsl #2
-+        vld1.32     {d20[1]}, [ip]
-+        vld1.32     {d18[1]}, [v8]
-+        vld1.32     {d22[1]}, [lr]
-+
-+        vld2.32     {d1[0], d3[0]}, [a3]!
-+        vld2.32     {d5[0], d7[0]}, [a4]!
-+        ldrb        a2, [a3], #1
-+        ldrb        ip, [a4], #1
-+        ldrb        lr, [a4], #1
-+        ldrb        v8, [a3], #1
-+        add         a2, v1, a2, lsl #2
-+        vld1.8      {d24[4]}, [a3], v5
-+        add         ip, v3, ip, lsl #2
-+        vld1.8      {d25[4]}, [a4], v6
-+        add         v8, v2, v8, lsl #2
-+        vld1.32     {d17[0]}, [a2]
-+        add         lr, v4, lr, lsl #2
-+        vld1.32     {d21[0]}, [ip]
-+        vld1.32     {d19[0]}, [v8]
-+        vld1.32     {d23[0]}, [lr]
-+
-+        vld2.32     {d1[1], d3[1]}, [a3]!
-+        vld2.32     {d5[1], d7[1]}, [a4]!
-+        ldrb        a2, [a3], #1
-+        ldrb        ip, [a4], #1
-+        ldrb        v8, [a3], #1
-+        ldrb        lr, [a4], #1
-+        add         a2, v1, a2, lsl #2
-+        vld1.8      {d24[6]}, [a3], v5
-+        add         ip, v3, ip, lsl #2
-+        vld1.8      {d25[6]}, [a4], v6
-+        add         v8, v2, v8, lsl #2
-+        vld1.32     {d17[1]}, [a2]
-+        add         lr, v4, lr, lsl #2
-+        vld1.32     {d21[1]}, [ip]
-+        vld1.32     {d19[1]}, [v8]
-+        vld1.32     {d23[1]}, [lr]
-+
-+        @ So now we have:
-+        @ q0.32[i]  = curr[i].mv[0]
-+        @ q1.32[i]  = curr[i].mv[1]
-+        @ q2.32[i]  = neigh[i].mv[0]
-+        @ q3.32[i]  = neigh[i].mv[1]
-+        @ q8.32[i]  = curr_rpl0[curr[i].ref_idx[0]]
-+        @ q9.32[i]  = curr_rpl1[curr[i].ref_idx[1]]
-+        @ q10.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
-+        @ q11.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
-+        @ d24.16[i] = curr[i].pred_flag
-+        @ d25.16[i] = neigh[i].pred_flag
-+
-+        vtst.16     d28, d24, d12
-+        vtst.16     d29, d24, d13
-+        vadd.i16    d8, d24, d12
-+        vadd.i16    d9, d25, d12
-+        vtst.16     d30, d25, d12
-+        vtst.16     d31, d25, d13
-+        veor        d26, d8, d9
-+          ldr         lr, [sp, 6*8 + 1*4]
-+        vmovl.s16   q4, d28
-+        vmovl.s16   q5, d29
-+          teq         lr, #1
-+        vmovl.s16   q14, d30
-+          it ne
-+          lslne       v1, lr, #1
-+        vmovl.s16   q15, d31
-+          it ne
-+          rsbne       v2, v1, #32
-+        vbif        q0, q1, q4
-+        vbif        q2, q3, q14
-+        vbif        q1, q0, q5
-+        vbif        q3, q2, q15
-+        vabd.s16    q12, q0, q2
-+        vabd.s16    q2, q1
-+        vabd.s16    q0, q3
-+        vabd.s16    q1, q3
-+        vbif        q8, q9, q4
-+        vbif        q10, q11, q14
-+        vbif        q9, q8, q5
-+        vbif        q11, q10, q15
-+        vclt.u16    d6, d24, d27
-+        vclt.u16    d8, d2, d27
-+        vclt.u16    d7, d25, d27
-+        vclt.u16    d9, d3, d27
-+        vclt.u16    d2, d0, d27
-+        vclt.u16    d0, d4, d27
-+        vclt.u16    d3, d1, d27
-+        vclt.u16    d1, d5, d27
-+        vceq.i32    q12, q10, q8
-+        vceq.i32    q10, q9
-+        vceq.i32    q8, q11
-+        vceq.i32    q9, q11
-+        vshrn.i32   d6, q3, #8
-+        vshrn.i32   d7, q4, #8
-+        vshrn.i32   d8, q1, #8
-+        vshrn.i32   d9, q0, #8
-+        vmovn.i32   d4, q12
-+        vmovn.i32   d2, q10
-+        vmovn.i32   d3, q8
-+        vmovn.i32   d5, q9
-+        vand        q2, q3
-+        vrev16.8    q3, q3
-+        vand        q2, q3
-+        vand        q1, q4
-+        vrev16.8    q4, q4
-+        vand        q1, q4
-+        vand        d4, d5
-+        vand        d2, d3
-+        vbic        d0, d12, d4
-+        vshr.u16    d26, #2
-+        vbic        d0, d2
-+        vmov.i16    d1, #0x5555
-+        vorr        d0, d26
-+          bne         10f
-+
-+        @ Merge results into result word, no duplicates
-+        vmov        a2, s0
-+        vmov        v8, s1
-+        vmov.u16    ip, d0[1]
-+        vmov.u16    lr, d0[3]
-+        lsl         a2, #30
-+        lsl         v8, #30
-+        lsl         ip, #30
-+        lsl         lr, #30
-+        orr         a2, ip, a2, lsr #2
-+        orr         v8, lr, v8, lsr #2
-+        orr         a2, v8, a2, lsr #4
-+        subs        a1, #4
-+        orr         v7, a2, v7, lsr #8
-+        bhi         1b
-+
-+        mov         a1, #32
-+        ldr         a3, [sp, #6*8]
-+        vpop        {d8-d13}
-+        sub         a1, a1, a3, lsl #1
-+        mov         a1, v7, lsr a1
-+        pop         {a2-a4,v1-v8,pc}
-+10:
-+        @ Merge results into result word, with duplicates
-+        vmul.i16    d0, d1
-+        vmov        a2, s0
-+        vmov        v8, s1
-+        vmov.u16    ip, d0[1]
-+        vmov.u16    lr, d0[3]
-+        lsl         a2, v2
-+        subs        a1, #4
-+        lsl         v8, v2
-+        lsl         ip, v2
-+        lsl         lr, v2
-+        ldr         v2, [sp, #6*8 + 12*4 + 1*4]
-+T       lsr         a2, v1
-+T       orr         a2, ip, a2
-+A       orr         a2, ip, a2, lsr v1
-+        lsl         ip, v1, #1
-+T       lsr         v8, v1
-+T       orr         v8, lr, v8
-+A       orr         v8, lr, v8, lsr v1
-+        lsl         lr, v1, #2
-+T       lsr         a2, ip
-+T       orr         a2, v8, a2
-+A       orr         a2, v8, a2, lsr ip
-+        ldr         v1, [sp, #6*8 + 12*4]
-+T       lsr         v7, lr
-+T       orr         v7, a2, v7
-+A       orr         v7, a2, v7, lsr lr
-+        bhi         1b
-+
-+        mov         a1, #32
-+        ldrd        a3, a4, [sp, #6*8]
-+        vpop        {d8-d13}
-+        mls         a1, a3, a4, a1
-+        mls         a1, a3, a4, a1
-+        mov         a1, v7, lsr a1
-+        pop         {a2-a4,v1-v8,pc}
-+
-+
-+2:
-+        sub         v5, v5, #10
-+        sub         v6, v6, #10
-+        vmov.u8     d16, #0
-+        blo         3f
-+        vld2.32     {d0[0], d1[0]}, [a3]!
-+        vld2.32     {d2[0], d3[0]}, [a4]!
-+        ldrb        a2, [a3], #1
-+        ldrb        ip, [a4], #1
-+        ldrb        lr, [a4], #1
-+        ldrb        v8, [a3], #1
-+        add         a2, v1, a2, lsl #2
-+        vld1.8      {d16[0]}, [a3], v5
-+        add         ip, v3, ip, lsl #2
-+        vld1.8      {d16[4]}, [a4], v6
-+        add         v8, v2, v8, lsl #2
-+        vld1.32     {d4[0]}, [a2]
-+        add         lr, v4, lr, lsl #2
-+        vld1.32     {d5[0]}, [ip]
-+        vld1.32     {d6[0]}, [v8]
-+        vld1.32     {d7[0]}, [lr]
-+
-+3:
-+        vld2.32     {d0[1], d1[1]}, [a3]!
-+        vld2.32     {d2[1], d3[1]}, [a4]!
-+        ldrb        a2, [a3], #1
-+          vmov.u16    d17, #1
-+        ldrb        ip, [a4], #1
-+          vmov.u16    d18, #2
-+        ldrb        v8, [a3], #1
-+          vmov.u16    d19, #4
-+        ldrb        lr, [a4], #1
-+        add         a2, v1, a2, lsl #2
-+        vld1.8      {d16[2]}, [a3], v5
-+        add         ip, v3, ip, lsl #2
-+        vld1.8      {d16[6]}, [a4], v6
-+        add         v8, v2, v8, lsl #2
-+        vld1.32     {d4[1]}, [a2]
-+        add         lr, v4, lr, lsl #2
-+        vld1.32     {d5[1]}, [ip]
-+        vld1.32     {d6[1]}, [v8]
-+        vld1.32     {d7[1]}, [lr]
-+
-+        @ So now we have:
-+        @ d0.32[i]  = curr[i].mv[0]
-+        @ d1.32[i]  = curr[i].mv[1]
-+        @ d2.32[i]  = neigh[i].mv[0]
-+        @ d3.32[i]  = neigh[i].mv[1]
-+        @ d4.32[i] = curr_rpl0[curr[i].ref_idx[0]]
-+        @ d5.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
-+        @ d6.32[i] = curr_rpl1[curr[i].ref_idx[1]]
-+        @ d7.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
-+        @ d16.16[i] = curr[i].pred_flag
-+        @ d16.16[2+i] = neigh[i].pred_flag
-+
-+        vtst.16     d20, d16, d17
-+        vtst.16     d22, d16, d18
-+        vadd.i16    d30, d16, d17
-+        vswp        d2, d3
-+        ldr         lr, [sp, #1*4]
-+        vmovl.s16   q10, d20
-+          teq         lr, #1
-+        vmovl.s16   q11, d22
-+          it ne
-+          lslne       v1, lr, #1
-+        vbif        d0, d1, d20
-+        vbif        d4, d6, d20
-+        vbif        d3, d2, d21
-+        vbif        d5, d7, d21
-+        vbif        d1, d0, d22
-+        vbif        d6, d4, d22
-+        vbif        d2, d3, d23
-+        vbif        d7, d5, d23
-+        vshr.u16    d30, #2
-+        vabd.s16    d24, d0, d3
-+        vabd.s16    d25, d1, d2
-+        vabd.s16    q0, q0, q1
-+        vceq.i32    d2, d4, d5
-+        vceq.i32    d20, d5, d6
-+        vceq.i32    d21, d4, d7
-+        vceq.i32    d3, d6, d7
-+        vclt.u16    d6, d24, d19
-+        vclt.u16    d7, d25, d19
-+        vclt.u16    d22, d1, d19
-+        vclt.u16    d23, d0, d19
-+        vshrn.i32   d6, q3, #8
-+        vmovn.i32   d2, q1
-+        vshrn.i32   d7, q11, #8
-+        vmovn.i32   d3, q10
-+        vand        q0, q3, q1
-+          it ne
-+          rsbne       v2, v1, #32
-+        vrev16.8    q3, q3
-+        vand        q0, q3
-+        vsra.u64    d30, #32
-+        vshr.u64    q1, q0, #32
-+        vand        q0, q1
-+        vbic        d0, d17, d0
-+        vand        d30, d30, d17
-+        vbic        d0, d1
-+        vmov.i16    d1, #0x5555
-+        vorr        d0, d30
-+          bne         10f
-+
-+        @ Construct result word, no duplicates
-+        cmp         a1, #2
-+        vmov.u16    a1, d0[1]
-+        vmov.u16    a2, d0[0]
-+        it eq
-+        orreq       a1, a2, a1, lsl #2
-+        pop         {a2-a4,v1-v8,pc}
-+10:
-+        @ Construct result word, with duplicates
-+        cmp         a1, #2
-+        vmul.i16    d0, d1
-+        vmov        a2, s0
-+        vmov.u16    a1, d0[1]
-+        lsl         a2, #16
-+        pkhbt       a1, a1, a1, lsl #16
-+        lsr         a2, v2
-+        lsr         a1, v2
-+T       itt eq
-+T       lsleq       a1, v1
-+T       orreq       a1, a2, a1
-+A       orreq       a1, a2, a1, lsl v1
-+        pop         {a2-a4,v1-v8,pc}
-+endfunc
-+
-+
-+
-+#else // non-NEON version
-+
-+
-+/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
-+ *                                            const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+ *                                            int in_inc0, in_inc1)
-+ */
-+function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
-+        add         ip, sp, #4*4
-+        push        {a2-a4,v1-v8,lr}
-+        mov         v6, #32
-+1:      ldmdb       ip, {v1-v4}
-+        ldrsb       v5, [a3, #8]    @ curr->ref_idx
-+        ldrsb       v8, [a3, #9]
-+        ldrsb       ip, [a4, #8]    @ neigh->ref_idx
-+        ldrsb       lr, [a4, #9]
-+        ldr         v1, [v1, v5, lsl #2]
-+        ldrb        v5, [a3, #10]   @ curr->pred_flag
-+        ldr         v2, [v2, v8, lsl #2]
-+        ldrb        v8, [a4, #10]   @ neigh->pred_flag
-+        ldr         v3, [v3, ip, lsl #2]
-+        ldr         v4, [v4, lr, lsl #2]
-+        teq         v5, #3
-+        beq         20f
-+        teq         v8, #3
-+        beq         90f
-+
-+        tst         v5, #1
-+        itee        ne
-+        ldrne       v5, [a3, #0]    @ curr->mv[0]
-+        moveq       v1, v2
-+        ldreq       v5, [a3, #4]    @ curr->mv[1]
-+        tst         v8, #1
-+        itee        ne
-+        ldrne       v8, [a4, #0]    @ neigh->mv[0]
-+        moveq       v3, v4
-+        ldreq       v8, [a4, #4]    @ neigh->mv[1]
-+        teq         v1, v3
-+        bne         10f
-+        ldr         lr, =0xFFFCFFFC
-+        ssub16      ip, v8, v5
-+        ssub16      v5, v5, v8
-+        sel         v5, v5, ip
-+        ands        v5, v5, lr
-+        @ drop through
-+10:     it          ne
-+        movne       v5, #1<<30
-+11:
-+        sub         v6, v6, #2
-+T       mov         v7, v7, lsr #2
-+        subs        a2, a2, #1
-+A       orr         v7, v5, v7, lsr #2
-+T       orr         v7, v5, v7
-+        bhi         11b
-+
-+        ldrd        v3, v4, [sp, #16*4]
-+        ldr         a2, [sp]
-+        add         ip, sp, #16*4
-+        subs        a1, a1, #1
-+        add         a3, a3, v3
-+        add         a4, a4, v4
-+        bhi         1b
-+        mov         a1, v7, lsr v6
-+        pop         {a2-a4,v1-v8,pc}
-+
-+20:     teq         v8, #3
-+        bne         10b
-+
-+        teq         v1, v3
-+        it          eq
-+        teqeq       v2, v4
-+        bne         40f
-+        teq         v1, v2
-+        bne         30f
-+
-+        ldrd        v1, v2, [a3]    @ curr->mv
-+        ldrd        v3, v4, [a4]    @ neigh->mv
-+        ldr         lr, =0xFFFCFFFC
-+        ssub16      ip, v3, v1
-+        ssub16      v5, v1, v3
-+        sel         v5, v5, ip
-+        ands        v5, v5, lr
-+        bne         25f
-+        ssub16      ip, v4, v2
-+        ssub16      v5, v2, v4
-+        sel         v5, v5, ip
-+        ands        v5, v5, lr
-+        beq         11b
-+        @ drop through
-+25:     ssub16      ip, v4, v1
-+        ssub16      v5, v1, v4
-+        sel         v5, v5, ip
-+        ands        v5, v5, lr
-+        bne         10b
-+        ssub16      ip, v3, v2
-+        ssub16      v5, v2, v3
-+        sel         v5, v5, ip
-+        ands        v5, v5, lr
-+        b           10b
-+
-+30:     ldrd        v1, v2, [a3]    @ curr->mv
-+        ldrd        v3, v4, [a4]    @ neigh->mv
-+        ldr         lr, =0xFFFCFFFC
-+        ssub16      ip, v3, v1
-+        ssub16      v5, v1, v3
-+        sel         v5, v5, ip
-+        ands        v5, v5, lr
-+        bne         10b
-+        ssub16      ip, v4, v2
-+        ssub16      v5, v2, v4
-+        sel         v5, v5, ip
-+        ands        v5, v5, lr
-+        b           10b
-+
-+40:     teq         v1, v4
-+        ite         eq
-+        teqeq       v2, v3
-+        bne         10b
-+
-+        ldrd        v1, v2, [a3]    @ curr->mv
-+        ldrd        v3, v4, [a4]    @ neigh->mv
-+        ldr         lr, =0xFFFCFFFC
-+        b           25b
-+
-+90:
-+        mov         v5, #1<<30
-+        b           11b
-+endfunc
-+
-+
-+#endif
-+
-+
-+@ =============================================================================
-+@
-+@ 10 bit
-+
-+function hevc_loop_filter_luma_body_10
-+        m_filter_luma 10, q11, q15
-+endfunc
-+
-+function ff_hevc_rpi_h_loop_filter_luma_neon_10, export=1
-+        hevc_loop_filter_luma_start
-+        b        .Lh_loop_luma_common_10
-+endfunc
-+
-+function ff_hevc_rpi_h_loop_filter_luma2_neon_10, export=1
-+        cmp      r3, #0
-+        it       eq
-+        bxeq     lr
-+        push     {r4-r10,lr}            @ 32 bytes
-+        ldr      r10, [sp, #32]
-+.Lh_loop_luma_common_10:
-+        m_filter_h_luma_16 10
-+endfunc
-+
-+function ff_hevc_rpi_v_loop_filter_luma_neon_10, export=1
-+        hevc_loop_filter_luma_start
-+        sub      r4, r0, #8
-+        b        .Lv_loop_luma_common_10
-+endfunc
-+
-+function ff_hevc_rpi_v_loop_filter_luma2_neon_10, export=1
-+        cmp      r3, #0
-+        it       eq
-+        bxeq     lr
-+        push     {r4-r10,lr}            @ 32 bytes
-+        ldr      r4, [sp, #36]
-+        ldr      r10, [sp, #32]
-+
-+.Lv_loop_luma_common_10:
-+        m_filter_v_luma_16 10
-+endfunc
-+
-+function ff_hevc_rpi_h_loop_filter_uv_neon_10, export=1
-+        m_filter_h_uv_16 10
-+endfunc
-+
-+function ff_hevc_rpi_v_loop_filter_uv2_neon_10, export=1
-+        m_filter_v_uv2_16 10
-+endfunc
-+
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_idct_neon.S
-@@ -0,0 +1,184 @@
-+/*
-+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+/* uses registers q8 - q13 for temp values */
-+.macro tr4_luma_shift shift
-+        vaddl.s16   q8, d28, d30    // c0 = src0 + src2
-+        vaddl.s16   q9, d30, d31    // c1 = src2 + src3
-+        vsubl.s16   q10, d28, d31   // c2 = src0 - src3
-+        vaddl.s16   q11, d28, d31   // src0 + src3
-+
-+        vmul.i32    q12, q8, d1[0]  // 29 * c0
-+        vmul.i32    q13, q10, d2[0] // 55 * c2
-+        vmul.i32    q8, q8, d2[0]   // 55 * c0
-+        vmull.s16   q14, d29, d0[0] // c3 = 74 * src1
-+
-+        vsubw.s16   q11, q11, d30   // src0 - src2 + src3
-+        vmla.i32    q12, q9, d2[0]  // 29 * c0 + 55 * c1
-+        vmls.i32    q13, q9, d1[0]  // 55 * c2 - 29 * c1
-+        vmla.i32    q8, q10, d1[0]  // 55 * c0 + 29 * c2
-+
-+        vmul.i32    q11, q11, d0[0] // dst2 = 74 * (src0 - src2 + src3)
-+        vadd.i32    q12, q12, q14   // dst0 = 29 * c0 + 55 * c1 + c3
-+        vadd.i32    q13, q13, q14   // dst1 = 55 * c2 - 29 * c1 + c3
-+        vsub.i32    q8, q8, q14     // dst3 = 55 * c0 + 29 * c2 - c3
-+
-+        vqrshrn.s32 d28, q12, \shift
-+        vqrshrn.s32 d29, q13, \shift
-+        vqrshrn.s32 d30, q11, \shift
-+        vqrshrn.s32 d31, q8, \shift
-+.endm
-+
-+/* uses registers q8 - q11 for temp values */
-+.macro tr4_shift shift
-+        vmull.s16   q9, d29, d0[0]   // 83 * src1
-+        vmull.s16   q8, d29, d0[1]   // 36 * src1
-+        vshll.s16   q14, d28, #6     // 64 * src0
-+        vshll.s16   q10, d30, #6     // 64 * src2
-+        vmlal.s16   q9, d31, d0[1]   // 83 * src1 + 36 * src3  o0
-+        vmlsl.s16   q8, d31, d0[0]   // 36 * src1 - 83 * src3  o1
-+        vadd.s32    q11, q14, q10    // 64 * (src0 + src2)     e0
-+        vsub.s32    q10, q14, q10    // 64 * (src0 - src2)     e1
-+        vadd.s32    q14, q11, q9     // e0 + o0
-+        vadd.s32    q15, q10, q8     // e1 + o1
-+        vsub.s32    q8, q10, q8      // e1 - o1
-+        vsub.s32    q9, q11, q9      // e0 - o0
-+
-+        vqrshrn.s32 d28, q14, \shift
-+        vqrshrn.s32 d29, q15, \shift
-+        vqrshrn.s32 d30, q8, \shift
-+        vqrshrn.s32 d31, q9, \shift
-+.endm
-+
-+.macro tr8_process d0, d1, d2, d3, d4, d5, d6, d7,                         \
-+                   tmp0, /* Q reg which doesn't alias with d4, d6 or d7 */ \
-+                   tmp1, /* Q reg which doesn't alias with d7 or d0     */ \
-+                   shift, I1, I2, I3
-+
-+        vmull.s16  q4, \d1, d1[1]        // 89 * src1
-+        \I1
-+        vmull.s16  q5, \d1, d1[0]        // 75 * src1
-+        \I2
-+        vmull.s16  q6, \d1, d1[3]        // 50 * src1
-+        \I3
-+        vmull.s16  q7, \d1, d1[2]        // 18 * src1
-+        vmlal.s16  q4, \d3, d1[0]        // 75 * src3
-+        vmlsl.s16  q5, \d3, d1[2]        //-18 * src3
-+        vmlsl.s16  q6, \d3, d1[1]        //-89 * src3
-+        vmlsl.s16  q7, \d3, d1[3]        //-50 * src3
-+
-+          // tr4
-+          vmull.s16  q1, \d2, d0[0]      // 83 * src(1*2)
-+          vmull.s16  q2, \d2, d0[1]      // 36 * src(1*2)
-+
-+        vmlal.s16  q4, \d5, d1[3]        // 50 * src5
-+        vmlsl.s16  q5, \d5, d1[1]        //-89 * src5
-+        vmlal.s16  q6, \d5, d1[2]        // 18 * src5
-+        vmlal.s16  q7, \d5, d1[0]        // 75 * src5
-+
-+          vshll.s16  q3, \d0, #6         // 64 * src(0*2)
-+          vshll.s16  \tmp0, \d4, #6      // 64 * src(2*2)
-+          vmlal.s16  q1, \d6, d0[1]      // 83 * src(1*2) + 36 * src(3*2)  o0
-+          vmlsl.s16  q2, \d6, d0[0]      // 36 * src(1*2) - 83 * src(3*2)  o1
-+          vadd.i32   \tmp1, q3, \tmp0    // 64 * (src(0*2) + src(2*2))     e0
-+          vsub.i32   \tmp0, q3, \tmp0    // 64 * (src(0*2) - src(2*2))     e1
-+
-+        vmlal.s16  q4, \d7, d1[2]        // 18 * src7
-+        vmlsl.s16  q5, \d7, d1[3]        //-50 * src7
-+        vmlal.s16  q6, \d7, d1[0]        // 75 * src7
-+        vmlsl.s16  q7, \d7, d1[1]        //-89 * src7
-+
-+          vsub.i32   q3, \tmp1, q1       // e0 - o0
-+          vadd.i32   \tmp1, \tmp1, q1    // e0 + o0
-+          vadd.i32   q1, \tmp0, q2       // e1 + o1
-+          vsub.i32   q2, \tmp0, q2       // e1 - o1
-+
-+        vadd.i32   \tmp0, \tmp1, q4      // e_8[0] + o_8[0], dst[0]
-+        vsub.i32   q4, \tmp1, q4         // e_8[0] - o_8[0], dst[7]
-+        vsub.i32   \tmp1, q3, q7         // e_8[3] - o_8[3], dst[4]
-+        vadd.i32   q7, q3, q7            // e_8[3] + o_8[3], dst[3]
-+        vadd.i32   q3, q1, q5            // e_8[1] + o_8[1], dst[1]
-+        vsub.i32   q5, q1, q5            // e_8[1] - o_8[1], dst[6]
-+        vsub.i32   q1, q2, q6            // e_8[2] - o_8[2], dst[5]
-+        vadd.i32   q6, q2, q6            // e_8[2] + o_8[2], dst[2]
-+        vqrshrn.s32   \d0, \tmp0, #\shift
-+        vqrshrn.s32   \d4, \tmp1, #\shift
-+        vqrshrn.s32   \d1, q3, #\shift
-+        vqrshrn.s32   \d5, q1, #\shift
-+        vqrshrn.s32   \d2, q6, #\shift
-+        vqrshrn.s32   \d6, q5, #\shift
-+        vqrshrn.s32   \d3, q7, #\shift
-+        vqrshrn.s32   \d7, q4, #\shift
-+.endm
-+
-+.macro tr8_vert d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, I1, I2, I3
-+        vld1.16     {\d0}, [r0 :64], r3
-+        vld1.16     {\d1}, [r2 :64], r3
-+        vld1.16     {\d2}, [r0 :64], r3
-+        vld1.16     {\d3}, [r2 :64], r3
-+        vld1.16     {\d4}, [r0 :64], r3
-+        vld1.16     {\d5}, [r2 :64], r3
-+        vld1.16     {\d6}, [r0 :64], r3
-+        vld1.16     {\d7}, [r2 :64], r3
-+
-+        tr8_process \
-+            \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \
-+            \q01, \q23, 7, "\I1", "\I2", "\I3"
-+.endm
-+
-+.macro tr8_horiz d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, shift
-+        tr8_process \
-+            \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \
-+            \q01, \q23, \shift
-+
-+        vzip.16    \d0, \d4
-+        vzip.16    \d1, \d5
-+        vzip.16    \d2, \d6
-+        vzip.16    \d3, \d7
-+        vst4.16    {\d0-\d3}, [r0 :128], r3
-+        vst4.16    {\d4-\d7}, [r2 :128], r3
-+.endm
-+
-+#define BIT_DEPTH 8
-+#include "rpi_hevc_idct_fn_neon.S"
-+
-+.text
-+
-+.align 4
-+tr4f:
-+.word 0x00240053  // 36 and d1[0] = 83
-+.word 0x00000000
-+tr8f:
-+.word 0x0059004b  // 89, d0[0] = 75
-+.word 0x00320012  // 50, d0[2] = 18
-+tr16:
-+.word 0x005a0057  // 90, d2[0] = 87
-+.word 0x00500046  // 80, d2[2] = 70
-+.word 0x0039002b  // 57, d2[0] = 43
-+.word 0x00190009  // 25, d2[2] = 9
-+
-+#undef BIT_DEPTH
-+#define BIT_DEPTH 10
-+#include "rpi_hevc_idct_fn_neon.S"
-+
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_init_arm.c
-@@ -0,0 +1,32 @@
-+/*
-+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/attributes.h"
-+#include "libavutil/arm/cpu.h"
-+#include "libavcodec/rpi_hevcdsp.h"
-+#include "rpi_hevcdsp_arm.h"
-+
-+av_cold void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth)
-+{
-+    int cpu_flags = av_get_cpu_flags();
-+
-+    if (have_neon(cpu_flags))
-+        ff_hevcdsp_rpi_init_neon(c, bit_depth);
-+}
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c
-@@ -0,0 +1,467 @@
-+/*
-+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "config.h"
-+#include "libavutil/attributes.h"
-+#include "libavutil/arm/cpu.h"
-+#include "libavcodec/rpi_hevcdsp.h"
-+#include "rpi_hevcdsp_arm.h"
-+#include "libavcodec/avcodec.h"
-+#include "libavcodec/bit_depth_template.c"
-+
-+// NEON inter pred fns for qpel & epel (non-sand) exist in the git repo but
-+// have been removed from head as we never use them.
-+
-+void ff_hevc_rpi_v_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+void ff_hevc_rpi_h_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+
-+void ff_hevc_rpi_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+void ff_hevc_rpi_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+
-+void ff_hevc_rpi_h_loop_filter_luma2_neon_8(uint8_t * _pix_r,
-+                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
-+void ff_hevc_rpi_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
-+                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
-+                             uint8_t * _pix_l);
-+void ff_hevc_rpi_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
-+                             unsigned int no_f);
-+void ff_hevc_rpi_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
-+                             uint8_t * src_l,
-+                             unsigned int no_f);
-+
-+void ff_hevc_rpi_h_loop_filter_luma2_neon_10(uint8_t * _pix_r,
-+                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
-+void ff_hevc_rpi_v_loop_filter_luma2_neon_10(uint8_t * _pix_r,
-+                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
-+                             uint8_t * _pix_l);
-+void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4,
-+                             unsigned int no_f);
-+void ff_hevc_rpi_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4,
-+                             uint8_t * src_l,
-+                             unsigned int no_f);
-+
-+void ff_hevc_rpi_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
-+void ff_hevc_rpi_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
-+void ff_hevc_rpi_idct_4x4_dc_neon_8(int16_t *coeffs);
-+void ff_hevc_rpi_idct_8x8_dc_neon_8(int16_t *coeffs);
-+void ff_hevc_rpi_idct_16x16_dc_neon_8(int16_t *coeffs);
-+void ff_hevc_rpi_idct_32x32_dc_neon_8(int16_t *coeffs);
-+void ff_hevc_rpi_transform_luma_4x4_neon_8(int16_t *coeffs);
-+
-+void ff_hevc_rpi_transform_4x4_neon_10(int16_t *coeffs, int col_limit);
-+void ff_hevc_rpi_transform_8x8_neon_10(int16_t *coeffs, int col_limit);
-+void ff_hevc_rpi_idct_4x4_dc_neon_10(int16_t *coeffs);
-+void ff_hevc_rpi_idct_8x8_dc_neon_10(int16_t *coeffs);
-+void ff_hevc_rpi_idct_16x16_dc_neon_10(int16_t *coeffs);
-+void ff_hevc_rpi_idct_32x32_dc_neon_10(int16_t *coeffs);
-+void ff_hevc_rpi_transform_luma_4x4_neon_10(int16_t *coeffs);
-+
-+void ff_hevc_rpi_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
-+                                     ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
-+                                     ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
-+                                       ptrdiff_t stride);
-+
-+void ff_hevc_rpi_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
-+
-+
-+void ff_hevc_rpi_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs,
-+                                     ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs,
-+                                     ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs,
-+                                       ptrdiff_t stride);
-+
-+void ff_hevc_rpi_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
-+
-+
-+void ff_hevc_rpi_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+void ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+void ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+
-+
-+void ff_hevc_rpi_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+void ff_hevc_rpi_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+void ff_hevc_rpi_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+
-+void ff_hevc_rpi_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+
-+void ff_hevc_rpi_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+
-+void ff_hevc_rpi_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height);
-+
-+void ff_hevc_rpi_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height);
-+
-+void ff_hevc_rpi_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height);
-+void ff_hevc_rpi_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height);
-+void ff_hevc_rpi_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height);
-+
-+void ff_hevc_rpi_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height);
-+void ff_hevc_rpi_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height);
-+void ff_hevc_rpi_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height);
-+
-+void ff_hevc_rpi_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+
-+void ff_hevc_rpi_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+
-+
-+uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh,
-+                                                const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+                                                int in_inc0, int in_inc1);
-+void ff_hevc_rpi_cpy_blks8x4_neon(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height);
-+
-+
-+static void ff_hevc_rpi_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
-+{
-+    ff_hevc_rpi_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
-+    ff_hevc_rpi_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height);
-+}
-+static void ff_hevc_rpi_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
-+{
-+    ff_hevc_rpi_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
-+    ff_hevc_rpi_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height);
-+}
-+
-+static void ff_hevc_rpi_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
-+{
-+    ff_hevc_rpi_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
-+    ff_hevc_rpi_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
-+}
-+static void ff_hevc_rpi_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
-+{
-+    ff_hevc_rpi_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
-+    ff_hevc_rpi_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
-+}
-+
-+#if SAO_FILTER_N == 6
-+static void ff_hevc_rpi_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
-+{
-+    ff_hevc_rpi_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
-+    ff_hevc_rpi_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height);
-+}
-+static void ff_hevc_rpi_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
-+{
-+    ff_hevc_rpi_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
-+    ff_hevc_rpi_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height);
-+}
-+
-+static void ff_hevc_rpi_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
-+{
-+    ff_hevc_rpi_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
-+    ff_hevc_rpi_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
-+}
-+static void ff_hevc_rpi_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
-+{
-+    ff_hevc_rpi_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
-+    ff_hevc_rpi_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
-+}
-+
-+static void ff_hevc_rpi_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height)
-+{
-+    ff_hevc_rpi_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
-+    ff_hevc_rpi_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
-+}
-+static void ff_hevc_rpi_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height)
-+{
-+    ff_hevc_rpi_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
-+    ff_hevc_rpi_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
-+}
-+
-+static void ff_hevc_rpi_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height)
-+{
-+    ff_hevc_rpi_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src,
-+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
-+    ff_hevc_rpi_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src,
-+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
-+}
-+static void ff_hevc_rpi_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height)
-+{
-+    ff_hevc_rpi_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src,
-+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
-+    ff_hevc_rpi_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src,
-+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
-+}
-+#endif
-+
-+
-+
-+#if RPI_HEVC_SAO_BUF_STRIDE != 160
-+#error SAO edge src stride not 160 - value used in .S
-+#endif
-+
-+av_cold void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth)
-+{
-+    if (bit_depth == 8) {
-+        c->hevc_v_loop_filter_luma     = ff_hevc_rpi_v_loop_filter_luma_neon_8;
-+        c->hevc_v_loop_filter_luma_c   = ff_hevc_rpi_v_loop_filter_luma_neon_8;
-+        c->hevc_h_loop_filter_luma     = ff_hevc_rpi_h_loop_filter_luma_neon_8;
-+        c->hevc_h_loop_filter_luma_c   = ff_hevc_rpi_h_loop_filter_luma_neon_8;
-+        c->hevc_h_loop_filter_luma2    = ff_hevc_rpi_h_loop_filter_luma2_neon_8;
-+        c->hevc_v_loop_filter_luma2    = ff_hevc_rpi_v_loop_filter_luma2_neon_8;
-+        c->hevc_h_loop_filter_uv       = ff_hevc_rpi_h_loop_filter_uv_neon_8;
-+        c->hevc_v_loop_filter_uv2      = ff_hevc_rpi_v_loop_filter_uv2_neon_8;
-+        c->idct[0]                     = ff_hevc_rpi_transform_4x4_neon_8;
-+        c->idct[1]                     = ff_hevc_rpi_transform_8x8_neon_8;
-+        c->idct_dc[0]                  = ff_hevc_rpi_idct_4x4_dc_neon_8;
-+        c->idct_dc[1]                  = ff_hevc_rpi_idct_8x8_dc_neon_8;
-+        c->idct_dc[2]                  = ff_hevc_rpi_idct_16x16_dc_neon_8;
-+        c->idct_dc[3]                  = ff_hevc_rpi_idct_32x32_dc_neon_8;
-+        c->add_residual[0]             = ff_hevc_rpi_add_residual_4x4_neon_8;
-+        c->add_residual[1]             = ff_hevc_rpi_add_residual_8x8_neon_8;
-+        c->add_residual[2]             = ff_hevc_rpi_add_residual_16x16_neon_8;
-+        c->add_residual[3]             = ff_hevc_rpi_add_residual_32x32_neon_8;
-+        c->add_residual_dc[0]          = ff_hevc_rpi_add_residual_4x4_dc_neon_8;
-+        c->add_residual_dc[1]          = ff_hevc_rpi_add_residual_8x8_dc_neon_8;
-+        c->add_residual_dc[2]          = ff_hevc_rpi_add_residual_16x16_dc_neon_8;
-+        c->add_residual_dc[3]          = ff_hevc_rpi_add_residual_32x32_dc_neon_8;
-+        c->add_residual_u[0]           = ff_hevc_rpi_add_residual_4x4_u_neon_8;
-+        c->add_residual_u[1]           = ff_hevc_rpi_add_residual_8x8_u_neon_8;
-+        c->add_residual_u[2]           = ff_hevc_rpi_add_residual_16x16_u_neon_8;
-+        c->add_residual_v[0]           = ff_hevc_rpi_add_residual_4x4_v_neon_8;
-+        c->add_residual_v[1]           = ff_hevc_rpi_add_residual_8x8_v_neon_8;
-+        c->add_residual_v[2]           = ff_hevc_rpi_add_residual_16x16_v_neon_8;
-+        c->add_residual_c[0]           = ff_hevc_rpi_add_residual_4x4_c_neon_8;
-+        c->add_residual_c[1]           = ff_hevc_rpi_add_residual_8x8_c_neon_8;
-+        c->add_residual_c[2]           = ff_hevc_rpi_add_residual_16x16_c_neon_8;
-+        c->add_residual_dc_c[0]        = ff_hevc_rpi_add_residual_4x4_dc_c_neon_8;
-+        c->add_residual_dc_c[1]        = ff_hevc_rpi_add_residual_8x8_dc_c_neon_8;
-+        c->add_residual_dc_c[2]        = ff_hevc_rpi_add_residual_16x16_dc_c_neon_8;
-+        c->transform_4x4_luma          = ff_hevc_rpi_transform_luma_4x4_neon_8;
-+        c->sao_band_filter[0]          = ff_hevc_rpi_sao_band_8_neon_8;
-+        c->sao_band_filter[1]          = ff_hevc_rpi_sao_band_16_neon_8;
-+        c->sao_band_filter[2]          = ff_hevc_rpi_sao_band_32_neon_8;
-+        c->sao_band_filter[3]          = ff_hevc_rpi_sao_band_48_neon_8;
-+        c->sao_band_filter[4]          = ff_hevc_rpi_sao_band_64_neon_8;
-+        c->sao_edge_filter[0]          = ff_hevc_rpi_sao_edge_8_neon_8;
-+        c->sao_edge_filter[1]          = ff_hevc_rpi_sao_edge_16_neon_8;
-+        c->sao_edge_filter[2]          = ff_hevc_rpi_sao_edge_32_neon_8;
-+        c->sao_edge_filter[3]          = ff_hevc_rpi_sao_edge_48_neon_8;
-+        c->sao_edge_filter[4]          = ff_hevc_rpi_sao_edge_64_neon_8;
-+#if SAO_FILTER_N == 6
-+        c->sao_band_filter[5]          = ff_hevc_rpi_sao_band_24_neon_8;
-+        c->sao_edge_filter[5]          = ff_hevc_rpi_sao_edge_24_neon_8;
-+#endif
-+        c->sao_band_filter_c[0]        = ff_hevc_rpi_sao_band_c_8_neon_8;
-+        c->sao_band_filter_c[1]        = ff_hevc_rpi_sao_band_c_16_neon_8;
-+        c->sao_band_filter_c[2]        = ff_hevc_rpi_sao_band_c_32_neon_8;
-+
-+        c->sao_edge_filter_c[0]        = ff_hevc_rpi_sao_edge_c_8_neon_8;
-+        c->sao_edge_filter_c[1]        = ff_hevc_rpi_sao_edge_c_16_neon_8;
-+        c->sao_edge_filter_c[2]        = ff_hevc_rpi_sao_edge_c_32_neon_8;
-+
-+#if SAO_FILTER_N == 6
-+        c->sao_band_filter_c[5]        = ff_hevc_rpi_sao_band_c_24_neon_8;
-+        c->sao_edge_filter_c[5]        = ff_hevc_rpi_sao_edge_c_24_neon_8;
-+#endif
-+    }
-+    else if (bit_depth == 10) {
-+        c->hevc_v_loop_filter_luma     = ff_hevc_rpi_v_loop_filter_luma_neon_10;
-+        c->hevc_v_loop_filter_luma_c   = ff_hevc_rpi_v_loop_filter_luma_neon_10;
-+        c->hevc_h_loop_filter_luma     = ff_hevc_rpi_h_loop_filter_luma_neon_10;
-+        c->hevc_h_loop_filter_luma_c   = ff_hevc_rpi_h_loop_filter_luma_neon_10;
-+        c->hevc_h_loop_filter_luma2    = ff_hevc_rpi_h_loop_filter_luma2_neon_10;
-+        c->hevc_v_loop_filter_luma2    = ff_hevc_rpi_v_loop_filter_luma2_neon_10;
-+        c->hevc_h_loop_filter_uv       = ff_hevc_rpi_h_loop_filter_uv_neon_10;
-+        c->hevc_v_loop_filter_uv2      = ff_hevc_rpi_v_loop_filter_uv2_neon_10;
-+        c->idct[0]                     = ff_hevc_rpi_transform_4x4_neon_10;
-+        c->idct[1]                     = ff_hevc_rpi_transform_8x8_neon_10;
-+        c->idct_dc[0]                  = ff_hevc_rpi_idct_4x4_dc_neon_10;
-+        c->idct_dc[1]                  = ff_hevc_rpi_idct_8x8_dc_neon_10;
-+        c->idct_dc[2]                  = ff_hevc_rpi_idct_16x16_dc_neon_10;
-+        c->idct_dc[3]                  = ff_hevc_rpi_idct_32x32_dc_neon_10;
-+        c->add_residual[0]             = ff_hevc_rpi_add_residual_4x4_neon_10;
-+        c->add_residual[1]             = ff_hevc_rpi_add_residual_8x8_neon_10;
-+        c->add_residual[2]             = ff_hevc_rpi_add_residual_16x16_neon_10;
-+        c->add_residual[3]             = ff_hevc_rpi_add_residual_32x32_neon_10;
-+        c->add_residual_dc[0]          = ff_hevc_rpi_add_residual_4x4_dc_neon_10;
-+        c->add_residual_dc[1]          = ff_hevc_rpi_add_residual_8x8_dc_neon_10;
-+        c->add_residual_dc[2]          = ff_hevc_rpi_add_residual_16x16_dc_neon_10;
-+        c->add_residual_dc[3]          = ff_hevc_rpi_add_residual_32x32_dc_neon_10;
-+        c->add_residual_u[0]           = ff_hevc_rpi_add_residual_4x4_u_neon_10;
-+        c->add_residual_u[1]           = ff_hevc_rpi_add_residual_8x8_u_neon_10;
-+        c->add_residual_u[2]           = ff_hevc_rpi_add_residual_16x16_u_neon_10;
-+        c->add_residual_v[0]           = ff_hevc_rpi_add_residual_4x4_v_neon_10;
-+        c->add_residual_v[1]           = ff_hevc_rpi_add_residual_8x8_v_neon_10;
-+        c->add_residual_v[2]           = ff_hevc_rpi_add_residual_16x16_v_neon_10;
-+        c->add_residual_c[0]           = ff_hevc_rpi_add_residual_4x4_c_neon_10;
-+        c->add_residual_c[1]           = ff_hevc_rpi_add_residual_8x8_c_neon_10;
-+        c->add_residual_c[2]           = ff_hevc_rpi_add_residual_16x16_c_neon_10;
-+        c->add_residual_dc_c[0]        = ff_hevc_rpi_add_residual_4x4_dc_c_neon_10;
-+        c->add_residual_dc_c[1]        = ff_hevc_rpi_add_residual_8x8_dc_c_neon_10;
-+        c->add_residual_dc_c[2]        = ff_hevc_rpi_add_residual_16x16_dc_c_neon_10;
-+        c->transform_4x4_luma          = ff_hevc_rpi_transform_luma_4x4_neon_10;
-+        c->sao_band_filter[0]          = ff_hevc_rpi_sao_band_8_neon_10;
-+        c->sao_band_filter[1]          = ff_hevc_rpi_sao_band_16_neon_10;
-+        c->sao_band_filter[2]          = ff_hevc_rpi_sao_band_32_neon_10;
-+        c->sao_band_filter[3]          = ff_hevc_rpi_sao_band_48_neon_10;
-+        c->sao_band_filter[4]          = ff_hevc_rpi_sao_band_64_neon_10;
-+
-+        c->sao_edge_filter[0]          = ff_hevc_rpi_sao_edge_8_neon_10;
-+        c->sao_edge_filter[1]          = ff_hevc_rpi_sao_edge_16_neon_10;
-+        c->sao_edge_filter[2]          = ff_hevc_rpi_sao_edge_32_neon_10;
-+        c->sao_edge_filter[3]          = ff_hevc_rpi_sao_edge_48_neon_10;
-+        c->sao_edge_filter[4]          = ff_hevc_rpi_sao_edge_64_neon_10;
-+#if SAO_FILTER_N == 6
-+        c->sao_band_filter[5]          = ff_hevc_rpi_sao_band_24_neon_10;
-+        c->sao_edge_filter[5]          = ff_hevc_rpi_sao_edge_24_neon_10;
-+#endif
-+        c->sao_band_filter_c[0]        = ff_hevc_rpi_sao_band_c_8_neon_10;
-+        c->sao_band_filter_c[1]        = ff_hevc_rpi_sao_band_c_16_neon_10;
-+        c->sao_band_filter_c[2]        = ff_hevc_rpi_sao_band_c_32_neon_10;
-+
-+        c->sao_edge_filter_c[0]        = ff_hevc_rpi_sao_edge_c_8_neon_10;
-+        c->sao_edge_filter_c[1]        = ff_hevc_rpi_sao_edge_c_16_neon_10;
-+        c->sao_edge_filter_c[2]        = ff_hevc_rpi_sao_edge_c_32_neon_10;
-+
-+#if SAO_FILTER_N == 6
-+        c->sao_band_filter_c[5]        = ff_hevc_rpi_sao_band_c_24_neon_10;
-+        c->sao_edge_filter_c[5]        = ff_hevc_rpi_sao_edge_c_24_neon_10;
-+#endif
-+    }
-+
-+    assert(offsetof(HEVCRpiMvField, mv) == 0);
-+    assert(offsetof(HEVCRpiMvField, ref_idx) == 8);
-+    assert(offsetof(HEVCRpiMvField, pred_flag) == 10);
-+    c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon;
-+    c->cpy_blk = ff_hevc_rpi_cpy_blks8x4_neon;
-+}
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_res16_neon.S
-@@ -0,0 +1,620 @@
-+/*
-+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox, Ben Avison
-+*/
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+ .arch_extension mp @ enable PLDW
-+
-+#define BIT_DEPTH 10
-+
-+.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
-+        vmax.s16  \Q0, \Q_MIN
-+        vmax.s16  \Q1, \Q_MIN
-+        vmax.s16  \Q2, \Q_MIN
-+        vmax.s16  \Q3, \Q_MIN
-+        vmin.s16  \Q0, \Q_MAX
-+        vmin.s16  \Q1, \Q_MAX
-+        vmin.s16  \Q2, \Q_MAX
-+        vmin.s16  \Q3, \Q_MAX
-+.endm
-+
-+@ add_residual4x4(
-+@  uint16_t *_dst,    [r0]
-+@  int16_t *res,      [r1]
-+@  ptrdiff_t stride)  [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_neon_, BIT_DEPTH), export=1
-+        add         ip, r0, r2
-+        vld1.16     {q10, q11}, [r1]
-+        lsl         r2, #1
-+        vld1.16     {d0}, [r0 :64], r2
-+        vld1.16     {d1}, [ip :64], r2
-+        vld1.16     {d2}, [r0 :64]
-+        vld1.16     {d3}, [ip :64]
-+        sub         r0, r2
-+        vqadd.s16   q0,  q10
-+        sub         ip, r2
-+        vqadd.s16   q1,  q11
-+        vmov.i16    q8,  #0
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        vmax.s16    q0,  q0,  q8
-+        vmax.s16    q1,  q1,  q8
-+        vmin.s16    q0,  q0,  q9
-+        vmin.s16    q1,  q1,  q9
-+        vst1.16     {d0}, [r0 :64], r2
-+        vst1.16     {d1}, [ip :64], r2
-+        vst1.16     {d2}, [r0 :64]
-+        vst1.16     {d3}, [ip :64]
-+        bx          lr
-+
-+endfunc
-+
-+@ add_residual4x4_dc(
-+@  uint16_t *_dst,    [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc)            [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1
-+        add         ip, r0, r1
-+        vdup.16     q15, r2
-+        lsl         r1, #1
-+        vld1.16     {d0}, [r0 :64], r1
-+        vld1.16     {d1}, [ip :64], r1
-+        vld1.16     {d2}, [r0 :64]
-+        vld1.16     {d3}, [ip :64]
-+        sub         r0, r1
-+        vqadd.s16   q0,  q15
-+        sub         ip, r1
-+        vqadd.s16   q1,  q15
-+        vmov.i16    q8,  #0
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        vmax.s16    q0,  q0,  q8
-+        vmax.s16    q1,  q1,  q8
-+        vmin.s16    q0,  q0,  q9
-+        vmin.s16    q1,  q1,  q9
-+        vst1.16     {d0}, [r0 :64], r1
-+        vst1.16     {d1}, [ip :64], r1
-+        vst1.16     {d2}, [r0 :64]
-+        vst1.16     {d3}, [ip :64]
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ add_residual8x8(
-+@  uint16_t *_dst,    [r0]
-+@  int16_t *res,      [r1]
-+@  ptrdiff_t stride)  [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_neon_, BIT_DEPTH), export=1
-+        mov         r3, #8
-+        vmov.i64    q8,  #0
-+        add         ip, r0, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        lsl         r2, #1
-+1:
-+        vldm        r1!, {q10-q13}
-+        vld1.16     {q0}, [r0 :128], r2
-+        vld1.16     {q1}, [ip :128], r2
-+        vld1.16     {q2}, [r0 :128]
-+        vld1.16     {q3}, [ip :128]
-+        sub         r0, r2
-+        vqadd.s16   q0,  q10
-+        sub         ip, r2
-+        vqadd.s16   q1,  q11
-+        subs        r3, #4
-+        vqadd.s16   q2,  q12
-+        vqadd.s16   q3,  q13
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst1.16     {q0}, [r0 :128], r2
-+        vst1.16     {q1}, [ip :128], r2
-+        vst1.16     {q2}, [r0 :128], r2
-+        vst1.16     {q3}, [ip :128], r2
-+        bne         1b
-+        bx          lr
-+
-+endfunc
-+
-+@ add_residual4x4_dc_c(
-+@  uint16_t *_dst,    [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc_uv)         [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1
-+        mov         r3, #4
-+        vdup.32     q15, r2
-+        b           9f
-+endfunc
-+
-+@ add_residual8x8_dc(
-+@  uint16_t *_dst,    [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc)            [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1
-+        vdup.16     q15, r2
-+        mov         r3, #8
-+9:
-+        vmov.i16    q8,  #0
-+        add         ip, r0, r1
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        lsl         r1, #1
-+1:
-+        vld1.16     {q0}, [r0 :128], r1
-+        vld1.16     {q1}, [ip :128], r1
-+        vld1.16     {q2}, [r0 :128]
-+        vld1.16     {q3}, [ip :128]
-+        sub         r0, r1
-+        vqadd.s16   q0,  q15
-+        sub         ip, r1
-+        vqadd.s16   q1,  q15
-+        subs        r3, #4
-+        vqadd.s16   q2,  q15
-+        vqadd.s16   q3,  q15
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst1.16     {q0}, [r0 :128], r1
-+        vst1.16     {q1}, [ip :128], r1
-+        vst1.16     {q2}, [r0 :128], r1
-+        vst1.16     {q3}, [ip :128], r1
-+        bne         1b
-+        bx          lr
-+
-+endfunc
-+
-+@ add_residual16x16(
-+@  uint16_t *_dst,    [r0]
-+@  int16_t *res,      [r1]
-+@  ptrdiff_t stride)  [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_neon_, BIT_DEPTH), export=1
-+        add         ip, r0, r2
-+        vmov.i16    q8,  #0
-+        lsl         r2, #1
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        mov         r3, #16
-+1:
-+        vldm        r1!, {q10-q13}
-+        @ For RPI Sand we could guarantee :256 but not for general
-+        @ non-RPI allocation. :128 is as good as we can claim
-+        vld1.16     {q0, q1}, [r0 :128]
-+        subs        r3, #2
-+        vld1.16     {q2, q3}, [ip :128]
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q1,  q11
-+        vqadd.s16   q2,  q12
-+        vqadd.s16   q3,  q13
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst1.16     {q0, q1}, [r0 :128], r2
-+        vst1.16     {q2, q3}, [ip :128], r2
-+        bne         1b
-+        bx          lr
-+endfunc
-+
-+@ add_residual8x8_dc_c(
-+@  uint16_t *_dst,    [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc_uv)         [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1
-+        mov         r3, #8
-+        vdup.32     q15, r2
-+        b           9f
-+endfunc
-+
-+@ add_residual16x16_dc(
-+@  uint16_t *_dst,    [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc)            [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1
-+        vdup.i16    q15, r2
-+        mov         r3, #16
-+9:
-+        vmov.i16    q8,  #0
-+        add         ip, r0, r1
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        lsl         r1, #1
-+1:
-+        @ For RPI Sand we could guarantee :256 but not for general
-+        @ non-RPI allocation. :128 is as good as we can claim
-+        vld1.16     {q0, q1}, [r0 :128]
-+        subs        r3, #2
-+        vqadd.s16   q0,  q15
-+        vqadd.s16   q1,  q15
-+        vld1.16     {q2, q3}, [ip :128]
-+        vqadd.s16   q2,  q15
-+        vqadd.s16   q3,  q15
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst1.16     {q0, q1}, [r0 :128], r1
-+        vst1.16     {q2, q3}, [ip :128], r1
-+        bne         1b
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ add_residual32x32(
-+@  uint16_t *_dst,    [r0]
-+@  int16_t *res,      [r1]
-+@  ptrdiff_t stride)  [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_32x32_neon_, BIT_DEPTH), export=1
-+        push        {lr}
-+        mov         r3, #32
-+        vmov.i16    q8,  #0
-+        add         lr, r0, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        add         ip, r0, #32
-+1:
-+        vldm        r1!, {q10-q13}
-+        vldm        r0,  {q0-q3}
-+        vqadd.s16   q0,  q10
-+          pldw        [lr]
-+        vqadd.s16   q1,  q11
-+          add         lr, r2
-+        vqadd.s16   q2,  q12
-+        subs        r3, #1
-+        vqadd.s16   q3,  q13
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst1.16     {q0-q1}, [r0], r2
-+        vst1.16     {q2-q3}, [ip], r2
-+        bne         1b
-+        pop         {pc}
-+
-+endfunc
-+
-+@ add_residual16x16_dc_c(
-+@  uint16_t *_dst,    [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc_uv)         [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1
-+        mov         r3, #16
-+        vdup.32     q15, r2
-+        b           9f
-+endfunc
-+
-+@ add_residual32x32_dc(
-+@  uint16_t *_dst,    [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc)            [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1
-+        vdup.16     q15, r2
-+        mov         r3, #32
-+9:
-+        vmov.i16    q8,  #0
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        add         ip, r0, #32
-+1:
-+        vldm        r0,  {q0-q3}
-+        vqadd.s16   q0,  q15
-+        subs        r3, #1
-+        vqadd.s16   q1,  q15
-+        vqadd.s16   q2,  q15
-+        vqadd.s16   q3,  q15
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst1.16     {q0-q1}, [r0], r1
-+        vst1.16     {q2-q3}, [ip], r1
-+        bne         1b
-+        bx          lr
-+
-+endfunc
-+
-+@ ============================================================================
-+@ U add
-+
-+@ add_residual4x4_u(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_u_neon_, BIT_DEPTH), export=1
-+        vdup.16     q15, r3
-+        add         ip, r0, r2
-+        vld1.16     {q10, q11}, [r1 :256]
-+        lsl         r2, #1
-+        vld2.16     {d0, d2}, [r0 :128], r2
-+        vld2.16     {d1, d3}, [ip :128], r2
-+        vld2.16     {d4, d6}, [r0 :128]
-+        vld2.16     {d5, d7}, [ip :128]
-+        sub         r0, r2
-+        vmov.i16    q8,  #0
-+        sub         ip, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q1,  q15
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q3,  q15
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+
-+        vst2.16     {d0, d2}, [r0 :128], r2
-+        vst2.16     {d1, d3}, [ip :128], r2
-+        vst2.16     {d4, d6}, [r0 :128]
-+        vst2.16     {d5, d7}, [ip :128]
-+        bx          lr
-+endfunc
-+
-+@ add_residual8x8_u(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_u_neon_, BIT_DEPTH), export=1
-+        vdup.16     q15, r3
-+        mov         r3, #8
-+        vmov.i16    q8,  #0
-+        add         ip, r0, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        lsl         r2, #1
-+1:
-+        vld2.16     {q0, q1}, [r0 :256]
-+        subs        r3, #2
-+        vld2.16     {q2, q3}, [ip :256]
-+        vld1.16     {q10, q11}, [r1 :256]!
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q1,  q15
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q3,  q15
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst2.16     {q0, q1}, [r0 :256], r2
-+        vst2.16     {q2, q3}, [ip :256], r2
-+        bne         1b
-+        bx          lr
-+endfunc
-+
-+@ add_residual16x16_u(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_u_neon_, BIT_DEPTH), export=1
-+        push        {lr}
-+        vdup.16     q15, r3
-+        mov         r3, #16
-+        vmov.i16    q8,  #0
-+        add         lr, r0, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        add         ip, r0, #32
-+1:
-+        vld2.16     {q0, q1}, [r0 :256]
-+        vld2.16     {q2, q3}, [ip :256]
-+        vld1.16     {q10, q11}, [r1 :256]!
-+        vqadd.s16   q0,  q10
-+          pldw        [lr]
-+        vqadd.s16   q1,  q15
-+          add         lr, r2
-+        vqadd.s16   q2,  q11
-+        subs        r3, #1
-+        vqadd.s16   q3,  q15
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst2.16     {q0, q1}, [r0 :256], r2
-+        vst2.16     {q2, q3}, [ip :256], r2
-+        bne         1b
-+        pop         {pc}
-+endfunc
-+
-+@ ============================================================================
-+@ V add
-+
-+@ add_residual4x4_v(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_v_neon_, BIT_DEPTH), export=1
-+        vdup.16     q15, r3
-+        add         ip, r0, r2
-+        vld1.16     {q10, q11}, [r1 :256]
-+        lsl         r2, #1
-+        vld2.16     {d0, d2}, [r0 :128], r2
-+        vld2.16     {d1, d3}, [ip :128], r2
-+        vld2.16     {d4, d6}, [r0 :128]
-+        vld2.16     {d5, d7}, [ip :128]
-+        sub         r0, r2
-+        vmov.i16    q8,  #0
-+        sub         ip, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+
-+        vqadd.s16   q0,  q15
-+        vqadd.s16   q1,  q10
-+        vqadd.s16   q2,  q15
-+        vqadd.s16   q3,  q11
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+
-+        vst2.16     {d0, d2}, [r0 :128], r2
-+        vst2.16     {d1, d3}, [ip :128], r2
-+        vst2.16     {d4, d6}, [r0 :128]
-+        vst2.16     {d5, d7}, [ip :128]
-+        bx          lr
-+endfunc
-+
-+@ add_residual8x8_v(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_v_neon_, BIT_DEPTH), export=1
-+        vdup.16     q15, r3
-+        mov         r3, #8
-+        vmov.i16    q8,  #0
-+        add         ip, r0, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        lsl         r2, #1
-+1:
-+        vld2.16     {q0, q1}, [r0 :256]
-+        subs        r3, #2
-+        vld2.16     {q2, q3}, [ip :256]
-+        vld1.16     {q10, q11}, [r1 :256]!
-+        vqadd.s16   q0,  q15
-+        vqadd.s16   q1,  q10
-+        vqadd.s16   q2,  q15
-+        vqadd.s16   q3,  q11
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst2.16     {q0, q1}, [r0 :256], r2
-+        vst2.16     {q2, q3}, [ip :256], r2
-+        bne         1b
-+        bx          lr
-+endfunc
-+
-+@ add_residual16x16_v(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_v_neon_, BIT_DEPTH), export=1
-+        push        {lr}
-+        vdup.16     q15, r3
-+        mov         r3, #16
-+        vmov.i16    q8,  #0
-+        add         lr, r0, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        add         ip, r0, #32
-+1:
-+        vld2.16     {q0, q1}, [r0 :256]
-+        vld2.16     {q2, q3}, [ip :256]
-+        vld1.16     {q10, q11}, [r1 :256]!
-+        vqadd.s16   q0,  q15
-+          pldw        [lr]
-+        vqadd.s16   q1,  q10
-+          add         lr, r2
-+        vqadd.s16   q2,  q15
-+        subs        r3, #1
-+        vqadd.s16   q3,  q11
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst2.16     {q0, q1}, [r0 :256], r2
-+        vst2.16     {q2, q3}, [ip :256], r2
-+        bne         1b
-+        pop         {pc}
-+endfunc
-+
-+@ ============================================================================
-+@ U & V add
-+
-+@ add_residual4x4_c(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_c_neon_, BIT_DEPTH), export=1
-+        vmov.i16    q8,  #0
-+        add         ip, r0, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        lsl         r2, #1
-+        vldm        r1, {q10-q13}
-+        vld2.16     {d0, d2}, [r0 :128], r2
-+        vld2.16     {d1, d3}, [ip :128], r2
-+        vld2.16     {d4, d6}, [r0 :128]
-+        vld2.16     {d5, d7}, [ip :128]
-+
-+        sub         r0, r2
-+        vqadd.s16   q0,  q10
-+        sub         ip, r2
-+        vqadd.s16   q1,  q12
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q3,  q13
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+
-+        vst2.16     {d0, d2}, [r0 :128], r2
-+        vst2.16     {d1, d3}, [ip :128], r2
-+        vst2.16     {d4, d6}, [r0 :128]
-+        vst2.16     {d5, d7}, [ip :128]
-+        bx          lr
-+endfunc
-+
-+@ add_residual8x8_c(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_c_neon_, BIT_DEPTH), export=1
-+        push        {lr}
-+        add         ip, r0, r2
-+        lsl         r2, #1
-+        vmov.i16    q8,  #0
-+        add         r3, r1, #(8*8*2)  @ Offset to V
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        mov         lr, #8
-+1:
-+        vld1.16     {q10, q11}, [r1 :256]!
-+        subs        lr, #2
-+        vld2.16     {q0, q1}, [r0 :256]
-+        vld2.16     {q2, q3}, [ip :256]
-+        vld1.16     {q12, q13}, [r3 :256]!
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q1,  q12
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q3,  q13
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst2.16     {q0, q1}, [r0 :256], r2
-+        vst2.16     {q2, q3}, [ip :256], r2
-+        bne         1b
-+        pop         {pc}
-+endfunc
-+
-+@ add_residual16x16_c(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_c_neon_, BIT_DEPTH), export=1
-+        push        {r4, lr}
-+        vmov.i16    q8,  #0
-+        add         r3,  r1, #(16*16*2)  @ Offset to V
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        add         ip, r0, #32
-+        add         r4, r0, r2
-+        mov         lr, #16
-+1:
-+        vld2.16     {q0, q1}, [r0 :256]
-+        vld2.16     {q2, q3}, [ip :256]
-+        vld1.16     {q10, q11}, [r1 :256]!
-+        vld1.16     {q12, q13}, [r3 :256]!
-+        vqadd.s16   q0,  q10
-+          pldw        [r4]
-+        vqadd.s16   q1,  q12
-+          add         r4, r2
-+        vqadd.s16   q2,  q11
-+        subs        lr, #1
-+        vqadd.s16   q3,  q13
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst2.16     {q0, q1}, [r0 :256], r2
-+        vst2.16     {q2, q3}, [ip :256], r2
-+        bne         1b
-+        pop         {r4,pc}
-+endfunc
-+
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_res8_neon.S
-@@ -0,0 +1,741 @@
-+/*
-+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox, Ben Avison
-+*/
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+ .arch_extension mp @ enable PLDW
-+
-+@ General notes:
-+@
-+@ Residual is generally only guaranteed to be clipped to 16 bits.
-+@ This means that we do need to do vmovl, vqadd, vqmovun
-+@ rather than vaddw, vqmovun (if we were clipped to 15 then we could get away
-+@ with this).
-+@
-+@ There is an exception for the DC case because its transform is guaranteed
-+@ to be small enough that overflow cannot occur during the first add.
-+
-+@ ============================================================================
-+@ Y add
-+
-+function ff_hevc_rpi_add_residual_4x4_neon_8, export=1
-+        add         ip, r0, r2
-+        vld1.16     {q0, q1}, [r1]
-+        lsl         r2, #1
-+        vld1.32     d4[0], [r0], r2
-+        rsb         r3, r2, #0
-+        vld1.32     d4[1], [ip], r2
-+        vld1.32     d5[0], [r0], r3
-+        vld1.32     d5[1], [ip], r3
-+        vmovl.u8    q8, d4
-+        vmovl.u8    q9, d5
-+        vqadd.s16   q0, q8
-+        vqadd.s16   q1, q9
-+        vqmovun.s16 d0, q0
-+        vqmovun.s16 d1, q1
-+        vst1.32     d0[0], [r0], r2
-+        vst1.32     d0[1], [ip], r2
-+        vst1.32     d1[0], [r0]
-+        vst1.32     d1[1], [ip]
-+        bx          lr
-+endfunc
-+
-+function ff_hevc_rpi_add_residual_8x8_neon_8, export=1
-+        push        {r4, lr}
-+        vld1.16     {q0, q1}, [r1]!
-+        add         ip, r0, r2
-+        vld1.8      {d6}, [r0]
-+        add         r4, r0, r2, lsl #1
-+        vld1.8      {d7}, [ip]
-+        add         lr, ip, r2, lsl #1
-+        lsl         r2, #1
-+        mov         r3, #8-2
-+        vmovl.u8    q2, d6
-+        vmovl.u8    q3, d7
-+        vqadd.s16   q2, q0
-+        vqadd.s16   q3, q1
-+1:
-+          vld1.16     {q0, q1}, [r1]!
-+        subs        r3, #2
-+        vqmovun.s16 d4, q2
-+        vqmovun.s16 d5, q3
-+          vld1.8      {d6}, [r4], r2
-+          vld1.8      {d7}, [lr], r2
-+        vst1.8      {d4}, [r0], r2
-+        vst1.8      {d5}, [ip], r2
-+          vmovl.u8    q2, d6
-+            pldw        [r4]
-+          vmovl.u8    q3, d7
-+          vqadd.s16   q2, q0
-+          vqadd.s16   q3, q1
-+        bne         1b
-+
-+          vqmovun.s16 d4, q2
-+          vqmovun.s16 d5, q3
-+          vst1.8      {d4}, [r0]
-+          vst1.8      {d5}, [ip]
-+          pop         {r4, pc}
-+endfunc
-+
-+function ff_hevc_rpi_add_residual_16x16_neon_8, export=1
-+        vld1.16     {q0, q1}, [r1]!
-+        add         ip, r0, r2
-+        vld1.8      {q3}, [r0]
-+        mov         r3, #16-1
-+        vmovl.u8    q2, d6
-+        vmovl.u8    q3, d7
-+        vqadd.s16   q2, q0
-+        vqadd.s16   q3, q1
-+1:
-+          vld1.16     {q0, q1}, [r1]!
-+        subs        r3, #1
-+        vqmovun.s16 d4, q2
-+        vqmovun.s16 d5, q3
-+          vld1.8      {q3}, [ip], r2
-+        vst1.8      {q2}, [r0], r2
-+          vmovl.u8    q2, d6
-+            pldw        [ip]
-+          vmovl.u8    q3, d7
-+          vqadd.s16   q2, q0
-+          vqadd.s16   q3, q1
-+        bne         1b
-+
-+          vqmovun.s16 d4, q2
-+          vqmovun.s16 d5, q3
-+          vst1.8      {q2}, [r0]
-+          bx          lr
-+endfunc
-+
-+function ff_hevc_rpi_add_residual_32x32_neon_8, export=1
-+        vldm        r1!, {q0-q3}
-+        vld1.8      {q8, q9}, [r0]
-+        add         ip, r0, r2
-+        vmovl.u8    q10, d16
-+        mov         r3, #32-1
-+        vmovl.u8    q11, d17
-+        vmovl.u8    q12, d18
-+        vmovl.u8    q13, d19
-+        vqadd.s16   q10, q0
-+        vqadd.s16   q11, q1
-+        vqadd.s16   q12, q2
-+        vqadd.s16   q13, q3
-+1:
-+          vldm        r1!, {q0-q3}
-+        vqmovun.s16 d20, q10
-+        vqmovun.s16 d21, q11
-+        vqmovun.s16 d22, q12
-+        vqmovun.s16 d23, q13
-+          vld1.8      {q8, q9}, [ip], r2
-+        subs        r3, #1
-+        vst1.8      {q10, q11}, [r0], r2
-+          vmovl.u8    q10, d16
-+            pldw        [ip]
-+          vmovl.u8    q11, d17
-+          vmovl.u8    q12, d18
-+          vmovl.u8    q13, d19
-+          vqadd.s16   q10, q0
-+          vqadd.s16   q11, q1
-+          vqadd.s16   q12, q2
-+          vqadd.s16   q13, q3
-+        bne     1b
-+
-+          vqmovun.s16 d20, q10
-+          vqmovun.s16 d21, q11
-+          vqmovun.s16 d22, q12
-+          vqmovun.s16 d23, q13
-+          vst1.8      {q10, q11}, [r0]
-+          bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_add_residual_4x4_dc_neon_8(
-+@   uint8_t * dst,              // [r0]
-+@   unsigned int stride,        // [r1]
-+@   int dc)                     // [r2]
-+
-+function ff_hevc_rpi_add_residual_4x4_dc_neon_8, export=1
-+        add         ip, r0, r1
-+        vdup.16     q15, r2
-+        lsl         r1, #1
-+        vld1.32     d4[0], [r0], r1
-+        rsb         r3, r1, #0
-+        vld1.32     d4[1], [ip], r1
-+        vld1.32     d5[0], [r0], r3
-+        vld1.32     d5[1], [ip], r3
-+        vaddw.u8    q0, q15, d4
-+        vaddw.u8    q1, q15, d5
-+        vqmovun.s16 d0, q0
-+        vqmovun.s16 d1, q1
-+        vst1.32     d0[0], [r0], r1
-+        vst1.32     d0[1], [ip], r1
-+        vst1.32     d1[0], [r0]
-+        vst1.32     d1[1], [ip]
-+        bx          lr
-+endfunc
-+
-+@ ============================================================================
-+@ DC Y or C add
-+
-+@ ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(
-+@   uint8_t * dst,              // [r0]
-+@   unsigned int stride,        // [r1]
-+@   int dc)                     // [r2]
-+
-+function ff_hevc_rpi_add_residual_4x4_dc_c_neon_8, export=1
-+        mov         r3,  #4-2
-+        vdup.32     q15, r2
-+        b           1f
-+endfunc
-+
-+@ ff_hevc_rpi_add_residual_8x8_dc_neon_8(
-+@   uint8_t * dst,              // [r0]
-+@   unsigned int stride,        // [r1]
-+@   int dc)                     // [r2]
-+
-+function ff_hevc_rpi_add_residual_8x8_dc_neon_8, export=1
-+        vdup.16     q15, r2
-+        mov         r3, #8-2
-+1:      vld1.8      d16, [r0]
-+        add         ip, r0, r1
-+        push        {r4, lr}
-+        vld1.8      d17, [ip]
-+        add         r4, r0, r1, lsl #1
-+        vaddw.u8    q0, q15, d16
-+        lsl         r1, #1
-+        vaddw.u8    q1, q15, d17
-+        add         lr, ip, r1
-+1:
-+          vld1.8      {d16}, [r4], r1
-+          vld1.8      {d17}, [lr], r1
-+        subs        r3, #2
-+        vqmovun.s16 d4, q0
-+        vqmovun.s16 d5, q1
-+          vaddw.u8    q0, q15, d16
-+          vaddw.u8    q1, q15, d17
-+        vst1.8      {d4}, [r0], r1
-+        vst1.8      {d5}, [ip], r1
-+        bne         1b
-+
-+          vqmovun.s16 d4, q0
-+          vqmovun.s16 d5, q1
-+          vst1.8      {d4}, [r0]
-+          vst1.8      {d5}, [ip]
-+          pop         {r4, pc}
-+endfunc
-+
-+
-+@ ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(
-+@   uint8_t * dst,              // [r0]
-+@   unsigned int stride,        // [r1]
-+@   int dc)                     // [r2]
-+
-+function ff_hevc_rpi_add_residual_8x8_dc_c_neon_8, export=1
-+        mov         r3,  #8-1
-+        vdup.32     q15, r2
-+        b           1f
-+endfunc
-+
-+@ ff_hevc_rpi_add_residual_16x16_dc_neon_8(
-+@   uint8_t * dst,              // [r0]
-+@   unsigned int stride,        // [r1]
-+@   int dc)                     // [r2]
-+
-+function ff_hevc_rpi_add_residual_16x16_dc_neon_8, export=1
-+        vdup.16     q15, r2
-+        mov         r3,  #16-1
-+1:      vld1.8      {q8}, [r0]
-+        add         ip, r0, r1
-+        vaddw.u8    q0, q15, d16
-+        vaddw.u8    q1, q15, d17
-+1:
-+          vld1.8      {q8}, [ip], r1
-+        subs        r3, #1
-+        vqmovun.s16 d4, q0
-+        vqmovun.s16 d5, q1
-+          vaddw.u8    q0, q15, d16
-+          vaddw.u8    q1, q15, d17
-+        vst1.8      {q2}, [r0], r1
-+        bne         1b
-+
-+          vqmovun.s16 d4, q0
-+          vqmovun.s16 d5, q1
-+          vst1.8      {q2}, [r0]
-+          bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(
-+@   uint8_t * dst,              // [r0]
-+@   unsigned int stride,        // [r1]
-+@   int dc)                     // [r2]
-+
-+function ff_hevc_rpi_add_residual_16x16_dc_c_neon_8, export=1
-+        mov         r3,  #16-1
-+        vdup.32     q15, r2
-+        b           1f
-+endfunc
-+
-+@ ff_hevc_rpi_add_residual_32x32_dc_neon_8(
-+@   uint8_t * dst,              // [r0]
-+@   unsigned int stride,        // [r1]
-+@   int dc)                     // [r2]
-+
-+function ff_hevc_rpi_add_residual_32x32_dc_neon_8, export=1
-+        vdup.16     q15, r2
-+        mov         r3, #32-1
-+1:      vld1.8      {q8, q9}, [r0]
-+        add         ip, r0, r1
-+        vaddw.u8    q0, q15, d16
-+        vaddw.u8    q1, q15, d17
-+        vaddw.u8    q2, q15, d18
-+        vaddw.u8    q3, q15, d19
-+1:
-+        vqmovun.s16 d20, q0
-+        vqmovun.s16 d21, q1
-+        vqmovun.s16 d22, q2
-+        vqmovun.s16 d23, q3
-+          vld1.8      {q8, q9}, [ip], r1
-+        subs        r3, #1
-+          vaddw.u8    q0, q15, d16
-+          vaddw.u8    q1, q15, d17
-+          vaddw.u8    q2, q15, d18
-+          vaddw.u8    q3, q15, d19
-+        vst1.8      {q10, q11}, [r0], r1
-+        bne     1b
-+
-+          vqmovun.s16 d20, q0
-+          vqmovun.s16 d21, q1
-+          vqmovun.s16 d22, q2
-+          vqmovun.s16 d23, q3
-+          vst1.8      {q10, q11}, [r0]
-+          bx          lr
-+endfunc
-+
-+@ ============================================================================
-+@ U add
-+
-+@ add_residual4x4_c(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc_v)             [r3]
-+
-+function ff_hevc_rpi_add_residual_4x4_u_neon_8, export=1
-+        add         ip, r0, r2
-+        vld1.16     {q0, q1}, [r1]
-+        lsl         r2, #1
-+        vld1.8      {d16}, [r0 :64], r2
-+        vld1.8      {d17}, [ip :64], r2
-+        vld1.8      {d18}, [r0 :64]
-+        sub         r0, r2
-+        vld1.8      {d19}, [ip :64]
-+        sub         ip, r2
-+        vdup.16     q2, r3
-+        vdup.16     q3, r3
-+        vmovl.u8    q10, d16
-+        vmovl.u8    q11, d17
-+        vmovl.u8    q12, d18
-+        vmovl.u8    q13, d19
-+        vzip.16     q0, q2
-+        vzip.16     q1, q3
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q1,  q12
-+        vqadd.s16   q3,  q13
-+        vqmovun.s16 d0,  q0
-+        vqmovun.s16 d1,  q2
-+        vqmovun.s16 d2,  q1
-+        vqmovun.s16 d3,  q3
-+        vst1.8      {d0}, [r0 :64], r2
-+        vst1.8      {d1}, [ip :64], r2
-+        vst1.8      {d2}, [r0 :64]
-+        vst1.8      {d3}, [ip :64]
-+        bx          lr
-+endfunc
-+
-+@ add_residual8x8_c(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+@   int dc_v)             [r3]
-+
-+function ff_hevc_rpi_add_residual_8x8_u_neon_8, export=1
-+        vdup.16     q15, r3
-+        add         ip, r0, r2
-+        push        {r4, lr}
-+        vld2.8      {d16, d17}, [r0 :128]
-+        lsl         r2, #1
-+        vld2.8      {d18, d19}, [ip :128]
-+        mov         r3, #8-2
-+        vld1.16     {q0, q1}, [r1 :256]!
-+        add         r4, r0, r2
-+        vmovl.u8    q10, d16
-+        add         lr, ip, r2
-+        vmovl.u8    q11, d18
-+        vqadd.s16   q0,  q10
-+        vaddw.u8    q2,  q15, d17
-+        vqadd.s16   q1,  q11
-+        vaddw.u8    q3,  q15, d19
-+1:
-+        vqmovun.s16 d20,  q0
-+        vqmovun.s16 d21,  q2
-+          vld2.8      {d16, d17}, [r4 :128], r2
-+        subs        r3, #2
-+        vqmovun.s16 d22,  q1
-+        vqmovun.s16 d23,  q3
-+        vst2.8      {d20, d21}, [r0 :128], r2
-+          vld2.8      {d18, d19}, [lr :128], r2
-+        vst2.8      {d22, d23}, [ip :128], r2
-+          vld1.16     {q0, q1}, [r1 :256]!
-+          vmovl.u8    q10, d16
-+          vmovl.u8    q11, d18
-+          vqadd.s16   q0,  q10
-+          vaddw.u8    q2,  q15, d17
-+          vqadd.s16   q1,  q11
-+          vaddw.u8    q3,  q15, d19
-+        bne         1b
-+
-+          vqmovun.s16 d20,  q0
-+          vqmovun.s16 d21,  q2
-+          vqmovun.s16 d22,  q1
-+          vqmovun.s16 d23,  q3
-+          vst2.8      {d20, d21}, [r0 :128]
-+          vst2.8      {d22, d23}, [ip :128]
-+          pop         {r4, pc}
-+endfunc
-+
-+@ add_residual16x16_u(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+@   int dc_v)             [r3]
-+
-+function ff_hevc_rpi_add_residual_16x16_u_neon_8, export=1
-+        vdup.16     q15, r3
-+        add         ip, r0, r2
-+        vld2.8      {q8, q9}, [r0 :256]
-+        mov         r3, #16-1
-+        vld1.16     {q0, q1}, [r1 :256]!
-+        vmovl.u8    q11, d16
-+        vmovl.u8    q12, d17
-+        vqadd.s16   q0,  q11
-+        vaddw.u8    q11, q15, d18
-+        vqadd.s16   q1,  q12
-+        vaddw.u8    q12, q15, d19
-+1:
-+          vld2.8      {q8, q9}, [ip :256], r2
-+        subs        r3, #1
-+        vqmovun.s16 d20, q0
-+        vqmovun.s16 d22, q11
-+        vqmovun.s16 d21, q1
-+        vqmovun.s16 d23, q12
-+          vld1.16     {q0, q1}, [r1 :256]!
-+        vst2.8      {q10, q11}, [r0 :256], r2
-+          vmovl.u8    q11, d16
-+            pldw        [ip]
-+          vmovl.u8    q12, d17
-+          vqadd.s16   q0,  q11
-+          vaddw.u8    q11, q15, d18
-+          vqadd.s16   q1,  q12
-+          vaddw.u8    q12, q15, d19
-+        bne         1b
-+
-+          vqmovun.s16 d20, q0
-+          vqmovun.s16 d22, q11
-+          vqmovun.s16 d21, q1
-+          vqmovun.s16 d23, q12
-+          vst2.8      {q10, q11}, [r0 :256]
-+          bx          lr
-+endfunc
-+
-+@ ============================================================================
-+@ V add
-+
-+@ add_residual4x4_v(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function ff_hevc_rpi_add_residual_4x4_v_neon_8, export=1
-+        add         ip, r0, r2
-+        vld1.16     {q2, q3}, [r1]
-+        lsl         r2, #1
-+        vld1.8      {d16}, [r0 :64], r2
-+        vld1.8      {d17}, [ip :64], r2
-+        vld1.8      {d18}, [r0 :64]
-+        sub         r0, r2
-+        vld1.8      {d19}, [ip :64]
-+        sub         ip, r2
-+        vdup.16     q0, r3
-+        vdup.16     q1, r3
-+        vmovl.u8    q10, d16
-+        vmovl.u8    q11, d17
-+        vmovl.u8    q12, d18
-+        vmovl.u8    q13, d19
-+        vzip.16     q0, q2
-+        vzip.16     q1, q3
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q1,  q12
-+        vqadd.s16   q3,  q13
-+        vqmovun.s16 d0,  q0
-+        vqmovun.s16 d1,  q2
-+        vqmovun.s16 d2,  q1
-+        vqmovun.s16 d3,  q3
-+        vst1.8      {d0}, [r0 :64], r2
-+        vst1.8      {d1}, [ip :64], r2
-+        vst1.8      {d2}, [r0 :64]
-+        vst1.8      {d3}, [ip :64]
-+        bx          lr
-+endfunc
-+
-+@ add_residual8x8_v(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function ff_hevc_rpi_add_residual_8x8_v_neon_8, export=1
-+        vdup.16     q15, r3
-+        add         ip, r0, r2
-+        push        {r4, lr}
-+        vld2.8      {d16, d17}, [r0 :128]
-+        lsl         r2, #1
-+        vld2.8      {d18, d19}, [ip :128]
-+        mov         r3, #8-2
-+        vld1.16     {q0, q1}, [r1 :256]!
-+        add         r4, r0, r2
-+        vmovl.u8    q10, d17
-+        add         lr, ip, r2
-+        vmovl.u8    q11, d19
-+        vqadd.s16   q0,  q10
-+        vaddw.u8    q2,  q15, d16
-+        vqadd.s16   q1,  q11
-+        vaddw.u8    q3,  q15, d18
-+1:
-+        vqmovun.s16 d20,  q2
-+        vqmovun.s16 d21,  q0
-+          vld2.8      {d16, d17}, [r4 :128], r2
-+        subs        r3, #2
-+        vqmovun.s16 d22,  q3
-+        vqmovun.s16 d23,  q1
-+        vst2.8      {d20, d21}, [r0 :128], r2
-+          vld2.8      {d18, d19}, [lr :128], r2
-+        vst2.8      {d22, d23}, [ip :128], r2
-+          vld1.16     {q0, q1}, [r1 :256]!
-+          vmovl.u8    q10, d17
-+          vmovl.u8    q11, d19
-+          vqadd.s16   q0,  q10
-+          vaddw.u8    q2,  q15, d16
-+          vqadd.s16   q1,  q11
-+          vaddw.u8    q3,  q15, d18
-+        bne         1b
-+
-+          vqmovun.s16 d20,  q2
-+          vqmovun.s16 d21,  q0
-+          vqmovun.s16 d22,  q3
-+          vqmovun.s16 d23,  q1
-+          vst2.8      {d20, d21}, [r0 :128]
-+          vst2.8      {d22, d23}, [ip :128]
-+          pop         {r4, pc}
-+endfunc
-+
-+@ add_residual16x16_v(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function ff_hevc_rpi_add_residual_16x16_v_neon_8, export=1
-+        vdup.16     q15, r3
-+        add         ip, r0, r2
-+        vld2.8      {q8, q9}, [r0 :256]
-+        mov         r3, #16-1
-+        vld1.16     {q0, q1}, [r1 :256]!
-+        vmovl.u8    q11, d18
-+        vmovl.u8    q12, d19
-+        vqadd.s16   q0,  q11
-+        vaddw.u8    q11, q15, d16
-+        vqadd.s16   q1,  q12
-+        vaddw.u8    q12, q15, d17
-+1:
-+          vld2.8      {q8, q9}, [ip :256], r2
-+        subs        r3, #1
-+        vqmovun.s16 d20, q11
-+        vqmovun.s16 d22, q0
-+        vqmovun.s16 d21, q12
-+        vqmovun.s16 d23, q1
-+          vld1.16     {q0, q1}, [r1 :256]!
-+        vst2.8      {q10, q11}, [r0 :256], r2
-+          vmovl.u8    q11, d18
-+            pldw        [ip]
-+          vmovl.u8    q12, d19
-+          vqadd.s16   q0,  q11
-+          vaddw.u8    q11, q15, d16
-+          vqadd.s16   q1,  q12
-+          vaddw.u8    q12, q15, d17
-+        bne         1b
-+
-+          vqmovun.s16 d20, q11
-+          vqmovun.s16 d22, q0
-+          vqmovun.s16 d21, q12
-+          vqmovun.s16 d23, q1
-+          vst2.8      {q10, q11}, [r0 :256]
-+          bx          lr
-+endfunc
-+
-+@ ============================================================================
-+@ U & V add
-+
-+@ add_residual4x4_c(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function ff_hevc_rpi_add_residual_4x4_c_neon_8, export=1
-+        add         ip, r0, r2
-+        vld1.16     {q0, q1}, [r1]!       @ all of U
-+        lsl         r2, #1
-+        vld1.8      {d16}, [r0 :64], r2
-+        rsb         r3, r2, #0
-+        vld1.8      {d17}, [ip :64], r2
-+        vld1.16     {q2, q3}, [r1]        @ all of V
-+        vld1.8      {d18}, [r0 :64], r3
-+        vld1.8      {d19}, [ip :64], r3
-+        vmovl.u8    q10, d16
-+        vmovl.u8    q11, d17
-+        vmovl.u8    q12, d18
-+        vmovl.u8    q13, d19
-+        vzip.16     q0, q2
-+        vzip.16     q1, q3
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q1,  q12
-+        vqadd.s16   q3,  q13
-+        vqmovun.s16 d0,  q0
-+        vqmovun.s16 d1,  q2
-+        vqmovun.s16 d2,  q1
-+        vqmovun.s16 d3,  q3
-+        vst1.8      {d0}, [r0 :64], r2
-+        vst1.8      {d1}, [ip :64], r2
-+        vst1.8      {d2}, [r0 :64]
-+        vst1.8      {d3}, [ip :64]
-+        bx          lr
-+endfunc
-+
-+@ add_residual8x8_c(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function ff_hevc_rpi_add_residual_8x8_c_neon_8, export=1
-+        vld2.8      {d16, d17}, [r0 :128]
-+        add         r3, r1, #(8*8*2)  @ Offset to V
-+        vld1.16     {q0}, [r1 :128]!
-+        add         ip, r0, r2
-+        vld1.16     {q1}, [r3 :128]!
-+        vmovl.u8    q10, d16
-+        push        {lr}
-+        vmovl.u8    q8,  d17
-+        mov         lr, #8-1
-+        vqadd.s16   q10, q0
-+        vqadd.s16   q1,  q8
-+1:
-+          vld2.8      {d16, d17}, [ip :128], r2
-+        subs        lr, #1
-+          vld1.16     {q0}, [r1 :128]!
-+        vqmovun.s16 d20, q10
-+        vqmovun.s16 d21, q1
-+          vld1.16     {q1}, [r3 :128]!
-+        vst2.8      {d20, d21}, [r0 :128], r2
-+          vmovl.u8    q10, d16
-+            pldw        [ip]
-+          vmovl.u8    q8,  d17
-+          vqadd.s16   q10, q0
-+          vqadd.s16   q1,  q8
-+        bne         1b
-+
-+          vqmovun.s16 d20, q10
-+          vqmovun.s16 d21, q1
-+          vst2.8      {d20, d21}, [r0 :128]
-+          pop         {pc}
-+endfunc
-+
-+@ add_residual16x16_c(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function ff_hevc_rpi_add_residual_16x16_c_neon_8, export=1
-+        vld2.8      {q8, q9}, [r0 :256]
-+        add         r3, r1, #(16*16*2)  @ Offset to V
-+        vld1.16     {q0, q1}, [r1 :256]!
-+        add         ip, r0, r2
-+        vld1.16     {q2, q3}, [r3 :256]!
-+        vmovl.u8    q10, d16
-+        push        {lr}
-+        vmovl.u8    q8,  d17
-+        mov         lr, #16-1
-+        vmovl.u8    q11, d18
-+        vmovl.u8    q9,  d19
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q1,  q8
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q3,  q9
-+1:
-+          vld2.8      {q8, q9}, [ip :256], r2
-+        subs        lr, #1
-+        vqmovun.s16 d20, q0
-+        vqmovun.s16 d22, q2
-+        vqmovun.s16 d21, q1
-+        vqmovun.s16 d23, q3
-+          vld1.16     {q0, q1}, [r1 :256]!
-+        vst2.8      {d20-d23}, [r0 :256], r2
-+          vld1.16     {q2, q3}, [r3 :256]!
-+          vmovl.u8    q10, d16
-+            pldw        [ip]
-+          vmovl.u8    q8,  d17
-+          vmovl.u8    q11, d18
-+          vmovl.u8    q9,  d19
-+          vqadd.s16   q0,  q10
-+          vqadd.s16   q1,  q8
-+          vqadd.s16   q2,  q11
-+          vqadd.s16   q3,  q9
-+        bne         1b
-+
-+          vqmovun.s16 d20, q0
-+          vqmovun.s16 d22, q2
-+          vqmovun.s16 d21, q1
-+          vqmovun.s16 d23, q3
-+          vst2.8      {d20-d23}, [r0 :256]
-+          pop         {pc}
-+endfunc
-+
-+@ 32x32 chroma never occurs so NIF
-+
-+@ ============================================================================
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_sao_neon.S
-@@ -0,0 +1,2245 @@
-+/*
-+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ *               2017 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+.set EDGE_SRC_STRIDE, 160
-+
-+@ PIC jump tables are fractionally more expensive than absolute in our code
-+.set jent_pic, CONFIG_PIC
-+
-+
-+.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128, I1, I2, I3, I4
-+        vshr.u8   q12, q8, #3
-+        \I1
-+        vadd.i8   q8, \Q_K128
-+        \I2
-+        vshr.u8   q13, q9, #3
-+        \I3
-+        vadd.i8   q9, \Q_K128
-+        \I4
-+        vtbl.8    d24, \XLAT0, d24
-+        vtbl.8    d25, \XLAT0, d25
-+        vtbl.8    d26, \XLAT1, d26
-+        vtbl.8    d27, \XLAT1, d27
-+
-+        vqadd.s8  q8, q12
-+        vshr.u8   q12, q10, #3
-+        vadd.i8   q10, \Q_K128
-+        vqadd.s8  q9, q13
-+        vshr.u8   q13, q11, #3
-+        vadd.i8   q11, \Q_K128
-+
-+        vtbl.8    d24, \XLAT0, d24
-+        vtbl.8    d25, \XLAT0, d25
-+        vtbl.8    d26, \XLAT1, d26
-+        vtbl.8    d27, \XLAT1, d27
-+        vqadd.s8  q10, q12
-+        vsub.i8   q8, \Q_K128
-+        vqadd.s8  q11, q13
-+        vsub.i8   q9, \Q_K128
-+        vsub.i8   q10, \Q_K128
-+        vsub.i8   q11, \Q_K128
-+.endm
-+
-+.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128, L1, L2, L3, L4, L5, S1, S2, S3, S4
-+        \L1
-+        \L2
-+        \L3
-+        \L4
-+        \L5
-+        vadd.i8   q12, q8, \Q_K128
-+        vshr.u8   q8, #3
-+        vtbl.8    d16, \XLAT0, d16
-+        vtbl.8    d17, \XLAT1, d17
-+        vqadd.s8  q12, q8
-+        bmi       2f
-+1:        \L1
-+          \L2
-+          \L3
-+          \L4
-+          \L5
-+        vsub.i8   q13, q12, \Q_K128
-+          vadd.i8   q12, q8, \Q_K128
-+          vshr.u8   q8, #3
-+        \S1
-+        \S2
-+        \S3
-+        \S4
-+          vtbl.8    d16, \XLAT0, d16
-+          vtbl.8    d17, \XLAT1, d17
-+          vqadd.s8  q12, q8
-+          bpl       1b
-+2:        vsub.i8   q13, q12, \Q_K128
-+          \S1
-+          \S2
-+          \S3
-+          \S4
-+.endm
-+
-+
-+.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
-+        vmax.s16  \Q0, \Q_MIN
-+        vmax.s16  \Q1, \Q_MIN
-+        vmax.s16  \Q2, \Q_MIN
-+        vmax.s16  \Q3, \Q_MIN
-+        vmin.s16  \Q0, \Q_MAX
-+        vmin.s16  \Q1, \Q_MAX
-+        vmin.s16  \Q2, \Q_MAX
-+        vmin.s16  \Q3, \Q_MAX
-+.endm
-+
-+@ Clobbers q12, q13
-+.macro sao_band_64b_16  Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, I1, I2
-+        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
-+        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
-+        vshrn.i16 d26, \Q2, #(\bit_depth - 5)
-+        \I1
-+        vtbl.8    d24, \XLAT0, d24
-+        vshrn.i16 d27, \Q3, #(\bit_depth - 5)
-+        vtbl.8    d25, \XLAT1, d25
-+        \I2
-+        vtbl.8    d26, \XLAT0, d26
-+        vtbl.8    d27, \XLAT1, d27
-+        vaddw.s8  \Q0, d24
-+        vaddw.s8  \Q1, d25
-+        vaddw.s8  \Q2, d26
-+        vaddw.s8  \Q3, d27
-+        clip16_4   \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX
-+.endm
-+
-+@ Clobbers q10, q11, q12
-+.macro sao_band_32b_16 Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, L1, L2, L3, L4, L5, S1, S2, S3, S4
-+        \L1
-+        \L2
-+        \L3
-+        \L4
-+        \L5
-+        vshrn.i16 d24, \Q0, #\bit_depth - 5
-+        vshrn.i16 d25, \Q1, #\bit_depth - 5
-+        vtbl.8    d24, \XLAT0, d24
-+        vtbl.8    d25, \XLAT1, d25
-+        vaddw.s8  q10, \Q0, d24
-+        vaddw.s8  q11, \Q1, d25
-+        bmi       2f
-+1:        \L1
-+          \L2
-+          \L3
-+          \L4
-+          \L5
-+        vmax.s16  q10, \Q_MIN
-+        vmax.s16  q11, \Q_MIN
-+          vshrn.i16 d24, \Q0, #\bit_depth - 5
-+          vshrn.i16 d25, \Q1, #\bit_depth - 5
-+        vmin.s16  q10, \Q_MAX
-+        vmin.s16  q11, \Q_MAX
-+        \S1
-+        \S2
-+        \S3
-+        \S4
-+          vtbl.8    d24, \XLAT0, d24
-+          vtbl.8    d25, \XLAT1, d25
-+          vaddw.s8  q10, \Q0, d24
-+          vaddw.s8  q11, \Q1, d25
-+          bpl       1b
-+2:        vmax.s16  q10, \Q_MIN
-+          vmax.s16  q11, \Q_MIN
-+          vmin.s16  q10, \Q_MAX
-+          vmin.s16  q11, \Q_MAX
-+          \S1
-+          \S2
-+          \S3
-+          \S4
-+.endm
-+
-+
-+@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38)
-+@ so we are quite safe stuffing it into a byte array
-+@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma
-+@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of
-+@ precision
-+
-+@ This, somewhat nasty, bit of code builds the {d0-d3} translation
-+@ array via the stack
-+@ Given that sao_left_class > 28 can cause wrap we can't just poke
-+@ all 4 bytes in at once
-+@
-+@ It also loads other common regs
-+
-+@ Beware that the offset read here overrreads by 6 bytes so source must be sized appropriately
-+function band_load_y
-+        ldr       ip, [sp, #16]         @ &sao_offset_val[0]
-+        ldr       r4, [sp, #20]         @ sao_left_class
-+        vmov.i64  d4, #0
-+        vmov.i64  q0, #0
-+        pld       [r1]
-+        vld2.8    {q8}, [ip]
-+        sub       ip, sp, #8*5
-+        vmov.i64  q1, #0
-+        add       r4, ip, r4
-+        vpush     {d0-d4}               @ Put zero array on stack
-+        vshr.u64  d16, d16, #8          @ 1st interesting val is [1]
-+        ldr       ip, [ip, #8*5 + 28]   @ height
-+        vst1.32   {d16[0]}, [r4]
-+        add       r4, r1, r3
-+        vpop      {d0-d4}               @ Pop modified array
-+        sub       ip, ip, #1
-+        vorr      d0, d0, d4
-+        bx        lr
-+endfunc
-+
-+@ Beware that offset reads here overrread by 6 bytes so source must be sized appropriately
-+function band_load_c
-+        ldr       ip, [sp, #16]         @ &sao_offset_val1[0]
-+        ldr       r4, [sp, #20]         @ sao_left_class1
-+        vmov.i64  d24, #0
-+        vmov.i64  q10, #0
-+        pld       [r1]
-+        vld2.8    {q8}, [ip]
-+        sub       ip, sp, #8*5
-+        vmov.i64  q11, #0
-+        add       r4, ip, r4
-+        ldr       ip, [sp, #24]         @ &sao_offset_val2[0]
-+        vpush     {d20-d24}             @ Put zero array on stack
-+        vld2.8    {q9}, [ip]
-+        vshr.u64  d16, d16, #8          @ 1st interesting val is [1]
-+        ldr       ip, [sp, #8*5 + 28]   @ sao_left_class2
-+        vst1.32   {d16[0]}, [r4]
-+        add       ip, sp, ip
-+        vshr.u64  d18, d18, #8          @ 1st interesting val is [1]
-+        vldmia    sp, {d0-d3}           @ Load modified array
-+        vldr      d16, [sp, #8*4]
-+        add       r4, r1, r3
-+        vstmia    sp, {d20-d24}         @ Put zero array on stack (again)
-+        vst1.32   {d18[0]}, [ip]
-+        vorr      d0, d0, d16
-+        vldmia    sp, {d4-d7}           @ Load modified array
-+        vldr      d18, [sp, #8*4]
-+        ldr       ip, [sp, #8*5 + 36]   @ height
-+        add       sp, sp, #8*5
-+        vorr      d4, d4, d18
-+        sub       ip, ip, #1
-+        bx        lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_band_64_neon_8 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+function ff_hevc_rpi_sao_band_64_neon_8, export=1
-+        push      {r4-r6, lr}
-+        vmov.u8   q15, #128
-+        bl        band_load_y
-+
-+1:      vldmia    r1, {q8-q11}
-+        sao_band_64b_8 {d0-d3}, {d0-d3}, q15, \
-+            "pld       [r4]",                 \
-+            "subs      ip, #1",               \
-+            "it ne; addne r4, r3",            \
-+            "add       r1, r3"
-+        vstmia    r0, {q8-q11}
-+        add       r0, r2
-+        bpl       1b
-+
-+        pop       {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_32_neon_8 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+function ff_hevc_rpi_sao_band_32_neon_8, export=1
-+        push      {r4-r6, lr}
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        vmov.u8   q15, #128
-+        bl        band_load_y
-+
-+1:      vld1.8    { q8, q9 }, [r1, :128], r3
-+        subs      ip, #2
-+        vld1.8    {q10, q11}, [r6, :128], r3
-+
-+        sao_band_64b_8 {d0-d3}, {d0-d3}, q15
-+
-+        vst1.8    { q8, q9 }, [r0, :128], r2
-+        vst1.8    {q10, q11}, [r5, :128], r2
-+        bpl       1b
-+
-+        pop       {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_16_neon_8 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+function ff_hevc_rpi_sao_band_16_neon_8, export=1
-+        push      {r4-r6, lr}
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        vmov.u8   q15, #128
-+        bl        band_load_y
-+
-+1:      vld1.8    { q8}, [r1, :128], r3
-+        subs      ip, #4
-+        vld1.8    { q9}, [r6, :128], r3
-+        vld1.8    {q10}, [r1, :128], r3
-+        vld1.8    {q11}, [r6, :128], r3
-+
-+        sao_band_64b_8 {d0-d3}, {d0-d3}, q15
-+
-+        vst1.8    { q8}, [r0, :128], r2
-+        vst1.8    { q9}, [r5, :128], r2
-+        vst1.8    {q10}, [r0, :128], r2
-+        vst1.8    {q11}, [r5, :128], r2
-+        bpl       1b
-+
-+        pop       {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_8_neon_8 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+function ff_hevc_rpi_sao_band_8_neon_8, export=1
-+        ldr       ip, [sp, #8]          @ width
-+        push      {r4-r6, lr}
-+        vmov.u8   q15, #128
-+        cmp       ip, #8
-+        bl        band_load_y
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        blt       4f
-+
-+        sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \
-+            "vld1.8    {d16}, [r1, :64], r3", \
-+            "subs      ip, #2",               \
-+            "vld1.8    {d17}, [r6, :64], r3", \
-+            "",                               \
-+            "",                               \
-+            "vst1.8 {d26}, [r0, :64], r2",    \
-+            "vst1.8 {d27}, [r5, :64], r2"
-+        pop       {r4-r6, pc}
-+4:
-+        sao_band_16b_8 {d0-d3}, {d0-d3}, q15,    \
-+            "vld1.32   {d16[0]}, [r1, :32], r3", \
-+            "subs      ip, #4",                  \
-+            "vld1.32   {d16[1]}, [r6, :32], r3", \
-+            "vld1.32   {d17[0]}, [r1, :32], r3", \
-+            "vld1.32   {d17[1]}, [r6, :32], r3", \
-+            "vst1.32   {d26[0]}, [r0, :32], r2", \
-+            "vst1.32   {d26[1]}, [r5, :32], r2", \
-+            "vst1.32   {d27[0]}, [r0, :32], r2", \
-+            "vst1.32   {d27[1]}, [r5, :32], r2"
-+        pop       {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_c_32_neon_8(
-+@   uint8_t * dst          [r0]
-+@   uint8_t * src          [r1]
-+@   uint32_t dst_stride    [r2]
-+@   uint32_t src_stride    [r3]
-+@   const int16_t * table1 sp[0]
-+@   uint32_t offset1       sp[4]
-+@   const int16_t * table2 sp[8]
-+@   uint32_t offset2       sp[12]
-+@   int width              sp[16]
-+@   int height             sp[20]
-+
-+function ff_hevc_rpi_sao_band_c_32_neon_8, export=1
-+        push      {r4-r6, lr}
-+        add       r5, r0, #32
-+        add       r6, r1, #32
-+        vmov.u8   q15, #128
-+        bl        band_load_c
-+
-+1:      vld2.8    { q8, q9 }, [r1, :128], r3
-+        subs      ip, #1
-+        vld2.8    {q10, q11}, [r6, :128], r3
-+
-+        sao_band_64b_8 {d0-d3}, {d4-d7}, q15, \
-+            "pld       [r4]",                 \
-+            "it ne; addne r4, r3"
-+
-+        vst2.8    { q8, q9 }, [r0, :128], r2
-+        vst2.8    {q10, q11}, [r5, :128], r2
-+        bpl       1b
-+
-+        pop     {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_c_16_neon_8(
-+@   uint8_t * dst          [r0]
-+@   uint8_t * src          [r1]
-+@   uint32_t dst_stride    [r2]
-+@   uint32_t src_stride    [r3]
-+@   const int16_t * table1 sp[0]
-+@   uint32_t offset1       sp[4]
-+@   const int16_t * table2 sp[8]
-+@   uint32_t offset2       sp[12]
-+@   int width              sp[16]
-+@   int height             sp[20]
-+
-+function ff_hevc_rpi_sao_band_c_16_neon_8, export=1
-+        push      {r4-r6, lr}
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        vmov.u8   q15, #128
-+        bl        band_load_c
-+
-+1:      vld2.8    { q8, q9 }, [r1, :128], r3
-+        subs      ip, #2
-+        vld2.8    {q10, q11}, [r6, :128], r3
-+
-+        sao_band_64b_8 {d0-d3}, {d4-d7}, q15
-+
-+        vst2.8    { q8, q9 }, [r0, :128], r2
-+        vst2.8    {q10, q11}, [r5, :128], r2
-+        bpl       1b
-+
-+        pop     {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_c_8_neon_8(
-+@   uint8_t * dst          [r0]
-+@   uint8_t * src          [r1]
-+@   uint32_t dst_stride    [r2]
-+@   uint32_t src_stride    [r3]
-+@   const int16_t * table1 sp[0]
-+@   uint32_t offset1       sp[4]
-+@   const int16_t * table2 sp[8]
-+@   uint32_t offset2       sp[12]
-+@   int width              sp[16]
-+@   int height             sp[20]
-+
-+function ff_hevc_rpi_sao_band_c_8_neon_8, export=1
-+        ldr       ip, [sp, #16]         @ width
-+        push      {r4-r6, lr}
-+        vmov.u8   q15, #128
-+        cmp       ip, #8
-+        bl        band_load_c
-+        blt       4f
-+
-+        sao_band_16b_8 {d0-d3}, {d4-d7}, q15,      \
-+            "vld2.8    {d16-d17}, [r1, :128], r3", \
-+            "subs      ip, #1",                    \
-+            "",                                    \
-+            "",                                    \
-+            "",                                    \
-+            "vst2.8    {d26-d27}, [r0, :128], r2"
-+        pop       {r4-r6, pc}
-+4:
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \
-+            "vld1.8    {d16}, [r1, :64], r3", \
-+            "subs      ip, #2",               \
-+            "vld1.8    {d17}, [r6, :64], r3", \
-+            "vuzp.8    d16, d17",             \
-+            "",                               \
-+            "vzip.8    d26, d27",             \
-+            "vst1.8    {d26}, [r0, :64], r2", \
-+            "vst1.8    {d27}, [r5, :64], r2"
-+        pop       {r4-r6, pc}
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_band_64_neon_10 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+.macro band_64_16 bit_depth
-+        push      {r4-r6, lr}
-+        vmov.i64  q2, #0
-+        vmov.i16  q3, #(1 << \bit_depth) - 1
-+        bl        band_load_y
-+        vpush     {q4-q7}
-+
-+1:      vldm      r1, {q4-q11}
-+        sao_band_64b_16 q4,  q5,  q6,  q7, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \
-+            "subs      ip, #1",                                                  \
-+            "add       r1, r3"
-+        sao_band_64b_16 q8,  q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth
-+        vstm      r0, {q4-q11}
-+        add       r0, r2
-+        bpl       1b
-+
-+        vpop      {q4-q7}
-+        pop       {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_64_neon_10, export=1
-+        band_64_16 10
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_32_neon_10 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+.macro band_32_16 bit_depth
-+        push      {r4-r6, lr}
-+        vmov.i64  q2, #0
-+        vmov.i16  q3, #(1 << \bit_depth) - 1
-+        bl        band_load_y
-+
-+1:      vldm      r1, {q8-q11}
-+        sao_band_64b_16 q8,  q9,  q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \
-+            "subs      ip, #1",                                                   \
-+            "add       r1, r3"
-+        vstm      r0, {q8-q11}
-+        add       r0, r2
-+        bpl       1b
-+
-+        pop       {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_32_neon_10, export=1
-+        band_32_16 10
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_16_neon_10 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+.macro band_16_16 bit_depth
-+        push      {r4-r6, lr}
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        vmov.i64  q14, #0
-+        vmov.i16  q15, #(1 << \bit_depth) - 1
-+        bl        band_load_y
-+
-+1:      vld1.16   { q8, q9 }, [r1, :128], r3
-+        subs      r12, #2
-+        vld1.16   {q10, q11}, [r6, :128], r3
-+        sao_band_64b_16 q8,  q9,  q10, q11, {d0-d3}, {d0-d3}, q14, q15, \bit_depth
-+        vst1.16   { q8, q9 }, [r0, :128], r2
-+        vst1.16   {q10, q11}, [r5, :128], r2
-+        bpl       1b
-+
-+        pop       {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_16_neon_10, export=1
-+        band_16_16 10
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_8_neon_10 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+.macro band_8_16 bit_depth
-+        ldr       ip, [sp, #8]          @ width
-+        push      {r4-r6, lr}
-+        vmov.i64  q14, #0
-+        cmp       ip, #8
-+        vmov.i16  q15, #(1 << \bit_depth) - 1
-+        bl        band_load_y
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        blt       4f
-+
-+        sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \
-+            "vld1.16   {q8}, [r1, :128], r3",                           \
-+            "subs      ip, #2",                                         \
-+            "vld1.16   {q9}, [r6, :128], r3",                           \
-+            "",                                                         \
-+            "",                                                         \
-+            "vst1.16   {q10}, [r0, :128], r2",                          \
-+            "vst1.16   {q11}, [r5, :128], r2"
-+        pop       {r4-r6, pc}
-+4:
-+        sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \
-+            "vld1.16   {d16}, [r1, :64], r3",                           \
-+            "subs      ip, #4",                                         \
-+            "vld1.16   {d17}, [r6, :64], r3",                           \
-+            "vld1.16   {d18}, [r1, :64], r3",                           \
-+            "vld1.16   {d19}, [r6, :64], r3",                           \
-+            "vst1.16   {d20}, [r0, :64], r2",                           \
-+            "vst1.16   {d21}, [r5, :64], r2",                           \
-+            "vst1.16   {d22}, [r0, :64], r2",                           \
-+            "vst1.16   {d23}, [r5, :64], r2"
-+        pop       {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_8_neon_10, export=1
-+        band_8_16 10
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_band_c_32_neon_10(
-+@   uint8_t * dst          [r0]
-+@   uint8_t * src          [r1]
-+@   uint32_t dst_stride    [r2]
-+@   uint32_t src_stride    [r3]
-+@   const int16_t * table1 sp[0]
-+@   uint32_t offset1       sp[4]
-+@   const int16_t * table2 sp[8]
-+@   uint32_t offset2       sp[12]
-+@   int width              sp[16]
-+@   int height             sp[20]
-+
-+.macro band_c_32_16 bit_depth
-+        push      {r4-r6, lr}
-+        add       r5, r0, #32
-+        add       r6, r1, #32
-+        sub       r2, #64
-+        sub       r3, #64
-+        vmov.i64  q14, #0
-+        vmov.i16  q15, #(1 << \bit_depth) - 1
-+        bl        band_load_c
-+        mov       lr, #64
-+        vpush     {q4-q7}
-+
-+1:      vld2.16   { q4, q5 }, [r1, :128], lr
-+        subs      ip, #1
-+        vld2.16   { q6, q7 }, [r6, :128], lr
-+        vld2.16   { q8, q9 }, [r1, :128], r3
-+        vld2.16   {q10, q11}, [r6, :128], r3
-+
-+        sao_band_64b_16 q4,  q5,  q6,  q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
-+            "pld       [r4]",                                                      \
-+            "it ne; addne r4, r3"
-+        sao_band_64b_16 q8,  q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
-+
-+        vst2.16   { q4, q5 }, [r0, :128], lr
-+        vst2.16   { q6, q7 }, [r5, :128], lr
-+        vst2.16   { q8, q9 }, [r0, :128], r2
-+        vst2.16   {q10, q11}, [r5, :128], r2
-+
-+        bpl       1b
-+
-+        vpop      {q4-q7}
-+        pop       {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_c_32_neon_10, export=1
-+        band_c_32_16 10
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_band_c_16_neon_10(
-+@   uint8_t * dst          [r0]
-+@   uint8_t * src          [r1]
-+@   uint32_t dst_stride    [r2]
-+@   uint32_t src_stride    [r3]
-+@   const int16_t * table1 sp[0]
-+@   uint32_t offset1       sp[4]
-+@   const int16_t * table2 sp[8]
-+@   uint32_t offset2       sp[12]
-+@   int width              sp[16]
-+@   int height             sp[20]
-+
-+.macro band_c_16_16 bit_depth
-+        push      {r4-r6, lr}
-+        add       r5, r0, #32
-+        add       r6, r1, #32
-+        vmov.i64  q14, #0
-+        vmov.i16  q15, #(1 << \bit_depth) - 1
-+        bl        band_load_c
-+
-+1:      vld2.16   { q8, q9 }, [r1, :128], r3
-+        subs      ip, #1
-+        vld2.16   {q10, q11}, [r6, :128], r3
-+
-+        sao_band_64b_16 q4,  q5,  q6,  q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
-+        sao_band_64b_16 q8,  q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
-+
-+        vst2.16   { q8, q9 }, [r0, :128], r2
-+        vst2.16   {q10, q11}, [r5, :128], r2
-+
-+        bpl       1b
-+        pop       {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_c_16_neon_10, export=1
-+        band_c_16_16 10
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_band_c_8_neon_10(
-+@   uint8_t * dst          [r0]
-+@   uint8_t * src          [r1]
-+@   uint32_t dst_stride    [r2]
-+@   uint32_t src_stride    [r3]
-+@   const int16_t * table1 sp[0]
-+@   uint32_t offset1       sp[4]
-+@   const int16_t * table2 sp[8]
-+@   uint32_t offset2       sp[12]
-+@   int width              sp[16]
-+@   int height             sp[20]
-+
-+.macro band_c_8_16 bit_depth
-+        ldr       ip, [sp, #16]         @ width
-+        push      {r4-r6, lr}
-+        vmov.i64  q14, #0
-+        cmp       ip, #8
-+        vmov.i16  q15, #(1 << \bit_depth) - 1
-+        bl        band_load_c
-+        blt       4f
-+
-+        sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
-+            "vld2.16   {q8,q9}, [r1, :128], r3",                        \
-+            "subs      ip, #1",                                         \
-+            "",                                                         \
-+            "",                                                         \
-+            "",                                                         \
-+            "vst2.16   {q10,q11}, [r0, :128], r2"
-+        pop       {r4-r6, pc}
-+4:
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
-+            "vld2.16   {d16,d18}, [r1, :128], r3",                      \
-+            "subs      ip, #2",                                         \
-+            "vld2.16   {d17,d19}, [r6, :128], r3",                      \
-+            "",                                                         \
-+            "",                                                         \
-+            "vst2.16   {d20,d22}, [r0, :128], r2",                      \
-+            "vst2.16   {d21,d23}, [r5, :128], r2"
-+        pop       {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_c_8_neon_10, export=1
-+        band_c_8_16 10
-+endfunc
-+
-+
-+@ =============================================================================
-+@ SAO EDGE
-+
-+@ r0    destination address
-+@ r2    stride to post-increment r0 with
-+@ [r5]  translate values
-+@
-+@ a <- c <- b
-+@ a in q0 - q3
-+@ c in q4 - q7
-+@ b in q8 - q11
-+@
-+@ q12-15 used as temp
-+@
-+@ Can be used for both Y & C as we unzip/zip the deltas and
-+@ transform "u/v" separately via d26/d27.  For Y d26=d27
-+
-+function edge_64b_body_8
-+
-+        vcgt.u8 q12,  q4,  q0   @ c > a -> -1 , otherwise 0
-+        vcgt.u8 q13,  q5,  q1
-+        vcgt.u8 q14,  q6,  q2
-+        vcgt.u8 q15,  q7,  q3
-+
-+        vcgt.u8  q0,  q4        @ a > c -> -1 , otherwise 0
-+        vcgt.u8  q1,  q5
-+        vcgt.u8  q2,  q6
-+        vcgt.u8  q3,  q7
-+
-+        vsub.s8  q0,  q12       @ a = sign(c-a)
-+        vsub.s8  q1,  q13
-+        vsub.s8  q2,  q14
-+        vsub.s8  q3,  q15
-+
-+        vcgt.u8  q12, q4,  q8   @ c > b -> -1 , otherwise 0
-+        vcgt.u8  q13, q5,  q9
-+        vcgt.u8  q14, q6,  q10
-+        vcgt.u8  q15, q7,  q11
-+
-+        vsub.s8  q0,  q12
-+        vsub.s8  q1,  q13
-+        vsub.s8  q2,  q14
-+        vsub.s8  q3,  q15
-+
-+        vcgt.u8  q12, q8,  q4   @ c < b -> -1 , otherwise 0
-+        vcgt.u8  q13, q9,  q5
-+        vcgt.u8  q14, q10, q6
-+        vcgt.u8  q15, q11, q7
-+
-+        vadd.s8  q0,  q12       @ a = sign(c-a) + sign(c-b)
-+        vadd.s8  q1,  q13
-+        vmov.u8  q12, #2
-+        vadd.s8  q2,  q14
-+        vadd.s8  q3,  q15
-+
-+        vadd.s8  q0,  q12
-+        vadd.s8  q1,  q12
-+
-+        vld1.8   {d26, d27}, [r5]
-+
-+        vadd.s8  q2,  q12
-+        vuzp.8   q0,  q1
-+        vmov.u8  q15, #128
-+        vadd.s8  q3,  q12       @ a = 2 + sign(c-a) + sign(c-b)
-+
-+        vtbl.8   d0,  {d26}, d0
-+        vadd.s8  q12, q4, q15   @ Add -128 so we can use saturating signed add
-+
-+        vtbl.8   d1,  {d26}, d1
-+        vadd.s8  q14, q5, q15
-+
-+        vtbl.8   d2,  {d27}, d2
-+        vuzp.8   q2,  q3
-+
-+        vtbl.8   d3,  {d27}, d3
-+
-+        vtbl.8   d4,  {d26}, d4
-+        vzip.8   q0,  q1
-+
-+        vtbl.8   d5,  {d26}, d5
-+        vqadd.s8 q0,  q12
-+        vqadd.s8 q1,  q14
-+        vadd.s8  q12, q6, q15   @ Add -128 so we can use saturating signed add
-+
-+        vtbl.8   d6,  {d27}, d6
-+        vtbl.8   d7,  {d27}, d7
-+        vadd.s8  q14, q7, q15   @ Add -128 so we can use saturating signed add
-+        vzip.8   q2,  q3
-+
-+        vsub.s8  q0,  q15
-+        vqadd.s8 q2,  q12
-+        vqadd.s8 q3,  q14
-+        vsub.s8  q1,  q15
-+        vsub.s8  q2,  q15
-+        vsub.s8  q3,  q15
-+
-+        bx      lr
-+endfunc
-+
-+@ r0    destination address
-+@ r2    stride to post-increment r0 with
-+@ r4    upper clip value
-+@ [r5]  translate values
-+@
-+@ a <- c <- b
-+@ a in q0 - q3
-+@ c in q4 - q7
-+@ b in q8 - q11
-+@
-+@ q12-15 used as temp
-+@
-+@ Can be used for both Y & C as we unzip/zip the deltas and
-+@ transform "u/v" separately via d26/d27.  For Y d26=d27
-+
-+function edge_64b_body_16
-+
-+        vcgt.u16 q12, q4, q0  // c > a -> -1 , otherwise 0
-+        vcgt.u16 q13, q5, q1
-+        vcgt.u16 q14, q6, q2
-+        vcgt.u16 q15, q7, q3
-+
-+        vcgt.u16 q0, q0, q4  // a > c -> -1 , otherwise 0
-+        vcgt.u16 q1, q1, q5
-+        vcgt.u16 q2, q2, q6
-+        vcgt.u16 q3, q3, q7
-+
-+        vsub.s16 q0, q0, q12 // a = sign(c-a)
-+        vsub.s16 q1, q1, q13
-+        vsub.s16 q2, q2, q14
-+        vsub.s16 q3, q3, q15
-+
-+        vcgt.u16 q12, q4, q8  // c > b -> -1 , otherwise 0
-+        vcgt.u16 q13, q5, q9
-+        vcgt.u16 q14, q6, q10
-+        vcgt.u16 q15, q7, q11
-+
-+        vsub.s16 q0, q0, q12
-+        vsub.s16 q1, q1, q13
-+        vsub.s16 q2, q2, q14
-+        vsub.s16 q3, q3, q15
-+
-+        vcgt.u16 q12, q8, q4  // c < b -> -1 , otherwise 0
-+        vcgt.u16 q13, q9, q5
-+        vcgt.u16 q14, q10, q6
-+        vcgt.u16 q15, q11, q7
-+
-+        vadd.s16 q0, q0, q12  // a = sign(c-a) + sign(c-b)
-+        vadd.s16 q1, q1, q13
-+        vadd.s16 q2, q2, q14
-+        vadd.s16 q3, q3, q15
-+
-+        vmov.u8  q12, #2
-+
-+        vmovn.s16 d0, q0
-+        vmovn.s16 d1, q1
-+        vmovn.s16 d2, q2
-+        vmovn.s16 d3, q3
-+
-+        vldr     d26, [r5]
-+
-+        vuzp.8   q0, q1
-+
-+        vldr     d27, [r5, #8]
-+
-+        vadd.s8  q0, q0, q12
-+        vadd.s8  q1, q1, q12
-+
-+        vmov.i64 q12, #0
-+
-+        vtbl.8   d0, {d26}, d0
-+        vtbl.8   d1, {d26}, d1
-+        vtbl.8   d2, {d27}, d2
-+        vtbl.8   d3, {d27}, d3
-+
-+        vdup.i16 q13, r4
-+
-+        vzip.8   q0, q1
-+
-+        @ Avoid overwrite whilst widening
-+        vaddw.s8 q2, q6, d2
-+        vaddw.s8 q3, q7, d3
-+        vaddw.s8 q1, q5, d1
-+        vaddw.s8 q0, q4, d0
-+
-+        @ now clip
-+        clip16_4 q2, q3, q1, q0, q12, q13
-+
-+        bx       lr
-+endfunc
-+
-+
-+@ a <- c <- b
-+@ a in q0
-+@ c in q1
-+@ b in q2
-+@ Temp q3, q9, q10
-+@
-+@ d16, d17 (q8) xlat U, V
-+@ q14.u8 #2
-+@ q15.u8 #128
-+
-+function edge_16b_body_8
-+        vcgt.u8  q9,  q0,  q1   @ a > c -> -1 , otherwise 0
-+        vadd.u8  q9,  q14, q9
-+        vcgt.u8  q0,  q1,  q0   @ c > a -> -1 , otherwise 0
-+        vsub.u8  q9,  q9,  q0
-+        vcgt.u8  q0,  q2,  q1   @ c < b -> -1 , otherwise 0
-+        vadd.u8  q9,  q9,  q0
-+        vcgt.u8  q0,  q1,  q2   @ c > b -> -1 , otherwise 0
-+        vsub.u8  q0,  q9,  q0
-+
-+        vadd.s8  q3,  q1, q15   @ Add -128 so we can use saturating signed add
-+
-+        vuzp.8   d0,  d1
-+
-+        vtbl.8   d0,  {d16}, d0
-+        vtbl.8   d1,  {d17}, d1
-+
-+        vzip.8   d0,  d1
-+        vqadd.s8 q0,  q3
-+        vsub.s8  q0,  q15
-+
-+        bx      lr
-+endfunc
-+
-+@ a <- c <- b
-+@ a in q0
-+@ c in q1
-+@ b in q2
-+@ Temp q3
-+@
-+@ q12, #0
-+@ d16, d17 xlat U, V
-+@ q14.u8 #2
-+@ q15.u16 max
-+function edge_16b_body_16
-+        vcgt.u16 q9, q0, q1     @ a > c -> -1 , otherwise 0
-+        vadd.u16 q9, q14, q9
-+        vcgt.u16 q0, q1, q0     @ c > a -> -1 , otherwise 0
-+        vsub.u16 q9, q9, q0
-+        vcgt.u16 q0, q2, q1     @ c < b -> -1 , otherwise 0
-+        vadd.u16 q9, q9, q0
-+        vcgt.u16 q0, q1, q2     @ c > b -> -1 , otherwise 0
-+        vsub.u16 q0, q9, q0
-+
-+        vmovn.s16 d0, q0
-+        @ d1 will have random contents that we transform but
-+        @ that doesn't matter as we then discard them
-+        vuzp.8   d0, d1
-+
-+        vtbl.8   d0, {d16}, d0
-+        vtbl.8   d1, {d17}, d1
-+
-+        vzip.8   d0, d1
-+
-+        vaddw.s8 q0, q1, d0
-+
-+        @ now clip
-+        vmax.s16 q0, q12
-+        vmin.s16 q0, q15
-+        bx       lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_edge_[c_]xx_neon(
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   const int16_t *_sao_offset_val_u, [r3]
-+@   const int16_t *_sao_offset_val_v, [sp, #0]   // Chroma only
-+@   int eo,                           [sp, #sp_base + 0]
-+@   int width,                        [sp, #sp_base + 4]
-+@   int height)                       [sp, #sp_base + 8]
-+
-+@ Jumps via jump_tab with
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   EDGE_SRC_STRIDE                   [r3]
-+@   (1 << \bit_depth) - 1             [r4]
-+@   * xlat_table                      [r5]  // setup_64b only
-+@   int height                        [r12]
-+@
-+@   0                                 [q12] // > 8 bit
-+@   2                                 [q14]
-+@   128                               [q15] // = 8 bit
-+@   r4                                [q15] // > 8 bit
-+
-+.macro  edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0, xjump = 0
-+
-+@ Build translate registers
-+@ As translate values can only be 0-4 we don't care about junk in the rest
-+@ of the register
-+.if \is_chroma
-+        ldr      ip, [sp, #0]
-+        push     {r4-r6, lr}    @ 16 bytes
-+        vld1.8   {d16[2]}, [r3]
-+        add      r3, r3, #2
-+        vld1.8   {d17[2]}, [ip]
-+        add      ip, ip, #2
-+        vld1.8   {d16[0]}, [r3]
-+        add      r3, r3, #2
-+        vld1.8   {d17[0]}, [ip]
-+        add      ip, ip, #2
-+        vld1.8   {d16[1]}, [r3]
-+        add      r3, r3, #2
-+        vld1.8   {d17[1]}, [ip]
-+        add      ip, ip, #2
-+        vld1.8   {d16[3]}, [r3]
-+        add      r3, r3, #2
-+        vld1.8   {d17[3]}, [ip]
-+        add      ip, ip, #2
-+        vld1.8   {d16[4]}, [r3]
-+        vld1.8   {d17[4]}, [ip]
-+        movw     r3, EDGE_SRC_STRIDE
-+.set sp_base, 20
-+.else
-+        add      ip, r3, #4
-+        vld1.8   {d16[1]}, [r3]
-+        add      r3, r3, #2
-+        vld1.8   {d17[0]}, [ip]
-+        add      ip, ip, #2
-+        vld1.8   {d16[0]}, [r3]
-+        add      r3, r3, #6
-+        vld1.8   {d17[1]}, [ip]
-+        vld1.8   {d16[2]}, [r3]
-+        movw     r3, EDGE_SRC_STRIDE
-+        push     {r4-r6, lr}    @ 16 bytes
-+        vzip.8   d16, d17
-+        vmov     d17, d16
-+.set sp_base, 16
-+.endif
-+
-+@ If setup_64b we need the xlat table on the stack
-+.if \setup_64b
-+        sub      r5, sp, #16
-+.endif
-+
-+@ Get jump address
-+@ We have a special case for width 4 as the calling code doesn't detect it
-+@ If we may have w4 then we add a 2nd jump table after the 1st
-+.if \check_w4
-+        ldr      r12, [sp, #sp_base + 4]        @ width
-+        adr      r6, \jump_tab
-+        ldr      lr, [sp, #sp_base + 0]        @ e0
-+        cmp      r12, #8
-+        it lt
-+        addlt    r6, #16
-+.else
-+        ldr      lr, [sp, #sp_base + 0]        @ e0
-+        adr      r6, \jump_tab
-+.endif
-+
-+        ldr      r12, [sp, #sp_base + 8]        @ height
-+
-+.if \bit_depth > 8
-+        movw     r4, (1 << \bit_depth) - 1
-+.endif
-+.if \setup_16b
-+.if \bit_depth > 8
-+        vmov.i64 q12, #0
-+        vdup.16  q15, r4
-+        vmov.u16 q14, #2
-+.else
-+        vmov.u8  q15, #128
-+        vmov.u8  q14, #2
-+.endif
-+.endif
-+
-+@ If setup_64b we need q4-q7 saved.
-+.if \setup_64b
-+        vpush    {q4-q8}        @ 80 bytes, q8 pushed first
-+.set sp_base, sp_base + 80
-+.endif
-+
-+        ldr      r6, [r6, lr, lsl #2]
-+
-+@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes
-+.if \do2
-+        push     {r0, r1, r6, r12}
-+.if jent_pic
-+        bl       98f
-+.else
-+        blx      r6
-+.endif
-+        pop      {r0, r1, r6, r12}
-+
-+        add      r0, #64
-+        add      r1, #64
-+.endif
-+
-+.if jent_pic
-+        bl       98f
-+.else
-+        blx      r6
-+.endif
-+
-+@ Tidy up & return
-+.if \setup_64b
-+        vpop     {q4-q8}        @ spurious but harmless load of q8
-+.endif
-+        pop      {r4-r6, pc}
-+
-+.if jent_pic && !\xjump
-+@ Magic label - used as 98b in jent macro
-+98:
-+        add      pc, r6
-+.endif
-+.endm
-+
-+
-+.macro  edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab
-+        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1
-+.endm
-+
-+.macro  edge_64b_init, bit_depth, is_chroma, do2, jump_tab, xjump=0
-+        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1, xjump=\xjump
-+.endm
-+
-+
-+.macro  edge_64b_e0, body_fn, pb
-+        sub      r1, #8
-+        mov      r6, lr
-+1:      vldm     r1, {d7-d16}
-+        // load a
-+        vext.8   q0,  q3,  q4, #(16 - \pb)
-+        add      r1, r3
-+        vext.8   q1,  q4,  q5, #(16 - \pb)
-+        subs     r12, #1
-+        vext.8   q2,  q5,  q6, #(16 - \pb)
-+        vext.8   q3,  q6,  q7, #(16 - \pb)
-+        pld      [r1]
-+        // load b
-+        vext.8   q11, q7,  q8, #\pb     @ Avoid overwrite
-+        pld      [r1, #64]
-+        vext.8   q8,  q4,  q5, #\pb
-+        vext.8   q9,  q5,  q6, #\pb
-+        vext.8   q10, q6,  q7, #\pb
-+        bl       \body_fn
-+        vstm     r0, {q0-q3}
-+        add      r0, r0, r2
-+        bgt      1b
-+        bx       r6
-+.endm
-+
-+.macro  edge_32bx2_e0, body_fn, pb
-+        add      r6, r1, r3
-+        push     {r7,lr}
-+        sub      r1, #8
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+1:      vldmia   r1, {d7-d12}
-+        // load a
-+        vext.8   q0, q3, q4, #16 - \pb
-+        add      r1, r1, r3, lsl #1
-+        vext.8   q1, q4, q5, #16 - \pb
-+        subs     r12, #2
-+        // load b
-+        vext.8   q8, q4, q5, #\pb
-+        vext.8   q9, q5, q6, #\pb
-+        vldr     d25, [r6, #-8]
-+        vldmia   r6, {d12-d15}
-+        vldr     d26, [r6, #32]
-+        // load a
-+        vext.8   q2, q12, q6, #16 - \pb
-+        add      r6, r6, r3, lsl #1
-+        vext.8   q3, q6, q7, #16 - \pb
-+        // load b
-+        vext.8   q10, q6, q7, #\pb
-+        vext.8   q11, q7, q13, #\pb
-+        bl       \body_fn
-+        vst1.8   {q0-q1}, [r0, :256], r2
-+        vst1.8   {q2-q3}, [r7, :256], r2
-+        bgt      1b
-+        pop      {r7,pc}
-+.endm
-+
-+.macro  edge_16b_e0, body_fn, pb
-+        sub      r1, #8
-+        mov      r6, lr
-+1:      vldmia   r1, {d1-d4}
-+        add      r1, r3
-+        subs     r12, #1
-+        vext.8   q0, q0, q1, #16 - \pb
-+        vext.8   q2, q1, q2, #\pb
-+
-+        bl       \body_fn
-+        vst1.8   {q0}, [r0, :128], r2
-+        bgt      1b
-+        bx       r6
-+.endm
-+
-+.macro  edge_8bx2_e0, body_fn, pb
-+        add      r6, r1, r3
-+        push     {r7,lr}
-+        sub      r1, #8
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+1:      vldmia   r1, {d1-d2}
-+        vldmia   r6, {d3-d4}
-+        vldr     d6, [r1, #16]
-+        subs     r12, #2
-+        vldr     d7, [r6, #-8]
-+        add      r1, r1, r3, lsl #1
-+        vext.8   d0, d1, d2, #8 - \pb
-+        add      r6, r6, r3, lsl #1
-+        vext.8   d5, d3, d4, #\pb
-+        vext.8   d4, d2, d6, #\pb
-+        vext.8   d1, d7, d3, #8 - \pb
-+
-+        bl       \body_fn
-+        vst1.8   {d0}, [r0, :64], r2
-+        vst1.8   {d1}, [r7, :64], r2
-+        bgt      1b
-+        pop      {r7,pc}
-+.endm
-+
-+.macro  edge_4bx4_e0, body_fn, pb
-+        add      r6, r1, r3
-+        push     {r7,lr}
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+
-+        tst      r1, #4
-+        bne      2f
-+1:      // r1 (and assumed r6) are 64-bit aligned
-+        vldr     d2, [r1]
-+        vldr     d0, [r1, #-8]
-+        add      r1, r1, r3, lsl #1
-+        vldr     d20, [r6]
-+        subs     r12, #4
-+        vldr     d18, [r6, #-8]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d3, [r1]
-+        vshr.u64 d4, d2, #\pb * 8
-+        vldr     d1, [r1, #-8]
-+        add      r1, r1, r3, lsl #1
-+        vldr     d21, [r6]
-+        vext.8   d0, d0, d2, #8 - \pb
-+        vldr     d19, [r6,#-8]
-+        add      r6, r6, r3, lsl #1
-+        vshr.u64 d22, d20, #\pb * 8
-+        vext.8   d18, d18, d20, #8 - \pb
-+        vshr.u64 d5, d3, #\pb * 8
-+        vext.8   d1, d1, d3, #8 - \pb
-+        vshr.u64 d23, d21, #\pb * 8
-+        vext.8   d19, d19, d21, #8 - \pb
-+        vsli.64  q1, q10, #32
-+        vsli.64  q2, q11, #32
-+        vsli.64  q0, q9, #32
-+
-+        bl       \body_fn
-+        vst1.32  {d0[0]}, [r0, :32], r2
-+        vst1.32  {d0[1]}, [r7, :32], r2
-+        vst1.32  {d1[0]}, [r0, :32], r2
-+        vst1.32  {d1[1]}, [r7, :32], r2
-+        bgt      1b
-+        pop      {r7,pc}
-+
-+2:      // r1 (and assumed r6) are 32-bit but not 64-bit aligned
-+        vldr     d20, [r1, #-4]
-+        vldr     d22, [r1, #4]
-+        add      r1, r1, r3, lsl #1
-+        vldr     d2, [r6, #-4]
-+        subs     r12, #4
-+        vldr     d4, [r6, #4]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d21, [r1, #-4]
-+        vshl.i64 d18, d20, #\pb * 8
-+        vldr     d23, [r1, #4]
-+        add      r1, r1, r3, lsl #1
-+        vldr     d3, [r6, #-4]
-+        vext.8   d22, d20, d22, #\pb
-+        vldr     d5, [r6, #4]
-+        add      r6, r6, r3, lsl #1
-+        vshl.i64 d0, d2, #\pb * 8
-+        vext.8   d4, d2, d4, #\pb
-+        vshl.i64 d19, d21, #\pb * 8
-+        vext.8   d23, d21, d23, #\pb
-+        vshl.i64 d1, d3, #\pb * 8
-+        vext.8   d5, d3, d5, #\pb
-+        vsri.64  q1, q10, #32
-+        vsri.64  q0, q9, #32
-+        vsri.64  q2, q11, #32
-+
-+        bl       \body_fn
-+        vst1.32  {d0[0]}, [r0, :32], r2
-+        vst1.32  {d0[1]}, [r7, :32], r2
-+        vst1.32  {d1[0]}, [r0, :32], r2
-+        vst1.32  {d1[1]}, [r7, :32], r2
-+        bgt      2b
-+        pop      {r7,pc}
-+.endm
-+
-+
-+.macro  edge_64b_e1, body_fn
-+        sub      r1, r3
-+        push     {lr}
-+        add      r6, r1, #32
-+        // load a
-+        vld1.8   {q0-q1}, [r1, :256], r3
-+        vld1.8   {q2-q3}, [r6, :256], r3
-+        // load c
-+        vld1.8   {q4-q5}, [r1, :256], r3
-+        vld1.8   {q6-q7}, [r6, :256], r3
-+1:      // load b
-+        vld1.8   {q8-q9}, [r1, :256], r3
-+        subs     r12, #1
-+        vld1.8   {q10-q11}, [r6, :256], r3
-+        bl       \body_fn
-+        vstm     r0, {q0-q3}
-+        // copy c to a
-+        vmov.64  q0, q4
-+        pld      [r1, r3]
-+        vmov.64  q1, q5
-+        it       le
-+        pople    {lr}
-+        vmov.64  q2, q6
-+        it       le
-+        bxle     lr
-+        vmov.64  q3, q7
-+        add      r0, r0, r2
-+        // copy b to c
-+        vmov.64  q4, q8
-+        vmov.64  q5, q9
-+        vmov.64  q6, q10
-+        vmov.64  q7, q11
-+        b        1b
-+.endm
-+
-+.macro  edge_32bx2_e1, body_fn
-+        sub      r6, r1, r3
-+        vld1.8   {q2-q3}, [r1, :256], r3
-+        vld1.8   {q0-q1}, [r6, :256]
-+        mov      r6, lr
-+
-+1:      @ Given the data duplication here we could obviously do better than
-+        @ using the generic body_fn but it almost certainly isn't worth it
-+        vld1.8   {q8-q9}, [r1, :256], r3
-+        subs     r12, #2
-+        vmov     q4, q2
-+        vmov     q5, q3
-+        vld1.8   {q10-q11}, [r1, :256], r3
-+        vmov     q6, q8
-+        vmov     q7, q9
-+
-+        bl       \body_fn
-+
-+        vst1.8   {q0-q1}, [r0, :256], r2
-+        // copy b to a
-+        vmov     q0, q8
-+        vmov     q1, q9
-+        vst1.8   {q2-q3}, [r0, :256], r2
-+        vmov     q2, q10
-+        it       le
-+        bxle     r6
-+        vmov     q3, q11
-+        b        1b
-+.endm
-+
-+.macro  edge_16b_e1, body_fn
-+        sub      r6, r1, r3
-+        // load c
-+        vld1.8   {q1}, [r1, :128], r3
-+        // load a
-+        vld1.8   {q0}, [r6, :128]
-+        mov      r6, lr
-+1:      // load b
-+        vld1.8   {q2}, [r1, :128], r3
-+        bl       \body_fn
-+        vst1.8   {q0}, [r0, :128], r2
-+        subs     r12, #1
-+        // copy c to a
-+        vmov.64  q0, q1
-+        it       le
-+        bxle     r6
-+        // copy b to c
-+        vmov.64  q1, q2
-+        b        1b
-+.endm
-+
-+.macro  edge_8bx2_e1, body_fn
-+        sub      r6, r1, r3
-+        lsl      r3, #1
-+        push     {r7, lr}
-+        vld1.8   {d1}, [r1, :64], r3
-+        vld1.8   {d0}, [r6, :64], r3
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+1:      @ Given the data duplication here we could obviously do better than
-+        @ using the generic body_fn but it almost certainly isn't worth it
-+        vld1.8   {d4}, [r6, :64], r3
-+        vmov     d2, d1
-+        vld1.8   {d5}, [r1, :64], r3
-+        subs     r12, #2
-+        vmov     d3, d4
-+
-+        bl       \body_fn
-+
-+        vst1.8   {d0}, [r0, :64], r2
-+        vst1.8   {d1}, [r7, :64], r2
-+
-+        // copy b to a
-+        vmov     q0, q2
-+        bgt      1b
-+        pop      {r7, pc}
-+.endm
-+
-+.macro  edge_4bx4_e1, body_fn
-+        sub      r6, r1, r3
-+        lsl      r3, #1
-+        push     {r7, lr}
-+        vld1.32  {d0[1]}, [r1, :32], r3
-+        add      r7, r0, r2
-+        vld1.32  {d0[0]}, [r6, :32], r3
-+        lsl      r2, #1
-+        vld1.32  {d4[1]}, [r1, :32], r3
-+        vld1.32  {d4[0]}, [r6, :32], r3
-+        vld1.32  {d5[1]}, [r1, :32], r3
-+        vld1.32  {d5[0]}, [r6, :32], r3
-+        vmov     d1, d4
-+        vext.32  d2, d0, d4, #1
-+        subs     r12, #4
-+        vmov     d22, d5
-+        vext.32  d3, d4, d5, #1
-+        b        2f
-+
-+1:      vst1.32  {d0[0]}, [r0, :32], r2
-+        vext.32  d2, d22, d4, #1
-+        vst1.32  {d0[1]}, [r7, :32], r2
-+        vmov     d0, d22
-+        vst1.32  {d1[0]}, [r0, :32], r2
-+        vext.32  d3, d4, d5, #1
-+        vst1.32  {d1[1]}, [r7, :32], r2
-+        vmov     d1, d4
-+        vmov     d22, d5
-+2:      @ Given the data duplication here we could probably do better than
-+        @ using the generic body_fn but it almost certainly isn't worth it
-+        bl       \body_fn
-+        ble      3f
-+        vld1.32  {d4[0]}, [r6, :32], r3
-+        subs     r12, #4
-+        vld1.32  {d4[1]}, [r1, :32], r3
-+        vld1.32  {d5[0]}, [r6, :32], r3
-+        vld1.32  {d5[1]}, [r1, :32], r3
-+        b        1b
-+
-+3:      vst1.32  {d0[0]}, [r0, :32], r2
-+        vst1.32  {d0[1]}, [r7, :32], r2
-+        vst1.32  {d1[0]}, [r0, :32]
-+        vst1.32  {d1[1]}, [r7, :32]
-+        pop      {r7, pc}
-+.endm
-+
-+.macro  edge_64b_e2, body_fn, pb
-+        push     {lr}
-+        sub      r6, r1, r3
-+        // load c and a
-+        vld1.8   {q4-q5}, [r1, :128]
-+        vldr     d25, [r6, #-8]
-+        vldmia   r6, {d16-d23}
-+        vext.8   q0, q12, q8, #16 - \pb
-+        add      r6, r1, #32
-+        vext.8   q1, q8, q9, #16 - \pb
-+        add      r1, r1, r3
-+        vext.8   q2, q9, q10, #16 - \pb
-+        vld1.8   {q6-q7}, [r6, :128]
-+        sub      r6, r1, r3
-+        vext.8   q3, q10, q11, #16 - \pb
-+
-+1:      // load b
-+        vldmia   r1, {d16-d24}
-+        vext.8   q8, q8, q9, #\pb
-+        pld      [r1, r3]
-+        vext.8   q9, q9, q10, #\pb
-+        subs     r12, #1
-+        vext.8   q10, q10, q11, #\pb
-+        vext.8   q11, q11, q12, #\pb
-+        bl       \body_fn
-+        // next a is mostly available in c
-+        vldr     d25, [r6, #-8]
-+        vstmia   r0, {q0-q3}
-+        vext.8   q3, q6, q7, #16 - \pb
-+        it       le
-+        pople    {lr}
-+        vext.8   q2, q5, q6, #16 - \pb
-+        it       le
-+        bxle     lr
-+        vext.8   q1, q4, q5, #16 - \pb
-+        add      r6, r6, r3
-+        vext.8   q0, q12, q4, #16 - \pb
-+        add      r0, r0, r2
-+        // next c is mostly available in b
-+        vldr     d8, [r1]
-+        vext.8   d9, d16, d17, #8 - \pb
-+        vext.8   q5, q8, q9, #16 - \pb
-+        add      r1, r1, r3
-+        vext.8   q6, q9, q10, #16 - \pb
-+        pld      [r6, #-8]
-+        vext.8   q7, q10, q11, #16 - \pb
-+        b        1b
-+.endm
-+
-+.macro  edge_32bx2_e2, body_fn, pb
-+        sub      r6, r1, r3
-+        push     {r7, lr}
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+        // load a and first 32b of c
-+        vld1.8   {q4-q5}, [r1, :256]
-+        vldr     d25, [r6, #-8]
-+        vld1.8   {q13-q14}, [r6, :256]
-+        vldr     d31, [r1, #-8]
-+        add      r6, r6, r3, lsl #1
-+        vext.8   q0, q12, q13, #16 - \pb
-+        add      r1, r1, r3, lsl #1
-+        vext.8   q1, q13, q14, #16 - \pb
-+        vext.8   q2, q15, q4, #16 - \pb
-+        vext.8   q3, q4, q5, #16 - \pb
-+1:
-+        // load second 32b of c and second 32b of b
-+        vldmia   r6, {d12-d16}
-+        vldmia   r1, {d20-d24}
-+        // first 32b of b is mostly available in second 32b of c
-+        vext.8   q9, q7, q8, #\pb
-+        subs     r12, #2
-+        vext.8   q8, q6, q7, #\pb
-+        vext.8   q10, q10, q11, #\pb
-+        vext.8   q11, q11, q12, #\pb
-+
-+        bl       \body_fn
-+
-+        vst1.8   {q0-q1}, [r0, :256], r2
-+        vst1.8   {q2-q3}, [r7, :256], r2
-+        ble      2f
-+
-+        vldr     d25, [r6, #-8]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d8, [r1]
-+        vext.8   d9, d20, d21, #8 - \pb
-+        vldr     d31, [r1, #-8]
-+        add      r1, r1, r3, lsl #1
-+        // first 32b of a is mostly available in second 32b of c
-+        vext.8   q1, q6, q7, #16 - \pb
-+        vext.8   q0, q12, q6, #16 - \pb
-+        // first 32b of c is mostly available in second 32b of b
-+        vext.8   q5, q10, q11, #16 - \pb
-+        // second 32b of a is mostly available in first 32b of c
-+        vext.8   q2, q15, q4, #16 - \pb
-+        vext.8   q3, q4, q5, #16 - \pb
-+        b        1b
-+
-+2:      pop      {r7, pc}
-+.endm
-+
-+.macro  edge_16b_e2, body_fn, pb
-+        push     {lr}
-+        sub      r6, r1, r3
-+        vld1.8   {q1}, [r1, :128], r3
-+        vldr     d19, [r6, #-8]
-+        vld1.8   {q10}, [r6, :128], r3
-+
-+1:      vldmia   r1, {d4-d6}
-+        vext.8   q0, q9, q10, #16 - \pb
-+        subs     r12, #1
-+        vext.8   q2, q2, q3, #\pb
-+        bl       \body_fn
-+        vst1.8   {q0}, [r0, :128], r2
-+        ble      2f
-+        vmov     q10, q1
-+        vldr     d2, [r1]
-+        add      r1, r1, r3
-+        vldr     d19, [r6, #-8]
-+        add      r6, r6, r3
-+        vext.8   d3, d4, d5, #8 - \pb
-+        b        1b
-+
-+2:      pop      {pc}
-+.endm
-+
-+.macro  edge_8bx2_e2, body_fn, pb
-+        sub      r6, r1, r3
-+        push     {r7, lr}
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+        vldr     d18, [r6, #-8]
-+        vldr     d19, [r6]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d20, [r1, #-8]
-+        vldr     d2, [r1]
-+        add      r1, r1, r3, lsl #1
-+        vldmia   r6, {d3-d4}
-+        vld1.8   {d21-d22}, [r1, :128]
-+
-+1:      vext.8   d0, d18, d19, #8 - \pb
-+        vext.8   d4, d3, d4, #\pb
-+        vext.8   d1, d20, d2, #8 - \pb
-+        subs     r12, #2
-+        vext.8   d5, d21, d22, #\pb
-+
-+        bl       \body_fn
-+
-+        vst1.8   {d0}, [r0, :64], r2
-+        vst1.8   {d1}, [r7, :64], r2
-+        ble      2f
-+
-+        vldr     d18, [r6, #-8]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d20, [r1, #-8]
-+        vmov     d19, d3
-+        vldr     d2, [r1]
-+        add      r1, r1, r3, lsl #1
-+        vldmia   r6, {d3-d4}
-+        vld1.8   {d21-d22}, [r1, :128]
-+        b        1b
-+
-+2:      pop      {r7, pc}
-+.endm
-+
-+.macro  edge_4bx4_e2, body_fn, pb
-+        sub      r6, r1, r3
-+        push     {r7-r9, lr}
-+        add      r8, r1, r3
-+        sub      r6, r6, #\pb
-+        add      r8, r8, #\pb
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+
-+1:      vld1.32  {d0[0]}, [r6], r3
-+        subs     r12, #4
-+        vld1.32  {d2[0]}, [r1], r3
-+        vld1.32  {d4[0]}, [r8], r3
-+        vld1.32  {d0[1]}, [r6], r3
-+        vld1.32  {d2[1]}, [r1], r3
-+        vld1.32  {d4[1]}, [r8], r3
-+        vld1.32  {d1[0]}, [r6], r3
-+        vld1.32  {d3[0]}, [r1], r3
-+        vld1.32  {d5[0]}, [r8], r3
-+        vld1.32  {d1[1]}, [r6], r3
-+        vld1.32  {d3[1]}, [r1], r3
-+        vld1.32  {d5[1]}, [r8], r3
-+
-+        bl       \body_fn
-+
-+        vst1.32  {d0[0]}, [r0, :32], r2
-+        vst1.32  {d0[1]}, [r7, :32], r2
-+        vst1.32  {d1[0]}, [r0, :32], r2
-+        vst1.32  {d1[1]}, [r7, :32], r2
-+        bgt      1b
-+
-+        pop      {r7-r9,pc}
-+.endm
-+
-+.macro  edge_64b_e3, body_fn, pb
-+        push     {lr}
-+        sub      r6, r1, r3
-+        // load c and a
-+        vld1.8   {q4-q5}, [r1, :128]
-+        vldmia   r6, {d16-d24}
-+        vext.8   q0, q8, q9, #\pb
-+        add      r6, r1, #32
-+        vext.8   q1, q9, q10, #\pb
-+        add      r1, r1, r3
-+        vext.8   q2, q10, q11, #\pb
-+        vld1.8   {q6-q7}, [r6, :128]
-+        sub      r6, r1, r3
-+        vext.8   q3, q11, q12, #\pb
-+
-+1:      // load b
-+        vldr     d17, [r1, #-8]
-+        vldmia   r1, {d18-d25}
-+        vext.8   q8, q8, q9, #16 - \pb
-+        pld      [r1, r3]
-+        vext.8   q9, q9, q10, #16 - \pb
-+        subs     r12, #1
-+        vext.8   q10, q10, q11, #16 - \pb
-+        vext.8   q11, q11, q12, #16 - \pb
-+        bl       \body_fn
-+        // next a is mostly available in c
-+        vldr     d24, [r6, #64]
-+        vstmia   r0, {q0-q3}
-+        vext.8   q0, q4, q5, #\pb
-+        it       le
-+        pople    {lr}
-+        vext.8   q1, q5, q6, #\pb
-+        it       le
-+        bxle     lr
-+        vext.8   q2, q6, q7, #\pb
-+        add      r6, r6, r3
-+        vext.8   q3, q7, q12, #\pb
-+        add      r0, r0, r2
-+        // next c is mostly available in b
-+        vext.8   d14, d22, d23, #\pb
-+        vldr     d15, [r1, #56]
-+        vext.8   q4, q8, q9, #\pb
-+        add      r1, r1, r3
-+        vext.8   q5, q9, q10, #\pb
-+        vext.8   q6, q10, q11, #\pb
-+        b        1b
-+.endm
-+
-+.macro  edge_32bx2_e3, body_fn, pb
-+        sub      r6, r1, r3
-+        push     {r7, lr}
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+        // load a and first 32b of c
-+        vldmia   r1, {d8-d12}
-+        vldmia   r6, {d24-d28}
-+        vext.8   q2, q4, q5, #\pb
-+        add      r6, r6, r3, lsl #1
-+        vext.8   q3, q5, q6, #\pb
-+        add      r1, r1, r3, lsl #1
-+        vext.8   q0, q12, q13, #\pb
-+        vext.8   q1, q13, q14, #\pb
-+1:
-+        // load second 32b of c and second 32b of b
-+        vldr     d25, [r6, #-8]
-+        subs     r12, #2
-+        vldmia   r6, {d12-d15}
-+        vldr     d27, [r1, #-8]
-+        vldmia   r1, {d20-d23}
-+        // first 32b of b is mostly available in second 32b of c
-+        vext.8   q8, q12, q6, #16 - \pb
-+        vext.8   q9, q6, q7, #16 - \pb
-+        vext.8   q11, q10, q11, #16 - \pb
-+        vext.8   q10, q13, q10, #16 - \pb
-+
-+        bl       \body_fn
-+
-+        vst1.8   {q0-q1}, [r0, :256], r2
-+        vst1.8   {q2-q3}, [r7, :256], r2
-+        ble      2f
-+
-+        vldr     d24, [r6, #32]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d11, [r1, #24]
-+        vext.8   d10, d22, d23, #\pb
-+        vldr     d30, [r1, #32]
-+        add      r1, r1, r3, lsl #1
-+        // first 32b of a is mostly available in second 32b of c
-+        vext.8   q0, q6, q7, #\pb
-+        vext.8   q1, q7, q12, #\pb
-+        // first 32b of c is mostly available in second 32b of b
-+        vext.8   q4, q10, q11, #\pb
-+        // second 32b of a is mostly available in first 32b of c
-+        vext.8   q3, q5, q15, #\pb
-+        vext.8   q2, q4, q5, #\pb
-+        b        1b
-+
-+2:      pop      {r7, pc}
-+.endm
-+
-+.macro  edge_16b_e3, body_fn, pb
-+        push     {lr}
-+        sub      r6, r1, r3
-+        vld1.8   {q1}, [r1, :128], r3
-+        vldmia   r6, {d18-d20}
-+        add      r6, r6, r3
-+
-+1:      vldr     d5, [r1, #-8]
-+        vld1.8   {q3}, [r1, :128]
-+        subs     r12, #1
-+        vext.8   q0, q9, q10, #\pb
-+        vext.8   q2, q2, q3, #16 - \pb
-+        bl       \body_fn
-+        vst1.8   {q0}, [r0, :128], r2
-+        ble      2f
-+        vmov     q9, q1
-+        vldr     d3, [r1, #8]
-+        add      r1, r1, r3
-+        vldr     d20, [r6, #16]
-+        add      r6, r6, r3
-+        vext.8   d2, d4, d5, #\pb
-+        b        1b
-+
-+2:      pop      {pc}
-+.endm
-+
-+.macro  edge_8bx2_e3, body_fn, pb
-+        sub      r6, r1, r3
-+        push     {r7, lr}
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+        vld1.8   {d18-d19}, [r6]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d20, [r1, #8]
-+        vldr     d2, [r1]
-+        add      r1, r1, r3, lsl #1
-+        vldr     d4, [r6, #-8]
-+        vldr     d3, [r6]
-+        vldr     d21, [r1, #-8]
-+        vldr     d22, [r1]
-+
-+1:      vext.8   d0, d18, d19, #\pb
-+        vext.8   d4, d4, d3, #8 - \pb
-+        vext.8   d1, d2, d20, #\pb
-+        subs     r12, #2
-+        vext.8   d5, d21, d22, #8 - \pb
-+
-+        bl       \body_fn
-+
-+        vst1.8   {d0}, [r0, :64], r2
-+        vst1.8   {d1}, [r7, :64], r2
-+        ble      2f
-+
-+        vldr     d19, [r6, #8]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d20, [r1, #8]
-+        vmov     d18, d3
-+        vldr     d2, [r1]
-+        add      r1, r1, r3, lsl #1
-+        vldr     d4, [r6, #-8]
-+        vldr     d3, [r6]
-+        vldr     d21, [r1, #-8]
-+        vldr     d22, [r1]
-+        b        1b
-+
-+2:      pop      {r7, pc}
-+.endm
-+
-+.macro  edge_4bx4_e3, body_fn, pb
-+        @ e3 is the same as e2 but with the X offset reversed
-+        edge_4bx4_e2 \body_fn, (-\pb)
-+.endm
-+
-+@ Jump table entry - if in neon mode the bottom bit must be set
-+@ ? There is probably a real asm instruction to do this but I haven't found it
-+.macro jent lab
-+.if jent_pic
-+@ Could use .short here but due to A32 not supporting ldrh [lsl#1] it is
-+@ simpler and clearer in the code to stick with .word
-+T       .word  (0 + \lab) - (4 + 98b)
-+A       .word  (0 + \lab) - (8 + 98b)
-+.else
-+T       .word   1 + \lab
-+A       .word   \lab
-+.endif
-+.endm
-+
-+.macro edge_64b_bodies, body_fn, pb
-+        jent    0f
-+        jent    10f
-+        jent    20f
-+        jent    30f
-+
-+0:      edge_64b_e0     \body_fn, \pb
-+10:     edge_64b_e1     \body_fn
-+20:     edge_64b_e2     \body_fn, \pb
-+30:     edge_64b_e3     \body_fn, \pb
-+.endm
-+
-+.macro edge_32bx2_bodies, body_fn, pb
-+        jent    0f
-+        jent    10f
-+        jent    20f
-+        jent    30f
-+
-+0:      edge_32bx2_e0   \body_fn, \pb
-+10:     edge_32bx2_e1   \body_fn
-+20:     edge_32bx2_e2   \body_fn, \pb
-+30:     edge_32bx2_e3   \body_fn, \pb
-+.endm
-+
-+.macro edge_16b_bodies, body_fn, pb
-+        jent    0f
-+        jent    10f
-+        jent    20f
-+        jent    30f
-+
-+0:      edge_16b_e0     \body_fn, \pb
-+10:     edge_16b_e1     \body_fn
-+20:     edge_16b_e2     \body_fn, \pb
-+30:     edge_16b_e3     \body_fn, \pb
-+.endm
-+
-+.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb
-+        jent    0f
-+        jent    10f
-+        jent    20f
-+        jent    30f
-+        jent    5f
-+        jent    15f
-+        jent    25f
-+        jent    35f
-+
-+0:      edge_32bx2_e0   \body_fn_64b, \pb
-+10:     edge_32bx2_e1   \body_fn_64b
-+20:     edge_32bx2_e2   \body_fn_64b, \pb
-+30:     edge_32bx2_e3   \body_fn_64b, \pb
-+5:      edge_16b_e0     \body_fn_16b, \pb
-+15:     edge_16b_e1     \body_fn_16b
-+25:     edge_16b_e2     \body_fn_16b, \pb
-+35:     edge_16b_e3     \body_fn_16b, \pb
-+.endm
-+
-+.macro edge_16b_8bx2_bodies, body_fn, pb
-+        jent    0f
-+        jent    10f
-+        jent    20f
-+        jent    30f
-+        jent    5f
-+        jent    15f
-+        jent    25f
-+        jent    35f
-+
-+0:      edge_16b_e0     \body_fn, \pb
-+10:     edge_16b_e1     \body_fn
-+20:     edge_16b_e2     \body_fn, \pb
-+30:     edge_16b_e3     \body_fn, \pb
-+5:      edge_8bx2_e0    \body_fn, \pb
-+15:     edge_8bx2_e1    \body_fn
-+25:     edge_8bx2_e2    \body_fn, \pb
-+35:     edge_8bx2_e3    \body_fn, \pb
-+.endm
-+
-+.macro edge_8bx2_4bx4_bodies, body_fn, pb
-+        jent    0f
-+        jent    10f
-+        jent    20f
-+        jent    30f
-+        jent    5f
-+        jent    15f
-+        jent    25f
-+        jent    35f
-+
-+0:      edge_8bx2_e0    \body_fn, \pb
-+10:     edge_8bx2_e1    \body_fn
-+20:     edge_8bx2_e2    \body_fn, \pb
-+30:     edge_8bx2_e3    \body_fn, \pb
-+5:      edge_4bx4_e0    \body_fn, \pb
-+15:     edge_4bx4_e1    \body_fn
-+25:     edge_4bx4_e2    \body_fn, \pb
-+35:     edge_4bx4_e3    \body_fn, \pb
-+.endm
-+
-+@ void ff_hevc_rpi_sao_edge_8_neon_8(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_8_neon_8, export=1
-+        edge_16b_init   8, 0, 1, 99f
-+99:
-+        edge_8bx2_4bx4_bodies edge_16b_body_8, 1
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_16_neon_8(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_16_neon_8, export=1
-+        edge_16b_init   8, 0, 0, 99f
-+99:
-+        edge_16b_bodies edge_16b_body_8, 1
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_32_neon_8(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_32_neon_8, export=1
-+        edge_64b_init   8, 0, 0, 99f
-+99:
-+        edge_32bx2_bodies edge_64b_body_8, 1
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_64_neon_8(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_64_neon_8, export=1
-+        edge_64b_init   8, 0, 0, 99f
-+99:
-+        edge_64b_bodies edge_64b_body_8, 1
-+endfunc
-+
-+@ ff_hevc_rpi_sao_edge_c_8_neon_8(
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   const int16_t *_sao_offset_val_u, [r3]
-+@   const int16_t *_sao_offset_val_v, [sp, #0]
-+@   int eo,                           [sp, #4]
-+@   int width,                        [sp, #8]
-+@   int height)                       [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_8_neon_8, export=1
-+        edge_16b_init   8, 1, 1, 99f
-+99:
-+        edge_16b_8bx2_bodies edge_16b_body_8, 2
-+endfunc
-+
-+@ ff_hevc_rpi_sao_edge_c_16_neon_8(
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   const int16_t *_sao_offset_val_u, [r3]
-+@   const int16_t *_sao_offset_val_v, [sp, #0]
-+@   int eo,                           [sp, #4]
-+@   int width,                        [sp, #8]
-+@   int height)                       [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_16_neon_8, export=1
-+        edge_64b_init   8, 1, 0, 99f
-+99:
-+        edge_32bx2_bodies edge_64b_body_8, 2
-+endfunc
-+
-+@ ff_hevc_rpi_sao_edge_c_32_neon_8(
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   const int16_t *_sao_offset_val_u, [r3]
-+@   const int16_t *_sao_offset_val_v, [sp, #0]
-+@   int eo,                           [sp, #4]
-+@   int width,                        [sp, #8]
-+@   int height)                       [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_32_neon_8, export=1
-+        edge_64b_init   8, 1, 0, 99f
-+99:
-+        edge_64b_bodies edge_64b_body_8, 2
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_8_neon_10(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_8_neon_10, export=1
-+        edge_16b_init   10, 0, 1, 99f
-+99:
-+        edge_16b_8bx2_bodies edge_16b_body_16, 2
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_16_neon_10(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_16_neon_10, export=1
-+        edge_64b_init   10, 0, 0, 99f
-+99:
-+        edge_32bx2_bodies edge_64b_body_16, 2
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_64_neon_10(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+@ We simply split the 32 case into 2 vertical stripes
-+@ and call the fns for w32
-+@
-+@ Calling code will always have src != dst so we don't have to worry
-+@ about edge effects
-+
-+function ff_hevc_rpi_sao_edge_64_neon_10, export=1
-+        edge_64b_init   10, 0, 1, 99f, xjump=1
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_32_neon_10(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_32_neon_10, export=1
-+        edge_64b_init   10, 0, 0, 99f
-+99:
-+        edge_64b_bodies edge_64b_body_16, 2
-+endfunc
-+
-+@ ff_hevc_rpi_sao_edge_c_8_neon_10(
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   const int16_t *_sao_offset_val_u, [r3]
-+@   const int16_t *_sao_offset_val_v, [sp, #0]
-+@   int eo,                           [sp, #4]
-+@   int width,                        [sp, #8]
-+@   int height)                       [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_8_neon_10, export=1
-+        edge_xxb_init   10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1
-+99:
-+        edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4
-+endfunc
-+
-+@ ff_hevc_rpi_sao_edge_c_32_neon_10(
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   const int16_t *_sao_offset_val_u, [r3]
-+@   const int16_t *_sao_offset_val_v, [sp, #0]
-+@   int eo,                           [sp, #4]
-+@   int width,                        [sp, #8]
-+@   int height)                       [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_32_neon_10, export=1
-+        edge_64b_init   10, 1, 1, 99f, xjump=1
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_edge_c_16_neon_10(
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   const int16_t *_sao_offset_val_u, [r3]
-+@   const int16_t *_sao_offset_val_v, [sp, #0]
-+@   int eo,                           [sp, #4]
-+@   int width,                        [sp, #8]
-+@   int height)                       [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_16_neon_10, export=1
-+        edge_64b_init   10, 1, 0, 99f
-+99:
-+        edge_64b_bodies edge_64b_body_16, 4
-+endfunc
-+
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_arm.h
-@@ -0,0 +1,28 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_ARM_HEVCPRED_ARM_H
-+#define AVCODEC_ARM_HEVCPRED_ARM_H
-+
-+#include "libavcodec/rpi_hevcpred.h"
-+
-+void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth);
-+void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth);
-+
-+#endif /* AVCODEC_ARM_HEVCPRED_ARM_H */
-+
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_init_arm.c
-@@ -0,0 +1,35 @@
-+/*
-+ * Copyright (c) 2018 John Cox (for Raspberry Pi)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/attributes.h"
-+#include "libavutil/cpu.h"
-+#include "libavutil/arm/cpu.h"
-+
-+#include "libavcodec/rpi_hevcpred.h"
-+#include "rpi_hevcpred_arm.h"
-+
-+av_cold void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth)
-+{
-+    int cpu_flags = av_get_cpu_flags();
-+
-+    if (have_neon(cpu_flags))
-+        ff_hevc_rpi_pred_init_neon(c, bit_depth);
-+}
-+
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_init_neon.c
-@@ -0,0 +1,210 @@
-+/*
-+ * Copyright (c) 2018 John Cox (for Raspberry Pi)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "rpi_hevcpred_arm.h"
-+
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_8;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_16;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_32;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_32;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_32;
-+
-+void ff_hevc_rpi_pred_angular_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+
-+void ff_hevc_rpi_pred_vertical_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+
-+void ff_hevc_rpi_pred_horizontal_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+
-+void ff_hevc_rpi_pred_planar_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+
-+void ff_hevc_rpi_pred_dc_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+
-+void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth)
-+{
-+    switch (bit_depth)
-+    {
-+    case 8:
-+        c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_8;
-+        c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_8;
-+        c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_16;  // Equivalent to c_4_neon_8
-+        c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_16;
-+        c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_16;
-+
-+        c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_8;
-+        c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_8;
-+        c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_8;
-+        c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_8;
-+        c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_8;
-+        c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_8;
-+        c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_8;
-+
-+        c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_8;
-+        c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_8;
-+        c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_8;
-+        c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_8;
-+        c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_8;
-+        c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_8;
-+        c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_8;
-+
-+        c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_8;
-+        c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_8;
-+        c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_8;
-+        c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_8;
-+        c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_8;
-+        c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_8;
-+        c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_8;
-+
-+        c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_8;
-+        c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_8;
-+        c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_8;
-+        c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_8;
-+        c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_8;
-+        c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_8;
-+        c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_8;
-+
-+        c->pred_dc[0]   = ff_hevc_rpi_pred_dc_4_neon_8;
-+        c->pred_dc[1]   = ff_hevc_rpi_pred_dc_8_neon_8;
-+        c->pred_dc[2]   = ff_hevc_rpi_pred_dc_16_neon_8;
-+        c->pred_dc[3]   = ff_hevc_rpi_pred_dc_32_neon_8;
-+        c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_8;
-+        c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_8;
-+        c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_8;
-+        break;
-+    case 10:
-+        c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_16;
-+        c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_16;
-+        c->intra_filter[2] = ff_hevc_rpi_intra_filter_16_neon_16;
-+        c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_32;
-+        c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_32;
-+        c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_32;
-+
-+        c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_10;
-+        c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_10;
-+        c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_10;
-+        c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_10;
-+        c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_10;
-+        c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_10;
-+        c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_10;
-+
-+        c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_10;
-+        c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_10;
-+        c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_10;
-+        c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_10;
-+        c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_10;
-+        c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_10;
-+        c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_10;
-+
-+        c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_10;
-+        c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_10;
-+        c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_10;
-+        c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_10;
-+        c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_10;
-+        c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_10;
-+        c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_10;
-+
-+        c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_10;
-+        c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_10;
-+        c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_10;
-+        c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_10;
-+        c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_10;
-+        c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_10;
-+        c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_10;
-+
-+        c->pred_dc[0]   = ff_hevc_rpi_pred_dc_4_neon_10;
-+        c->pred_dc[1]   = ff_hevc_rpi_pred_dc_8_neon_10;
-+        c->pred_dc[2]   = ff_hevc_rpi_pred_dc_16_neon_10;
-+        c->pred_dc[3]   = ff_hevc_rpi_pred_dc_32_neon_10;
-+        c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_10;
-+        c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_10;
-+        c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_10;
-+        break;
-+    default:
-+        break;
-+    }
-+}
-+
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
-@@ -0,0 +1,2984 @@
-+/*
-+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox, Ben Avison
-+*/
-+
-+/*
-+ * General angular pred
-+ *
-+ * Horizontal (10) & Vertical (26) cases have their own file
-+ * and are not dealt with properly here (luma filtering is missing)
-+ *
-+ * The inv_angle calculations are annoying - if it wasn't for the +128
-+ * rounding step then the result would simply be the loop counter :-(
-+ */
-+
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+.text
-+
-+@ Horizontal Patch functions
-+@ These need a transpose before store so exist as smaller patches
-+@ Patches can be called repeatedly without any intermediate setup
-+@ to generate a horizontal block
-+@
-+@ It is almost certainly the case that larger patch fns can be built
-+@ and they would be a little faster, but we would still need the small
-+@ fns and code size (or at least instruction cache size) is an issue
-+@ given how much code we already have here
-+
-+@ Generate 8x8 luma 8 patch
-+@
-+@ r3   Out stride
-+@ r4   Angle add
-+@ r7   Inv angle (_up only)
-+@
-+@ In/Out (updated)
-+@ r0   Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
-+@ r2   Left ptr - updated
-+@ r10  Inv angle accumulator (_up only)
-+@ r12  32 - angle frac (_down) or angle frac (_up)
-+@ d0   Older reference samples
-+@ d1=r8+r9  Newer reference samples
-+@ d2   32 - angle frac
-+@ d3   Angle frac
-+@ q2   Partially computed next result (_up only)
-+@
-+@ Temps
-+@ r5   Loop counter
-+@ r6
-+@ r7   (_down only)
-+@ r11  (_up only)
-+@ q2, q8-q11
-+
-+patch_h_down_8x8_8:
-+        ldrd        r8, r9, [r2]        @ Left
-+        rsb         r12, r6, #32
-+        vmov        d0, r8, r9
-+        vdup.8      d3, r6
-+        lsr         r8, #8
-+        vdup.8      d2, r12
-+        orr         r8, r8, r9, lsl #24
-+        ldr         r9, [r2, #5]!
-+        vmov        d1, r8, r9
-+        // drop through...
-+patch_h_down_8x8_8_continue:
-+        mov         r5, #8
-+1:
-+          subs        r12, r4
-+        vmull.u8    q2, d0, d2
-+          it          mi
-+          addmi       r12, #32
-+        vmlal.u8    q2, d1, d3
-+          rsb         r6, r12, #32
-+        vext.8      q8, q8, q9, #8
-+          itt         mi
-+          lsrmi       r7, r8, #8
-+          vmovmi      d0, r8, r9
-+          vdup.8      d2, r12
-+        vext.8      q9, q9, q10, #8
-+          it          mi
-+          orrmi       r8, r7, r9, lsl #24
-+        vext.8      q10, q10, q11, #8
-+          it          mi
-+          ldrmi       r9, [r2, #1]!
-+        vmov        d22, d23
-+        vrshrn.u16  d23, q2, #5
-+          it          mi
-+          vmovmi      d1, r8, r9
-+        subs        r5, #1
-+          vdup.8      d3, r6
-+        bne         1b
-+        // drop through...
-+store_tran_8x8_8:
-+        vzip.8      d16, d17
-+        add         r6, r0, r3
-+        vzip.8      d18, d19
-+        lsl         r3, #1
-+        vzip.8      d20, d21
-+        add         r5, r0, r3
-+        vzip.8      d22, d23
-+        vzip.16     q8, q9
-+        vzip.16     q10, q11
-+        vzip.32     q8, q10
-+        vzip.32     q9, q11
-+        vst1.8      {d16}, [r0]!
-+        vst1.8      {d17}, [r6], r3
-+        vst1.8      {d20}, [r5], r3
-+        vst1.8      {d21}, [r6], r3
-+        vst1.8      {d18}, [r5], r3
-+        vst1.8      {d19}, [r6], r3
-+        vst1.8      {d22}, [r5]
-+        asr         r3, #1
-+        vst1.8      {d23}, [r6]
-+
-+        bx          lr
-+
-+patch_h_up_8x8_8:
-+        ldrd        r8, r9, [r2]
-+        rsb         r6, r4, #32
-+        vmov        d0, r8, r9
-+        vdup.8      d3, r4
-+        lsr         r11, r8, #24
-+        vdup.8      d2, r6
-+        ldr         r8, [r2, #-1]!
-+        orr         r9, r11, r9, lsl #8
-+        vmov        d1, r8, r9
-+        mov         r12, r4
-+        vmull.u8    q2, d0, d2
-+        vmlal.u8    q2, d1, d3
-+patch_h_up_8x8_8_continue:
-+        mov         r5, #8
-+1:
-+          add         r12, r4
-+          mov         r11, #0
-+          cmp         r12, #33
-+          it          cs
-+          addcs       r10, r7
-+        vext.8      q8, q8, q9, #8
-+          itt         cs
-+          subcs       r12, #32
-+          tstcs       r10, #1<<31
-+          rsb         r6, r12, #32
-+          it          eq
-+          asreq       r11, r10, #8
-+          it          cs
-+          vmovcs      d0, r8, r9
-+          vdup.8      d2, r6
-+          it          cs
-+          lsrcs       r6, r8, #24
-+        vext.8      q9, q9, q10, #8
-+          itt         cs
-+          orrcs       r9, r6, r9, lsl #8
-+          ldrbcs      r11, [r1, r11]
-+          vdup.8      d3, r12
-+        vext.8      q10, q10, q11, #8
-+          it          hi
-+          ldrbhi      r11, [r2, #-1]!
-+        vmov        d22, d23
-+        vrshrn.u16  d23, q2, #5
-+          itt         cs
-+          orrcs       r8, r11, r8, lsl #8
-+          vmovcs      d1, r8, r9
-+          vmull.u8    q2, d0, d2
-+        subs        r5, #1
-+          vmlal.u8    q2, d1, d3
-+        bne         1b
-+
-+        b           store_tran_8x8_8
-+
-+
-+.macro ADRT reg, val
-+@ adr in T32 has enough range but not in A32
-+A       adrl        \reg, \val
-+T       adr         \reg, \val
-+.endm
-+
-+@ ff_hevc_rpi_pred_angular_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_4_neon_8, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r8, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        ldr         lr, [r2], #1        @ Top
-+        rsb         r12, r6, #32
-+        vmov        s0, lr
-+        vdup.8      d3, r6
-+        ldr         lr, [r2], #1
-+        vdup.8      d2, r12
-+        vmov        s2, lr
-+          subs        r12, r4
-+        vmull.u8    q2, d0, d2
-+          it          mi
-+          addmi       r12, #32
-+        vmlal.u8    q2, d1, d3
-+          rsb         r6, r12, #32
-+          itt         mi
-+          vmovmi      s0, lr
-+          ldrmi       lr, [r2], #1
-+          vdup.8      d2, r12
-+          it          mi
-+          vmovmi      s2, lr
-+          vdup.8      d3, r6
-+        mov         r5, #2
-+1:
-+        vrshrn.u16  d20, q2, #5
-+            subs        r12, r4
-+          vmull.u8    q2, d0, d2
-+            it          mi
-+            addmi       r12, #32
-+          vmlal.u8    q2, d1, d3
-+            rsb         r6, r12, #32
-+        vext.64     q8, q8, q9, #1
-+            it          mi
-+            vmovmi      s0, lr
-+        vext.64     q9, q9, q10, #1
-+            it          mi
-+            ldrmi       lr, [r2], #1
-+            vdup.8      d2, r12
-+            it          mi
-+            vmovmi      s2, lr
-+        subs        r5, #1
-+            vdup.8      d3, r6
-+        bne         1b
-+
-+          vrshrn.u16  d20, q2, #5
-+            vmull.u8    q2, d0, d2
-+        add         r12, r0,  r3
-+            vmlal.u8    q2, d1, d3
-+        lsl         r3,  #1
-+          vext.64     q8, q8, q9, #1
-+          vext.64     q9, q9, q10, #1
-+            vrshrn.u16  d20, q2, #5
-+
-+98:
-+        vst4.8      {d17[0], d18[0], d19[0], d20[0]}, [r0], r3
-+        vst4.8      {d17[1], d18[1], d19[1], d20[1]}, [r12], r3
-+        vst4.8      {d17[2], d18[2], d19[2], d20[2]}, [r0]
-+        vst4.8      {d17[3], d18[3], d19[3], d20[3]}, [r12]
-+        pop        {r4-r8, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        rsb         r12, r6, #32
-+        ldr         lr, [r2]            @ Left
-+        ldrb        r2, [r2, #-1]       @ Top-left
-+        vmov        s0, lr
-+        vdup.8      d2, r12
-+        vdup.8      d3, r6
-+        orr         lr, r2, lr, lsl #8
-+        vmov        s2, lr
-+        sub         r8, r7, #128
-+        mov         r5, #3
-+2:
-+        vmull.u8    q2, d0, d2
-+          subs        r12, r4
-+        vmlal.u8    q2, d1, d3
-+T         it          mi
-+          addmi       r12, #32
-+T         asr         r6, r8, #8
-+T         it          mi
-+T         ldrbmi      r2, [r1, r6]
-+A         ldrbmi      r2, [r1, r8, asr #8]
-+          rsb         r6, r12, #32
-+          vdup.8      d2, r12
-+          ittt        mi
-+          vmovmi      s0, lr
-+          orrmi       lr, r2, lr, lsl #8
-+          vmovmi      s2, lr
-+        vrshrn.u16  d20, q2, #5
-+          vdup.8      d3, r6
-+          it          mi
-+          addmi       r8, r7
-+        subs        r5, #1
-+        vext.64     q8, q8, q9, #1
-+        vext.64     q9, q9, q10, #1
-+        bne         2b
-+
-+          vmull.u8    q2, d0, d2
-+        add         r12, r0,  r3
-+          vmlal.u8    q2, d1, d3
-+        lsl         r3,  #1
-+          vrshrn.u16  d20, q2, #5
-+        b           98b
-+
-+@ Left of vertical - works down left
-+18:
-+        ldrh        r7, [r7]
-+        rsb         r12, r6, #32
-+        ldr         lr, [r1]            @ Top
-+        ldrb        r1, [r2, #-1]       @ Top-left
-+        vmov        s0, lr
-+        vdup.8      d2, r12
-+        vdup.8      d3, r6
-+        orr         lr, r1, lr, lsl #8
-+        vmov        s2, lr
-+        sub         r8, r7, #128
-+        mov         r5, #3
-+2:
-+        vmull.u8    q2, d0, d2
-+          subs        r12, r4
-+        vmlal.u8    q2, d1, d3
-+T         it          mi
-+          addmi       r12, #32
-+T         asr         r6, r8, #8
-+T         it          mi
-+T         ldrbmi      r1, [r2, r6]
-+A         ldrbmi      r1, [r2, r8, asr #8]
-+          rsb         r6, r12, #32
-+          vdup.8      d2, r12
-+          ittt        mi
-+          vmovmi      s0, lr
-+          orrmi       lr, r1, lr, lsl #8
-+          vmovmi      s2, lr
-+        vrshrn.u16  d4, q2, #5
-+          vdup.8      d3, r6
-+          it          mi
-+          addmi       r8, r7
-+        subs        r5, #1
-+        vst1.32     {d4[0]}, [r0], r3
-+        bne         2b
-+
-+          vmull.u8    q2, d0, d2
-+          vmlal.u8    q2, d1, d3
-+          vrshrn.u16  d4, q2, #5
-+          vst1.32     {d4[0]}, [r0]
-+
-+        pop         {r4-r8, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        ldr         lr, [r1], #1        @ Top
-+        rsb         r12, r6, #32
-+        vmov        s0, lr
-+        vdup.8      d3, r6
-+        ldr         lr, [r1], #1
-+        vdup.8      d2, r12
-+        vmov        s2, lr
-+          subs        r12, r4
-+        vmull.u8    q2, d0, d2
-+          it          mi
-+          addmi       r12, #32
-+        vmlal.u8    q2, d1, d3
-+          rsb         r6, r12, #32
-+          itt         mi
-+          vmovmi      s0, lr
-+          ldrmi       lr, [r1], #1
-+          vdup.8      d2, r12
-+          it          mi
-+          vmovmi      s2, lr
-+          vdup.8      d3, r6
-+        mov         r5, #2
-+1:
-+        vrshrn.u16  d6, q2, #5
-+            subs        r12, r4
-+          vmull.u8    q2, d0, d2
-+            it          mi
-+            addmi       r12, #32
-+          vmlal.u8    q2, d1, d3
-+            rsb         r6, r12, #32
-+        vst1.32     {d6[0]}, [r0], r3
-+            itt         mi
-+            vmovmi      s0, lr
-+            ldrmi       lr, [r1], #1
-+            vdup.8      d2, r12
-+            it          mi
-+            vmovmi      s2, lr
-+        subs        r5, #1
-+            vdup.8      d3, r6
-+        bne         1b
-+
-+          vrshrn.u16  d6, q2, #5
-+            vmull.u8    q2, d0, d2
-+            vmlal.u8    q2, d1, d3
-+          vst1.32     {d6[0]}, [r0], r3
-+            vrshrn.u16  d6, q2, #5
-+            vst1.32     {d6[0]}, [r0]
-+
-+        pop         {r4-r8, pc}
-+
-+endfunc
-+
-+
-+
-+@ ff_hevc_rpi_pred_angular_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_8_neon_8, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        bl          patch_h_down_8x8_8
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+        bl          patch_h_up_8x8_8
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        ldrd        r8, r9, [r1]        @ Top
-+        rsb         r12, r6, #32
-+        ldrb        lr, [r2, #-1]       @ Top-left
-+        ldrh        r7, [r7]
-+        vmov        d0, r8, r9
-+        lsl         r9, r9, #8
-+        vdup.8      d2, r12
-+        orr         r9, r9, r8, lsr #24
-+        orr         r8, lr, r8, lsl #8
-+        vmov        d1, r8, r9
-+        sub         r1, r7, #128
-+        mov         r5, #7
-+1:
-+        vdup.8      d3, r6
-+        vmull.u8    q2, d0, d2
-+          subs        r12, r12, r4
-+        vmlal.u8    q2, d1, d3
-+          ittt        mi
-+          addmi       lr, r2, r1, asr #8
-+          addmi       r12, r12, #32
-+          vmovmi      d0, r8, r9
-+          rsb         r6, r12, #32
-+          itt         mi
-+          lslmi       r9, r9, #8
-+          ldrbmi      lr, [lr]
-+          vdup.8      d2, r12
-+        vrshrn.u16  d4, q2, #5
-+          itttt       mi
-+          orrmi       r9, r9, r8, lsr #24
-+          orrmi       r8, lr, r8, lsl #8
-+          vmovmi      d1, r8, r9
-+          addmi       r1, r1, r7
-+        subs        r5, r5, #1
-+        vst1.8      {d4}, [r0], r3
-+        bne         1b
-+
-+          vdup.8      d3, r6
-+          vmull.u8    q2, d0, d2
-+          vmlal.u8    q2, d1, d3
-+          vrshrn.u16  d4, q2, #5
-+          vst1.8      {d4}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        ldrd        r8, r9, [r1]        @ Top
-+        rsb         r12, r6, #32
-+        vmov        d0, r8, r9
-+        vdup.8      d3, r6
-+        mov         r5, #7
-+        lsr         r8, #8
-+        vdup.8      d2, r12
-+        orr         r8, r8, r9, lsl #24
-+        ldr         r9, [r1, #5]!
-+        vmov        d1, r8, r9
-+1:
-+        vmull.u8    q2, d0, d2
-+          subs        r12, r4
-+        vmlal.u8    q2, d1, d3
-+          it          mi
-+          addmi       r12, #32
-+          rsb         r6, r12, #32
-+          itt         mi
-+          vmovmi      d0, r8, r9
-+          lsrmi       r8, #8
-+          vdup.8      d2, r12
-+          itt         mi
-+          orrmi       r8, r8, r9, lsl #24
-+          ldrmi       r9, [r1, #1]!
-+        vrshrn.u16  d6, q2, #5
-+          it          mi
-+          vmovmi      d1, r8, r9
-+          vdup.8      d3, r6
-+        subs        r5, #1
-+        vst1.8      {d6}, [r0], r3
-+        bne         1b
-+
-+          vmull.u8    q2, d0, d2
-+          vmlal.u8    q2, d1, d3
-+          vrshrn.u16  d6, q2, #5
-+          vst1.8      {d6}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_16_neon_8, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        mov         r1,  r2             @ save r2 - r1 unused by patch_down
-+
-+        bl          patch_h_down_8x8_8
-+        bl          patch_h_down_8x8_8_continue
-+
-+        add         r2, r1, #8          @ restore r2, but 8 rows further down left
-+        sub         r0, #16
-+        mov         r6, r4
-+        add         r0, r0, r3, lsl #3
-+
-+        bl          patch_h_down_8x8_8
-+        bl          patch_h_down_8x8_8_continue
-+
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+
-+        push        {r2}
-+        bl          patch_h_up_8x8_8
-+        bl          patch_h_up_8x8_8_continue
-+        pop         {r2}
-+
-+        sub         r0, #16
-+        mov         r10, #-128
-+        add         r2, #8
-+        add         r0, r0, r3, lsl #3
-+        sub         r10, r10, r7, lsl #3
-+
-+        bl          patch_h_up_8x8_8
-+        bl          patch_h_up_8x8_8_continue
-+
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.8      {q9}, [r1]
-+        sub         r1, r2, #1
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        vdup.8      d6, r6
-+        vext.8      q8, q9, q9, #15
-+        sub         r8, r7, #128
-+        vld1.8      {d16[0]}, [r1]
-+        vdup.8      d7, r12
-+        mov         r5, #15
-+1:
-+        vmull.u8    q0, d18, d7
-+        subs        r12, r4
-+        vmlal.u8    q0, d16, d6
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d19, d7
-+        it          cc
-+        addcc       r1, r2, r8, asr #8
-+        vmlal.u8    q1, d17, d6
-+        rsb         r6, r12, #32
-+        vext.8      q10, q8, q8, #15
-+        sub         r5, #1
-+        vld1.8      {d20[0]}, [r1]
-+        it          cc
-+        addcc       r8, r7
-+        vmov        q11, q8
-+        teq         r5, #0
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        vmull.u8    q0, d22, d7
-+        subs        r12, r4
-+        vmlal.u8    q0, d20, d6
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d23, d7
-+        it          cc
-+        addcc       r1, r2, r8, asr #8
-+        vmlal.u8    q1, d21, d6
-+        rsb         r6, r12, #32
-+        vext.8      q8, q10, q10, #15
-+        sub         r5, #1
-+        vld1.8      {d16[0]}, [r1]
-+        it          cc
-+        addcc       r8, r7
-+        vmov        q9, q10
-+        teq         r5, #0
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmull.u8    q0, d22, d7
-+        vmlal.u8    q0, d20, d6
-+        vmull.u8    q1, d23, d7
-+        vmlal.u8    q1, d21, d6
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmull.u8    q0, d18, d7
-+        vmlal.u8    q0, d16, d6
-+        vmull.u8    q1, d19, d7
-+        vmlal.u8    q1, d17, d6
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        vld1.8      {q9}, [r1]!
-+        rsb         r12, r6, #32
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vext.8      q8, q9, q9, #1
-+        vld1.8      {d17[7]}, [r1]!
-+        mov         r5, #15
-+1:
-+        vmull.u8    q0, d16, d6
-+        subs        r12, r4
-+        vmlal.u8    q0, d18, d7
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d17, d6
-+        rsb         r6, r12, #32
-+        vmlal.u8    q1, d19, d7
-+        sub         r5, #1
-+        vext.8      q10, q8, q8, #1
-+        teq         r5, #0
-+        vld1.8      {d21[7]}, [r1]
-+        it          cc
-+        addcc       r1, #1
-+        vmov        q11, q8
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        vmull.u8    q0, d20, d6
-+        subs        r12, r4
-+        vmlal.u8    q0, d22, d7
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d21, d6
-+        rsb         r6, r12, #32
-+        vmlal.u8    q1, d23, d7
-+        sub         r5, #1
-+        vext.8      q8, q10, q10, #1
-+        teq         r5, #0
-+        vld1.8      {d17[7]}, [r1]
-+        it          cc
-+        addcc       r1, #1
-+        vmov        q9, q10
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmull.u8    q0, d20, d6
-+        vmlal.u8    q0, d22, d7
-+        vmull.u8    q1, d21, d6
-+        vmlal.u8    q1, d23, d7
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmull.u8    q0, d16, d6
-+        vmlal.u8    q0, d18, d7
-+        vmull.u8    q1, d17, d6
-+        vmlal.u8    q1, d19, d7
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_32_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_32_neon_8, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        mov         r10, #4
-+        mov         r1, r2
-+1:
-+        bl          patch_h_down_8x8_8
-+        bl          patch_h_down_8x8_8_continue
-+        bl          patch_h_down_8x8_8_continue
-+        bl          patch_h_down_8x8_8_continue
-+
-+        add         r2, r1, #8          @ restore r2, but 8 rows further down left
-+        add         r1, r1, #8
-+        mov         r6, r4
-+        sub         r0, #32
-+        subs        r10, #1
-+        add         r0, r0, r3, lsl #3
-+        bne         1b
-+
-+        pop        {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+        vmov.i8     d6, #1<<2
-+1:
-+        push        {r2,r10}
-+        bl          patch_h_up_8x8_8
-+        bl          patch_h_up_8x8_8_continue
-+        bl          patch_h_up_8x8_8_continue
-+        bl          patch_h_up_8x8_8_continue
-+        pop         {r2,r10}
-+
-+        vmov        r8, s12
-+        sub         r0, #32
-+        add         r2, #8
-+        add         r0, r0, r3, lsl #3
-+        sub         r10, r10, r7, lsl #3
-+        vshr.u8     d6, #1
-+        teq         r8, #0
-+        bne         1b
-+
-+        pop        {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.8      {q0-q1}, [r1]
-+        sub         r9, r2, #1
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+        mov         r5, #32
-+1:
-+        vld1.8      {d17[7]}, [r9]
-+        add         r8, r7
-+        vmov        q2, q0
-+        vmov        q3, q1
-+        add         r9, r2, r8, asr #8
-+        vext.8      q1, q0, q1, #15
-+        vext.8      q0, q8, q0, #15
-+2:
-+        vmull.u8    q10, d4, d19
-+        subs        r12, r4
-+        vmlal.u8    q10, d0, d18
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q11, d5, d19
-+        rsb         r6, r12, #32
-+        vmlal.u8    q11, d1, d18
-+        sub         r5, #1
-+        vmull.u8    q12, d6, d19
-+        teq         r5, #0
-+        vmlal.u8    q12, d2, d18
-+        vmull.u8    q13, d7, d19
-+        vmlal.u8    q13, d3, d18
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+        vrshrn.u16  d20, q10, #5
-+        vrshrn.u16  d21, q11, #5
-+        vrshrn.u16  d22, q12, #5
-+        vrshrn.u16  d23, q13, #5
-+        vst1.8      {q10-q11}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        add         r5, r1, #32
-+        vld1.8      {q0-q1}, [r1]!
-+        rsb         r12, r6, #32
-+        vld1.8      {d16[0]}, [r5]
-+        mov         r5, #32
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+1:
-+        vmov        q2, q0
-+        add         r1, #1
-+        vmov        q3, q1
-+        vext.8      q0, q0, q1, #1
-+        vext.8      q1, q1, q8, #1
-+2:
-+        vmull.u8    q10, d0, d18
-+        subs        r12, r4
-+        vmlal.u8    q10, d4, d19
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q11, d1, d18
-+        rsb         r6, r12, #32
-+        vmlal.u8    q11, d5, d19
-+        sub         r5, #1
-+        vmull.u8    q12, d2, d18
-+        teq         r5, #0
-+        vmlal.u8    q12, d6, d19
-+        vmull.u8    q13, d3, d18
-+        vmlal.u8    q13, d7, d19
-+        vld1.8      {d16[0]}, [r1]
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+        vrshrn.u16  d20, q10, #5
-+        vrshrn.u16  d21, q11, #5
-+        vrshrn.u16  d22, q12, #5
-+        vrshrn.u16  d23, q13, #5
-+        vst1.8      {q10-q11}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ Chroma 8 bit 4x4 patch fns
-+        .text
-+
-+patch_h_down_c_4x4_8:
-+        ldrd        r8, r9, [r2]        @ Left
-+        rsb         r12, r6, #32
-+        vmov        d0, r8, r9
-+        vdup.8      d3, r6
-+        lsr         r8, #16
-+        vdup.8      d2, r12
-+        orr         r8, r8, r9, lsl #16
-+        ldr         r9, [r2, #6]!
-+        vmov        d1, r8, r9
-+        // drop through...
-+patch_h_down_c_4x4_8_continue:
-+        mov         r5, #4
-+1:
-+          subs        r12, r4
-+        vmull.u8    q2, d0, d2
-+          it          mi
-+          addmi       r12, #32
-+        vmlal.u8    q2, d1, d3
-+          rsb         r6, r12, #32
-+        vext.8      q8, q8, q9, #8
-+          it          mi
-+          lsrmi       r7, r8, #16
-+        vmov        d18, d19
-+          it          mi
-+          vmovmi      d0, r8, r9
-+          vdup.8      d2, r12
-+          it          mi
-+          orrmi       r8, r7, r9, lsl #16
-+        vrshrn.u16  d19, q2, #5
-+          itt         mi
-+          ldrmi       r9, [r2, #2]!
-+          vmovmi      d1, r8, r9
-+        subs        r5, #1
-+          vdup.8      d3, r6
-+        bne         1b
-+        // drop through...
-+store_tran_c_4x4_8:
-+        vzip.16     d16, d17
-+        add         r6, r0, r3
-+        vzip.16     d18, d19
-+        lsl         r3, #1
-+        vzip.32     q8, q9
-+        add         r5, r0, r3
-+        vst1.16     {d16}, [r0]!
-+        vst1.16     {d17}, [r6], r3
-+        vst1.16     {d18}, [r5]
-+        asr         r3, #1
-+        vst1.16     {d19}, [r6]
-+
-+        bx          lr
-+
-+patch_h_up_c_4x4_8:
-+        ldrd        r8, r9, [r2]
-+        rsb         r6, r4, #32
-+        vmov        d0, r8, r9
-+        vdup.8      d3, r4
-+        lsr         r11, r8, #16
-+        vdup.8      d2, r6
-+        ldr         r8, [r2, #-2]!
-+        orr         r9, r11, r9, lsl #16
-+        vmov        d1, r8, r9
-+        mov         r12, r4
-+        vmull.u8    q2, d0, d2
-+        vmlal.u8    q2, d1, d3
-+patch_h_up_c_4x4_8_continue:
-+        mov         r5, #4
-+1:
-+          add         r12, r4
-+          cmp         r12, #33
-+          it          cs
-+          addcs       r10, r7
-+          mov         r11, #0
-+          itt         cs
-+          subcs       r12, #32
-+          tstcs       r10, #1<<31
-+          rsb         r6, r12, #32
-+          it          eq
-+          asreq       r11, r10, #7
-+          it          cs
-+          vmovcs      d0, r8, r9
-+          it          eq
-+          biceq       r11, #1
-+          vdup.8      d2, r6
-+          it          cs
-+          lsrcs       r6, r8, #16
-+          vdup.8      d3, r12
-+        vext.8      q8, q8, q9, #8
-+          itt         cs
-+          orrcs       r9, r6, r9, lsl #16
-+          ldrhcs      r11, [r1, r11]
-+        vmov        d18, d19
-+          it          hi
-+          ldrhhi      r11, [r2, #-2]!
-+        vrshrn.u16  d19, q2, #5
-+          itt         cs
-+          orrcs       r8, r11, r8, lsl #16
-+          vmovcs      d1, r8, r9
-+          vmull.u8    q2, d0, d2
-+        subs        r5, #1
-+          vmlal.u8    q2, d1, d3
-+        bne         1b
-+
-+        b           store_tran_c_4x4_8
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_c_4_neon_8, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        bl          patch_h_down_c_4x4_8
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+        bl          patch_h_up_c_4x4_8
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        ldrd        r8, r9, [r1]        @ Top
-+        rsb         r12, r6, #32
-+        ldrh        lr, [r2, #-2]       @ Top-left
-+        ldrh        r7, [r7]
-+        vmov        d0, r8, r9
-+        lsl         r9, r9, #16
-+        vdup.8      d2, r12
-+        orr         r9, r9, r8, lsr #16
-+        orr         r8, lr, r8, lsl #16
-+        vmov        d1, r8, r9
-+        sub         r1, r7, #128
-+        mov         r5, #3
-+1:
-+        vdup.8      d3, r6
-+        vmull.u8    q2, d0, d2
-+          subs        r12, r12, r4
-+        vmlal.u8    q2, d1, d3
-+          itttt       mi
-+          addmi       lr, r2, r1, asr #7
-+          bicmi       lr, #1
-+          addmi       r12, r12, #32
-+          vmovmi      d0, r8, r9
-+          rsb         r6, r12, #32
-+          itt         mi
-+          lslmi       r9, r9, #16
-+          ldrhmi      lr, [lr]
-+          vdup.8      d2, r12
-+        vrshrn.u16  d4, q2, #5
-+          itttt       mi
-+          orrmi       r9, r9, r8, lsr #16
-+          orrmi       r8, lr, r8, lsl #16
-+          vmovmi      d1, r8, r9
-+          addmi       r1, r1, r7
-+        subs        r5, r5, #1
-+        vst1.16     {d4}, [r0], r3
-+        bne         1b
-+
-+          vdup.8      d3, r6
-+          vmull.u8    q2, d0, d2
-+          vmlal.u8    q2, d1, d3
-+          vrshrn.u16  d4, q2, #5
-+          vst1.16     {d4}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        ldrd        r8, r9, [r1]        @ Top
-+        rsb         r12, r6, #32
-+        vmov        d0, r8, r9
-+        vdup.8      d3, r6
-+        mov         r5, #3
-+        lsr         r8, #16
-+        vdup.8      d2, r12
-+        orr         r8, r8, r9, lsl #16
-+        ldr         r9, [r1, #6]!
-+        vmov        d1, r8, r9
-+1:
-+        vmull.u8    q2, d0, d2
-+          subs        r12, r4
-+        vmlal.u8    q2, d1, d3
-+          it          mi
-+          addmi       r12, #32
-+          rsb         r6, r12, #32
-+          itt         mi
-+          vmovmi      d0, r8, r9
-+          lsrmi       r8, #16
-+          vdup.8      d2, r12
-+          itt         mi
-+          orrmi       r8, r8, r9, lsl #16
-+          ldrmi       r9, [r1, #2]!
-+        vrshrn.u16  d6, q2, #5
-+          it          mi
-+          vmovmi      d1, r8, r9
-+          vdup.8      d3, r6
-+        subs        r5, #1
-+        vst1.16     {d6}, [r0], r3
-+        bne         1b
-+
-+          vmull.u8    q2, d0, d2
-+          vmlal.u8    q2, d1, d3
-+          vrshrn.u16  d6, q2, #5
-+          vst1.16     {d6}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_c_8_neon_8, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        mov         r1,  r2             @ save r2 - r1 unused by patch_down
-+
-+        bl          patch_h_down_c_4x4_8
-+        bl          patch_h_down_c_4x4_8_continue
-+
-+        add         r2, r1, #4*2        @ restore r2, but 4 rows further down left
-+        sub         r0, #16
-+        mov         r6, r4
-+        add         r0, r0, r3, lsl #2
-+
-+        bl          patch_h_down_c_4x4_8
-+        bl          patch_h_down_c_4x4_8_continue
-+
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+
-+        push        {r2}
-+        bl          patch_h_up_c_4x4_8
-+        bl          patch_h_up_c_4x4_8_continue
-+        pop         {r2}
-+
-+        sub         r0, #16
-+        mov         r10, #-128
-+        add         r2, #8
-+        add         r0, r0, r3, lsl #2
-+        sub         r10, r10, r7, lsl #2
-+
-+        bl          patch_h_up_c_4x4_8
-+        bl          patch_h_up_c_4x4_8_continue
-+
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.8      {q9}, [r1]
-+        sub         r1, r2, #2
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        vdup.8      d6, r6
-+        vext.8      q8, q9, q9, #14
-+        sub         r8, r7, #128
-+        vld1.16     {d16[0]}, [r1]
-+        vdup.8      d7, r12
-+        mov         r5, #7
-+1:
-+        subs        r12, r4
-+        vmull.u8    q0, d18, d7
-+        it          cc
-+        asrcc       r1, r8, #8
-+        vmlal.u8    q0, d16, d6
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d19, d7
-+        it          cc
-+        addcc       r1, r2, r1, lsl #1
-+        vmlal.u8    q1, d17, d6
-+        rsb         r6, r12, #32
-+        vext.8      q10, q8, q8, #14
-+        sub         r5, #1
-+        vld1.16     {d20[0]}, [r1]
-+        it          cc
-+        addcc       r8, r7
-+        vmov        q11, q8
-+        teq         r5, #0
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        subs        r12, r4
-+        vmull.u8    q0, d22, d7
-+        it          cc
-+        asrcc       r1, r8, #8
-+        vmlal.u8    q0, d20, d6
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d23, d7
-+        it          cc
-+        addcc       r1, r2, r1, lsl #1
-+        vmlal.u8    q1, d21, d6
-+        rsb         r6, r12, #32
-+        vext.8      q8, q10, q10, #14
-+        sub         r5, #1
-+        vld1.16     {d16[0]}, [r1]
-+        it          cc
-+        addcc       r8, r7
-+        vmov        q9, q10
-+        teq         r5, #0
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmull.u8    q0, d22, d7
-+        vmlal.u8    q0, d20, d6
-+        vmull.u8    q1, d23, d7
-+        vmlal.u8    q1, d21, d6
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmull.u8    q0, d18, d7
-+        vmlal.u8    q0, d16, d6
-+        vmull.u8    q1, d19, d7
-+        vmlal.u8    q1, d17, d6
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        vld1.8      {q9}, [r1]!
-+        rsb         r12, r6, #32
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vext.8      q8, q9, q9, #2
-+        vld1.16     {d17[3]}, [r1]!
-+        mov         r5, #7
-+1:
-+        vmull.u8    q0, d16, d6
-+        subs        r12, r4
-+        vmlal.u8    q0, d18, d7
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d17, d6
-+        rsb         r6, r12, #32
-+        vmlal.u8    q1, d19, d7
-+        sub         r5, #1
-+        vext.8      q10, q8, q8, #2
-+        teq         r5, #0
-+        vld1.16     {d21[3]}, [r1]
-+        it          cc
-+        addcc       r1, #2
-+        vmov        q11, q8
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        vmull.u8    q0, d20, d6
-+        subs        r12, r4
-+        vmlal.u8    q0, d22, d7
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d21, d6
-+        rsb         r6, r12, #32
-+        vmlal.u8    q1, d23, d7
-+        sub         r5, #1
-+        vext.8      q8, q10, q10, #2
-+        teq         r5, #0
-+        vld1.16     {d17[3]}, [r1]
-+        it          cc
-+        addcc       r1, #2
-+        vmov        q9, q10
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmull.u8    q0, d20, d6
-+        vmlal.u8    q0, d22, d7
-+        vmull.u8    q1, d21, d6
-+        vmlal.u8    q1, d23, d7
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmull.u8    q0, d16, d6
-+        vmlal.u8    q0, d18, d7
-+        vmull.u8    q1, d17, d6
-+        vmlal.u8    q1, d19, d7
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_c_16_neon_8, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        mov         r10, #4
-+        mov         r1, r2
-+1:
-+        bl          patch_h_down_c_4x4_8
-+        bl          patch_h_down_c_4x4_8_continue
-+        bl          patch_h_down_c_4x4_8_continue
-+        bl          patch_h_down_c_4x4_8_continue
-+
-+        add         r2, r1, #4*2         @ restore r2, but 4 rows further down left
-+        add         r1, r1, #4*2
-+        mov         r6, r4
-+        sub         r0, #32
-+        subs        r10, #1
-+        add         r0, r0, r3, lsl #2
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+        vmov.i8     d6, #1<<2
-+1:
-+        push        {r2, r10}
-+        bl          patch_h_up_c_4x4_8
-+        bl          patch_h_up_c_4x4_8_continue
-+        bl          patch_h_up_c_4x4_8_continue
-+        bl          patch_h_up_c_4x4_8_continue
-+        pop         {r2, r10}
-+
-+        vmov        r8, s12
-+        sub         r0, #32
-+        add         r2, #8
-+        add         r0, r0, r3, lsl #2
-+        sub         r10, r10, r7, lsl #2
-+        vshr.u8     d6, #1
-+        teq         r8, #0
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.8      {q0-q1}, [r1]
-+        sub         r9, r2, #2
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+        mov         r5, #16
-+1:
-+        vld1.16     {d17[3]}, [r9]
-+        add         r8, r7
-+        vmov        q2, q0
-+        vmov        q3, q1
-+        asr         r9, r8, #8
-+        vext.8      q1, q0, q1, #14
-+        add         r9, r2, r9, lsl #1
-+        vext.8      q0, q8, q0, #14
-+2:
-+        vmull.u8    q10, d4, d19
-+        subs        r12, r4
-+        vmlal.u8    q10, d0, d18
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q11, d5, d19
-+        rsb         r6, r12, #32
-+        vmlal.u8    q11, d1, d18
-+        sub         r5, #1
-+        vmull.u8    q12, d6, d19
-+        teq         r5, #0
-+        vmlal.u8    q12, d2, d18
-+        vmull.u8    q13, d7, d19
-+        vmlal.u8    q13, d3, d18
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+        vrshrn.u16  d20, q10, #5
-+        vrshrn.u16  d21, q11, #5
-+        vrshrn.u16  d22, q12, #5
-+        vrshrn.u16  d23, q13, #5
-+        vst1.8      {q10-q11}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        add         r5, r1, #32
-+        vld1.8      {q0-q1}, [r1]!
-+        rsb         r12, r6, #32
-+        vld1.16     {d16[0]}, [r5]
-+        mov         r5, #16
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+1:
-+        vmov        q2, q0
-+        add         r1, #2
-+        vmov        q3, q1
-+        vext.8      q0, q0, q1, #2
-+        vext.8      q1, q1, q8, #2
-+2:
-+        vmull.u8    q10, d0, d18
-+        subs        r12, r4
-+        vmlal.u8    q10, d4, d19
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q11, d1, d18
-+        rsb         r6, r12, #32
-+        vmlal.u8    q11, d5, d19
-+        sub         r5, #1
-+        vmull.u8    q12, d2, d18
-+        teq         r5, #0
-+        vmlal.u8    q12, d6, d19
-+        vmull.u8    q13, d3, d18
-+        vmlal.u8    q13, d7, d19
-+        vld1.16     {d16[0]}, [r1]
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+        vrshrn.u16  d20, q10, #5
-+        vrshrn.u16  d21, q11, #5
-+        vrshrn.u16  d22, q12, #5
-+        vrshrn.u16  d23, q13, #5
-+        vst1.8      {q10-q11}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+@------------------------------------------------------------------------------
-+@ Data
-+
-+        .text
-+        .balign  64
-+angle_2:
-+        .byte    32
-+        .byte    26,  21,  17,  13,   9,   5,   2,   0
-+        @ Sign inverted from standards table
-+        .byte     2,   5,   9,  13,  17,  21,  26,  32
-+        .byte    26,  21,  17,  13,   9,   5,   2,   0
-+        @ Standard sign
-+        .byte     2,   5,   9,  13,  17,  21,  26,  32
-+
-+        .balign   2
-+
-+        @ Sign inverted from standards table
-+inv_angle:
-+        .short   4096, 1638,  910,  630,  482,  390,  315
-+        .short    256
-+        .short    315,  390,  482,  630,  910, 1638, 4096
-+
-+@------------------------------------------------------------------------------
-+@
-+@ 10 bit fns
-+@ Should work for 9 & 11 bit as there is no actual bit-depth specific code
-+@ but runs out of register width for 12+ bit
-+
-+        .text
-+        .balign 64
-+
-+patch_h_down_4x4_10:
-+        ldrd        r8, r9, [r2]        @ Left
-+        rsb         r12, r6, #32
-+        vmov        d0, r8, r9
-+        vdup.16     d3, r6
-+        lsr         r8, #16
-+        vdup.16     d2, r12
-+        orr         r8, r8, r9, lsl #16
-+        ldr         r9, [r2, #6]!
-+        vmov        d1, r8, r9
-+        // drop through...
-+patch_h_down_4x4_10_continue:
-+        mov         r5, #4
-+1:
-+          subs        r12, r4
-+        vmul.u16    d4, d0, d2
-+          it          mi
-+          addmi       r12, #32
-+        vmla.u16    d4, d1, d3
-+          rsb         r6, r12, #32
-+        vext.16     q8, q8, q9, #4
-+          it          mi
-+          lsrmi       r7, r8, #16
-+        vmov        d18, d19
-+          it          mi
-+          vmovmi      d0, r8, r9
-+          vdup.16     d2, r12
-+          it          mi
-+          orrmi       r8, r7, r9, lsl #16
-+        vrshr.u16   d19, d4, #5
-+          itt         mi
-+          ldrmi       r9, [r2, #2]!
-+          vmovmi      d1, r8, r9
-+        subs        r5, #1
-+          vdup.16     d3, r6
-+        bne         1b
-+        // drop through...
-+store_tran_4x4_10:
-+        vzip.16     d16, d17
-+        add         r6, r0, r3
-+        vzip.16     d18, d19
-+        lsl         r3, #1
-+        vzip.32     q8, q9
-+        add         r5, r0, r3
-+        vst1.16     {d16}, [r0]!
-+        vst1.16     {d17}, [r6], r3
-+        vst1.16     {d18}, [r5]
-+        asr         r3, #1
-+        vst1.16     {d19}, [r6]
-+
-+        bx          lr
-+
-+patch_h_up_4x4_10:
-+        ldrd        r8, r9, [r2]
-+        rsb         r6, r4, #32
-+        vmov        d0, r8, r9
-+        vdup.16     d3, r4
-+        lsr         r11, r8, #16
-+        vdup.16     d2, r6
-+        ldr         r8, [r2, #-2]!
-+        orr         r9, r11, r9, lsl #16
-+        vmov        d1, r8, r9
-+        mov         r12, r4
-+        vmul.u16    d4, d0, d2
-+        vmla.u16    d4, d1, d3
-+patch_h_up_4x4_10_continue:
-+        mov         r5, #4
-+1:
-+          add         r12, r4
-+          cmp         r12, #33
-+          it          cs
-+          addcs       r10, r7
-+          mov         r11, #0
-+          itt         cs
-+          subcs       r12, #32
-+          tstcs       r10, #1<<31
-+          rsb         r6, r12, #32
-+          it          eq
-+          asreq       r11, r10, #7
-+          it          cs
-+          vmovcs      d0, r8, r9
-+          it          eq
-+          biceq       r11, #1
-+          vdup.16     d2, r6
-+          it          cs
-+          lsrcs       r6, r8, #16
-+          vdup.16     d3, r12
-+        vext.16     q8, q8, q9, #4
-+          itt         cs
-+          orrcs       r9, r6, r9, lsl #16
-+          ldrhcs      r11, [r1, r11]
-+        vmov        d18, d19
-+          it          hi
-+          ldrhhi      r11, [r2, #-2]!
-+        vrshr.u16   d19, d4, #5
-+          itt         cs
-+          orrcs       r8, r11, r8, lsl #16
-+          vmovcs      d1, r8, r9
-+          vmul.u16    d4, d0, d2
-+        subs        r5, #1
-+          vmla.u16    d4, d1, d3
-+        bne         1b
-+
-+        b           store_tran_4x4_10
-+
-+
-+@ ff_hevc_rpi_pred_angular_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_4_neon_10, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        bl          patch_h_down_4x4_10
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+        bl          patch_h_up_4x4_10
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        ldrd        r8, r9, [r1]        @ Top
-+        rsb         r12, r6, #32
-+        ldrh        lr, [r2, #-2]       @ Top-left
-+        ldrh        r7, [r7]
-+        vmov        d0, r8, r9
-+        lsl         r9, r9, #16
-+        vdup.16     d2, r12
-+        orr         r9, r9, r8, lsr #16
-+        orr         r8, lr, r8, lsl #16
-+        vmov        d1, r8, r9
-+        sub         r1, r7, #128
-+        mov         r5, #3
-+1:
-+        sel         lr, lr, lr          @ force pipeline 0 on Cortex-A53
-+        vdup.16     d3, r6
-+        vmul.u16    d4, d0, d2
-+          subs        r12, r12, r4
-+        vmla.u16    d4, d1, d3
-+          itttt       mi
-+          addmi       lr, r2, r1, asr #7
-+          bicmi       lr, #1
-+          addmi       r12, r12, #32
-+          vmovmi      d0, r8, r9
-+          rsb         r6, r12, #32
-+          itt         mi
-+          lslmi       r9, r9, #16
-+          ldrhmi      lr, [lr]
-+          vdup.16     d2, r12
-+        vrshr.u16   d4, d4, #5
-+          itttt       mi
-+          orrmi       r9, r9, r8, lsr #16
-+          orrmi       r8, lr, r8, lsl #16
-+          vmovmi      d1, r8, r9
-+          addmi       r1, r1, r7
-+        subs        r5, r5, #1
-+        vst1.16     {d4}, [r0], r3
-+        bne         1b
-+
-+          vdup.16     d3, r6
-+          nop                           @ force next insn into pipeline 0 to enable
-+          vmul.u16    d4, d0, d2        @ vmla to execute back-to-back on Cortex-A53
-+          vmla.u16    d4, d1, d3
-+          vrshr.u16   d4, d4, #5
-+          vst1.16     {d4}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        ldrd        r8, r9, [r1]        @ Top
-+        rsb         r12, r6, #32
-+        vmov        d0, r8, r9
-+        vdup.16     d3, r6
-+        lsr         r8, #16
-+        vdup.16     d2, r12
-+        orr         r8, r8, r9, lsl #16
-+        ldr         r9, [r1, #6]!
-+        vmov        d1, r8, r9
-+        mov         r5, #3
-+1:
-+        vmul.u16    d4, d0, d2
-+          subs        r12, r4
-+        vmla.u16    d4, d1, d3
-+          it          mi
-+          addmi       r12, #32
-+          rsb         r6, r12, #32
-+          itt         mi
-+          vmovmi      d0, r8, r9
-+          lsrmi       r8, #16
-+          vdup.16     d2, r12
-+          itt         mi
-+          orrmi       r8, r8, r9, lsl #16
-+          ldrmi       r9, [r1, #2]!
-+        vrshr.u16   d4, d4, #5
-+          it          mi
-+          vmovmi      d1, r8, r9
-+          vdup.16     d3, r6
-+        subs        r5, #1
-+        vst1.16     {d4}, [r0], r3
-+        bne         1b
-+
-+          vmul.u16    d4, d0, d2
-+          vmla.u16    d4, d1, d3
-+          vrshr.u16   d4, d4, #5
-+          vst1.16     {d4}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_8_neon_10, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        mov         r1,  r2             @ save r2 - r1 unused by patch_down
-+
-+        bl          patch_h_down_4x4_10
-+        bl          patch_h_down_4x4_10_continue
-+
-+        add         r2, r1, #4*2        @ restore r2, but 4 rows further down left
-+        sub         r0, #16
-+        mov         r6, r4
-+        add         r0, r0, r3, lsl #2
-+
-+        bl          patch_h_down_4x4_10
-+        bl          patch_h_down_4x4_10_continue
-+
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+
-+        push        {r2}
-+        bl          patch_h_up_4x4_10
-+        bl          patch_h_up_4x4_10_continue
-+        pop         {r2}
-+
-+        sub         r0, #16
-+        mov         r10, #-128
-+        add         r2, #8
-+        add         r0, r0, r3, lsl #2
-+        sub         r10, r10, r7, lsl #2
-+
-+        bl          patch_h_up_4x4_10
-+        bl          patch_h_up_4x4_10_continue
-+
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.16     {q9}, [r1]
-+        sub         r1, r2, #2
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        vdup.16     q2, r6
-+        vext.16     q8, q9, q9, #7
-+        sub         r8, r7, #128
-+        vld1.16     {d16[0]}, [r1]
-+        vdup.16     q3, r12
-+        mov         r5, #7
-+1:
-+        vmul.u16    q0, q9, q3
-+        subs        r12, r4
-+        vmla.u16    q0, q8, q2
-+        ittt        cc
-+        asrcc       r1, r8, #8
-+        addcc       r12, #32
-+        addcc       r1, r2, r1, lsl #1
-+        vext.16     q10, q8, q8, #7
-+        rsb         r6, r12, #32
-+        vmov        q11, q8
-+        sub         r5, #1
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r8, r7
-+        vld1.16     {d20[0]}, [r1]
-+        teq         r5, #0
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        vmul.u16    q0, q11, q3
-+        subs        r12, r4
-+        vmla.u16    q0, q10, q2
-+        ittt        cc
-+        asrcc       r1, r8, #8
-+        addcc       r12, #32
-+        addcc       r1, r2, r1, lsl #1
-+        vext.16     q8, q10, q10, #7
-+        rsb         r6, r12, #32
-+        vmov        q9, q10
-+        sub         r5, #1
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r8, r7
-+        vld1.16     {d16[0]}, [r1]
-+        teq         r5, #0
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmul.u16    q0, q11, q3
-+        vmla.u16    q0, q10, q2
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmul.u16    q0, q9, q3
-+        vmla.u16    q0, q8, q2
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        vld1.16     {q9}, [r1]!
-+        rsb         r12, r6, #32
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vext.16     q8, q9, q9, #1
-+        vld1.16     {d17[3]}, [r1]!
-+        mov         r5, #7
-+1:
-+        vmul.u16    q0, q8, q2
-+        subs        r12, r4
-+        vmla.u16    q0, q9, q3
-+        it          cc
-+        addcc       r12, #32
-+        vext.16     q10, q8, q8, #1
-+        rsb         r6, r12, #32
-+        vld1.16     {d21[3]}, [r1]
-+        sub         r5, #1
-+        vmov        q11, q8
-+        teq         r5, #0
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r1, #2
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        vmul.u16    q0, q10, q2
-+        subs        r12, r4
-+        vmla.u16    q0, q11, q3
-+        it          cc
-+        addcc       r12, #32
-+        vext.16     q8, q10, q10, #1
-+        rsb         r6, r12, #32
-+        vld1.16     {d17[3]}, [r1]
-+        sub         r5, #1
-+        vmov        q9, q10
-+        teq         r5, #0
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r1, #2
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmul.u16    q0, q10, q2
-+        vmla.u16    q0, q11, q3
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmul.u16    q0, q8, q2
-+        vmla.u16    q0, q9, q3
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_16_neon_10, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        mov         r10, #4
-+        mov         r1, r2
-+1:
-+        bl          patch_h_down_4x4_10
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+
-+        add         r2, r1, #4*2         @ restore r2, but 4 rows further down left
-+        add         r1, r1, #4*2
-+        mov         r6, r4
-+        sub         r0, #32
-+        subs        r10, #1
-+        add         r0, r0, r3, lsl #2
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+        vmov.i8     d6, #1<<2
-+1:
-+        push        {r2, r10}
-+        bl          patch_h_up_4x4_10
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        pop         {r2, r10}
-+
-+        vmov        r8, s12
-+        sub         r0, #32
-+        add         r2, #8
-+        add         r0, r0, r3, lsl #2
-+        sub         r10, r10, r7, lsl #2
-+        vshr.u8     d6, #1
-+        teq         r8, #0
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.16     {q0-q1}, [r1]
-+        sub         r9, r2, #2
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+        mov         r5, #16
-+1:
-+        vld1.16     {d17[3]}, [r9]
-+        add         r8, r7
-+        vmov        q2, q0
-+        vmov        q3, q1
-+        asr         r9, r8, #8
-+        vext.16     q1, q0, q1, #7
-+        add         r9, r2, r9, lsl #1
-+        vext.16     q0, q8, q0, #7
-+2:
-+        vmul.u16    q11, q2, q10
-+        subs        r12, r4
-+        vmla.u16    q11, q0, q9
-+        it          cc
-+        addcc       r12, #32
-+        vmul.u16    q12, q3, q10
-+        rsb         r6, r12, #32
-+        vmla.u16    q12, q1, q9
-+        sub         r5, #1
-+        teq         r5, #0
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+        vrshr.u16   q11, q11, #5
-+        vrshr.u16   q12, q12, #5
-+        vst1.16     {q11-q12}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        add         r5, r1, #32
-+        vld1.16     {q0-q1}, [r1]!
-+        rsb         r12, r6, #32
-+        vld1.16     {d16[0]}, [r5]
-+        mov         r5, #16
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+1:
-+        vmov        q2, q0
-+        add         r1, #2
-+        vmov        q3, q1
-+        vext.16     q0, q0, q1, #1
-+        vext.16     q1, q1, q8, #1
-+2:
-+        vmul.u16    q11, q0, q9
-+        subs        r12, r4
-+        vmla.u16    q11, q2, q10
-+        it          cc
-+        addcc       r12, #32
-+        vmul.u16    q12, q1, q9
-+        rsb         r6, r12, #32
-+        vmla.u16    q12, q3, q10
-+        sub         r5, #1
-+        vld1.16     {d16[0]}, [r1]
-+        teq         r5, #0
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+        vrshr.u16   q11, q11, #5
-+        vrshr.u16   q12, q12, #5
-+        vst1.16     {q11-q12}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_32_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_32_neon_10, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #1
-+        vpush       {d8}
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        add         sp, #8
-+        mov         r10, #8
-+        mov         r1, r2
-+1:
-+        bl          patch_h_down_4x4_10
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+
-+        add         r2, r1, #4*2         @ restore r2, but 4 rows further down left
-+        add         r1, r1, #4*2
-+        mov         r6, r4
-+        sub         r0, #64
-+        subs        r10, #1
-+        add         r0, r0, r3, lsl #2
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        add         sp, #8
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+        vmov.i8     d6, #1<<6
-+1:
-+        push        {r2, r10}
-+        bl          patch_h_up_4x4_10
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        pop         {r2, r10}
-+
-+        vmov        r8, s12
-+        sub         r0, #64
-+        add         r2, #8
-+        add         r0, r0, r3, lsl #2
-+        sub         r10, r10, r7, lsl #2
-+        vshr.u8     d6, #1
-+        teq         r8, #0
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        add         r5, r1, #32
-+        vld1.16     {q1-q2}, [r1]
-+        rsb         r12, r6, r6, lsl #16
-+        vld1.16     {q3-q4}, [r5]
-+        sub         r9, r2, #2
-+        rsb         r4, r12, #0
-+        rsb         r12, r12, #32 << 16
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        vmov        d0, d9
-+        vmov        s2, r12
-+        add         r10, r0, #32
-+        mov         r5, #32
-+1:
-+        vld1.16     {d1[3]}, [r9]
-+        add         r8, r7
-+        vmov        q11, q4
-+        vmov        q10, q3
-+        asr         r9, r8, #8
-+        vmov        q9, q2
-+        add         r9, r2, r9, lsl #1
-+        vmov        q8, q1
-+        vext.16     q4, q3, q4, #7
-+        vext.16     q3, q2, q3, #7
-+        vext.16     q2, q1, q2, #7
-+        vext.16     q1, q0, q1, #7
-+2:
-+        vmul.u16    q12, q8, d1[1]
-+        adds        r12, r4
-+        vmla.u16    q12, q1, d1[0]
-+        it          cc
-+        addcc       r12, #32 << 16
-+        vmul.u16    q13, q9, d1[1]
-+        it          cc
-+        subcc       r12, #32
-+        vmla.u16    q13, q2, d1[0]
-+        sub         r5, #1
-+        vmul.u16    q14, q10, d1[1]
-+        teq         r5, #0
-+        vmla.u16    q14, q3, d1[0]
-+        vmul.u16    q15, q11, d1[1]
-+        vmla.u16    q15, q4, d1[0]
-+        vmov        s2, r12
-+        vrshr.u16   q12, q12, #5
-+        vrshr.u16   q13, q13, #5
-+        vrshr.u16   q14, q14, #5
-+        vrshr.u16   q15, q15, #5
-+        vst1.16     {q12-q13}, [r0], r3
-+        vst1.16     {q14-q15}, [r10], r3
-+        bhi         2b
-+        bne         1b
-+
-+        vpop        {d8}
-+        vmov        d9, d0
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        add         r5, r1, #32
-+        vld1.16     {q1-q2}, [r1]
-+        rsb         r12, r6, r6, lsl #16
-+        vld1.16     {q3-q4}, [r5]
-+        add         r1, r1, #64
-+        rsb         r4, r12, #0
-+        rsb         r12, r12, #32 << 16
-+        vmov        d1, d9
-+        vmov        s1, r12
-+        add         r10, r0, #32
-+        mov         r5, #32
-+1:
-+        vld1.16     {d0[0]}, [r1]!
-+        vmov        q8, q1
-+        vmov        q9, q2
-+        vmov        q10, q3
-+        vmov        q11, q4
-+        vext.16     q1, q1, q2, #1
-+        vext.16     q2, q2, q3, #1
-+        vext.16     q3, q3, q4, #1
-+        vext.16     q4, q4, q0, #1
-+2:
-+        vmul.u16    q12, q1, d0[2]
-+        adds        r12, r4
-+        vmla.u16    q12, q8, d0[3]
-+        it          cc
-+        addcc       r12, #32 << 16
-+        vmul.u16    q13, q2, d0[2]
-+        it          cc
-+        subcc       r12, #32
-+        vmla.u16    q13, q9, d0[3]
-+        sub         r5, #1
-+        vmul.u16    q14, q3, d0[2]
-+        teq         r5, #0
-+        vmla.u16    q14, q10, d0[3]
-+        vmul.u16    q15, q4, d0[2]
-+        vmla.u16    q15, q11, d0[3]
-+        vmov        s1, r12
-+        vrshr.u16   q12, q12, #5
-+        vrshr.u16   q13, q13, #5
-+        vrshr.u16   q14, q14, #5
-+        vrshr.u16   q15, q15, #5
-+        vst1.16     {q12-q13}, [r0], r3
-+        vst1.16     {q14-q15}, [r10], r3
-+        bhi         2b
-+        bne         1b
-+
-+        vpop        {d8}
-+        vmov        d9, d1
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+
-+@ Generate 4x4 chroma patch
-+@
-+@ In (const)
-+@ r1   Up ptr (_up only)
-+@ r3   Out stride
-+@ r4   Angle add
-+@ r7   Inv angle (_up only)
-+@
-+@ In/Out (updated)
-+@ r0   Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
-+@ r2   Left ptr - updated
-+@ r6   Angle frac (init to r4 + 32)
-+@ r8   Inv angle accumulator
-+@ q2   Cur Line - load before 1st call for down - set by _up
-+@ q8   Cur Line - load before 1st call for up   - set by _down
-+@
-+@ Temps
-+@ r5   Loop counter
-+@ r12
-+@ d0, q1, q12-q15
-+
-+patch_h_down_c_4x4_10:
-+        vld1.16     {q12}, [r2]!
-+        rsb         r12, r6, #32
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        mov         r5, #4
-+1:
-+        vmov        q13, q12
-+        vext.16     q12, q12, q12, #2
-+        vld1.32     {d25[1]}, [r2]!
-+patch_h_down_c_4x4_10_continue:
-+2:
-+        vmov        q8, q9
-+        subs        r12, r4
-+        vmul.u16    q0, q13, q3
-+        it          cc
-+        addcc       r12, #32
-+        vmla.u16    q0, q12, q2
-+        rsb         r6, r12, #32
-+        vmov        q9, q10
-+        sub         r5, #1
-+        vmov        q10, q11
-+        teq         r5, #0
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vrshr.u16   q11, q0, #5
-+        bhi         2b
-+        bne         1b
-+
-+        bcs         3f
-+        vmov        q13, q12
-+        vext.16     q12, q12, q12, #2
-+        vld1.32     {d25[1]}, [r2]!
-+3:
-+
-+store_tran_c_4x4_10:
-+T       add         r6, r0, r3
-+        vzip.32     q8, q10
-+A       add         r6, r0, r3
-+T       lsl         r3, #1
-+        vzip.32     q9, q11
-+A       add         r5, r0, r3, lsl #1
-+T       add         r5, r0, r3
-+        vst2.32     {d16,d18}, [r0]!
-+A       lsl         r3, #1
-+        vst2.32     {d17,d19}, [r6], r3
-+        asr         r3, #1
-+        vst2.32     {d20,d22}, [r5]
-+        mov         r5, #4
-+        vst2.32     {d21,d23}, [r6]
-+        bx          lr
-+
-+patch_h_up_c_4x4_10:
-+        vld1.16     {q1}, [r2]
-+        rsb         r12, r6, #32
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        mov         r5, #4
-+1:
-+        adds        r8, r7
-+        vmov        q12, q1
-+        it          mi
-+        ldrmi       r6, [r2, #-4]!
-+        vext.16     q1, q1, q1, #6
-+        itt         pl
-+        asrpl       r6, r8, #8
-+        ldrpl       r6, [r1, r6, lsl #2]
-+        vmov        s4, r6
-+patch_h_up_c_4x4_10_continue:
-+2:
-+        vmov        q8, q9
-+        subs        r12, r4
-+        vmul.u16    q0, q12, q3
-+        it          cc
-+        addcc       r12, #32
-+        vmla.u16    q0, q1, q2
-+        rsb         r6, r12, #32
-+        vmov        q9, q10
-+        sub         r5, #1
-+        vmov        q10, q11
-+        teq         r5, #0
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vrshr.u16   q11, q0, #5
-+        bhi         2b
-+        bne         1b
-+
-+        bcs         store_tran_c_4x4_10
-+        adds        r8, r7
-+        vmov        q12, q1
-+        it          mi
-+        ldrmi       r6, [r2, #-4]!
-+        vext.16     q1, q1, q1, #6
-+        itt         pl
-+        asrpl       r6, r8, #8
-+        ldrpl       r6, [r1, r6, lsl #2]
-+        vmov        s4, r6
-+        b           store_tran_c_4x4_10
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_c_4_neon_10, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r8, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #2
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        bl          patch_h_down_c_4x4_10
-+        pop         {r4-r8, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        sub         r8, r7
-+        bl          patch_h_up_c_4x4_10
-+        pop         {r4-r8, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.16     {q9}, [r1]
-+        sub         r1, r2, #4
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        vdup.16     q2, r6
-+        vext.16     q8, q9, q9, #6
-+        sub         r8, r7, #128
-+        vld1.32     {d16[0]}, [r1]
-+        vdup.16     q3, r12
-+        mov         r5, #3
-+1:
-+        vmul.u16    q0, q9, q3
-+        subs        r12, r4
-+        vmla.u16    q0, q8, q2
-+        ittt        cc
-+        asrcc       r1, r8, #8
-+        addcc       r12, #32
-+        addcc       r1, r2, r1, lsl #2
-+        vext.16     q10, q8, q8, #6
-+        rsb         r6, r12, #32
-+        vmov        q11, q8
-+        sub         r5, #1
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r8, r7
-+        vld1.32     {d20[0]}, [r1]
-+        teq         r5, #0
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        vmul.u16    q0, q11, q3
-+        subs        r12, r4
-+        vmla.u16    q0, q10, q2
-+        ittt        cc
-+        asrcc       r1, r8, #8
-+        addcc       r12, #32
-+        addcc       r1, r2, r1, lsl #2
-+        vext.16     q8, q10, q10, #6
-+        rsb         r6, r12, #32
-+        vmov        q9, q10
-+        sub         r5, #1
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r8, r7
-+        vld1.32     {d16[0]}, [r1]
-+        teq         r5, #0
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmul.u16    q0, q11, q3
-+        vmla.u16    q0, q10, q2
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r8, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmul.u16    q0, q9, q3
-+        vmla.u16    q0, q8, q2
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r8, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        vld1.16     {q9}, [r1]!
-+        rsb         r12, r6, #32
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vext.16     q8, q9, q9, #2
-+        vld1.32     {d17[1]}, [r1]!
-+        mov         r5, #3
-+1:
-+        vmul.u16    q0, q8, q2
-+        subs        r12, r4
-+        vmla.u16    q0, q9, q3
-+        it          cc
-+        addcc       r12, #32
-+        vext.16     q10, q8, q8, #2
-+        rsb         r6, r12, #32
-+        vld1.32     {d21[1]}, [r1]
-+        sub         r5, #1
-+        vmov        q11, q8
-+        teq         r5, #0
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r1, #4
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        vmul.u16    q0, q10, q2
-+        subs        r12, r4
-+        vmla.u16    q0, q11, q3
-+        it          cc
-+        addcc       r12, #32
-+        vext.16     q8, q10, q10, #2
-+        rsb         r6, r12, #32
-+        vld1.32     {d17[1]}, [r1]
-+        sub         r5, #1
-+        vmov        q9, q10
-+        teq         r5, #0
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r1, #4
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmul.u16    q0, q10, q2
-+        vmla.u16    q0, q11, q3
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r8, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmul.u16    q0, q8, q2
-+        vmla.u16    q0, q9, q3
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r8, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_c_8_neon_10, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r8, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #2
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        mov         r1,  r2             @ save r2 - r1 unused by patch_down
-+
-+        bl          patch_h_down_c_4x4_10
-+        bl          patch_h_down_c_4x4_10_continue
-+
-+        add         r2, r1, #4*4        @ restore r2, but 4 rows further down left
-+        sub         r0, #32
-+        mov         r6, r4
-+        add         r0, r0, r3, lsl #2
-+
-+        bl          patch_h_down_c_4x4_10
-+        bl          patch_h_down_c_4x4_10_continue
-+
-+        pop         {r4-r8, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        sub         r8, r7
-+
-+        push        {r2, r8}
-+        bl          patch_h_up_c_4x4_10
-+        bl          patch_h_up_c_4x4_10_continue
-+        pop         {r2, r8}
-+
-+        sub         r0, #32
-+        mov         r6, r4
-+        add         r2, #16
-+        sub         r8, r8, r7, lsl #2
-+        add         r0, r0, r3, lsl #2
-+
-+        bl          patch_h_up_c_4x4_10
-+        bl          patch_h_up_c_4x4_10_continue
-+
-+        pop         {r4-r8, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.16     {q0-q1}, [r1]
-+        sub         r9, r2, #4
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+        mov         r5, #8
-+1:
-+        vld1.32     {d17[1]}, [r9]
-+        add         r8, r7
-+        vmov        q2, q0
-+        vmov        q3, q1
-+        asr         r9, r8, #8
-+        vext.16     q1, q0, q1, #6
-+        add         r9, r2, r9, lsl #2
-+        vext.16     q0, q8, q0, #6
-+2:
-+        vmul.u16    q11, q2, q10
-+        subs        r12, r4
-+        vmla.u16    q11, q0, q9
-+        it          cc
-+        addcc       r12, #32
-+        vmul.u16    q12, q3, q10
-+        rsb         r6, r12, #32
-+        vmla.u16    q12, q1, q9
-+        sub         r5, #1
-+        teq         r5, #0
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+        vrshr.u16   q11, q11, #5
-+        vrshr.u16   q12, q12, #5
-+        vst1.16     {q11-q12}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r8, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        add         r5, r1, #32
-+        vld1.16     {q0-q1}, [r1]!
-+        rsb         r12, r6, #32
-+        vld1.32     {d16[0]}, [r5]
-+        mov         r5, #8
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+1:
-+        vmov        q2, q0
-+        add         r1, #4
-+        vmov        q3, q1
-+        vext.16     q0, q0, q1, #2
-+        vext.16     q1, q1, q8, #2
-+2:
-+        vmul.u16    q11, q0, q9
-+        subs        r12, r4
-+        vmla.u16    q11, q2, q10
-+        it          cc
-+        addcc       r12, #32
-+        vmul.u16    q12, q1, q9
-+        rsb         r6, r12, #32
-+        vmla.u16    q12, q3, q10
-+        sub         r5, #1
-+        vld1.32     {d16[0]}, [r1]
-+        teq         r5, #0
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+        vrshr.u16   q11, q11, #5
-+        vrshr.u16   q12, q12, #5
-+        vst1.16     {q11-q12}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r8, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_c_16_neon_10, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r10, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #2
-+        vpush       {d8}
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        add         sp, #8
-+        mov         r10, #4
-+        mov         r1, r2
-+1:
-+        bl          patch_h_down_c_4x4_10
-+        bl          patch_h_down_c_4x4_10_continue
-+        bl          patch_h_down_c_4x4_10_continue
-+        bl          patch_h_down_c_4x4_10_continue
-+
-+        add         r2, r1, #4*4         @ restore r2, but 4 rows further down left
-+        add         r1, r1, #4*4
-+        mov         r6, r4
-+        sub         r0, #64
-+        subs        r10, #1
-+        add         r0, r0, r3, lsl #2
-+        bne         1b
-+
-+        pop         {r4-r10, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        add         sp, #8
-+        mov         r10, #4
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        sub         r8, r7
-+2:
-+        push        {r2, r8}
-+        bl          patch_h_up_c_4x4_10
-+        bl          patch_h_up_c_4x4_10_continue
-+        bl          patch_h_up_c_4x4_10_continue
-+        bl          patch_h_up_c_4x4_10_continue
-+        pop         {r2, r8}
-+
-+        sub         r0, #64
-+        mov         r6, r4
-+        add         r2, #16
-+        sub         r8, r8, r7, lsl #2
-+        add         r0, r0, r3, lsl #2
-+        subs        r10, #1
-+        bne         2b
-+
-+        pop         {r4-r10, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        add         r5, r1, #32
-+        vld1.16     {q1-q2}, [r1]
-+        rsb         r12, r6, r6, lsl #16
-+        vld1.16     {q3-q4}, [r5]
-+        sub         r9, r2, #4
-+        rsb         r4, r12, #0
-+        rsb         r12, r12, #32 << 16
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        vmov        d0, d9
-+        vmov        s2, r12
-+        add         r10, r0, #32
-+        mov         r5, #16
-+1:
-+        vld1.32     {d1[1]}, [r9]
-+        add         r8, r7
-+        vmov        q11, q4
-+        vmov        q10, q3
-+        asr         r9, r8, #8
-+        vmov        q9, q2
-+        add         r9, r2, r9, lsl #2
-+        vmov        q8, q1
-+        vext.16     q4, q3, q4, #6
-+        vext.16     q3, q2, q3, #6
-+        vext.16     q2, q1, q2, #6
-+        vext.16     q1, q0, q1, #6
-+2:
-+        vmul.u16    q12, q8, d1[1]
-+        adds        r12, r4
-+        vmla.u16    q12, q1, d1[0]
-+        it          cc
-+        addcc       r12, #32 << 16
-+        vmul.u16    q13, q9, d1[1]
-+        it          cc
-+        subcc       r12, #32
-+        vmla.u16    q13, q2, d1[0]
-+        sub         r5, #1
-+        vmul.u16    q14, q10, d1[1]
-+        teq         r5, #0
-+        vmla.u16    q14, q3, d1[0]
-+        vmul.u16    q15, q11, d1[1]
-+        vmla.u16    q15, q4, d1[0]
-+        vmov        s2, r12
-+        vrshr.u16   q12, q12, #5
-+        vrshr.u16   q13, q13, #5
-+        vrshr.u16   q14, q14, #5
-+        vrshr.u16   q15, q15, #5
-+        vst1.16     {q12-q13}, [r0], r3
-+        vst1.16     {q14-q15}, [r10], r3
-+        bhi         2b
-+        bne         1b
-+
-+        vpop        {d8}
-+        vmov        d9, d0
-+        pop         {r4-r10, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        add         r5, r1, #32
-+        vld1.16     {q1-q2}, [r1]
-+        rsb         r12, r6, r6, lsl #16
-+        vld1.16     {q3-q4}, [r5]
-+        add         r1, r1, #64
-+        rsb         r4, r12, #0
-+        rsb         r12, r12, #32 << 16
-+        vmov        d1, d9
-+        vmov        s1, r12
-+        add         r10, r0, #32
-+        mov         r5, #16
-+1:
-+        vld1.32     {d0[0]}, [r1]!
-+        vmov        q8, q1
-+        vmov        q9, q2
-+        vmov        q10, q3
-+        vmov        q11, q4
-+        vext.16     q1, q1, q2, #2
-+        vext.16     q2, q2, q3, #2
-+        vext.16     q3, q3, q4, #2
-+        vext.16     q4, q4, q0, #2
-+2:
-+        vmul.u16    q12, q1, d0[2]
-+        adds        r12, r4
-+        vmla.u16    q12, q8, d0[3]
-+        it          cc
-+        addcc       r12, #32 << 16
-+        vmul.u16    q13, q2, d0[2]
-+        it          cc
-+        subcc       r12, #32
-+        vmla.u16    q13, q9, d0[3]
-+        sub         r5, #1
-+        vmul.u16    q14, q3, d0[2]
-+        teq         r5, #0
-+        vmla.u16    q14, q10, d0[3]
-+        vmul.u16    q15, q4, d0[2]
-+        vmla.u16    q15, q11, d0[3]
-+        vmov        s1, r12
-+        vrshr.u16   q12, q12, #5
-+        vrshr.u16   q13, q13, #5
-+        vrshr.u16   q14, q14, #5
-+        vrshr.u16   q15, q15, #5
-+        vst1.16     {q12-q13}, [r0], r3
-+        vst1.16     {q14-q15}, [r10], r3
-+        bhi         2b
-+        bne         1b
-+
-+        vpop        {d8}
-+        vmov        d9, d1
-+        pop         {r4-r10, pc}
-+
-+endfunc
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S
-@@ -0,0 +1,705 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox, Ben Avison
-+*/
-+
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+
-+@ ff_hevc_rpi_pred_dc_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_4_neon_8, export=1
-+
-+        @ Average the els of top & left
-+        ldr         r2, [r2]
-+        vld1.32     {d0[0]}, [r1]
-+        mov         r1, #2
-+        vmov        s1, r2
-+        vmov        s2, r2
-+        vmov.i16    q2, #3
-+        add         r2, r0, r3
-+        vaddl.u8    q1, d0, d1    @ d2[0] = top[0] + left[0]
-+        lsl         r3, #1
-+        vmovl.u8    q0, d0
-+        vmov.i64    d7, #0xffff
-+        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
-+        vpadd.i16   d6, d2, d2    @ 2 (top & bottom of vector the same)
-+        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..3], left[0..3]
-+
-+        @ top line gets some smoothing
-+        @ (top[i] + 3*dc + 2) >> 2
-+        @ as does left
-+        @ top_line[0] is extra special
-+        @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+        vmov.i64    d7, #0xff
-+        vpadd.i16   d6, d6        @ 1 (all the same)
-+        vrshr.u16   d6, #3
-+        vmla.i16    q0, q2, d6[0]
-+        vdup.8      d6, d6[0]
-+        vrshrn.i16  d0, q0, #2
-+
-+        @ Store top line
-+        vst1.32     {d0[0]}, [r0], r3
-+
-+        @ Store the rest
-+        vshr.u64    d1, d0, #5*8
-+        vshr.u64    d2, d0, #6*8
-+        vshr.u64    d3, d0, #7*8
-+        vbif        d1, d6, d7
-+        vbif        d2, d6, d7
-+        vst1.32     {d1[0]}, [r2], r3
-+        vbif        d3, d6, d7
-+        vst1.32     {d2[0]}, [r0]
-+        vst1.32     {d3[0]}, [r2]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_c_4_neon_8, export=1
-+
-+        @ Average the els of top & left
-+        vld1.8      {d0}, [r1]
-+        vld1.8      {d1}, [r2]
-+A       add         r2, r0, r3, lsl #1
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+T       add         r2, r0, r3
-+T       lsl         r3, #1
-+        vaddl.u8    q0, d0, d1
-+        vadd.i16    d0, d1       @ d0 has 2 val pairs
-+        vpadd.i32   d2, d0, d0   @ This adds U & V separately
-+        vpadd.i32   d3, d0, d0
-+        vrshrn.u16  d0, q1, #3
-+
-+        @ Store
-+        vst1.8      {d0}, [r0], r3
-+        vst1.8      {d0}, [r2], r3
-+        vst1.8      {d0}, [r0]
-+        vst1.8      {d0}, [r2]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_8_neon_8, export=1
-+
-+        @ Average the els of top & left
-+        vld1.8      {d0}, [r1]
-+        mov         r1, #2
-+        vld1.8      {d16}, [r2]
-+        vmov.i16    q2, #3
-+        vmov.i64    d7, #0xffff
-+        vaddl.u8    q1, d0, d16   @ d2[0] = top[0] + left[0]
-+        vmovl.u8    q0, d0
-+        vadd.i16    d6, d2, d3    @ d6 has 4 vals
-+        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
-+        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..7]
-+
-+        @ top line gets some smoothing
-+        @ (top[i] + 3*dc + 2) >> 2
-+        @ as does left
-+        @ top_line[0] is extra special
-+        @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+        vmov.i64    d7, #0xff
-+        vmovl.u8    q1, d16
-+        vpadd.i16   d6, d6        @ 2 (top & bottom of vector the same)
-+        vpadd.i16   d6, d6        @ 1 (all the same)
-+        vrshr.u16   d6, #4
-+        vmla.i16    q1, q2, d6[0]
-+        vmla.i16    q0, q2, d6[0]
-+        vdup.8      d6, d6[0]
-+        vrshrn.i16  d2, q1, #2
-+        vrshrn.i16  d0, q0, #2
-+
-+        @ Store top line
-+        vst1.8      {d0}, [r0], r3
-+
-+        @ Store the rest
-+        vshr.u64    d2, #8
-+        vbit        d6, d2, d7
-+        vshr.u64    d2, #8
-+        vst1.8      {d6}, [r0], r3
-+        mov         r1, #6
-+1:
-+        vbit        d6, d2, d7
-+        vshr.u64    d2, #8
-+        vst1.8      {d6}, [r0], r3
-+        subs        r1, #2
-+        vbit        d6, d2, d7
-+        vshr.u64    d2, #8
-+        vst1.8      {d6}, [r0], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_c_8_neon_8, export=1
-+
-+        @ Average the els of top & left
-+        vld1.8      {q0}, [r1]
-+        mov         r1, #8
-+        vld1.8      {q1}, [r2]
-+T       lsl         r3, #1
-+        vaddl.u8    q0, d0, d1
-+A       add         r2, r0, r3, lsl #1
-+A       lsl         r3, #2
-+T       add         r2, r0, r3
-+T       lsl         r3, #1
-+        vaddl.u8    q1, d2, d3
-+        vadd.i16    q1, q0
-+        vadd.i16    d3, d2        @ d3 has 2 val pairs
-+        vpadd.i32   d2, d3, d3    @ This add U & V separately
-+        vpadd.i32   d3, d3, d3
-+        vrshrn.u16  d0, q1, #4
-+        vrshrn.u16  d1, q1, #4
-+
-+        @ Store
-+1:
-+        vst1.8      {q0}, [r0], r3
-+        subs        r1, #4
-+        vst1.8      {q0}, [r2], r3
-+        vst1.8      {q0}, [r0], r3
-+        vst1.8      {q0}, [r2], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_16_neon_8, export=1
-+
-+        @ Average the els of top & left
-+        vld1.8      {q8}, [r1]
-+        mov         r1, #2
-+        vld1.8      {q9}, [r2]
-+        vaddl.u8    q10, d16, d17
-+        vaddl.u8    q11, d16, d18
-+        vaddl.u8    q0, d18, d19
-+        vmov.i16    q1, #3
-+        vadd.i16    q10, q0
-+        vmovl.u8    q0, d18
-+        vadd.i16    d20, d21
-+        vmov.i16    d2[0], r1     @ 2, 3, 3, 3...
-+
-+        @ top line gets some smoothing
-+        @ (top[i] + 3*dc + 2) >> 2
-+        @ as does left
-+        @ top_line[0] is extra special
-+        @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+        vmovl.u8    q2, d16
-+        vmovl.u8    q9, d19
-+        vpadd.i16   d20, d20      @ 2 (top & bottom of vector the same)
-+        vmov.i64    d7, #0xffff
-+        vmovl.u8    q8, d17
-+        vbit        d4, d22, d7   @ q2 = top[0]+left[0], top[1..7]
-+        vmov.i64    d7, #0xff
-+        vpadd.i16   d20, d20      @ 1 (all the same)
-+        vrshr.u16   d21, d20, #5
-+        vrshr.u16   d20, d20, #5
-+        vmla.i16    q0, q10, d2[1]
-+        vmla.i16    q9, q10, d2[1]
-+        vmla.i16    q2, q10, q1
-+        vmla.i16    q8, q10, d2[1]
-+        vdup.8      q1, d20[0]
-+        vrshrn.i16  d0, q0, #2
-+        vrshrn.i16  d1, q9, #2
-+        vrshrn.i16  d4, q2, #2
-+        vrshrn.i16  d5, q8, #2
-+        vext.8      q0, q0, q0, #1
-+
-+        @ Store top line
-+        vst1.8      {q2}, [r0], r3
-+
-+        @ Store the rest
-+        mov         r1, #15
-+1:
-+        vbit        d2, d0, d7
-+        vext.8      q0, q0, q0, #1
-+        subs        r1, #1
-+        vst1.8      {q1}, [r0], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_c_16_neon_8, export=1
-+
-+        @ Average the els of top & left
-+        vld1.8      {q0-q1}, [r1]
-+        mov         r1, #16
-+        vld1.8      {q2-q3}, [r2]
-+T       lsl         r3, #1
-+        vaddl.u8    q0, d0, d1
-+A       add         r2, r0, r3, lsl #1
-+T       add         r2, r0, r3
-+        vaddl.u8    q1, d2, d3
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vaddl.u8    q2, d4, d5
-+        vaddl.u8    q3, d6, d7
-+        vadd.i16    q0, q1
-+        vadd.i16    q2, q3
-+        vadd.i16    q0, q2
-+        vadd.i16    d0, d1        @ d0 has 2 val pairs
-+        vpadd.i32   d4, d0, d0    @ This adds U & V separately
-+        vpadd.i32   d5, d0, d0
-+        vrshrn.u16  d0, q2, #5
-+        vrshrn.u16  d1, q2, #5
-+        vrshrn.u16  d2, q2, #5
-+        vrshrn.u16  d3, q2, #5
-+
-+        @ Store
-+1:
-+        vst1.8      {q0-q1}, [r0], r3
-+        subs        r1, #2
-+        vst1.8      {q0-q1}, [r2], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_32_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_32_neon_8, export=1
-+
-+        @ Average the els of top & left
-+        vld1.8      {q0-q1}, [r1]
-+        mov         r1, #32
-+        vld1.8      {q2-q3}, [r2]
-+        add         r2, r0, r3
-+        vaddl.u8    q0, d0, d1
-+        lsl         r3, #1
-+        vaddl.u8    q1, d2, d3
-+        vaddl.u8    q2, d4, d5
-+        vaddl.u8    q3, d6, d7
-+        vadd.i16    q0, q1
-+        vadd.i16    q2, q3
-+        vadd.i16    q0, q2
-+        vadd.i16    d0, d1        @ d0 has 4 vals
-+        vpadd.i16   d0, d0        @ 2 (top & bottom the same)
-+        vpadd.i16   d4, d0, d0    @ 1 (all the same)
-+        vpadd.i16   d5, d0, d0
-+        vrshrn.u16  d0, q2, #6
-+        vrshrn.u16  d1, q2, #6
-+        vrshrn.u16  d2, q2, #6
-+        vrshrn.u16  d3, q2, #6
-+
-+        @ Store
-+1:
-+        vst1.8      {q0-q1}, [r0], r3
-+        subs        r1, #2
-+        vst1.8      {q0-q1}, [r2], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ -----------------------------------------------------------------------------
-+@
-+@ 10 Bit versions
-+@
-+@ There is no actual bit depth dependency in this code except that our
-+@ intermediate results will overflow the 16 bits they are stored in
-+@ All there functions are good to 10 bits - with the worst case being
-+@ in dc_32 where we use all 16 bits.
-+
-+
-+@ ff_hevc_rpi_pred_dc_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_4_neon_10, export=1
-+
-+        @ Average the els of top & left
-+        vld1.16     {d0}, [r1]
-+        mov         r1, #2
-+        vld1.16     {d1}, [r2]
-+T       lsl         r3, #1
-+        vmov.i16    q2, #3
-+A       add         r2, r0, r3, lsl #1
-+T       add         r2, r0, r3
-+        vadd.u16    d2, d0, d1    @ d2[0] = top[0] + left[0]
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
-+        vmov.i64    d7, #0xffff
-+        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..3], left[0..3]
-+
-+        @ top line gets some smoothing
-+        @ (top[i] + 3*dc + 2) >> 2
-+        @ as does left
-+        @ top_line[0] is extra special
-+        @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+        vpadd.i16   d6, d2, d2    @ 2 (top & bottom of vector the same)
-+        vpadd.i16   d6, d6        @ 1 (all the same)
-+        vrshr.u16   d6, #3
-+        vmla.i16    q0, q2, d6[0]
-+        vrshr.u16   q0, #2
-+
-+        @ Store top line
-+        vst1.16     {d0}, [r0], r3
-+
-+        @ Store the rest
-+        vshr.u64    d3, d1, #1*16
-+        vshr.u64    d4, d1, #2*16
-+        vshr.u64    d5, d1, #3*16
-+        vbif        d3, d6, d7
-+        vbif        d4, d6, d7
-+        vst1.16     {d3}, [r2], r3
-+        vbif        d5, d6, d7
-+        vst1.16     {d4}, [r0]
-+        vst1.16     {d5}, [r2]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]  (In pels - needs * 4)
-+
-+function ff_hevc_rpi_pred_dc_c_4_neon_10, export=1
-+
-+        @ Average the els of top & left
-+        vld1.8      {q0}, [r1]
-+        vld1.8      {q1}, [r2]
-+A       add         r2, r0, r3, lsl #2
-+A       lsl         r3, #3
-+T       lsl         r3, #2
-+T       add         r2, r0, r3
-+T       lsl         r3, #1
-+        vadd.i16    q0, q1
-+        vadd.i16    d0, d1       @ d0 has 2 val pairs
-+        vpadd.i32   d2, d0, d0   @ This adds U & V separately
-+        vpadd.i32   d3, d0, d0
-+        vrshr.u16   q0, q1, #3
-+
-+        vst1.16     {q0}, [r0], r3
-+        vst1.16     {q0}, [r2], r3
-+        vst1.16     {q0}, [r0]
-+        vst1.16     {q0}, [r2]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_8_neon_10, export=1
-+
-+        @ Average the els of top & left
-+        vld1.16     {q0}, [r1]
-+        mov         r1, #2
-+        vld1.16     {q8}, [r2]
-+T       lsl         r3, #1
-+        vmov.i16    q2, #3
-+A       add         r2, r0, r3, lsl #1
-+T       add         r2, r0, r3
-+        vadd.i16    q1, q0, q8    @ q1[0] = top[0] + left[0]
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vmov.i64    d7, #0xffff
-+        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
-+        vadd.i16    d6, d2, d3    @ d6 has 4 vals
-+        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..7]
-+
-+        @ top line gets some smoothing
-+        @ (top[i] + 3*dc + 2) >> 2
-+        @ as does left
-+        @ top_line[0] is extra special
-+        @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+        vpadd.i16   d6, d6        @ 2 (top & bottom of vector the same)
-+        vpadd.i16   d6, d6        @ 1 (all the same)
-+        vrshr.u16   d6, #4
-+        vmla.i16    q8, q2, d6[0]
-+        vmla.i16    q0, q2, d6[0]
-+        vdup.16     q2, d6[0]
-+        vdup.16     q9, d6[0]
-+        vrshr.u16   q8, q8, #2
-+        vrshr.u16   q0, q0, #2
-+        vext.16     q1, q8, q8, #1
-+
-+        @ Store top line
-+        vst1.16     {q0}, [r0], r3
-+
-+        @ Store the rest
-+        vbit        d18, d2, d7
-+        vst1.16     {q9}, [r2], r3
-+        mov         r1, #6
-+1:
-+        vext.16     q8, q8, q8, #2
-+        subs        r1, #2
-+        vext.16     q1, q1, q1, #2
-+        vbit        d4, d16, d7
-+        vst1.16     {q2}, [r0], r3
-+        vbit        d18, d2, d7
-+        vst1.16     {q9}, [r2], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]  (In pels - needs * 4)
-+
-+function ff_hevc_rpi_pred_dc_c_8_neon_10, export=1
-+
-+        @ Average the els of top & left
-+        vld1.16     {q0-q1}, [r1]
-+        mov         r1, #8
-+        vld1.16     {q2-q3}, [r2]
-+T       lsl         r3, #2
-+        vadd.i16    q1, q0
-+A       add         r2, r0, r3, lsl #2
-+A       lsl         r3, #3
-+T       add         r2, r0, r3
-+T       lsl         r3, #1
-+        vadd.i16    q2, q3
-+        vadd.i16    q1, q2
-+        vadd.i16    d3, d2        @ d3 has 2 val pairs
-+        vpadd.i32   d2, d3, d3    @ This add U & V separately
-+        vpadd.i32   d3, d3, d3
-+        vrshr.u16   q0, q1, #4
-+        vrshr.u16   q1, q1, #4
-+
-+        @ Store
-+1:
-+        vst1.8      {q0-q1}, [r0], r3
-+        subs        r1, #2
-+        vst1.8      {q0-q1}, [r2], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_16_neon_10, export=1
-+
-+        @ Average the els of top & left
-+        vld1.16     {q8-q9}, [r1]
-+        mov         r1, #2
-+        vld1.16     {q10-q11}, [r2]
-+        lsl         r3, #1        @ stride given in pels
-+        vadd.i16    q0, q8, q9
-+        vadd.i16    q1, q10, q11
-+        vmov.i16    q3, #3
-+        vadd.i16    q1, q0
-+        vadd.i16    d0, d16, d20
-+        vmov.i64    d31, #0xffff
-+        vadd.i16    d3, d2
-+        vmov.16     d6[0], r1     @ 2, 3, 3, 3...
-+
-+        @ top line gets some smoothing
-+        @ (top[i] + 3*dc + 2) >> 2
-+        @ as does left
-+        @ topline[0] is extra special
-+        @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+        vbit        d16, d0, d31  @ q8 = top[0]+left[0], top[1..7]
-+        vpadd.i16   d3, d3        @ 2 (top & bottom of vector the same)
-+        vpadd.i16   d3, d3        @ 1 (all the same)
-+        vrshr.u16   d2, d3, #5
-+        vrshr.u16   d3, d3, #5
-+        vmov        q0, q1
-+        vmla.i16    q10, q1, d6[1]
-+        vmla.i16    q11, q1, d6[1]
-+        vmla.i16    q8, q1, q3
-+        vmla.i16    q9, q1, d6[1]
-+        vrshr.u16   q2, q10, #2
-+        vrshr.u16   q3, q11, #2
-+        vrshr.u16   q8, #2
-+        vrshr.u16   q9, #2
-+        vext.16     q2, q2, q2, #1
-+        mov         r1, #7<<29
-+
-+        @ Store top line
-+        vst1.16     {q8-q9}, [r0], r3
-+
-+        @ Store the rest
-+1:
-+        vbit        d0, d4, d31
-+        vext.16     q2, q2, q2, #1
-+        subs        r1, #1<<29
-+        vst1.16     {q0-q1}, [r0], r3
-+        bne         1b
-+1:
-+        vbit        d0, d6, d31
-+        vext.16     q3, q3, q3, #1
-+        subs        r1, #1<<29
-+        vst1.16     {q0-q1}, [r0], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]  (In pels - needs * 4)
-+
-+function ff_hevc_rpi_pred_dc_c_16_neon_10, export=1
-+
-+        @ Average the els of top & left
-+        vldm        r1, {q0-q3}
-+        vldm        r2, {q8-q11}
-+        vadd.i16    q0, q1
-+        mov         r1, #16
-+        vadd.i16    q2, q3
-+        add         r2, r0, #32
-+        vadd.i16    q8, q9
-+        lsl         r3, #2
-+        vadd.i16    q10, q11
-+        vadd.u16    q0, q2
-+        vadd.u16    q8, q10
-+        vadd.i16    q0, q8
-+        vadd.i16    d0, d1        @ d0 has 2 val pairs
-+        vpadd.i32   d4, d0, d0    @ This adds U & V separately
-+        vpadd.i32   d5, d0, d0
-+        vrshr.u16   q0, q2, #5
-+        vrshr.u16   q1, q2, #5
-+
-+        @ Store
-+1:
-+        vst1.16     {q0-q1}, [r0], r3
-+        subs        r1, #1
-+        vst1.16     {q0-q1}, [r2], r3
-+        bne         1b
-+
-+        bx           lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_32_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]  (In pels)
-+
-+function ff_hevc_rpi_pred_dc_32_neon_10, export=1
-+
-+        @ Average the els of top & left
-+        @ With 10 bits we are (just) safe from overflow in i16
-+        vldm        r1, {q0-q3}
-+        vldm        r2, {q8-q11}
-+        vadd.i16    q0, q1
-+        mov         r1, #32
-+        vadd.i16    q2, q3
-+        add         r2, r0, #32
-+        vadd.i16    q8, q9
-+        lsl         r3, #1
-+        vadd.i16    q10, q11
-+        vadd.u16    q0, q2
-+        vadd.u16    q8, q10
-+        vadd.i16    q0, q8
-+        vadd.i16    d0, d1        @ d0 has 4 vals
-+        vpadd.i16   d0, d0        @ 2 (top & bottom the same)
-+        vpadd.i16   d4, d0, d0    @ 1 (all the same)
-+        vpadd.i16   d5, d0, d0
-+        vrshr.u16   q0, q2, #6
-+        vrshr.u16   q1, q2, #6
-+
-+        @ Store
-+1:
-+        vst1.16     {q0-q1}, [r0], r3
-+        subs        r1, #1
-+        vst1.16     {q0-q1}, [r2], r3
-+        bne         1b
-+
-+        bx           lr
-+endfunc
-+
-+
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S
-@@ -0,0 +1,881 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox, Ben Avison
-+*/
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+@ All functions have the call
-+@
-+@ int ff_hevc_rpi_intra_filter_N_neon_PW(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+@
-+@ Assumptions:
-+@ (that wouldn't apply to all frame layoouts but do apply to sand, so beware
-+@  if reuseing this code)
-+@
-+@ Min ctb size is 8 so we don't need to worry about tr_size or dl_size for
-+@ N==4, but do for chroma N>=8.  As we share Y/C fns that means we can ignore
-+@ N==8,PW=8 (chroma always PW>8) but have to cope for larger
-+@
-+@ We always have at least 64 pixel H frame width rounding - this lets us
-+@ load UR widthout having to worry about exactly how many pixels are actually
-+@ within the frame.  As partial loads will only occur very occasionally this
-+@ should be a win in nearly all cases.
-+@
-+@ 16 bit fns can be used as 8 bit chroma fns as chroma never filters
-+@ so we do no maths on the contents
-+@
-+@ No filtering in 32bit fns as they are chroma only
-+
-+
-+.equ    AVAIL_UR, 1
-+.equ    AVAIL_U,  2
-+.equ    AVAIL_UL, 4
-+.equ    AVAIL_L,  8
-+.equ    AVAIL_DL, 16
-+
-+.equ    FILTER_LIGHT, 0x40
-+.equ    FILTER_STRONG, 0x80
-+
-+.equ    AVAIL_S_UR_N_U_C, 32 - 1
-+.equ    AVAIL_S_U_N_UL_C, 32 - 2
-+.equ    AVAIL_S_UL_N_L_C, 32 - 3
-+.equ    AVAIL_S_L_N_DL_C, 32 - 4
-+
-+.equ    AVAIL_S_U_DL_CPSR, 31 - 4  @ Shift for u..dl to go into flags via cpsr
-+
-+@ On entry
-+@  r2   req
-+@  r3   avail
-+@ [sp, #sp_offset...]  args
-+@
-+@ On Exit:
-+@
-+@ Extend values:
-+@  d_l  scalar contains value for L & DL
-+@       if DL avail then this is is DL[0] so we don't need to load that
-+@  d_ul scalar containing value for UL
-+@  d_u  scalar containing value for U
-+@  d_ur scalar containing value for UR
-+@ If DL avail then d_l == b_dl elif L avail then d_l == a_l else...
-+@ This means that L-light-filter works even if nreq DL (we never filter
-+@ req-DL without req-L, but we do filter req-L without req-DL)
-+@ If UR avail then d_ur == a_ur so U-filter good too
-+@
-+@ Data load pointers (only load if req & avail):
-+@  r4   DL + stride
-+@  r10  L
-+@  r6   U
-+@  r5   UR
-+@
-+@ Others:
-+@  r2   req
-+@  r7   req & avail
-+@  r3   L + stride
-+@  r8   DL + stride * 2
-+@  r9   stride * 2
-+@  cs   Load U
-+@  mi   Load UR
-+@
-+@ Clobbered:
-+@  r12
-+
-+.macro  load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur
-+
-+.equ    src_l\@,   \sp_offset + 0
-+.equ    src_u\@,   \sp_offset + 4
-+.equ    src_ur\@,  \sp_offset + 8
-+.equ    stride\@,  \sp_offset + 12
-+.equ    pw\@,      (1 << \pw_s)                 @ pel width in bytes
-+.equ    b_size\@,  (1 << (\pw_s + \log2_s))     @ size in bytes
-+
-+@ r9    stride
-+@                       r7 = ab_ul, r6 = a_u, r5 = a_ur
-+@ r4 = b_dl, r10 = b_l,             r8 = b_u
-+
-+        ldr        r5,  [sp, #src_ur\@]
-+        lsl        r12, r3,  #AVAIL_S_U_DL_CPSR
-+        ldr        r10, [sp, #src_l\@]
-+        ldr        r9,  [sp, #stride\@]
-+        ldr        r6,  [sp, #src_u\@]
-+
-+        @ This is quite a slow instruction but it replaces
-+        @ a decent number of tests that yield a max of 2 flags/op
-+        @ It is annoying we can't branch on Q!
-+        @ If L navail (ne) then DL must be navail (pl)
-+        msr        APSR_nzcvq, r12      @ n=dl, z=l, c=ul, v=u, q=ur
-+
-+        mov        r4,  r5
-+        sub        r7,  r10, r9
-+        it vs
-+        movvs      r4,  r6
-+        add        r8,  r6,  #b_size\@ - pw\@
-+        it cs
-+        movcs      r4,  r7
-+        ite ne
-+        movne      r10, r4
-+        addeq      r4,  r7,  r9,  lsl #\log2_s
-+        it cc
-+        movcc      r7,  r10
-+        it mi
-+        addmi      r4,  r10, r9,  lsl #\log2_s
-+        vld1.\d_type {\d_ul}, [r7]
-+        itt vc
-+        movvc      r8,  r7
-+        movvc      r6,  r7
-+        vld1.\d_type {\d_l }, [r4], r9
-+        tst        r3,  #AVAIL_UR
-+        vld1.\d_type {\d_u }, [r6]
-+        it eq
-+        moveq      r5,  r8
-+        and        r7,  r2,  r3
-+        add        r8,  r4,  r9
-+        vld1.\d_type {\d_ur}, [r5]
-+        lsls       r12, r7,  #AVAIL_S_UR_N_U_C
-+        add        r3,  r10, r9
-+        lsl        r9,  #1
-+.endm
-+
-+
-+
-+@ int ff_hevc_rpi_intra_filter_4_neon_8(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    pw_s,    0
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  2
-+
-+function ff_hevc_rpi_intra_filter_4_neon_8, export=1
-+        push       {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 8, d0[], d31[7], d1[], d2[]
-+
-+        it cs
-+        vldrcs     s2,  [r6]
-+        ite pl
-+        vmovpl     s3,  s4
-+        vldrmi     s3,  [r5]
-+
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        add        r12, r0,  #-pw
-+        bpl        1f
-+
-+        vld1.8    {d0[0]}, [r10], r9
-+        vld1.8    {d0[1]}, [r3],  r9
-+        vld1.8    {d0[2]}, [r10]
-+        vld1.8    {d0[3]}, [r3]
-+1:
-+        bcc        1f
-+        vld1.8    {d0[5]}, [r4],  r9
-+        vld1.8    {d0[6]}, [r8]
-+        vld1.8    {d0[7]}, [r4]
-+1:
-+        vstr       d1,  [r1]            @ Up
-+        vst1.8    {d31[7]}, [r12]
-+        vstr       d0,  [r0]            @ Left
-+        pop       {r4-r10, pc}
-+endfunc
-+
-+
-+@ int ff_hevc_rpi_intra_filter_4_neon_16(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    pw_s,    1
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  2
-+
-+function ff_hevc_rpi_intra_filter_4_neon_16, export=1
-+        push       {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], d2[], d3[]
-+
-+        it cs
-+        vldrcs     d2,  [r6]
-+        it mi
-+        vldrmi     d3,  [r5]
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        add        r12, r0, #-pw
-+        bpl        1f
-+        vld1.16   {d0[0]}, [r10], r9
-+        vld1.16   {d0[1]}, [r3],  r9
-+        vld1.16   {d0[2]}, [r10]
-+        vld1.16   {d0[3]}, [r3]
-+1:
-+        bcc        1f
-+        vld1.16   {d1[1]}, [r4],  r9
-+        vld1.16   {d1[2]}, [r8]
-+        vld1.16   {d1[3]}, [r4]
-+1:
-+        vst1.16   {q1}, [r1]           @ Up
-+        vst1.16   {d31[3]}, [r12]
-+        vst1.16   {q0}, [r0]           @ Left
-+        pop       {r4-r10, pc}
-+endfunc
-+
-+
-+@ int ff_hevc_rpi_intra_filter_8_neon_8(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    pw_s,    0
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  3
-+
-+function ff_hevc_rpi_intra_filter_8_neon_8, export=1
-+        push      {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d31[7], d4[], d5[]
-+
-+        it cs
-+        vldrcs     d4,  [r6]
-+        it mi
-+        vldrmi     d5,  [r5]
-+
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        bpl        1f
-+        vld1.8    {d0[0]}, [r10], r9
-+        vld1.8    {d0[1]}, [r3],  r9
-+        vld1.8    {d0[2]}, [r10], r9
-+        vld1.8    {d0[3]}, [r3],  r9
-+        vld1.8    {d0[4]}, [r10], r9
-+        vld1.8    {d0[5]}, [r3],  r9
-+        vld1.8    {d0[6]}, [r10]
-+        vld1.8    {d0[7]}, [r3]
-+1:
-+        bcc        1f
-+        vld1.8    {d1[1]}, [r4],  r9
-+        vld1.8    {d1[2]}, [r8],  r9
-+        vld1.8    {d1[3]}, [r4],  r9
-+        vld1.8    {d1[4]}, [r8],  r9
-+        vld1.8    {d1[5]}, [r4],  r9
-+        vld1.8    {d1[6]}, [r8]
-+        vld1.8    {d1[7]}, [r4]
-+1:
-+        tst        r2,  #FILTER_LIGHT
-+        add        r12, r0,  #-pw
-+        beq        10f
-+
-+        @ Luma light filter
-+        vext.8     q8,  q15, q2,  #15
-+        vext.8     q12, q15, q0,  #15
-+        vaddl.u8   q9,  d17, d5
-+        vaddl.u8   q8,  d16, d4
-+        vaddl.u8   q13, d25, d1
-+        vaddl.u8   q12, d24, d0
-+        vmov.u8    r3,  d5[7]           @ Save final pel
-+        vmov.u8    r2,  d1[7]           @ Save final pel
-+
-+        vext.16    q2,  q8,  q9,  #1
-+        vext.16    q3,  q9,  q9,  #1
-+        vext.16    q0,  q12, q13, #1
-+        vext.16    q1,  q13, q13, #1
-+        vadd.u16   d30, d16, d24        @ d30[0] = l[0] + 2ul + u[0]
-+        vadd.u16   q2,  q8
-+        vadd.u16   q3,  q9
-+        vadd.u16   q0,  q12
-+        vadd.u16   q1,  q13
-+
-+        vrshrn.u16 d4,  q2,  #2
-+        vrshrn.u16 d5,  q3,  #2
-+        vrshrn.u16 d0,  q0,  #2
-+        vrshrn.u16 d1,  q1,  #2
-+        vrshr.u16  d30, #2
-+        vmov.u8    d5[7], r3            @ Restore final pel
-+        vmov.u8    d1[7], r2            @ Restore final pel
-+        vdup.u8    d31, d30[0]          @ d31[3] = d30[0]
-+
-+10:
-+        vst1.8    {q2 }, [r1]           @ Up
-+        vst1.8    {d31[7]}, [r12]       @ Up-left
-+        vst1.8    {q0 }, [r0]           @ Left
-+        pop       {r4-r10, pc}
-+endfunc
-+
-+
-+@ int ff_hevc_rpi_intra_filter_8_neon_16(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    ur_size, sp_base + 16
-+.set    dl_size, sp_base + 20
-+.set    pw_s,    1
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  3
-+.set    p_size,  (1 << log2_s)          @ size in pels
-+
-+function ff_hevc_rpi_intra_filter_8_neon_16, export=1
-+        push      {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d4[],d5[]", "d6[],d7[]"
-+
-+        it cs
-+        vldmcs     r6,  {d4, d5}
-+        ldr        r12, [sp, #ur_size]
-+        bpl        1f
-+        cmp        r12, #4
-+        vldm       r5,  {d6, d7}
-+        bgt        1f
-+        vdup.16    d7,  d6[3]
-+1:
-+        lsls       r12, r7,  #AVAIL_S_L_N_DL_C
-+        vdup.16    q1,  d0[0]
-+        bpl        1f
-+        vld1.16   {d0[0]}, [r10], r9
-+        vld1.16   {d0[1]}, [r3],  r9
-+        vld1.16   {d0[2]}, [r10], r9
-+        vld1.16   {d0[3]}, [r3],  r9
-+        vld1.16   {d1[0]}, [r10], r9
-+        vld1.16   {d1[1]}, [r3],  r9
-+        vld1.16   {d1[2]}, [r10]
-+        vld1.16   {d1[3]}, [r3]
-+1:
-+        bcc        1f
-+        ldr        r12, [sp, #dl_size]
-+        vld1.16   {d2[1]}, [r4],  r9
-+        cmp        r12, #p_size
-+        vld1.16   {d2[2]}, [r8],  r9
-+        vld1.16   {d2[3]}, [r4],  r9
-+        blt        2f
-+        vld1.16   {d3[0]}, [r8],  r9
-+        vld1.16   {d3[1]}, [r4],  r9
-+        vld1.16   {d3[2]}, [r8]
-+        vld1.16   {d3[3]}, [r4]
-+        b          1f
-+2:
-+        vdup.16    d3,  d2[3]
-+1:
-+        tst        r2,  #FILTER_LIGHT
-+        add        r12, r0,  #-pw
-+        beq        10f
-+
-+        @ Luma light filter
-+        vext.16    q9,  q2,  q3,  #7
-+        vext.16    q8,  q15, q2,  #7
-+        vext.16    q13, q0,  q1,  #7
-+        vext.16    q12, q15, q0,  #7
-+        vadd.u16   q9,  q3
-+        vadd.u16   q8,  q2
-+        vadd.u16   q13, q1
-+        vadd.u16   q12, q0
-+        vmov.u16   r3,  d7[3]           @ Save final pel
-+        vmov.u16   r2,  d3[3]           @ Save final pel
-+
-+        vext.16    q2,  q8,  q9,  #1
-+        vext.16    q3,  q9,  q9,  #1
-+        vext.16    q0,  q12, q13, #1
-+        vext.16    q1,  q13, q13, #1
-+        vadd.u16   d30, d16, d24        @ d30[0] = l[0] + 2ul + u[0]
-+        vadd.u16   q2,  q8
-+        vadd.u16   q3,  q9
-+        vadd.u16   q0,  q12
-+        vadd.u16   q1,  q13
-+
-+        vrshr.u16  q2,  #2
-+        vrshr.u16  q3,  #2
-+        vrshr.u16  q0,  #2
-+        vrshr.u16  q1,  #2
-+        vrshr.u16  d30, #2
-+        vmov.u16   d7[3], r3            @ Restore final pel
-+        vmov.u16   d3[3], r2            @ Restore final pel
-+        vdup.u16   d31, d30[0]          @ d31[3] = d30[0]
-+
-+10:
-+        vst1.16   {q2,  q3}, [r1]       @ Up
-+        vst1.16   {d31[3]}, [r12]       @ Up-left
-+        vst1.16   {q0,  q1}, [r0]       @ Left
-+        pop       {r4-r10, pc}
-+endfunc
-+
-+@ int ff_hevc_rpi_intra_filter_16_neon_16(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    ur_size, sp_base + 16
-+.set    dl_size, sp_base + 20
-+.set    pw_s,    1
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  4
-+.set    p_size,  (1 << log2_s)          @ size in pels
-+
-+function ff_hevc_rpi_intra_filter_16_neon_16, export=1
-+        push      {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d16[],d17[]", "d20[],d21[]"
-+
-+        vdup.16    q9,  d16[0]
-+        vdup.16    q11, d20[0]
-+
-+        it cs
-+        vldmcs     r6,  {d16-d19}
-+        ldr        r12, [sp, #ur_size]
-+        bpl        1f
-+        cmp        r12, #12
-+        @ Given chroma frame layout, if UR exists then it is always legit to
-+        @ load all of it even if most of it is outside the frame.
-+        vldm       r5,  {d20-d23}
-+        bgt        1f
-+        bge        4f
-+        cmp        r12,  #8
-+        bge        3f
-+        vdup.16    d21, d20[3]
-+3:      vdup.16    d22, d21[3]
-+4:      vdup.16    d23, d22[3]
-+
-+1:
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        ldr        r12, [sp, #dl_size]
-+        vdup.16    q1,  d0[0]
-+        vdup.16    q2,  d0[0]
-+        vdup.16    q3,  d0[0]
-+        bpl        1f
-+        vld1.16   {d0[0]}, [r10], r9
-+        vld1.16   {d0[1]}, [r3],  r9
-+        vld1.16   {d0[2]}, [r10], r9
-+        vld1.16   {d0[3]}, [r3],  r9
-+        vld1.16   {d1[0]}, [r10], r9
-+        vld1.16   {d1[1]}, [r3],  r9
-+        vld1.16   {d1[2]}, [r10], r9
-+        vld1.16   {d1[3]}, [r3],  r9
-+        vld1.16   {d2[0]}, [r10], r9
-+        vld1.16   {d2[1]}, [r3],  r9
-+        vld1.16   {d2[2]}, [r10], r9
-+        vld1.16   {d2[3]}, [r3],  r9
-+        vld1.16   {d3[0]}, [r10], r9
-+        vld1.16   {d3[1]}, [r3],  r9
-+        vld1.16   {d3[2]}, [r10]
-+        vld1.16   {d3[3]}, [r3]
-+1:
-+        bcc        1f
-+        vld1.16   {d4[1]}, [r4],  r9
-+        cmp        r12, #4
-+        vld1.16   {d4[2]}, [r8],  r9
-+        vld1.16   {d4[3]}, [r4],  r9
-+        ble        2f
-+        vld1.16   {d5[0]}, [r8],  r9
-+        vld1.16   {d5[1]}, [r4],  r9
-+        cmp        r12, #12
-+        vld1.16   {d5[2]}, [r8],  r9
-+        vld1.16   {d5[3]}, [r4],  r9
-+        blt        3f
-+        vld1.16   {d6[0]}, [r8],  r9
-+        vld1.16   {d6[1]}, [r4],  r9
-+        vld1.16   {d6[2]}, [r8],  r9
-+        vld1.16   {d6[3]}, [r4],  r9
-+        ble        4f
-+        vld1.16   {d7[0]}, [r8],  r9
-+        vld1.16   {d7[1]}, [r4],  r9
-+        vld1.16   {d7[2]}, [r8]
-+        vld1.16   {d7[3]}, [r4]
-+        b          1f
-+2:      vdup.16    d5,  d4[3]
-+3:      vdup.16    d6,  d5[3]
-+4:      vdup.16    d7,  d6[3]
-+1:
-+        tst        r2,  #FILTER_LIGHT
-+        add        r12, r0,  #-pw
-+        beq        10f
-+
-+        vpush     {q5}
-+        @ Luma light filter
-+        @ Left
-+        vext.16    q5,  q2,  q3,  #7
-+        vext.16    q14, q1,  q2,  #7
-+        vext.16    q13, q0,  q1,  #7
-+        vext.16    q12, q15, q0,  #7
-+
-+        vadd.u16   q5,  q3
-+        vadd.u16   q14, q2
-+        vadd.u16   q13, q1
-+        vadd.u16   q12, q0
-+        vmov.u16   r2,  d7[3]           @ Save final pel
-+
-+        vext.16    q0,  q12, q13, #1
-+        vext.16    q1,  q13, q14, #1
-+        vext.16    q2,  q14, q5,  #1
-+        vext.16    q3,  q5,  q5,  #1
-+
-+        vmov       d30, d24             @ d30[0] = l[0] + ul
-+        vadd.u16   q0,  q12
-+        vadd.u16   q1,  q13
-+        vadd.u16   q2,  q14
-+        vadd.u16   q3,  q5
-+
-+        vrshr.u16  q0,  #2
-+        vrshr.u16  q1,  #2
-+        vrshr.u16  q2,  #2
-+        vrshr.u16  q3,  #2
-+
-+        @ Up
-+        vext.16    q5,  q10, q11, #7
-+        vext.16    q14, q9,  q10, #7
-+        vext.16    q13, q8,  q9,  #7
-+        vext.16    q12, q15, q8,  #7
-+
-+        vadd.u16   q5,  q11
-+        vadd.u16   q14, q10
-+        vadd.u16   q13, q9
-+        vadd.u16   q12, q8
-+        vmov.u16   r3,  d23[3]          @ Save final pel
-+
-+        vext.16    q8,  q12, q13, #1
-+        vext.16    q9,  q13, q14, #1
-+        vext.16    q10, q14, q5,  #1
-+        vext.16    q11, q5,  q5,  #1
-+
-+        vadd.u16   d30, d24             @ d30[0] = l[0] + 2ul + u[0]
-+        vadd.u16   q8,  q12
-+        vadd.u16   q9,  q13
-+        vadd.u16   q10, q14
-+        vadd.u16   q11, q5
-+
-+        vrshr.u16  q8,  #2
-+        vrshr.u16  q9,  #2
-+        vrshr.u16  q10, #2
-+        vrshr.u16  q11, #2
-+
-+        @ Misc
-+        vrshr.u16  d30, #2
-+        vmov.u16   d7[3], r2            @ Restore final pel
-+        vmov.u16   d23[3], r3           @ Restore final pel
-+        vdup.u16   d31, d30[0]          @ d31[3] = d30[0]
-+        vpop      {q5}
-+
-+10:
-+        vstm       r1, {d16-d23}        @ Up
-+        vst1.16   {d31[3]}, [r12]       @ Up-left
-+        vstm       r0, { d0-d7 }        @ Left
-+        pop       {r4-r10, pc}
-+endfunc
-+
-+@ int ff_hevc_rpi_intra_filter_4_neon_32(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    pw_s,    2
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  2
-+
-+function ff_hevc_rpi_intra_filter_4_neon_32, export=1
-+        push       {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d4[],d5[]", "d6[],d7[]"
-+
-+        it cs
-+        vldmcs     r6,  {d4, d5}
-+        it mi
-+        vldmmi     r5,  {d6, d7}
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        vdup.32    q1,  d0[0]
-+        add        r12, r0,  #-pw
-+        bpl        1f
-+        vld1.32   {d0[0]}, [r10], r9
-+        vld1.32   {d0[1]}, [r3],  r9
-+        vld1.32   {d1[0]}, [r10]
-+        vld1.32   {d1[1]}, [r3]
-+1:
-+        bcc        1f
-+        vld1.32   {d2[1]}, [r4],  r9
-+        vld1.32   {d3[0]}, [r8]
-+        vld1.32   {d3[1]}, [r4]
-+1:
-+        vst1.32    {q2,  q3 }, [r1]     @ Up
-+        vst1.32    {d31[1]}, [r12]
-+        vst1.32    {q0,  q1 }, [r0]     @ Left
-+        pop        {r4-r10, pc}
-+endfunc
-+
-+
-+@ int ff_hevc_rpi_intra_filter_8_neon_32(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    ur_size, sp_base + 16
-+.set    dl_size, sp_base + 20
-+.set    pw_s,    2
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  3
-+.set    p_size,  (1 << log2_s)          @ size in pels
-+
-+function ff_hevc_rpi_intra_filter_8_neon_32, export=1
-+        push       {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]"
-+
-+        vdup.32    q9,  d16[0]
-+        vdup.32    q11, d20[0]
-+
-+        it cs
-+        vldmcs     r6,  {q8,  q9 }
-+        ldr        r12, [sp, #ur_size]
-+        bpl        1f
-+        cmp        r12, #p_size
-+        vldm       r5,  {q10, q11}
-+        bge        1f
-+        vdup.32    q11, d21[1]
-+1:
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        vdup.32    q1,  d0[0]
-+        vdup.32    q2,  d0[0]
-+        vdup.32    q3,  d0[0]
-+        bpl        1f
-+        vld1.32   {d0[0]}, [r10], r9
-+        vld1.32   {d0[1]}, [r3],  r9
-+        vld1.32   {d1[0]}, [r10], r9
-+        vld1.32   {d1[1]}, [r3],  r9
-+        vld1.32   {d2[0]}, [r10], r9
-+        vld1.32   {d2[1]}, [r3],  r9
-+        vld1.32   {d3[0]}, [r10]
-+        vld1.32   {d3[1]}, [r3]
-+1:
-+        bcc        1f
-+        ldr        r12, [sp, #dl_size]
-+        vld1.32   {d4[1]}, [r4],  r9
-+        cmp        r12, #p_size
-+        vld1.32   {d5[0]}, [r8],  r9
-+        vld1.32   {d5[1]}, [r4],  r9
-+        blt        2f
-+        vld1.32   {d6[0]}, [r8],  r9
-+        vld1.32   {d6[1]}, [r4],  r9
-+        vld1.32   {d7[0]}, [r8]
-+        vld1.32   {d7[1]}, [r4]
-+        b          1f
-+2:
-+        vdup.32    q3,  d5[1]
-+1:
-+        add        r12, r0,  #-pw
-+        vstm       r1,  { q8-q11}       @ Up
-+        vst1.32   {d31[1]}, [r12]
-+        vstm       r0,  { q0-q3 }       @ Left
-+        pop       {r4-r10, pc}
-+endfunc
-+
-+
-+@ int ff_hevc_rpi_intra_filter_16_neon_32(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    ur_size, sp_base + 16
-+.set    dl_size, sp_base + 20
-+.set    pw_s,    2
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  4
-+.set    p_size,  (1 << log2_s)          @ size in pels
-+
-+function ff_hevc_rpi_intra_filter_16_neon_32, export=1
-+        push       {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1]
-+
-+        @ Once we get this big we have run out of neon regs to store
-+        @ everything at once so do in pieces
-+
-+        @ Up (have)
-+        it cs
-+        vldmcs     r6,  { q0-q3 }
-+        ldr        r12, [sp, #ur_size]
-+        it mi
-+        vldmmi     r5,  { q8-q11}
-+        it cs
-+        vstmcs     r1,  { q0-q3 }
-+        bpl        1f
-+        cmp        r12, #12
-+        add        lr,  r1,  #(pw << log2_s)
-+        bgt        2f
-+        cmp        r12, #8
-+        bge        3f
-+        vdup.16    q9,  d17[1]
-+4:      vdup.16    d10, d19[1]
-+3:      vdup.16    q11, d21[1]
-+2:      vstm       lr, { q8-q11}
-+1:
-+
-+        @ Left (have)
-+        add        lr,  r0,  #-pw
-+        lsls       r12, r7,  #AVAIL_S_L_N_DL_C
-+        vst1.32   {d30[1]}, [lr]        @ UL
-+        bpl        1f
-+        vld1.32   { d0[0]}, [r10], r9
-+        vld1.32   { d0[1]}, [r3],  r9
-+        vld1.32   { d1[0]}, [r10], r9
-+        vld1.32   { d1[1]}, [r3],  r9
-+        vld1.32   { d2[0]}, [r10], r9
-+        vld1.32   { d2[1]}, [r3],  r9
-+        vld1.32   { d3[0]}, [r10], r9
-+        vld1.32   { d3[1]}, [r3],  r9
-+        vld1.32   { d4[0]}, [r10], r9
-+        vld1.32   { d4[1]}, [r3],  r9
-+        vld1.32   { d5[0]}, [r10], r9
-+        vld1.32   { d5[1]}, [r3],  r9
-+        vld1.32   { d6[0]}, [r10], r9
-+        vld1.32   { d6[1]}, [r3],  r9
-+        vld1.32   { d7[0]}, [r10]
-+        vld1.32   { d7[1]}, [r3]
-+        vstm       r0,  { q0-q3 }
-+1:
-+        bcc        1f
-+        ldr        r12, [sp, #dl_size]
-+        vdup.32    d16, d30[0]          @ d16[0] = d30[0]
-+        add        lr,  r0,  #(pw << log2_s)
-+        vld1.32   {d16[1]}, [r4],  r9
-+        cmp        r12, #4
-+        vld1.32   {d17[0]}, [r8],  r9
-+        vld1.32   {d17[1]}, [r4],  r9
-+        ble        2f
-+        vld1.32   {d18[0]}, [r8],  r9
-+        vld1.32   {d18[1]}, [r4],  r9
-+        cmp        r12, #12
-+        vld1.32   {d19[0]}, [r8],  r9
-+        vld1.32   {d19[1]}, [r4],  r9
-+        blt        3f
-+        vld1.32   {d20[0]}, [r8],  r9
-+        vld1.32   {d20[1]}, [r4],  r9
-+        vld1.32   {d21[0]}, [r8],  r9
-+        vld1.32   {d21[1]}, [r4],  r9
-+        ble        4f
-+        vld1.32   {d22[0]}, [r8],  r9
-+        vld1.32   {d22[1]}, [r4],  r9
-+        vld1.32   {d23[0]}, [r8]
-+        vld1.32   {d23[1]}, [r4]
-+        b          5f
-+2:      vdup.32    q9,  d17[1]
-+3:      vdup.32    q10, d19[1]
-+4:      vdup.32    q11, d21[1]
-+5:      vstm       lr,  { q8-q11}
-+1:
-+        eors       r7,  r2
-+        beq        99f
-+
-+        lsls       r12, r7,  #AVAIL_S_UR_N_U_C
-+        vdup.32    q0,  d31[0]
-+        vdup.32    q1,  d31[0]
-+        vdup.32    q2,  d31[0]
-+        vdup.32    q3,  d31[0]
-+        add        lr,  r1,  #(pw << log2_s)
-+        vdup.32    q8,  d31[1]
-+        vdup.32    q9,  d31[1]
-+        vdup.32    q10, d31[1]
-+        vdup.32    q11, d31[1]
-+        it cs
-+        vstmcs     r1,  { q0-q3 }
-+        it mi
-+        vstmmi     lr,  { q8-q11}
-+
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        vdup.32    q0,  d30[0]
-+        vdup.32    q1,  d30[0]
-+        vdup.32    q2,  d30[0]
-+        vdup.32    q3,  d30[0]
-+        add        lr,  r0,  #(pw << log2_s)
-+        it mi
-+        vstmmi     r0, { q0-q3 }
-+        it cs
-+        vstmcs     lr, { q0-q3 }
-+
-+99:
-+        pop       {r4-r10, pc}
-+endfunc
-+
-+
-+
-+
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
-@@ -0,0 +1,920 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox, Ben Avison
-+*/
-+
-+/*
-+ * Horizontal & Vertical special cases of angular intra pred
-+ *
-+ * Split out because:
-+ *  Vertical, at least, is relatively common
-+ *  Much simpler code than the general angular case
-+ *  Luma with size < 32 has extra filtering that doesn't happen anywhere else
-+ *
-+ * *** Currently luma filtering is mandatory where it occurs, but there are
-+ *     cases where it should be turned off (rdpcm & an extension sps flag).
-+ *     These don't occur in the standard conformance suite for Main Profile
-+ */
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+@ ff_hevc_rpi_pred_vertical_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_4_neon_8, export=1
-+        ldrb        ip, [r2, #-1]       @ Top-left
-+        vld1.32     {d0[0]}, [r2 :32]   @ Left
-+        add         r2, r0, r3
-+        vld1.8      {d1[]}, [r1]
-+        lsl         r3, #1
-+        vdup.8      d4, ip
-+        vmov.i8     d2, #128
-+        vhsub.u8    d4, d0, d4
-+        veor        d1, d2
-+        vld1.32     {d0[0]}, [r1 :32]   @ Top
-+        vqadd.s8    d1, d4
-+        vmov.i64    d3, #0xff
-+        vmov        d4, d0
-+        veor        d5, d1, d2
-+        veor        d1, d1, d2
-+        vbit        d0, d1, d3
-+        vshr.u64    d5, #8
-+        vst1.32     {d0[0]}, [r0], r3
-+        vshr.u64    d1, #16
-+        vbit        d4, d5, d3
-+        vshr.u64    d5, #16
-+        vst1.32     {d4[0]}, [r2], r3
-+        vbit        d0, d1, d3
-+        vst1.32     {d0[0]}, [r0]
-+        vbit        d4, d5, d3
-+        vst1.32     {d4[0]}, [r2]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_8_neon_8, export=1
-+        ldrb        ip, [r2, #-1]       @ Top-left
-+        vld1.8      {d0}, [r2 :64]      @ Left
-+        vmov.i8     d1, #128
-+        vld1.8      {d2[]}, [r1]
-+        vld1.8      {d3}, [r1 :64]      @ Top
-+        vdup.8      d4, ip
-+        vhsub.u8    d4, d0, d4
-+        veor        d2, d1
-+        vmov.i64    d0, #0xff
-+        mov         r1, #8
-+        vqadd.s8    d2, d4, d2
-+        veor        d1, d2, d1
-+1:
-+        vbit        d3, d1, d0
-+        vshr.u64    d1, #8
-+        vst1.8      {d3}, [r0 :64], r3
-+        subs        r1, #2
-+        vbit        d3, d1, d0
-+        vshr.u64    d1, #8
-+        vst1.8      {d3}, [r0 :64], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_16_neon_8, export=1
-+        ldrb        ip, [r2, #-1]       @ Top-left
-+        vld1.8      {q0}, [r2 :128]     @ Left
-+        vdup.8      q1, ip
-+        vld1.8      {d4[],d5[]}, [r1]
-+        vhsub.u8    q0, q1
-+        vmov.i8     q1, #128
-+        veor        q2, q1
-+        vmov.i64    d16, #0xff
-+        vqadd.s8    q0, q2
-+        vld1.8      {q3}, [r1 :128]     @ Top
-+        mov         r1, #16
-+        veor        q0, q1
-+        vmov        q1, q3
-+        vext.8      q2, q0, q0, #1
-+1:
-+        vbit        d2, d0, d16
-+        vbit        d6, d4, d16
-+        vext.8      q0, q0, q0, #2
-+        subs        r1, #2
-+        vst1.8      {q1}, [r0 :128], r3
-+        vext.8      q2, q2, q2, #2
-+        vst1.8      {q3}, [r0 :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vert_32_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_32_neon_8, export=1
-+        vld1.8     {q0,  q1 }, [r1  :128]    @ Up
-+        add         r2,  r0,  r3
-+        lsl         r3,  #1
-+        mov         r1,  #16
-+1:
-+        vst1.8     {q0,  q1 }, [r0  :128], r3
-+        subs        r1,  #1
-+        vst1.8     {q0,  q1 }, [r2  :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_4_neon_8, export=1
-+        vld1.16    {d0 }, [r1  :64]    @ Up
-+        add         r2,  r0,  r3,  lsl #1
-+        lsl         r3,  #2
-+
-+        vst1.16    {d0 }, [r0  :64], r3
-+        vst1.16    {d0 }, [r2  :64], r3
-+        vst1.16    {d0 }, [r0  :64]
-+        vst1.16    {d0 }, [r2  :64]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_8_neon_8, export=1
-+        vld1.16    {q0 }, [r1  :128]    @ Up
-+        add         r2,  r0,  r3,  lsl #1
-+        lsl         r3,  #2
-+        mov         r1,  #4
-+1:
-+        vst1.16    {q0 }, [r0  :128], r3
-+        subs        r1,  #2
-+        vst1.16    {q0 }, [r2  :128], r3
-+        vst1.16    {q0 }, [r0  :128], r3
-+        vst1.16    {q0 }, [r2  :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_16_neon_8, export=1
-+        vld1.16    {q0,  q1 }, [r1  :128]    @ Up
-+        add         r2,  r0,  r3,  lsl #1
-+        lsl         r3,  #2
-+        mov         r1,  #8
-+1:
-+        vst1.16    {q0,  q1 }, [r0  :128], r3
-+        subs        r1,  #1
-+        vst1.16    {q0,  q1 }, [r2  :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontalal_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+@ ? Might be faster as simple arm
-+
-+function ff_hevc_rpi_pred_horizontal_4_neon_8, export=1
-+        ldrb        ip, [r2, #-1]       @ Top-left
-+        vld1.32     {d0[0]}, [r1 :32]   @ Top
-+        add         r1, r2, #3
-+        vld1.8      {d1[]}, [r2]!
-+        vdup.8      d2, ip
-+        vmov.i8     d3, #128
-+        vhsub.u8    d0, d2
-+        veor        d1, d3
-+        vld1.8      {d2[]}, [r2]!
-+        add         ip, r0, r3
-+        vqadd.s8    d0, d0, d1
-+        lsl         r3, #1
-+        vld1.8      {d1[]}, [r2]
-+        vld1.8      {d4[]}, [r1]
-+        veor        d0, d3
-+        vst1.32     {d0[0]}, [r0 :32], r3
-+        vst1.32     {d2[0]}, [ip :32], r3
-+        vst1.32     {d1[0]}, [r0 :32]
-+        vst1.32     {d4[0]}, [ip :32]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_8_neon_8, export=1
-+        ldrb        ip, [r2, #-1]       @ Top-left
-+        vld1.8      {d0}, [r1 :64]      @ Top
-+        vmov.i8     d1, #128
-+        vld1.8      {d2[]}, [r2]!
-+        mov         r1, #8-2
-+        vdup.8      d3, ip
-+        vhsub.u8    d0, d3
-+        veor        d2, d1
-+        vqadd.s8    d0, d2
-+          vld1.8      {d2[]}, [r2]!
-+        veor        d0, d1
-+        vst1.8      {d0}, [r0], r3
-+1:
-+            vld1.8      {d0[]}, [r2]!
-+        subs        r1, #2
-+          vst1.8      {d2}, [r0 :64], r3
-+              vld1.8      {d2[]}, [r2]!
-+            vst1.8      {d0}, [r0 :64], r3
-+        bne         1b
-+
-+              vst1.8      {d2}, [r0 :64]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_16_neon_8, export=1
-+        ldrb        ip, [r2, #-1]       @ Top-left
-+        vld1.8      {q0}, [r1 :64]      @ Top
-+        mov         r1, #16-2
-+        vld1.8      {d4[],d5[]}, [r2]!
-+        vdup.8      q3, ip
-+        vhsub.u8    q0, q3
-+        vmov.i8     q1, #128
-+        veor        q2, q1
-+        vqadd.s8    q0, q2
-+          vld1.8      {d4[],d5[]}, [r2]!
-+        veor        q0, q1
-+        vst1.8      {q0}, [r0], r3
-+1:
-+            vld1.8      {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.8      {q2}, [r0 :64], r3
-+              vld1.8      {d4[],d5[]}, [r2]!
-+            vst1.8      {q0}, [r0 :64], r3
-+        bne         1b
-+
-+              vst1.8      {q2}, [r0 :64]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_32_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_32_neon_8, export=1
-+        vld1.8      {d0[],d1[]}, [r2]!
-+        add         ip, r0, #16
-+        mov         r1, #32-2
-+          vld1.8      {d2[],d3[]}, [r2]!
-+        vst1.8      {q0}, [r0 :128], r3
-+        vst1.8      {q0}, [ip :128], r3
-+1:
-+            vld1.8      {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.8      {q1}, [r0 :128], r3
-+          vst1.8      {q1}, [ip :128], r3
-+              vld1.8      {d2[],d3[]}, [r2]!
-+            vst1.8      {q0}, [r0 :128], r3
-+            vst1.8      {q0}, [ip :128], r3
-+        bne         1b
-+
-+              vst1.8      {q1}, [r0 :128]
-+              vst1.8      {q1}, [ip :128]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_4_neon_8, export=1
-+        add         r1, r2, #2
-+        vld1.16     {d0[]}, [r2]
-+        add         r2, #4
-+        vld1.16     {d1[]}, [r1]
-+        add         r1, #4
-+        vld1.16     {d2[]}, [r2]
-+A       add         r2, r0, r3, lsl #1
-+T       lsl         r3, #1
-+T       add         r2, r0, r3
-+        vld1.16     {d3[]}, [r1]
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vst1.16     {d0}, [r0 :64], r3
-+        vst1.16     {d1}, [r2 :64], r3
-+        vst1.16     {d2}, [r0 :64]
-+        vst1.16     {d3}, [r2 :64]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_8_neon_8, export=1
-+        vld1.16     {d0[],d1[]}, [r2]!
-+        lsl         r3, #1
-+          vld1.16     {d2[],d3[]}, [r2]!
-+        mov         r1, #8-2
-+        vst1.16     {q0}, [r0 :64], r3
-+1:
-+            vld1.16     {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.16     {q1}, [r0 :64], r3
-+              vld1.16     {d2[],d3[]}, [r2]!
-+            vst1.16     {q0}, [r0 :64], r3
-+        bne         1b
-+
-+              vst1.16     {q1}, [r0 :64]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_16_neon_8, export=1
-+        vld1.16     {d0[],d1[]}, [r2]!
-+        lsl         r3, #1
-+        add         ip, r0, #16
-+        mov         r1, #16-2
-+          vld1.16     {d2[],d3[]}, [r2]!
-+        vst1.16     {q0}, [r0 :128], r3
-+        vst1.16     {q0}, [ip :128], r3
-+1:
-+            vld1.16     {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.16     {q1}, [r0 :128], r3
-+          vst1.16     {q1}, [ip :128], r3
-+              vld1.16     {d2[],d3[]}, [r2]!
-+            vst1.16     {q0}, [r0 :128], r3
-+            vst1.16     {q0}, [ip :128], r3
-+        bne         1b
-+
-+              vst1.16     {q1}, [r0 :128]
-+              vst1.16     {q1}, [ip :128]
-+        bx          lr
-+endfunc
-+
-+
-+@------------------------------------------------------------------------------
-+@
-+@ 10 Bit
-+@ Has clipping constants so 10-bit only but could easily be macroed up to
-+@ 14-bit before we run out of bits
-+
-+
-+@ ff_hevc_rpi_pred_vertical_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_4_neon_10, export=1
-+        ldrh        ip, [r2, #-2]       @ Top-left
-+        vld1.16     {d0}, [r2 :64]      @ Left
-+        vmov.i16    d2, #0
-+        vld1.16     {d1[]}, [r1]
-+T       lsl         r3, #1
-+        vdup.16     d4, ip
-+        vmov.i16    d3, #0x3ff
-+        vld1.16     {d5}, [r1 :64]      @ Top
-+        vhsub.u16   d4, d0, d4
-+        vmov.i64    d0, #0xffff
-+A       add         r2, r0, r3, lsl #1
-+T       add         r2, r0, r3
-+        vadd.i16    d1, d1, d4
-+        vmov        d6, d5
-+        vmax.s16    d1, d1, d2
-+        vmin.s16    d2, d1, d3
-+        vmin.s16    d1, d1, d3
-+        vbit        d5, d1, d0
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vshr.u64    d2, #16
-+        vshr.u64    d1, #32
-+        vbit        d6, d2, d0
-+        vst1.16     {d5}, [r0], r3
-+        vshr.u64    d2, #32
-+        vst1.16     {d6}, [r2], r3
-+        vbit        d5, d1, d0
-+        vst1.16     {d5}, [r0]
-+        vbit        d6, d2, d0
-+        vst1.16     {d6}, [r2]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_8_neon_10, export=1
-+        ldrh        ip, [r2, #-2]       @ Top-left
-+        vld1.16     {q0}, [r2 :128]     @ Left
-+        lsl         r3, #1
-+        vdup.16     q1, ip
-+        vld1.16     {d4[],d5[]}, [r1]
-+        vhsub.u16   q0, q0, q1
-+        vmov.i16    q1, #0
-+        vadd.i16    q0, q2
-+        vmov.i16    q2, #0x3ff
-+        vld1.16     {q3}, [r1 :128]     @ Top
-+        mov         r1, #8
-+        vmax.s16    q0, q1
-+        vmov        q1, q3
-+        vmin.s16    q0, q2
-+        vmov.i64    d16, #0xffff
-+        vext.16     q2, q0, q0, #1
-+1:
-+        vbit        d2, d0, d16
-+        vbit        d6, d4, d16
-+        vext.16     q0, q0, q0, #2
-+        subs        r1, #2
-+        vst1.16     {q1}, [r0 :128], r3
-+        vext.16     q2, q2, q2, #2
-+        vst1.16     {q3}, [r0 :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_16_neon_10, export=1
-+        ldrh        ip, [r2, #-2]       @ Top-left
-+        vld1.16     {q0-q1}, [r2 :128]  @ Left
-+T       lsl         r3, #1
-+        vdup.16     q2, ip
-+A       add         r2, r0, r3, lsl #1
-+T       add         r2, r0, r3
-+        vld1.16     {d6[],d7[]}, [r1]
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vhsub.u16   q0, q2
-+        vhsub.u16   q1, q2
-+        vadd.i16    q0, q3
-+        vadd.i16    q1, q3
-+        vmov.i16    q2, #0
-+        vld1.16     {q8-q9}, [r1 :128]  @ Top
-+        mov         r1, #0
-+        vmov.i16    q3, #0x3ff
-+        vmax.s16    q0, q2
-+        vmax.s16    q1, q2
-+        vmin.s16    q0, q3
-+        vmin.s16    q1, q3
-+        vmov        q10, q8
-+        vmov        q11, q9
-+        vext.16     q2, q0, q1, #1
-+        vext.16     q3, q1, q1, #1
-+        vmov.i64    d24, #0xffff
-+1:
-+        vbit        d16, d0, d24
-+        vbit        d20, d4, d24
-+        vext.16     q0, q0, q0, #2
-+        subs        r1, #1<<30
-+        vst1.16     {q8-q9}, [r0 :128], r3
-+        vext.16     q2, q2, q2, #2
-+        vst1.16     {q10-q11}, [r2 :128], r3
-+        bne         1b
-+1:
-+        vbit        d16, d2, d24
-+        vbit        d20, d6, d24
-+        vext.16     q1, q1, q1, #2
-+        subs        r1, #1<<30
-+        vst1.16     {q8-q9}, [r0 :128], r3
-+        vext.16     q3, q3, q3, #2
-+        vst1.16     {q10-q11}, [r2 :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_32_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_32_neon_10, export=1
-+        vldm        r1, { q0-q3 }    @ Up
-+        lsl         r3, #1
-+        mov         r1, #32
-+        add         r2, r0, #32
-+1:
-+        vst1.16     {q0-q1}, [r0 :128], r3
-+        subs        r1, #1
-+        vst1.16     {q2-q3}, [r2 :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_4_neon_10, export=1
-+        vld1.16    {q0 }, [r1  :128]    @ Up
-+        add         r2,  r0,  r3,  lsl #2
-+        lsl         r3,  #3
-+
-+        vst1.16    {q0 }, [r0  :128], r3
-+        vst1.16    {q0 }, [r2  :128], r3
-+        vst1.16    {q0 }, [r0  :128]
-+        vst1.16    {q0 }, [r2  :128]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_8_neon_10, export=1
-+        vld1.16    {q0,  q1 }, [r1  :128]    @ Up
-+        add         r2,  r0,  r3,  lsl #2
-+        lsl         r3,  #3
-+        mov         r1,  #4
-+1:
-+        vst1.16    {q0,  q1 }, [r0  :128], r3
-+        subs        r1,  #1
-+        vst1.16    {q0,  q1 }, [r2  :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_16_neon_10, export=1
-+        vldm        r1, { q0-q3 }    @ Up
-+        lsl         r3, #2
-+        mov         r1, #16
-+        add         r2, r0, #32
-+1:
-+        vst1.16     {q0-q1}, [r0 :128], r3
-+        subs        r1, #1
-+        vst1.16     {q2-q3}, [r2 :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+@ ff_hevc_rpi_pred_horizontal_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_4_neon_10, export=1
-+        ldrh        ip, [r2, #-2]       @ Top-left
-+        vld1.16     {d0}, [r1 :64]      @ Top
-+        vmov.i16    d1, #0
-+        vld1.16     {d2[]}, [r2]!
-+T       lsl         r3, #1
-+        vdup.16     d3, ip
-+        vmov.i16    d4, #0x3ff
-+        vhsub.u16   d0, d3
-+A       add         ip, r0, r3, lsl #1
-+T       add         ip, r0, r3
-+        vld1.16     {d3[]}, [r2]!
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vadd.i16    d0, d2
-+        vld1.16     {d2[]}, [r2]!
-+        vmax.s16    d0, d1
-+        vld1.16     {d1[]}, [r2]
-+        vmin.s16    d0, d4
-+        vst1.16     {d0}, [r0 :64], r3
-+        vst1.16     {d3}, [ip :64], r3
-+        vst1.16     {d2}, [r0 :64]
-+        vst1.16     {d1}, [ip :64]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_8_neon_10, export=1
-+        ldrh        ip, [r2, #-2]       @ Top-left
-+        vld1.16     {q0}, [r1 :128]     @ Top
-+        lsl         r3, #1
-+        vdup.16     q1, ip
-+        mov         r1, #8-2
-+        vhsub.u16   q0, q1
-+        vld1.16     {d2[],d3[]}, [r2]!
-+        vmov.i16    q2, #0
-+        vadd.i16    q0, q1
-+        vmov.i16    q1, #0x3ff
-+        vmax.s16    q0, q2
-+          vld1.16     {d4[],d5[]}, [r2]!
-+        vmin.s16    q0, q1
-+        vst1.16     {q0}, [r0 :128], r3
-+1:
-+            vld1.16     {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.16     {q2}, [r0 :128], r3
-+              vld1.16     {d4[],d5[]}, [r2]!
-+            vst1.16     {q0}, [r0 :128], r3
-+        bne         1b
-+
-+              vst1.16     {q2}, [r0 :128]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontalal_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_16_neon_10, export=1
-+        ldrh        ip, [r2, #-2]       @ Top-left
-+        vld1.16     {q0-q1}, [r1 :128]  @ Top
-+        lsl         r3, #1
-+        vdup.16     q2, ip
-+        add         ip, r0, r3
-+        vhsub.u16   q0, q2
-+        add         ip, #16
-+        vhsub.u16   q1, q2
-+        mov         r1, #16-2
-+        vld1.16     {d4[],d5[]}, [r2]!
-+        vmov.i16    q3, #0
-+        vadd.u16    q0, q2
-+        vadd.i16    q1, q2
-+        vmov.i16    q2, #0x3ff
-+        vmax.s16    q0, q3
-+        vmax.s16    q1, q3
-+          vld1.16     {d6[],d7[]}, [r2]!
-+        vmin.s16    q0, q2
-+        vmin.s16    q1, q2
-+        vst1.16     {q0-q1}, [r0 :128], r3
-+1:
-+            vld1.16     {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.16     {q3}, [r0 :128], r3
-+          vst1.16     {q3}, [ip :128], r3
-+              vld1.16     {d6[],d7[]}, [r2]!
-+            vst1.16     {q0}, [r0 :128], r3
-+            vst1.16     {q0}, [ip :128], r3
-+        bne         1b
-+
-+              vst1.16     {q3}, [r0 :128]
-+              vst1.16     {q3}, [ip :128]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_32_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_32_neon_10, export=1
-+        vld1.16     {d0[],d1[]}, [r2]!
-+        add         ip, r0, #16
-+        push        {lr}
-+        mov         lr, #32
-+          vld1.16     {d2[],d3[]}, [r2]!
-+        lsl         r3, #1
-+        vst1.16     {q0}, [r0 :128], lr
-+        sub         r3, #32
-+        vst1.16     {q0}, [ip :128], lr
-+        mov         r1, #32-2
-+        vst1.16     {q0}, [r0 :128], r3
-+        vst1.16     {q0}, [ip :128], r3
-+1:
-+            vld1.16     {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.16     {q1}, [r0 :128], lr
-+          vst1.16     {q1}, [ip :128], lr
-+          vst1.16     {q1}, [r0 :128], r3
-+          vst1.16     {q1}, [ip :128], r3
-+              vld1.16     {d2[],d3[]}, [r2]!
-+            vst1.16     {q0}, [r0 :128], lr
-+            vst1.16     {q0}, [ip :128], lr
-+            vst1.16     {q0}, [r0 :128], r3
-+            vst1.16     {q0}, [ip :128], r3
-+        bne         1b
-+
-+              vst1.16     {q1}, [r0 :128], lr
-+              vst1.16     {q1}, [ip :128], lr
-+              vst1.16     {q1}, [r0 :128]
-+              vst1.16     {q1}, [ip :128]
-+        pop         {pc}
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_4_neon_10, export=1
-+        add         r1, r2, #4
-+        vld1.32     {d0[],d1[]}, [r2]
-+        add         r2, #8
-+        vld1.32     {d2[],d3[]}, [r1]
-+        add         r1, #8
-+        vld1.32     {d4[],d5[]}, [r2]
-+A       add         r2, r0, r3, lsl #2
-+T       lsl         r3, #2
-+T       add         r2, r0, r3
-+        vld1.32     {d6[],d7[]}, [r1]
-+A       lsl         r3, #3
-+T       lsl         r3, #1
-+        vst1.32     {q0}, [r0 :128], r3
-+        vst1.32     {q1}, [r2 :128], r3
-+        vst1.32     {q2}, [r0 :128]
-+        vst1.32     {q3}, [r2 :128]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_8_neon_10, export=1
-+        vld1.32     {d0[],d1[]}, [r2]!
-+        lsl         r3, #2
-+        add         ip, r0, #16
-+        mov         r1, #8-2
-+          vld1.32     {d2[],d3[]}, [r2]!
-+        vst1.32     {q0}, [r0 :128], r3
-+        vst1.32     {q0}, [ip :128], r3
-+1:
-+            vld1.32     {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.32     {q1}, [r0 :128], r3
-+          vst1.32     {q1}, [ip :128], r3
-+              vld1.32     {d2[],d3[]}, [r2]!
-+            vst1.32     {q0}, [r0 :128], r3
-+            vst1.32     {q0}, [ip :128], r3
-+        bne         1b
-+
-+              vst1.32     {q1}, [r0 :128]
-+              vst1.32     {q1}, [ip :128]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_16_neon_10, export=1
-+        vld1.32     {d0[],d1[]}, [r2]!
-+        add         ip, r0, #16
-+        push        {lr}
-+        mov         lr, #32
-+          vld1.32     {d2[],d3[]}, [r2]!
-+        lsl         r3, #2
-+        vst1.32     {q0}, [r0 :128], lr
-+        sub         r3, #32
-+        vst1.32     {q0}, [ip :128], lr
-+        mov         r1, #16-2
-+        vst1.32     {q0}, [r0 :128], r3
-+        vst1.32     {q0}, [ip :128], r3
-+1:
-+            vld1.32     {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.32     {q1}, [r0 :128], lr
-+          vst1.32     {q1}, [ip :128], lr
-+          vst1.32     {q1}, [r0 :128], r3
-+          vst1.32     {q1}, [ip :128], r3
-+              vld1.32     {d2[],d3[]}, [r2]!
-+            vst1.32     {q0}, [r0 :128], lr
-+            vst1.32     {q0}, [ip :128], lr
-+            vst1.32     {q0}, [r0 :128], r3
-+            vst1.32     {q0}, [ip :128], r3
-+        bne         1b
-+
-+              vst1.32     {q1}, [r0 :128], lr
-+              vst1.32     {q1}, [ip :128], lr
-+              vst1.32     {q1}, [r0 :128]
-+              vst1.32     {q1}, [ip :128]
-+        pop         {pc}
-+endfunc
-+
-+
-+
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S
-@@ -0,0 +1,1043 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox, Ben Avison
-+*/
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+@ Planar intra pred (8.4.4.2.4)
-+@
-+@ predSamples[ x ][ y ] =
-+@ ( ( nTbS - 1 - x ) * p[ -1 ][ y ] +
-+@   ( x + 1 ) * p[ nTbS ][ -1 ] +
-+@   ( nTbS - 1 - y ) * p[ x ][ -1 ] +
-+@   ( y + 1 ) * p[ -1 ][ nTbS ] + nTbS ) >> ( Log2( nTbS ) + 1 )
-+
-+@ All 10-bit functions would work with 9
-+
-+
-+@ ff_hevc_rpi_pred_planar_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_4_neon_8, export=1
-+
-+        vld1.8      {d0}, [r1]          @ Top
-+        adr         ip, nb_3_0_1_4
-+        vld1.8      {d1}, [r2]          @ Left
-+        vmov.i64    d2, #0xffffffff
-+        vldr        d3, [ip, #8]        @ {1,2,3,4,1,2,3,4}
-+        add         r1, r0, r3
-+        vdup.32     d4, d0[0]           @ {t0,t1,t2,t3,t0,t1,t2,t3}
-+        vdup.8      d0, d0[4]           @ {t4,t4,t4,t4,t4,t4,t4,t4}
-+        vdup.8      d5, d1[4]           @ {l4,l4,l4,l4,l4,l4,l4,l4}
-+        vdup.8      d6, d1[0]           @ {l0,l0,l0,l0,l0,l0,l0,l0}
-+        vshll.u8    q8, d4, #2
-+        lsl         r3, #1
-+        vsubl.u8    q2, d5, d4
-+        vmlal.u8    q8, d0, d3
-+        vld1.8      {d0}, [ip]          @ {3,2,1,0,3,2,1,0}
-+        vdup.8      d7, d1[1]           @ {l1,l1,l1,l1,l1,l1,l1,l1}
-+        vshl.s16    q9, q2, #1
-+        vbif        d6, d7, d2          @ {l0,l0,l0,l0,l1,l1,l1,l1}
-+        vadd.i16    d16, d4
-+        vdup.8      d7, d1[2]           @ {l2,l2,l2,l2,l2,l2,l2,l2}
-+        vadd.i16    d17, d18
-+        vdup.8      d1, d1[3]           @ {l3,l3,l3,l3,l3,l3,l3,l3}
-+        vadd.i16    q2, q8, q9
-+        vmlal.u8    q8, d0, d6
-+        vbif        d7, d1, d2          @ {l2,l2,l2,l2,l3,l3,l3,l3}
-+        vmlal.u8    q2, d0, d7
-+        vrshrn.i16  d0, q8, #3
-+        vst1.32     d0[0], [r0 :32], r3
-+        vst1.32     d0[1], [r1 :32], r3
-+        vrshrn.i16  d0, q2, #3
-+        vst1.32     d0[0], [r0 :32]
-+        vst1.32     d0[1], [r1 :32]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_4_neon_10, export=1
-+        @ Load from bytes & expand later - at the very least this uses less
-+        @ memory than having a short table
-+        vld1.16     {q0}, [r1 :64]      @ Top
-+        adr         ip, nbh_3_0_1_4
-+        vldr        d2, [r2, #8]        @ Left (lower)
-+        vldr        d3, [ip, #8]        @ {1,2,3,4}
-+T       lsl         r3, #1
-+        vshl.s16    d4, d0, #2
-+        vdup.16     d1, d1[0]           @ {t4,t4,t4,t4}
-+        vldr        d5, [r2]            @ Left (upper)
-+        vdup.16     d2, d2[0]           @ {l4,l4,l4,l4}
-+        vldr        d6, [ip]            @ {3,2,1,0}
-+        vmla.i16    d4, d3, d1          @ Acc set up
-+        vsub.i16    d0, d2, d0          @ Add set up
-+        vmov        d7, d6
-+        vdup.16     d2, d5[0]
-+        vdup.16     d3, d5[1]
-+        vdup.16     d16, d5[2]
-+        vadd.i16    d18, d0, d4
-+        vshl.s16    d0, #1              @ x2
-+        vadd.i16    d19, d0, d4
-+        vdup.16     d17, d5[3]
-+        vadd.i16    d4, d0, d18
-+A       add         r1, r0, r3, lsl #1
-+T       add         r1, r0, r3
-+        vadd.i16    d5, d0, d19
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vmla.i16    q9, q1, q3
-+        vmla.i16    q2, q8, q3
-+        vrshr.u16   q0, q9, #3
-+        vst1.16     {d0}, [r0], r3
-+        vrshr.u16   d2, d4, #3
-+        vst1.16     {d1}, [r1], r3
-+        vrshr.u16   d3, d5, #3
-+        vst1.16     {d2}, [r0]
-+        vst1.16     {d3}, [r1]
-+
-+        bx         lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_8_neon_8, export=1
-+
-+        vld1.8      {q0}, [r1]          @ Top
-+        adr         ip, nb_7_0_1_8
-+        vldr        d2, [r2, #8]        @ Left (lower)
-+        mov         r1, #8
-+        vldr        d3, [ip, #8]        @ {1,2,3,4,5,6,7,8}
-+        vshll.u8    q2, d0, #3
-+        vdup.8      d1, d1[0]           @ {t8,t8,t8,t8,t8,t8,t8,t8}
-+        vdup.8      d2, d2[0]           @ {l8,l8,l8,l8,l8,l8,l8,l8}
-+        vldr        d6, [r2]            @ Left (upper)
-+        vmlal.u8    q2, d3, d1
-+        vsubl.u8    q0, d2, d0
-+        vldr        d7, [ip]            @ {7,6,5,4,3,2,1,0}
-+
-+@ u8   7..0    [1]  d7
-+@ u8  left[y]  [1]  d6
-+@ u16 acc      [2]  q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [2]  q0 = p[-1][nTbs] - p[x][-1]
-+
-+        vdup.8      d2, d6[0]
-+        vadd.i16    q2, q0
-+        vdup.8      d3, d6[1]
-+        vadd.i16    q8, q2, q0
-+1:
-+        vmlal.u8    q2, d7, d2
-+        subs        r1, #2
-+        vadd.i16    q9, q8, q0
-+        vmlal.u8    q8, d7, d3
-+        vdup.8      d2, d6[2]
-+        vdup.8      d3, d6[3]
-+        vrshrn.i16  d20, q2, #4
-+        vshr.u64    d6, #16
-+        vmov        q2, q9
-+        vst1.8      {d20}, [r0], r3
-+        vrshrn.i16  d20, q8, #4
-+        vadd.i16    q8, q2, q0
-+        vst1.8      {d20}, [r0], r3
-+        bne         1b
-+
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_8_neon_10, export=1
-+
-+        adr         ip, nb_7_0_1_8
-+        vld1.16     {q0}, [r1 :128]!    @ Top (left)
-+        lsl         r3, #1
-+        vld1.16     {q1}, [ip :128]     @ {7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8}
-+        add         ip, r2, #16
-+        vld1.16     {d4[],d5[]}, [r1]   @ Top (right)
-+        mov         r1, #8-2
-+        vshl.s16    q3, q0, #3
-+        vmovl.u8    q8, d3              @ {1,2,3,4,5,6,7,8}
-+        vld1.16     {d18[],d19[]}, [ip] @ Left (lower)
-+        vmla.i16    q3, q8, q2          @ Acc set up
-+        vsub.i16    q0, q9, q0          @ Add set up
-+        vmovl.u8    q1, d2              @ {7,6,5,4,3,2,1,0}
-+        vadd.i16    q2, q3, q0
-+
-+@ u16  7..0        [1]  q1
-+@ u32 left[y]      [1]  [r2]
-+@ u16 acc          [1]  q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add          [1]  q0 = p[-1][nTbs] - p[x][-1]
-+
-+        vld1.16     {d6[],d7[]}, [r2]!
-+        vadd.i16    q8, q2, q0
-+        vld1.16     {d18[],d19[]}, [r2]!
-+        vmla.i16    q2, q1, q3
-+        vadd.i16    q3, q8, q0
-+        vmla.i16    q8, q1, q9
-+1:
-+        vrshr.u16   q9, q2, #4
-+        subs        r1, #2
-+        vmov        q2, q3
-+        vrshr.u16   q10, q8, #4
-+          vld1.16     {d6[],d7[]}, [r2]!
-+        vst1.16     {q9}, [r0 :128], r3
-+          vadd.i16    q8, q2, q0
-+          vld1.16     {d18[],d19[]}, [r2]!
-+          vmla.i16    q2, q1, q3
-+          vadd.i16    q3, q8, q0
-+          vmla.i16    q8, q1, q9
-+        vst1.16     {q10}, [r0 :128], r3
-+        bne         1b
-+
-+        vrshr.u16   q9, q2, #4
-+        add         r3, r0
-+        vrshr.u16   q10, q8, #4
-+        vst1.16     {q9}, [r0 :128]
-+        vst1.16     {q10}, [r3 :128]
-+
-+        bx         lr
-+endfunc
-+
-+
-+@------------------------------------------------------------------------------
-+@
-+@ Data - has to be in two lumps to ensure we can always reach using adr
-+
-+        .balign 64
-+
-+nb_31_0_1_32:
-+        .byte   31, 30, 29, 28, 27, 26, 25, 24
-+        .byte   23, 22, 21, 20, 19, 18, 17, 16
-+nb_15_0_1_16:
-+        .byte   15, 14, 13, 12, 11, 10,  9,  8
-+        .byte    7,  6,  5,  4,  3,  2,  1,  0
-+        .byte    1,  2,  3,  4,  5,  6,  7,  8
-+        .byte    9, 10, 11, 12, 13, 14, 15, 16
-+        .byte   17, 18, 19, 20, 21, 22, 23, 24
-+        .byte   25, 26, 27, 28, 29, 30, 31, 32
-+
-+        @ should be back on a 64-byte boundary here
-+
-+        @ These could be extracted from the above array, but separate out
-+        @ out for better (16 byte) alignment
-+nb_3_0_1_4:
-+        .byte    3,  2,  1,  0,  3,  2,  1,  0
-+        .byte    1,  2,  3,  4,  1,  2,  3,  4
-+nb_7_0_1_8:
-+        .byte    7,  6,  5,  4,  3,  2,  1,  0
-+        .byte    1,  2,  3,  4,  5,  6,  7,  8
-+nbh_3_0_1_4:
-+        .short   3,  2,  1,  0,  1,  2,  3,  4
-+
-+@------------------------------------------------------------------------------
-+
-+
-+@ ff_hevc_rpi_pred_planar_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_16_neon_8, export=1
-+
-+        adr         ip, nb_15_0_1_16 + 16
-+        vld1.8      {q0}, [r1 :128]!    @ Top (left)
-+        add         r2, #16
-+        vld1.8      {q1}, [ip: 128]     @ {1,2,3...16}
-+        vld1.8      {d4[]}, [r1]        @ Top (right)
-+        sub         ip, #16
-+        vshll.u8    q3, d0, #4
-+        mov         r1, #16
-+        vshll.u8    q8, d1, #4
-+        vld1.8      {d5[]}, [r2]        @ Left (lower)
-+        sub         r2, #16
-+        vmlal.u8    q3, d2, d4
-+        vmlal.u8    q8, d3, d4          @ Acc set up
-+        vsubl.u8    q1, d5, d0
-+        vsubl.u8    q0, d5, d1          @ Add set up
-+        vld1.8      {q2}, [ip :128]     @ {15,14,13...0}
-+
-+@ u8  15..0    [1]  q2
-+@ u8  left[y]  [1]  [r2]
-+@ u16 acc      [2]  q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [2]  q1,q0 = p[-1][nTbs] - p[x][-1]
-+
-+        vadd.i16    q3, q1
-+        vadd.i16    q8, q0
-+1:
-+        vadd.i16    q10, q3, q1
-+        subs        r1, #2
-+        vld1.8      {d18[]}, [r2]!
-+        vadd.i16    q11, q8, q0
-+        vld1.8      {d19[]}, [r2]!
-+        vmlal.u8    q3, d4, d18
-+        vmlal.u8    q8, d5, d18
-+        vadd.i16    q12, q10, q1
-+        vmlal.u8    q10, d4, d19
-+        vadd.i16    q13, q11, q0
-+        vmlal.u8    q11, d5, d19
-+        vrshrn.u16  d18, q3, #5
-+        vrshrn.u16  d19, q8, #5
-+        vmov        q3, q12
-+        vst1.8      {q9}, [r0 :128], r3
-+        vrshrn.u16  d18, q10, #5
-+        vrshrn.u16  d19, q11, #5
-+        vmov        q8, q13
-+        vst1.8      {q9}, [r0 :128], r3
-+        bne         1b
-+
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_16_neon_10, export=1
-+
-+        @ Load from bytes & expand later - at the very least this uses less
-+        @ memory than having a short table
-+        adr         ip, nb_15_0_1_16 + 16
-+        vld1.16     {q0-q1}, [r1 :128]! @ Top (left)
-+        add         r2, #32
-+        vld1.8      {q2}, [ip :128]     @ {1,2,3...16}
-+        lsl         r3, #1
-+        vld1.16     {d6[],d7[]}, [r1]   @ Top (right)
-+        sub         ip, #16
-+        vmovl.u8    q8, d4
-+        mov         r1, #16
-+        vshl.i16    q9, q0, #4
-+        vmovl.u8    q2, d5
-+        vshl.i16    q10, q1, #4
-+        vld1.16     {d22[],d23[]}, [r2] @ Left (lower)
-+        sub         r2, #32
-+        vld1.8      {q12}, [ip]         @ {15,14,13...0}
-+        vmla.i16    q9, q8, q3
-+        vmla.i16    q10, q2, q3         @ Acc set up
-+        vsub.i16    q0, q11, q0
-+        vsub.i16    q1, q11, q1         @ Add set up
-+        vadd.i16    q2, q9, q0
-+        vadd.i16    q3, q10, q1
-+        vmovl.u8    q8, d24
-+        vmovl.u8    q9, d25
-+
-+@ u16  15..0       [2]  q8,q9
-+@ u32 left[y]      [2]  [r2]
-+@ u16 acc          [2]  q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add          [2]  q0,q1 = p[-1][nTbs] - p[x][-1]
-+
-+1:
-+        vadd.i16    q10, q2, q0
-+        subs        r1, #2
-+        vld1.16     {d24[],d25[]}, [r2]!
-+        vadd.i16    q11, q3, q1
-+        vld1.16     {d28[],d29[]}, [r2]!
-+        vmla.i16    q2, q8, q12
-+        vmla.i16    q3, q9, q12
-+        vadd.i16    q12, q10, q0
-+        vmla.i16    q10, q8, q14
-+        vadd.i16    q13, q11, q1
-+        vmla.i16    q11, q9, q14
-+        vrshr.u16   q14, q2, #5
-+        vrshr.u16   q15, q3, #5
-+        vmov        q2, q12
-+        vst1.16     {q14-q15}, [r0 :128], r3
-+        vrshr.u16   q14, q10, #5
-+        vrshr.u16   q15, q11, #5
-+        vmov        q3, q13
-+        vst1.16     {q14-q15}, [r0 :128], r3
-+        bne         1b
-+
-+        bx         lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_32_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_32_neon_8, export=1
-+
-+        vld1.8      {q0-q1}, [r1 :128]! @ Top (left)
-+        adr         ip, nb_31_0_1_32 + 32
-+        vpush       {d8-d12}
-+        vld1.8      {q2-q3}, [ip :128]  @ {1,2,3...32}
-+        add         r2, #32
-+        vld1.8      {d8[]}, [r1]        @ Top (right)
-+        sub         ip, #32
-+        vshll.u8    q8, d0, #5
-+        mov         r1, #32
-+        vld1.8      {d9[]}, [r2]        @ Left (lower)
-+        sub         r2, #32
-+        vshll.u8    q9, d1, #5
-+        vshll.u8    q10, d2, #5
-+        vshll.u8    q11, d3, #5
-+        vmlal.u8    q8, d4, d8
-+        vsubl.u8    q12, d9, d0
-+        vmlal.u8    q9, d5, d8
-+        vsubl.u8    q13, d9, d1
-+        vmlal.u8    q10, d6, d8
-+        vsubl.u8    q14, d9, d2
-+        vmlal.u8    q11, d7, d8         @ Acc set up
-+        vsubl.u8    q15, d9, d3         @ Add set up
-+        vadd.i16    q8, q12
-+        vadd.i16    q9, q13
-+        vadd.i16    q10, q14
-+        vadd.i16    q11, q15
-+        vld1.8      {q4-q5}, [ip :128]  @ {31,30,29...0}
-+
-+@ u8  31..0    [2]  q4,q5
-+@ u8  left[y]  [2]  [r2]
-+@ u16 acc      [4]  q8-q11  = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [4]  q12-q15 = p[-1][nTbs] - p[x][-1]
-+
-+        vld1.8      {d12[]}, [r2]!
-+        vadd.i16    q0, q8, q12
-+        b           2f
-+1:
-+          vld1.8      {d12[]}, [r2]!
-+        vrshrn.u16  d3, q1, #6
-+        vrshrn.u16  d2, q0, #6
-+          vadd.i16    q0, q8, q12
-+        vrshrn.u16  d4, q2, #6
-+        vrshrn.u16  d5, q3, #6
-+        vst1.8      {q1-q2}, [r0 :128], r3
-+2:        vadd.i16    q1, q9, q13
-+          subs        r1, #2
-+          vadd.i16    q2, q10, q14
-+          vadd.i16    q3, q11, q15
-+          vmlal.u8    q8, d8, d12
-+          vmlal.u8    q9, d9, d12
-+          vmlal.u8    q10, d10, d12
-+          vmlal.u8    q11, d11, d12
-+            vld1.8      {d12[]}, [r2]!
-+          vrshrn.u16  d19, q9, #6
-+          vrshrn.u16  d18, q8, #6
-+            vadd.i16    q8, q0, q12
-+          vrshrn.u16  d20, q10, #6
-+          vrshrn.u16  d21, q11, #6
-+          vst1.8      {q9-q10}, [r0 :128], r3
-+            vadd.i16    q9, q1, q13
-+            vadd.i16    q10, q2, q14
-+            vadd.i16    q11, q3, q15
-+            vmlal.u8    q0, d8, d12
-+            vmlal.u8    q1, d9, d12
-+            vmlal.u8    q2, d10, d12
-+            vmlal.u8    q3, d11, d12
-+
-+        bne         1b
-+
-+        vpop        {d8-d12}
-+
-+        vrshrn.u16  d3, q1, #6
-+        vrshrn.u16  d2, q0, #6
-+        vrshrn.u16  d4, q2, #6
-+        vrshrn.u16  d5, q3, #6
-+        vst1.8      {q1-q2}, [r0 :128]
-+
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_32_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_32_neon_10, export=1
-+
-+        @ Load from bytes & expand later - at the very least this uses less
-+        @ memory than having a short table
-+        vld1.16     {q0-q1}, [r1 :128]!  @ Top (left)
-+        adr         ip, nb_31_0_1_32 + 32
-+        vpush       {q4-q7}
-+        vld1.16     {q2-q3}, [r1 :128]!  @ Top (centre)
-+        add         r2, #64
-+        vld1.8      {q14-q15}, [ip :128] @ {1,2,3...32}
-+T       lsl         r3, #1
-+        vld1.16     {d8[],d9[]}, [r1]    @ Top (right)
-+        sub         ip, #32
-+        vmovl.u8    q12, d28
-+        mov         r1, #32
-+        vmovl.u8    q13, d29
-+        vld1.8      {q6-q7}, [ip :128]   @ {31,30,29...0}
-+        vmovl.u8    q14, d30
-+        vmovl.u8    q15, d31
-+        vld1.16     {d10[],d11[]}, [r2]  @ Left (lower)
-+        sub         r2, #64
-+        vshl.i16    q8, q0, #5
-+        vshl.i16    q9, q1, #5
-+        vshl.i16    q10, q2, #5
-+        vshl.i16    q11, q3, #5
-+        vmla.i16    q8, q12, q4
-+        vsub.i16    q0, q5, q0
-+        vmla.i16    q9, q13, q4
-+        vsub.i16    q1, q5, q1
-+        vmla.i16    q10, q14, q4
-+        vmov.u16    ip, d0[0]
-+        vsub.i16    q2, q5, q2
-+        vmla.i16    q11, q15, q4         @ Acc set up
-+        vsub.i16    q3, q5, q3           @ Add set up
-+        vadd.i16    q8, q0
-+        vadd.i16    q9, q1
-+        vadd.i16    q10, q2
-+        vadd.i16    q11, q3
-+        vmovl.u8    q4, d12
-+        vmovl.u8    q5, d13
-+        vmovl.u8    q6, d14
-+        vmovl.u8    q7, d15
-+
-+@ u16 31..0    [4]  q4-q7
-+@ u16 left[y]  [4]  [r2]
-+@ u16 acc      [4]  q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [4]  q0-q3  = p[-1][nTbs] - p[x][-1]
-+
-+        vadd.i16    q12, q8, q0
-+A       sub         r0, r0, r3, lsl #1
-+T       sub         r0, r3
-+1:
-+        vld1.16     {d0[0]}, [r2]!
-+A       add         r0, r0, r3, lsl #1
-+T       add         r0, r3
-+        vadd.i16    q13, q9, q1
-+        subs        r1, #2
-+        vadd.i16    q14, q10, q2
-+        vadd.i16    q15, q11, q3
-+        vmla.i16    q8, q4, d0[0]
-+        vmla.i16    q9, q5, d0[0]
-+        vmla.i16    q10, q6, d0[0]
-+        vmla.i16    q11, q7, d0[0]
-+        vmov.16     d0[0], ip
-+        vrshr.u16   q8, #6
-+        vrshr.u16   q9, #6
-+        vrshr.u16   q10, #6
-+        vrshr.u16   q11, #6
-+        vstm        r0, {q8-q11}
-+        vadd.i16    q8, q12, q0
-+A       add         r0, r0, r3, lsl #1
-+T       add         r0, r3
-+        vld1.16     {d0[0]}, [r2]!
-+        vadd.i16    q9, q13, q1
-+        vadd.i16    q10, q14, q2
-+        vadd.i16    q11, q15, q3
-+        vmla.i16    q12, q4, d0[0]
-+        vmla.i16    q13, q5, d0[0]
-+        vmla.i16    q14, q6, d0[0]
-+        vmla.i16    q15, q7, d0[0]
-+        vmov.16     d0[0], ip
-+        vrshr.u16   q12, #6
-+        vrshr.u16   q13, #6
-+        vrshr.u16   q14, #6
-+        vrshr.u16   q15, #6
-+        vstm        r0, {q12-q15}
-+        vadd.i16    q12, q8, q0
-+        bne         1b
-+
-+        vpop        {q4-q7}
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_4_neon_8, export=1
-+
-+        vld1.8      {q0}, [r1]          @ Top
-+        adr         ip, nbx2_3_0_1_4
-+        vldr        d2, [r2, #8]        @ Left (lower)
-+        mov         r1, #4
-+        vldr        d3, [ip, #8]        @ {1,1,2,2,3,3,4,4}
-+        lsl         r3, #1
-+        vshll.u8    q2, d0, #2
-+        vdup.16     d1, d1[0]           @ {t4,t4,t4,t4,t4,t4,t4,t4}
-+        vdup.16     d2, d2[0]           @ {l4,l4,l4,l4,l4,l4,l4,l4}
-+        vldr        d6, [r2]            @ Left (upper)
-+        vmlal.u8    q2, d3, d1
-+        vsubl.u8    q0, d2, d0
-+        vldr        d7, [ip]            @ {3,3,2,2,1,1,0,0}
-+
-+@ u8   3..0    [1]  d7
-+@ u8  left[y]  [1]  d6
-+@ u16 acc      [2]  q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [2]  q0 = p[-1][nTbs] - p[x][-1]
-+
-+        vdup.16     d2, d6[0]
-+        vadd.i16    q2, q0
-+        vdup.16     d3, d6[1]
-+        vadd.i16    q8, q2, q0
-+1:
-+        vmlal.u8    q2, d7, d2
-+        subs        r1, #2
-+        vadd.i16    q9, q8, q0
-+        vmlal.u8    q8, d7, d3
-+        vdup.16     d2, d6[2]
-+        vdup.16     d3, d6[3]
-+        vrshrn.i16  d20, q2, #3
-+        vmov        q2, q9
-+        vst1.8      {d20}, [r0], r3
-+        vrshrn.i16  d20, q8, #3
-+        vadd.i16    q8, q2, q0
-+        vst1.8      {d20}, [r0], r3
-+        bne         1b
-+
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_4_neon_10, export=1
-+
-+        adr         ip, nbx2_3_0_1_4
-+        vld1.16     {q0}, [r1 :128]!    @ Top (left)
-+        lsl         r3, #2
-+        vld1.16     {q1}, [ip :128]     @ {3,3,2,2,1,1,0,0,1,1,2,2,3,3,4,4}
-+        add         ip, r2, #16
-+        vld1.32     {d4[],d5[]}, [r1]   @ Top (right)
-+        vshl.s16    q3, q0, #2
-+        vmovl.u8    q8, d3              @ {1,1,2,2,3,3,4,4}
-+        vld1.32     {d18[],d19[]}, [ip] @ Left (lower)
-+        vmla.i16    q3, q8, q2          @ Acc set up
-+        vsub.i16    q0, q9, q0          @ Add set up
-+        vmovl.u8    q1, d2              @ {3,3,2,2,1,1,0,0}
-+        vadd.i16    q2, q3, q0
-+
-+@ u16  3..0        [1]  q1
-+@ u32 left[y]      [1]  [r2]
-+@ u16 acc          [1]  q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add          [1]  q0 = p[-1][nTbs] - p[x][-1]
-+
-+        vld1.32     {d6[],d7[]}, [r2]!
-+        vadd.i16    q8, q2, q0
-+        vld1.32     {d18[],d19[]}, [r2]!
-+        vmla.i16    q2, q1, q3
-+        vadd.i16    q3, q8, q0
-+        vmla.i16    q8, q1, q9
-+
-+        vrshr.u16   q9, q2, #3
-+        vmov        q2, q3
-+        vrshr.u16   q10, q8, #3
-+          vld1.32     {d6[],d7[]}, [r2]!
-+        vst1.16     {q9}, [r0 :128], r3
-+          vadd.i16    q8, q2, q0
-+          vld1.32     {d18[],d19[]}, [r2]!
-+          vmla.i16    q2, q1, q3
-+          vadd.i16    q3, q8, q0
-+          vmla.i16    q8, q1, q9
-+        vst1.16     {q10}, [r0 :128], r3
-+
-+          vrshr.u16   q9, q2, #3
-+          add         r3, r0
-+          vrshr.u16   q10, q8, #3
-+          vst1.16     {q9}, [r0 :128]
-+          vst1.16     {q10}, [r3 :128]
-+
-+          bx         lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_8_neon_8, export=1
-+
-+        adr         ip, nbx2_7_0_1_8 + 16
-+        vld1.8      {q0}, [r1 :128]!    @ Top (left)
-+        add         r2, #16
-+        vld1.8      {q1}, [ip: 128]     @ {1,1,2,2,3,3...8,8}
-+        lsl         r3, #1
-+        vld1.16     {d4[]}, [r1]        @ Top (right)
-+        sub         ip, #16
-+        vshll.u8    q3, d0, #3
-+        mov         r1, #8
-+        vshll.u8    q8, d1, #3
-+        vld1.16     {d5[]}, [r2]        @ Left (lower)
-+        sub         r2, #16
-+        vmlal.u8    q3, d2, d4
-+        vmlal.u8    q8, d3, d4          @ Acc set up
-+        vsubl.u8    q1, d5, d0
-+        vsubl.u8    q0, d5, d1          @ Add set up
-+        vld1.8      {q2}, [ip :128]     @ {7,7,6,6,5,5...0,0}
-+
-+@ u8  7..0     [1]  q2
-+@ u8  left[y]  [1]  [r2]
-+@ u16 acc      [2]  q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [2]  q1,q0 = p[-1][nTbs] - p[x][-1]
-+
-+        vadd.i16    q3, q1
-+        vadd.i16    q8, q0
-+1:
-+        vadd.i16    q10, q3, q1
-+        subs        r1, #2
-+        vld1.16     {d18[]}, [r2]!
-+        vadd.i16    q11, q8, q0
-+        vld1.16     {d19[]}, [r2]!
-+        vmlal.u8    q3, d4, d18
-+        vmlal.u8    q8, d5, d18
-+        vadd.i16    q12, q10, q1
-+        vmlal.u8    q10, d4, d19
-+        vadd.i16    q13, q11, q0
-+        vmlal.u8    q11, d5, d19
-+        vrshrn.u16  d18, q3, #4
-+        vrshrn.u16  d19, q8, #4
-+        vmov        q3, q12
-+        vst1.8      {q9}, [r0 :128], r3
-+        vrshrn.u16  d18, q10, #4
-+        vrshrn.u16  d19, q11, #4
-+        vmov        q8, q13
-+        vst1.8      {q9}, [r0 :128], r3
-+        bne         1b
-+
-+        bx          lr
-+
-+endfunc
-+
-+
-+@------------------------------------------------------------------------------
-+@
-+@ Data - has to be in two lumps to ensure we can always reach using adr
-+
-+        .balign 64
-+
-+nbx2_15_0_1_16:
-+        .byte   15, 15, 14, 14, 13, 13, 12, 12
-+        .byte   11, 11, 10, 10,  9,  9,  8,  8
-+nbx2_7_0_1_8:
-+        .byte    7,  7,  6,  6,  5,  5,  4,  4
-+        .byte    3,  3,  2,  2,  1,  1,  0,  0
-+        .byte    1,  1,  2,  2,  3,  3,  4,  4
-+        .byte    5,  5,  6,  6,  7,  7,  8,  8
-+        .byte    9,  9, 10, 10, 11, 11, 12, 12
-+        .byte   13, 13, 14, 14, 15, 15, 16, 16
-+
-+        @ should be back on a 64-byte boundary here
-+
-+nbx2_3_0_1_4:
-+        .byte    3,  3,  2,  2,  1,  1,  0,  0
-+        .byte    1,  1,  2,  2,  3,  3,  4,  4
-+
-+@------------------------------------------------------------------------------
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_8_neon_10, export=1
-+
-+        @ Load from bytes & expand later - at the very least this uses less
-+        @ memory than having a short table
-+        adr         ip, nbx2_7_0_1_8 + 16
-+        vld1.16     {q0-q1}, [r1 :128]! @ Top (left)
-+        add         r2, #32
-+        vld1.8      {q2}, [ip :128]     @ {1,1,2,2,3,3...8,8}
-+        lsl         r3, #2
-+        vld1.32     {d6[],d7[]}, [r1]   @ Top (right)
-+        sub         ip, #16
-+        vmovl.u8    q8, d4
-+        mov         r1, #8
-+        vshl.i16    q9, q0, #3
-+        vmovl.u8    q2, d5
-+        vshl.i16    q10, q1, #3
-+        vld1.32     {d22[],d23[]}, [r2] @ Left (lower)
-+        sub         r2, #32
-+        vld1.8      {q12}, [ip]         @ {7,7,6,6,5,5...0,0}
-+        vmla.i16    q9, q8, q3
-+        vmla.i16    q10, q2, q3         @ Acc set up
-+        vsub.i16    q0, q11, q0
-+        vsub.i16    q1, q11, q1         @ Add set up
-+        vadd.i16    q2, q9, q0
-+        vadd.i16    q3, q10, q1
-+        vmovl.u8    q8, d24
-+        vmovl.u8    q9, d25
-+
-+@ u16  7..0        [2]  q8,q9
-+@ u32 left[y]      [2]  [r2]
-+@ u16 acc          [2]  q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add          [2]  q0,q1 = p[-1][nTbs] - p[x][-1]
-+
-+1:
-+        vadd.i16    q10, q2, q0
-+        subs        r1, #2
-+        vld1.32     {d24[],d25[]}, [r2]!
-+        vadd.i16    q11, q3, q1
-+        vld1.32     {d28[],d29[]}, [r2]!
-+        vmla.i16    q2, q8, q12
-+        vmla.i16    q3, q9, q12
-+        vadd.i16    q12, q10, q0
-+        vmla.i16    q10, q8, q14
-+        vadd.i16    q13, q11, q1
-+        vmla.i16    q11, q9, q14
-+        vrshr.u16   q14, q2, #4
-+        vrshr.u16   q15, q3, #4
-+        vmov        q2, q12
-+        vst1.16     {q14-q15}, [r0 :128], r3
-+        vrshr.u16   q14, q10, #4
-+        vrshr.u16   q15, q11, #4
-+        vmov        q3, q13
-+        vst1.16     {q14-q15}, [r0 :128], r3
-+        bne         1b
-+
-+        bx         lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_16_neon_8, export=1
-+
-+        vld1.8      {q0-q1}, [r1 :128]! @ Top (left)
-+        adr         ip, nbx2_15_0_1_16 + 32
-+        vpush       {d8-d12}
-+        vld1.8      {q2-q3}, [ip :128]  @ {1,1,2,2,3,3...16,16}
-+        add         r2, #32
-+        vld1.16     {d8[]}, [r1]        @ Top (right)
-+        sub         ip, #32
-+        vshll.u8    q8, d0, #4
-+        mov         r1, #16
-+        vld1.16     {d9[]}, [r2]        @ Left (lower)
-+        sub         r2, #32
-+        vshll.u8    q9, d1, #4
-+        lsl         r3, #1
-+        vshll.u8    q10, d2, #4
-+        vshll.u8    q11, d3, #4
-+        vmlal.u8    q8, d4, d8
-+        vsubl.u8    q12, d9, d0
-+        vmlal.u8    q9, d5, d8
-+        vsubl.u8    q13, d9, d1
-+        vmlal.u8    q10, d6, d8
-+        vsubl.u8    q14, d9, d2
-+        vmlal.u8    q11, d7, d8         @ Acc set up
-+        vsubl.u8    q15, d9, d3         @ Add set up
-+        vadd.i16    q8, q12
-+        vadd.i16    q9, q13
-+        vadd.i16    q10, q14
-+        vadd.i16    q11, q15
-+        vld1.8      {q4-q5}, [ip :128]  @ {15,15,14,14,13,13...0,0}
-+
-+@ u8  15..0    [2]  q4,q5
-+@ u8  left[y]  [2]  [r2]
-+@ u16 acc      [4]  q8-q11  = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [4]  q12-q15 = p[-1][nTbs] - p[x][-1]
-+
-+        vld1.16     {d12[]}, [r2]!
-+        vadd.i16    q0, q8, q12
-+        b           2f
-+1:
-+          vld1.16     {d12[]}, [r2]!
-+        vrshrn.u16  d3, q1, #5
-+        vrshrn.u16  d2, q0, #5
-+          vadd.i16    q0, q8, q12
-+        vrshrn.u16  d4, q2, #5
-+        vrshrn.u16  d5, q3, #5
-+        vst1.8      {q1-q2}, [r0 :128], r3
-+2:        vadd.i16    q1, q9, q13
-+          subs        r1, #2
-+          vadd.i16    q2, q10, q14
-+          vadd.i16    q3, q11, q15
-+          vmlal.u8    q8, d8, d12
-+          vmlal.u8    q9, d9, d12
-+          vmlal.u8    q10, d10, d12
-+          vmlal.u8    q11, d11, d12
-+            vld1.16     {d12[]}, [r2]!
-+          vrshrn.u16  d19, q9, #5
-+          vrshrn.u16  d18, q8, #5
-+            vadd.i16    q8, q0, q12
-+          vrshrn.u16  d20, q10, #5
-+          vrshrn.u16  d21, q11, #5
-+          vst1.8      {q9-q10}, [r0 :128], r3
-+            vadd.i16    q9, q1, q13
-+            vadd.i16    q10, q2, q14
-+            vadd.i16    q11, q3, q15
-+            vmlal.u8    q0, d8, d12
-+            vmlal.u8    q1, d9, d12
-+            vmlal.u8    q2, d10, d12
-+            vmlal.u8    q3, d11, d12
-+
-+        bne         1b
-+
-+        vpop        {d8-d12}
-+
-+        vrshrn.u16  d3, q1, #5
-+        vrshrn.u16  d2, q0, #5
-+        vrshrn.u16  d4, q2, #5
-+        vrshrn.u16  d5, q3, #5
-+        vst1.8      {q1-q2}, [r0 :128]
-+
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_16_neon_10, export=1
-+
-+        @ Load from bytes & expand later - at the very least this uses less
-+        @ memory than having a short table
-+        vld1.16     {q0-q1}, [r1 :128]!  @ Top (left)
-+        adr         ip, nbx2_15_0_1_16 + 32
-+        vpush       {q4-q7}
-+        vld1.16     {q2-q3}, [r1 :128]!  @ Top (centre)
-+        add         r2, #64
-+        vld1.8      {q14-q15}, [ip :128] @ {1,1,2,2,3,3...16,16}
-+T       lsl         r3, #2
-+        vld1.32     {d8[],d9[]}, [r1]    @ Top (right)
-+        sub         ip, #32
-+        vmovl.u8    q12, d28
-+        mov         r1, #16
-+        vmovl.u8    q13, d29
-+        vld1.8      {q6-q7}, [ip :128]   @ {15,15,14,14,13,13...0,0}
-+        vmovl.u8    q14, d30
-+        vmovl.u8    q15, d31
-+        vld1.32     {d10[],d11[]}, [r2]  @ Left (lower)
-+        sub         r2, #64
-+        vshl.i16    q8, q0, #4
-+        vshl.i16    q9, q1, #4
-+        vshl.i16    q10, q2, #4
-+        vshl.i16    q11, q3, #4
-+        vmla.i16    q8, q12, q4
-+        vsub.i16    q0, q5, q0
-+        vmla.i16    q9, q13, q4
-+        vpush       {q0}
-+        vsub.i16    q1, q5, q1
-+        vmla.i16    q10, q14, q4
-+        vsub.i16    q2, q5, q2
-+        vmla.i16    q11, q15, q4         @ Acc set up
-+        vsub.i16    q3, q5, q3           @ Add set up
-+        vadd.i16    q8, q0
-+        vadd.i16    q9, q1
-+        vadd.i16    q10, q2
-+        vadd.i16    q11, q3
-+        vmovl.u8    q4, d12
-+        vmovl.u8    q5, d13
-+        vmovl.u8    q6, d14
-+        vmovl.u8    q7, d15
-+
-+@ u16 31..0    [4]  q4-q7
-+@ u16 left[y]  [4]  [r2]
-+@ u16 acc      [4]  q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [4]  q0-q3  = p[-1][nTbs] - p[x][-1]
-+
-+        vadd.i16    q12, q8, q0
-+A       sub         r0, r0, r3, lsl #2
-+T       sub         r0, r3
-+1:
-+        vld1.32     {d0[],d1[]}, [r2]!
-+A       add         r0, r0, r3, lsl #2
-+T       add         r0, r3
-+        vadd.i16    q13, q9, q1
-+        subs        r1, #2
-+        vadd.i16    q14, q10, q2
-+        vadd.i16    q15, q11, q3
-+        vmla.i16    q8, q4, q0
-+        vmla.i16    q9, q5, q0
-+        vmla.i16    q10, q6, q0
-+        vmla.i16    q11, q7, q0
-+        vld1.16     {q0}, [sp]
-+        vrshr.u16   q8, #5
-+        vrshr.u16   q9, #5
-+        vrshr.u16   q10, #5
-+        vrshr.u16   q11, #5
-+        vstm        r0, {q8-q11}
-+        vadd.i16    q8, q12, q0
-+A       add         r0, r0, r3, lsl #2
-+T       add         r0, r3
-+        vld1.32     {d0[],d1[]}, [r2]!
-+        vadd.i16    q9, q13, q1
-+        vadd.i16    q10, q14, q2
-+        vadd.i16    q11, q15, q3
-+        vmla.i16    q12, q4, q0
-+        vmla.i16    q13, q5, q0
-+        vmla.i16    q14, q6, q0
-+        vmla.i16    q15, q7, q0
-+        vld1.16     {q0}, [sp]
-+        vrshr.u16   q12, #5
-+        vrshr.u16   q13, #5
-+        vrshr.u16   q14, #5
-+        vrshr.u16   q15, #5
-+        vstm        r0, {q12-q15}
-+        vadd.i16    q12, q8, q0
-+        bne         1b
-+
-+        vpop        {q3-q7}
-+        bx          lr
-+
-+endfunc
---- a/libavcodec/arm/vc1dsp_init_neon.c
-+++ b/libavcodec/arm/vc1dsp_init_neon.c
-@@ -19,6 +19,7 @@
- #include <stdint.h>
- 
- #include "libavutil/attributes.h"
-+#include "libavutil/intreadwrite.h"
- #include "libavcodec/vc1dsp.h"
- #include "vc1dsp.h"
- 
-@@ -32,6 +33,13 @@ void ff_vc1_inv_trans_4x8_dc_neon(uint8_
- void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
- void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
- 
-+void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq);
-+void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq);
-+void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq);
-+void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq);
-+void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq);
-+void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq);
-+
- void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels,
-                            ptrdiff_t line_size, int rnd);
- 
-@@ -77,6 +85,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t
- void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
-                                 int h, int x, int y);
- 
-+int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
-+
-+static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
-+{
-+    /* Dealing with starting and stopping, and removing escape bytes, are
-+     * comparatively less time-sensitive, so are more clearly expressed using
-+     * a C wrapper around the assembly inner loop. Note that we assume a
-+     * little-endian machine that supports unaligned loads. */
-+    int dsize = 0;
-+    while (size >= 4)
-+    {
-+        int found = 0;
-+        while (!found && (((uintptr_t) dst) & 7) && size >= 4)
-+        {
-+            found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
-+            if (!found)
-+            {
-+                *dst++ = *src++;
-+                --size;
-+                ++dsize;
-+            }
-+        }
-+        if (!found)
-+        {
-+            int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
-+            dst += skip;
-+            src += skip;
-+            size -= skip;
-+            dsize += skip;
-+            while (!found && size >= 4)
-+            {
-+                found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
-+                if (!found)
-+                {
-+                    *dst++ = *src++;
-+                    --size;
-+                    ++dsize;
-+                }
-+            }
-+        }
-+        if (found)
-+        {
-+            *dst++ = *src++;
-+            *dst++ = *src++;
-+            ++src;
-+            size -= 3;
-+            dsize += 2;
-+        }
-+    }
-+    while (size > 0)
-+    {
-+        *dst++ = *src++;
-+        --size;
-+        ++dsize;
-+    }
-+    return dsize;
-+}
-+
- #define FN_ASSIGN(X, Y) \
-     dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
-     dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon
-@@ -92,6 +158,13 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPC
-     dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
-     dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
- 
-+    dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_neon;
-+    dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_neon;
-+    dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_neon;
-+    dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_neon;
-+    dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
-+    dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
-+
-     dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon;
-     FN_ASSIGN(1, 0);
-     FN_ASSIGN(2, 0);
-@@ -116,4 +189,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPC
-     dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
-     dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
-     dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
-+
-+    dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
- }
---- a/libavcodec/arm/vc1dsp_neon.S
-+++ b/libavcodec/arm/vc1dsp_neon.S
-@@ -1161,3 +1161,764 @@ function ff_vc1_inv_trans_4x4_dc_neon, e
-         vst1.32         {d1[1]},  [r0,:32]
-         bx              lr
- endfunc
-+
-+@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
-+@ On entry:
-+@   r0 -> top-left pel of lower block
-+@   r1 = row stride, bytes
-+@   r2 = PQUANT bitstream parameter
-+function ff_vc1_v_loop_filter4_neon, export=1
-+        sub             r3, r0, r1, lsl #2
-+        vldr            d0, .Lcoeffs
-+        vld1.32         {d1[0]}, [r0], r1       @ P5
-+        vld1.32         {d2[0]}, [r3], r1       @ P1
-+        vld1.32         {d3[0]}, [r3], r1       @ P2
-+        vld1.32         {d4[0]}, [r0], r1       @ P6
-+        vld1.32         {d5[0]}, [r3], r1       @ P3
-+        vld1.32         {d6[0]}, [r0], r1       @ P7
-+        vld1.32         {d7[0]}, [r3]           @ P4
-+        vld1.32         {d16[0]}, [r0]          @ P8
-+        vshll.u8        q9, d1, #1              @ 2*P5
-+        vdup.16         d17, r2                 @ pq
-+        vshll.u8        q10, d2, #1             @ 2*P1
-+        vmovl.u8        q11, d3                 @ P2
-+        vmovl.u8        q1, d4                  @ P6
-+        vmovl.u8        q12, d5                 @ P3
-+        vmls.i16        d20, d22, d0[1]         @ 2*P1-5*P2
-+        vmovl.u8        q11, d6                 @ P7
-+        vmls.i16        d18, d2, d0[1]          @ 2*P5-5*P6
-+        vshll.u8        q2, d5, #1              @ 2*P3
-+        vmovl.u8        q3, d7                  @ P4
-+        vmla.i16        d18, d22, d0[1]         @ 2*P5-5*P6+5*P7
-+        vmovl.u8        q11, d16                @ P8
-+        vmla.u16        d20, d24, d0[1]         @ 2*P1-5*P2+5*P3
-+        vmovl.u8        q12, d1                 @ P5
-+        vmls.u16        d4, d6, d0[1]           @ 2*P3-5*P4
-+        vmls.u16        d18, d22, d0[0]         @ 2*P5-5*P6+5*P7-2*P8
-+        vsub.i16        d1, d6, d24             @ P4-P5
-+        vmls.i16        d20, d6, d0[0]          @ 2*P1-5*P2+5*P3-2*P4
-+        vmla.i16        d4, d24, d0[1]          @ 2*P3-5*P4+5*P5
-+        vmls.i16        d4, d2, d0[0]           @ 2*P3-5*P4+5*P5-2*P6
-+        vabs.s16        d2, d1
-+        vrshr.s16       d3, d18, #3
-+        vrshr.s16       d5, d20, #3
-+        vshr.s16        d2, d2, #1              @ clip
-+        vrshr.s16       d4, d4, #3
-+        vabs.s16        d3, d3                  @ a2
-+        vshr.s16        d1, d1, #8              @ clip_sign
-+        vabs.s16        d5, d5                  @ a1
-+        vceq.i16        d7, d2, #0              @ test clip == 0
-+        vabs.s16        d16, d4                 @ a0
-+        vshr.s16        d4, d4, #8              @ a0_sign
-+        vcge.s16        d18, d5, d3             @ test a1 >= a2
-+        vcge.s16        d17, d16, d17           @ test a0 >= pq
-+        vbsl            d18, d3, d5             @ a3
-+        vsub.i16        d1, d1, d4              @ clip_sign - a0_sign
-+        vorr            d3, d7, d17             @ test clip == 0 || a0 >= pq
-+        vqsub.u16       d4, d16, d18            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
-+        vcge.s16        d5, d18, d16            @ test a3 >= a0
-+        vmul.i16        d0, d4, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
-+        vorr            d4, d3, d5              @ test clip == 0 || a0 >= pq || a3 >= a0
-+        vmov.32         r0, d4[1]               @ move to gp reg
-+        vshr.u16        d0, d0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
-+        vcge.s16        d4, d0, d2
-+        tst             r0, #1
-+        bne             1f                      @ none of the 4 pixel pairs should be updated if this one is not filtered
-+        vbsl            d4, d2, d0              @ FFMIN(d, clip)
-+        vbic            d0, d4, d3              @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
-+        vmls.i16        d6, d0, d1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
-+        vmla.i16        d24, d0, d1             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
-+        vqmovun.s16     d0, q3
-+        vqmovun.s16     d1, q12
-+        vst1.32         {d0[0]}, [r3], r1
-+        vst1.32         {d1[0]}, [r3]
-+1:      bx              lr
-+endfunc
-+
-+@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
-+@ On entry:
-+@   r0 -> top-left pel of right block
-+@   r1 = row stride, bytes
-+@   r2 = PQUANT bitstream parameter
-+function ff_vc1_h_loop_filter4_neon, export=1
-+        sub             r3, r0, #4              @ where to start reading
-+        vldr            d0, .Lcoeffs
-+        vld1.32         {d2}, [r3], r1
-+        sub             r0, r0, #1              @ where to start writing
-+        vld1.32         {d4}, [r3], r1
-+        vld1.32         {d3}, [r3], r1
-+        vld1.32         {d5}, [r3]
-+        vdup.16         d1, r2                  @ pq
-+        vtrn.8          q1, q2
-+        vtrn.16         d2, d3                  @ P1, P5, P3, P7
-+        vtrn.16         d4, d5                  @ P2, P6, P4, P8
-+        vshll.u8        q3, d2, #1              @ 2*P1, 2*P5
-+        vmovl.u8        q8, d4                  @ P2, P6
-+        vmovl.u8        q9, d3                  @ P3, P7
-+        vmovl.u8        q2, d5                  @ P4, P8
-+        vmls.i16        q3, q8, d0[1]           @ 2*P1-5*P2, 2*P5-5*P6
-+        vshll.u8        q10, d3, #1             @ 2*P3, 2*P7
-+        vmovl.u8        q1, d2                  @ P1, P5
-+        vmla.i16        q3, q9, d0[1]           @ 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
-+        vmls.i16        q3, q2, d0[0]           @ 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
-+        vmov            d2, d3                  @ needs to be in an even-numbered vector for when we come to narrow it later
-+        vmls.i16        d20, d4, d0[1]          @ 2*P3-5*P4
-+        vmla.i16        d20, d3, d0[1]          @ 2*P3-5*P4+5*P5
-+        vsub.i16        d3, d4, d2              @ P4-P5
-+        vmls.i16        d20, d17, d0[0]         @ 2*P3-5*P4+5*P5-2*P6
-+        vrshr.s16       q3, q3, #3
-+        vabs.s16        d5, d3
-+        vshr.s16        d3, d3, #8              @ clip_sign
-+        vrshr.s16       d16, d20, #3
-+        vabs.s16        q3, q3                  @ a1, a2
-+        vshr.s16        d5, d5, #1              @ clip
-+        vabs.s16        d17, d16                @ a0
-+        vceq.i16        d18, d5, #0             @ test clip == 0
-+        vshr.s16        d16, d16, #8            @ a0_sign
-+        vcge.s16        d19, d6, d7             @ test a1 >= a2
-+        vcge.s16        d1, d17, d1             @ test a0 >= pq
-+        vsub.i16        d16, d3, d16            @ clip_sign - a0_sign
-+        vbsl            d19, d7, d6             @ a3
-+        vorr            d1, d18, d1             @ test clip == 0 || a0 >= pq
-+        vqsub.u16       d3, d17, d19            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
-+        vcge.s16        d6, d19, d17            @ test a3 >= a0    @
-+        vmul.i16        d0, d3, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
-+        vorr            d3, d1, d6              @ test clip == 0 || a0 >= pq || a3 >= a0
-+        vmov.32         r2, d3[1]               @ move to gp reg
-+        vshr.u16        d0, d0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
-+        vcge.s16        d3, d0, d5
-+        tst             r2, #1
-+        bne             1f                      @ none of the 4 pixel pairs should be updated if this one is not filtered
-+        vbsl            d3, d5, d0              @ FFMIN(d, clip)
-+        vbic            d0, d3, d1              @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
-+        vmla.i16        d2, d0, d16             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
-+        vmls.i16        d4, d0, d16             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
-+        vqmovun.s16     d1, q1
-+        vqmovun.s16     d0, q2
-+        vst2.8          {d0[0], d1[0]}, [r0], r1
-+        vst2.8          {d0[1], d1[1]}, [r0], r1
-+        vst2.8          {d0[2], d1[2]}, [r0], r1
-+        vst2.8          {d0[3], d1[3]}, [r0]
-+1:      bx              lr
-+endfunc
-+
-+@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
-+@ On entry:
-+@   r0 -> top-left pel of lower block
-+@   r1 = row stride, bytes
-+@   r2 = PQUANT bitstream parameter
-+function ff_vc1_v_loop_filter8_neon, export=1
-+        sub             r3, r0, r1, lsl #2
-+        vldr            d0, .Lcoeffs
-+        vld1.32         {d1}, [r0 :64], r1      @ P5
-+        vld1.32         {d2}, [r3 :64], r1      @ P1
-+        vld1.32         {d3}, [r3 :64], r1      @ P2
-+        vld1.32         {d4}, [r0 :64], r1      @ P6
-+        vld1.32         {d5}, [r3 :64], r1      @ P3
-+        vld1.32         {d6}, [r0 :64], r1      @ P7
-+        vshll.u8        q8, d1, #1              @ 2*P5
-+        vshll.u8        q9, d2, #1              @ 2*P1
-+        vld1.32         {d7}, [r3 :64]          @ P4
-+        vmovl.u8        q1, d3                  @ P2
-+        vld1.32         {d20}, [r0 :64]         @ P8
-+        vmovl.u8        q11, d4                 @ P6
-+        vdup.16         q12, r2                 @ pq
-+        vmovl.u8        q13, d5                 @ P3
-+        vmls.i16        q9, q1, d0[1]           @ 2*P1-5*P2
-+        vmovl.u8        q1, d6                  @ P7
-+        vshll.u8        q2, d5, #1              @ 2*P3
-+        vmls.i16        q8, q11, d0[1]          @ 2*P5-5*P6
-+        vmovl.u8        q3, d7                  @ P4
-+        vmovl.u8        q10, d20                @ P8
-+        vmla.i16        q8, q1, d0[1]           @ 2*P5-5*P6+5*P7
-+        vmovl.u8        q1, d1                  @ P5
-+        vmla.i16        q9, q13, d0[1]          @ 2*P1-5*P2+5*P3
-+        vsub.i16        q13, q3, q1             @ P4-P5
-+        vmls.i16        q2, q3, d0[1]           @ 2*P3-5*P4
-+        vmls.i16        q8, q10, d0[0]          @ 2*P5-5*P6+5*P7-2*P8
-+        vabs.s16        q10, q13
-+        vshr.s16        q13, q13, #8            @ clip_sign
-+        vmls.i16        q9, q3, d0[0]           @ 2*P1-5*P2+5*P3-2*P4
-+        vshr.s16        q10, q10, #1            @ clip
-+        vmla.i16        q2, q1, d0[1]           @ 2*P3-5*P4+5*P5
-+        vrshr.s16       q8, q8, #3
-+        vmls.i16        q2, q11, d0[0]          @ 2*P3-5*P4+5*P5-2*P6
-+        vceq.i16        q11, q10, #0            @ test clip == 0
-+        vrshr.s16       q9, q9, #3
-+        vabs.s16        q8, q8                  @ a2
-+        vabs.s16        q9, q9                  @ a1
-+        vrshr.s16       q2, q2, #3
-+        vcge.s16        q14, q9, q8             @ test a1 >= a2
-+        vabs.s16        q15, q2                 @ a0
-+        vshr.s16        q2, q2, #8              @ a0_sign
-+        vbsl            q14, q8, q9             @ a3
-+        vcge.s16        q8, q15, q12            @ test a0 >= pq
-+        vsub.i16        q2, q13, q2             @ clip_sign - a0_sign
-+        vqsub.u16       q9, q15, q14            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
-+        vcge.s16        q12, q14, q15           @ test a3 >= a0
-+        vorr            q8, q11, q8             @ test clip == 0 || a0 >= pq
-+        vmul.i16        q0, q9, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
-+        vorr            q9, q8, q12             @ test clip == 0 || a0 >= pq || a3 >= a0
-+        vshl.i64        q11, q9, #16
-+        vmov.32         r0, d18[1]              @ move to gp reg
-+        vshr.u16        q0, q0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
-+        vmov.32         r2, d19[1]
-+        vshr.s64        q9, q11, #48
-+        vcge.s16        q11, q0, q10
-+        vorr            q8, q8, q9
-+        and             r0, r0, r2
-+        vbsl            q11, q10, q0            @ FFMIN(d, clip)
-+        tst             r0, #1
-+        bne             1f                      @ none of the 8 pixel pairs should be updated in this case
-+        vbic            q0, q11, q8             @ set each d to zero if it should not be filtered
-+        vmls.i16        q3, q0, q2              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
-+        vmla.i16        q1, q0, q2              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
-+        vqmovun.s16     d0, q3
-+        vqmovun.s16     d1, q1
-+        vst1.32         {d0}, [r3 :64], r1
-+        vst1.32         {d1}, [r3 :64]
-+1:      bx              lr
-+endfunc
-+
-+.align  5
-+.Lcoeffs:
-+.quad   0x00050002
-+
-+@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
-+@ On entry:
-+@   r0 -> top-left pel of right block
-+@   r1 = row stride, bytes
-+@   r2 = PQUANT bitstream parameter
-+function ff_vc1_h_loop_filter8_neon, export=1
-+        push            {lr}
-+        sub             r3, r0, #4              @ where to start reading
-+        vldr            d0, .Lcoeffs
-+        vld1.32         {d2}, [r3], r1          @ P1[0], P2[0]...
-+        sub             r0, r0, #1              @ where to start writing
-+        vld1.32         {d4}, [r3], r1
-+        add             r12, r0, r1, lsl #2
-+        vld1.32         {d3}, [r3], r1
-+        vld1.32         {d5}, [r3], r1
-+        vld1.32         {d6}, [r3], r1
-+        vld1.32         {d16}, [r3], r1
-+        vld1.32         {d7}, [r3], r1
-+        vld1.32         {d17}, [r3]
-+        vtrn.8          q1, q2                  @ P1[0], P1[1], P3[0]... P1[2], P1[3], P3[2]... P2[0], P2[1], P4[0]... P2[2], P2[3], P4[2]...
-+        vdup.16         q9, r2                  @ pq
-+        vtrn.16         d2, d3                  @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]...
-+        vtrn.16         d4, d5                  @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]...
-+        vtrn.8          q3, q8                  @ P1[4], P1[5], P3[4]... P1[6], P1[7], P3[6]... P2[4], P2[5], P4[4]... P2[6], P2[7], P4[6]...
-+        vtrn.16         d6, d7                  @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[5], P3[7], P7[4]...
-+        vtrn.16         d16, d17                @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]...
-+        vtrn.32         d2, d6                  @ P1, P5
-+        vtrn.32         d4, d16                 @ P2, P6
-+        vtrn.32         d3, d7                  @ P3, P7
-+        vtrn.32         d5, d17                 @ P4, P8
-+        vshll.u8        q10, d2, #1             @ 2*P1
-+        vshll.u8        q11, d6, #1             @ 2*P5
-+        vmovl.u8        q12, d4                 @ P2
-+        vmovl.u8        q13, d16                @ P6
-+        vmovl.u8        q14, d3                 @ P3
-+        vmls.i16        q10, q12, d0[1]         @ 2*P1-5*P2
-+        vmovl.u8        q12, d7                 @ P7
-+        vshll.u8        q1, d3, #1              @ 2*P3
-+        vmls.i16        q11, q13, d0[1]         @ 2*P5-5*P6
-+        vmovl.u8        q2, d5                  @ P4
-+        vmovl.u8        q8, d17                 @ P8
-+        vmla.i16        q11, q12, d0[1]         @ 2*P5-5*P6+5*P7
-+        vmovl.u8        q3, d6                  @ P5
-+        vmla.i16        q10, q14, d0[1]         @ 2*P1-5*P2+5*P3
-+        vsub.i16        q12, q2, q3             @ P4-P5
-+        vmls.i16        q1, q2, d0[1]           @ 2*P3-5*P4
-+        vmls.i16        q11, q8, d0[0]          @ 2*P5-5*P6+5*P7-2*P8
-+        vabs.s16        q8, q12
-+        vshr.s16        q12, q12, #8            @ clip_sign
-+        vmls.i16        q10, q2, d0[0]          @ 2*P1-5*P2+5*P3-2*P4
-+        vshr.s16        q8, q8, #1              @ clip
-+        vmla.i16        q1, q3, d0[1]           @ 2*P3-5*P4+5*P5
-+        vrshr.s16       q11, q11, #3
-+        vmls.i16        q1, q13, d0[0]          @ 2*P3-5*P4+5*P5-2*P6
-+        vceq.i16        q13, q8, #0             @ test clip == 0
-+        vrshr.s16       q10, q10, #3
-+        vabs.s16        q11, q11                @ a2
-+        vabs.s16        q10, q10                @ a1
-+        vrshr.s16       q1, q1, #3
-+        vcge.s16        q14, q10, q11           @ test a1 >= a2
-+        vabs.s16        q15, q1                 @ a0
-+        vshr.s16        q1, q1, #8              @ a0_sign
-+        vbsl            q14, q11, q10           @ a3
-+        vcge.s16        q9, q15, q9             @ test a0 >= pq
-+        vsub.i16        q1, q12, q1             @ clip_sign - a0_sign
-+        vqsub.u16       q10, q15, q14           @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
-+        vcge.s16        q11, q14, q15           @ test a3 >= a0
-+        vorr            q9, q13, q9             @ test clip == 0 || a0 >= pq
-+        vmul.i16        q0, q10, d0[1]          @ a0 >= a3 ? 5*(a0-a3) : 0
-+        vorr            q10, q9, q11            @ test clip == 0 || a0 >= pq || a3 >= a0
-+        vmov.32         r2, d20[1]              @ move to gp reg
-+        vshr.u16        q0, q0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
-+        vmov.32         r3, d21[1]
-+        vcge.s16        q10, q0, q8
-+        and             r14, r2, r3
-+        vbsl            q10, q8, q0             @ FFMIN(d, clip)
-+        tst             r14, #1
-+        bne             2f                      @ none of the 8 pixel pairs should be updated in this case
-+        vbic            q0, q10, q9             @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
-+        vmla.i16        q3, q0, q1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
-+        vmls.i16        q2, q0, q1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
-+        vqmovun.s16     d1, q3
-+        vqmovun.s16     d0, q2
-+        tst             r2, #1
-+        bne             1f                      @ none of the first 4 pixel pairs should be updated if so
-+        vst2.8          {d0[0], d1[0]}, [r0], r1
-+        vst2.8          {d0[1], d1[1]}, [r0], r1
-+        vst2.8          {d0[2], d1[2]}, [r0], r1
-+        vst2.8          {d0[3], d1[3]}, [r0]
-+1:      tst             r3, #1
-+        bne             2f                      @ none of the second 4 pixel pairs should be updated if so
-+        vst2.8          {d0[4], d1[4]}, [r12], r1
-+        vst2.8          {d0[5], d1[5]}, [r12], r1
-+        vst2.8          {d0[6], d1[6]}, [r12], r1
-+        vst2.8          {d0[7], d1[7]}, [r12]
-+2:      pop             {pc}
-+endfunc
-+
-+@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
-+@ On entry:
-+@   r0 -> top-left pel of lower block
-+@   r1 = row stride, bytes
-+@   r2 = PQUANT bitstream parameter
-+function ff_vc1_v_loop_filter16_neon, export=1
-+        vpush           {d8-d15}
-+        sub             r3, r0, r1, lsl #2
-+        vldr            d0, .Lcoeffs
-+        vld1.64         {q1}, [r0 :128], r1     @ P5
-+        vld1.64         {q2}, [r3 :128], r1     @ P1
-+        vld1.64         {q3}, [r3 :128], r1     @ P2
-+        vld1.64         {q4}, [r0 :128], r1     @ P6
-+        vld1.64         {q5}, [r3 :128], r1     @ P3
-+        vld1.64         {q6}, [r0 :128], r1     @ P7
-+        vshll.u8        q7, d2, #1              @ 2*P5[0..7]
-+        vshll.u8        q8, d4, #1              @ 2*P1[0..7]
-+        vld1.64         {q9}, [r3 :128]         @ P4
-+        vmovl.u8        q10, d6                 @ P2[0..7]
-+        vld1.64         {q11}, [r0 :128]        @ P8
-+        vmovl.u8        q12, d8                 @ P6[0..7]
-+        vdup.16         q13, r2                 @ pq
-+        vshll.u8        q2, d5, #1              @ 2*P1[8..15]
-+        vmls.i16        q8, q10, d0[1]          @ 2*P1[0..7]-5*P2[0..7]
-+        vshll.u8        q10, d3, #1             @ 2*P5[8..15]
-+        vmovl.u8        q3, d7                  @ P2[8..15]
-+        vmls.i16        q7, q12, d0[1]          @ 2*P5[0..7]-5*P6[0..7]
-+        vmovl.u8        q4, d9                  @ P6[8..15]
-+        vmovl.u8        q14, d10                @ P3[0..7]
-+        vmovl.u8        q15, d12                @ P7[0..7]
-+        vmls.i16        q2, q3, d0[1]           @ 2*P1[8..15]-5*P2[8..15]
-+        vshll.u8        q3, d10, #1             @ 2*P3[0..7]
-+        vmls.i16        q10, q4, d0[1]          @ 2*P5[8..15]-5*P6[8..15]
-+        vmovl.u8        q6, d13                 @ P7[8..15]
-+        vmla.i16        q8, q14, d0[1]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
-+        vmovl.u8        q14, d18                @ P4[0..7]
-+        vmovl.u8        q9, d19                 @ P4[8..15]
-+        vmla.i16        q7, q15, d0[1]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
-+        vmovl.u8        q15, d11                @ P3[8..15]
-+        vshll.u8        q5, d11, #1             @ 2*P3[8..15]
-+        vmls.i16        q3, q14, d0[1]          @ 2*P3[0..7]-5*P4[0..7]
-+        vmla.i16        q2, q15, d0[1]          @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
-+        vmovl.u8        q15, d22                @ P8[0..7]
-+        vmovl.u8        q11, d23                @ P8[8..15]
-+        vmla.i16        q10, q6, d0[1]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
-+        vmovl.u8        q6, d2                  @ P5[0..7]
-+        vmovl.u8        q1, d3                  @ P5[8..15]
-+        vmls.i16        q5, q9, d0[1]           @ 2*P3[8..15]-5*P4[8..15]
-+        vmls.i16        q8, q14, d0[0]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
-+        vmls.i16        q7, q15, d0[0]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
-+        vsub.i16        q15, q14, q6            @ P4[0..7]-P5[0..7]
-+        vmla.i16        q3, q6, d0[1]           @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
-+        vrshr.s16       q8, q8, #3
-+        vmls.i16        q2, q9, d0[0]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
-+        vrshr.s16       q7, q7, #3
-+        vmls.i16        q10, q11, d0[0]         @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
-+        vabs.s16        q11, q15
-+        vabs.s16        q8, q8                  @ a1[0..7]
-+        vmla.i16        q5, q1, d0[1]           @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
-+        vshr.s16        q15, q15, #8            @ clip_sign[0..7]
-+        vrshr.s16       q2, q2, #3
-+        vmls.i16        q3, q12, d0[0]          @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
-+        vabs.s16        q7, q7                  @ a2[0..7]
-+        vrshr.s16       q10, q10, #3
-+        vsub.i16        q12, q9, q1             @ P4[8..15]-P5[8..15]
-+        vshr.s16        q11, q11, #1            @ clip[0..7]
-+        vmls.i16        q5, q4, d0[0]           @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
-+        vcge.s16        q4, q8, q7              @ test a1[0..7] >= a2[0..7]
-+        vabs.s16        q2, q2                  @ a1[8..15]
-+        vrshr.s16       q3, q3, #3
-+        vabs.s16        q10, q10                @ a2[8..15]
-+        vbsl            q4, q7, q8              @ a3[0..7]
-+        vabs.s16        q7, q12
-+        vshr.s16        q8, q12, #8             @ clip_sign[8..15]
-+        vrshr.s16       q5, q5, #3
-+        vcge.s16        q12, q2, q10            @ test a1[8..15] >= a2[8.15]
-+        vshr.s16        q7, q7, #1              @ clip[8..15]
-+        vbsl            q12, q10, q2            @ a3[8..15]
-+        vabs.s16        q2, q3                  @ a0[0..7]
-+        vceq.i16        q10, q11, #0            @ test clip[0..7] == 0
-+        vshr.s16        q3, q3, #8              @ a0_sign[0..7]
-+        vsub.i16        q3, q15, q3             @ clip_sign[0..7] - a0_sign[0..7]
-+        vcge.s16        q15, q2, q13            @ test a0[0..7] >= pq
-+        vorr            q10, q10, q15           @ test clip[0..7] == 0 || a0[0..7] >= pq
-+        vqsub.u16       q15, q2, q4             @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
-+        vcge.s16        q2, q4, q2              @ test a3[0..7] >= a0[0..7]
-+        vabs.s16        q4, q5                  @ a0[8..15]
-+        vshr.s16        q5, q5, #8              @ a0_sign[8..15]
-+        vmul.i16        q15, q15, d0[1]         @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
-+        vcge.s16        q13, q4, q13            @ test a0[8..15] >= pq
-+        vorr            q2, q10, q2             @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
-+        vsub.i16        q5, q8, q5              @ clip_sign[8..15] - a0_sign[8..15]
-+        vceq.i16        q8, q7, #0              @ test clip[8..15] == 0
-+        vshr.u16        q15, q15, #3            @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
-+        vmov.32         r0, d4[1]               @ move to gp reg
-+        vorr            q8, q8, q13             @ test clip[8..15] == 0 || a0[8..15] >= pq
-+        vqsub.u16       q13, q4, q12            @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
-+        vmov.32         r2, d5[1]
-+        vcge.s16        q4, q12, q4             @ test a3[8..15] >= a0[8..15]
-+        vshl.i64        q2, q2, #16
-+        vcge.s16        q12, q15, q11
-+        vmul.i16        q0, q13, d0[1]          @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
-+        vorr            q4, q8, q4              @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
-+        vshr.s64        q2, q2, #48
-+        and             r0, r0, r2
-+        vbsl            q12, q11, q15           @ FFMIN(d[0..7], clip[0..7])
-+        vshl.i64        q11, q4, #16
-+        vmov.32         r2, d8[1]
-+        vshr.u16        q0, q0, #3              @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
-+        vorr            q2, q10, q2
-+        vmov.32         r12, d9[1]
-+        vshr.s64        q4, q11, #48
-+        vcge.s16        q10, q0, q7
-+        vbic            q2, q12, q2             @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
-+        vorr            q4, q8, q4
-+        and             r2, r2, r12
-+        vbsl            q10, q7, q0             @ FFMIN(d[8..15], clip[8..15])
-+        vmls.i16        q14, q2, q3             @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
-+        and             r0, r0, r2
-+        vbic            q0, q10, q4             @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
-+        tst             r0, #1
-+        bne             1f                      @ none of the 16 pixel pairs should be updated in this case
-+        vmla.i16        q6, q2, q3              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
-+        vmls.i16        q9, q0, q5              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
-+        vqmovun.s16     d4, q14
-+        vmla.i16        q1, q0, q5              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
-+        vqmovun.s16     d0, q6
-+        vqmovun.s16     d5, q9
-+        vqmovun.s16     d1, q1
-+        vst1.64         {q2}, [r3 :128], r1
-+        vst1.64         {q0}, [r3 :128]
-+1:      vpop            {d8-d15}
-+        bx              lr
-+endfunc
-+
-+@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
-+@ On entry:
-+@   r0 -> top-left pel of right block
-+@   r1 = row stride, bytes
-+@   r2 = PQUANT bitstream parameter
-+function ff_vc1_h_loop_filter16_neon, export=1
-+        push            {r4-r6,lr}
-+        vpush           {d8-d15}
-+        sub             r3, r0, #4              @ where to start reading
-+        vldr            d0, .Lcoeffs
-+        vld1.32         {d2}, [r3], r1          @ P1[0], P2[0]...
-+        sub             r0, r0, #1              @ where to start writing
-+        vld1.32         {d3}, [r3], r1
-+        add             r4, r0, r1, lsl #2
-+        vld1.32         {d10}, [r3], r1
-+        vld1.32         {d11}, [r3], r1
-+        vld1.32         {d16}, [r3], r1
-+        vld1.32         {d4}, [r3], r1
-+        vld1.32         {d8}, [r3], r1
-+        vtrn.8          d2, d3                  @ P1[0], P1[1], P3[0]... P2[0], P2[1], P4[0]...
-+        vld1.32         {d14}, [r3], r1
-+        vld1.32         {d5}, [r3], r1
-+        vtrn.8          d10, d11                @ P1[2], P1[3], P3[2]... P2[2], P2[3], P4[2]...
-+        vld1.32         {d6}, [r3], r1
-+        vld1.32         {d12}, [r3], r1
-+        vtrn.8          d16, d4                 @ P1[4], P1[5], P3[4]... P2[4], P2[5], P4[4]...
-+        vld1.32         {d13}, [r3], r1
-+        vtrn.16         d2, d10                 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]...
-+        vld1.32         {d1}, [r3], r1
-+        vtrn.8          d8, d14                 @ P1[6], P1[7], P3[6]... P2[6], P2[7], P4[6]...
-+        vld1.32         {d7}, [r3], r1
-+        vtrn.16         d3, d11                 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]...
-+        vld1.32         {d9}, [r3], r1
-+        vtrn.8          d5, d6                  @ P1[8], P1[9], P3[8]... P2[8], P2[9], P4[8]...
-+        vld1.32         {d15}, [r3]
-+        vtrn.16         d16, d8                 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[6], P3[7], P7[4]...
-+        vtrn.16         d4, d14                 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]...
-+        vtrn.8          d12, d13                @ P1[10], P1[11], P3[10]... P2[10], P2[11], P4[10]...
-+        vdup.16         q9, r2                  @ pq
-+        vtrn.8          d1, d7                  @ P1[12], P1[13], P3[12]... P2[12], P2[13], P4[12]...
-+        vtrn.32         d2, d16                 @ P1[0..7], P5[0..7]
-+        vtrn.16         d5, d12                 @ P1[8], P1[7], P1[10], P1[11], P5[8]... P3[8], P3[9], P3[10], P3[11], P7[8]...
-+        vtrn.16         d6, d13                 @ P2[8], P2[7], P2[10], P2[11], P6[8]... P4[8], P4[9], P4[10], P4[11], P8[8]...
-+        vtrn.8          d9, d15                 @ P1[14], P1[15], P3[14]... P2[14], P2[15], P4[14]...
-+        vtrn.32         d3, d4                  @ P2[0..7], P6[0..7]
-+        vshll.u8        q10, d2, #1             @ 2*P1[0..7]
-+        vtrn.32         d10, d8                 @ P3[0..7], P7[0..7]
-+        vshll.u8        q11, d16, #1            @ 2*P5[0..7]
-+        vtrn.32         d11, d14                @ P4[0..7], P8[0..7]
-+        vtrn.16         d1, d9                  @ P1[12], P1[13], P1[14], P1[15], P5[12]... P3[12], P3[13], P3[14], P3[15], P7[12]...
-+        vtrn.16         d7, d15                 @ P2[12], P2[13], P2[14], P2[15], P6[12]... P4[12], P4[13], P4[14], P4[15], P8[12]...
-+        vmovl.u8        q1, d3                  @ P2[0..7]
-+        vmovl.u8        q12, d4                 @ P6[0..7]
-+        vtrn.32         d5, d1                  @ P1[8..15], P5[8..15]
-+        vtrn.32         d6, d7                  @ P2[8..15], P6[8..15]
-+        vtrn.32         d12, d9                 @ P3[8..15], P7[8..15]
-+        vtrn.32         d13, d15                @ P4[8..15], P8[8..15]
-+        vmls.i16        q10, q1, d0[1]          @ 2*P1[0..7]-5*P2[0..7]
-+        vmovl.u8        q1, d10                 @ P3[0..7]
-+        vshll.u8        q2, d5, #1              @ 2*P1[8..15]
-+        vshll.u8        q13, d1, #1             @ 2*P5[8..15]
-+        vmls.i16        q11, q12, d0[1]         @ 2*P5[0..7]-5*P6[0..7]
-+        vmovl.u8        q14, d6                 @ P2[8..15]
-+        vmovl.u8        q3, d7                  @ P6[8..15]
-+        vmovl.u8        q15, d8                 @ P7[0..7]
-+        vmla.i16        q10, q1, d0[1]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
-+        vmovl.u8        q1, d12                 @ P3[8..15]
-+        vmls.i16        q2, q14, d0[1]          @ 2*P1[8..15]-5*P2[8..15]
-+        vmovl.u8        q4, d9                  @ P7[8..15]
-+        vshll.u8        q14, d10, #1            @ 2*P3[0..7]
-+        vmls.i16        q13, q3, d0[1]          @ 2*P5[8..15]-5*P6[8..15]
-+        vmovl.u8        q5, d11                 @ P4[0..7]
-+        vmla.i16        q11, q15, d0[1]         @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
-+        vshll.u8        q15, d12, #1            @ 2*P3[8..15]
-+        vmovl.u8        q6, d13                 @ P4[8..15]
-+        vmla.i16        q2, q1, d0[1]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
-+        vmovl.u8        q1, d14                 @ P8[0..7]
-+        vmovl.u8        q7, d15                 @ P8[8..15]
-+        vmla.i16        q13, q4, d0[1]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
-+        vmovl.u8        q4, d16                 @ P5[0..7]
-+        vmovl.u8        q8, d1                  @ P5[8..15]
-+        vmls.i16        q14, q5, d0[1]          @ 2*P3[0..7]-5*P4[0..7]
-+        vmls.i16        q15, q6, d0[1]          @ 2*P3[8..15]-5*P4[8..15]
-+        vmls.i16        q10, q5, d0[0]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
-+        vmls.i16        q11, q1, d0[0]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
-+        vsub.i16        q1, q5, q4              @ P4[0..7]-P5[0..7]
-+        vmls.i16        q2, q6, d0[0]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
-+        vrshr.s16       q10, q10, #3
-+        vmls.i16        q13, q7, d0[0]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
-+        vsub.i16        q7, q6, q8              @ P4[8..15]-P5[8..15]
-+        vrshr.s16       q11, q11, #3
-+        vmla.s16        q14, q4, d0[1]          @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
-+        vrshr.s16       q2, q2, #3
-+        vmla.i16        q15, q8, d0[1]          @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
-+        vabs.s16        q10, q10                @ a1[0..7]
-+        vrshr.s16       q13, q13, #3
-+        vmls.i16        q15, q3, d0[0]          @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
-+        vabs.s16        q3, q11                 @ a2[0..7]
-+        vabs.s16        q2, q2                  @ a1[8..15]
-+        vmls.i16        q14, q12, d0[0]         @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
-+        vabs.s16        q11, q1
-+        vabs.s16        q12, q13                @ a2[8..15]
-+        vcge.s16        q13, q10, q3            @ test a1[0..7] >= a2[0..7]
-+        vshr.s16        q1, q1, #8              @ clip_sign[0..7]
-+        vrshr.s16       q15, q15, #3
-+        vshr.s16        q11, q11, #1            @ clip[0..7]
-+        vrshr.s16       q14, q14, #3
-+        vbsl            q13, q3, q10            @ a3[0..7]
-+        vcge.s16        q3, q2, q12             @ test a1[8..15] >= a2[8.15]
-+        vabs.s16        q10, q15                @ a0[8..15]
-+        vshr.s16        q15, q15, #8            @ a0_sign[8..15]
-+        vbsl            q3, q12, q2             @ a3[8..15]
-+        vabs.s16        q2, q14                 @ a0[0..7]
-+        vabs.s16        q12, q7
-+        vshr.s16        q7, q7, #8              @ clip_sign[8..15]
-+        vshr.s16        q14, q14, #8            @ a0_sign[0..7]
-+        vshr.s16        q12, q12, #1            @ clip[8..15]
-+        vsub.i16        q7, q7, q15             @ clip_sign[8..15] - a0_sign[8..15]
-+        vqsub.u16       q15, q10, q3            @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
-+        vcge.s16        q3, q3, q10             @ test a3[8..15] >= a0[8..15]
-+        vcge.s16        q10, q10, q9            @ test a0[8..15] >= pq
-+        vcge.s16        q9, q2, q9              @ test a0[0..7] >= pq
-+        vsub.i16        q1, q1, q14             @ clip_sign[0..7] - a0_sign[0..7]
-+        vqsub.u16       q14, q2, q13            @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
-+        vcge.s16        q2, q13, q2             @ test a3[0..7] >= a0[0..7]
-+        vmul.i16        q13, q15, d0[1]         @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
-+        vceq.i16        q15, q11, #0            @ test clip[0..7] == 0
-+        vmul.i16        q0, q14, d0[1]          @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
-+        vorr            q9, q15, q9             @ test clip[0..7] == 0 || a0[0..7] >= pq
-+        vceq.i16        q14, q12, #0            @ test clip[8..15] == 0
-+        vshr.u16        q13, q13, #3            @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
-+        vorr            q2, q9, q2              @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
-+        vshr.u16        q0, q0, #3              @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
-+        vorr            q10, q14, q10           @ test clip[8..15] == 0 || a0[8..15] >= pq
-+        vcge.s16        q14, q13, q12
-+        vmov.32         r2, d4[1]               @ move to gp reg
-+        vorr            q3, q10, q3             @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
-+        vmov.32         r3, d5[1]
-+        vcge.s16        q2, q0, q11
-+        vbsl            q14, q12, q13           @ FFMIN(d[8..15], clip[8..15])
-+        vbsl            q2, q11, q0             @ FFMIN(d[0..7], clip[0..7])
-+        vmov.32         r5, d6[1]
-+        vbic            q0, q14, q10            @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
-+        vmov.32         r6, d7[1]
-+        and             r12, r2, r3
-+        vbic            q2, q2, q9              @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
-+        vmls.i16        q6, q0, q7              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
-+        vmls.i16        q5, q2, q1              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
-+        and             r14, r5, r6
-+        vmla.i16        q4, q2, q1              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
-+        and             r12, r12, r14
-+        vqmovun.s16     d4, q6
-+        vmla.i16        q8, q0, q7              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
-+        tst             r12, #1
-+        bne             4f                      @ none of the 16 pixel pairs should be updated in this case
-+        vqmovun.s16     d2, q5
-+        vqmovun.s16     d3, q4
-+        vqmovun.s16     d5, q8
-+        tst             r2, #1
-+        bne             1f
-+        vst2.8          {d2[0], d3[0]}, [r0], r1
-+        vst2.8          {d2[1], d3[1]}, [r0], r1
-+        vst2.8          {d2[2], d3[2]}, [r0], r1
-+        vst2.8          {d2[3], d3[3]}, [r0]
-+1:      add             r0, r4, r1, lsl #2
-+        tst             r3, #1
-+        bne             2f
-+        vst2.8          {d2[4], d3[4]}, [r4], r1
-+        vst2.8          {d2[5], d3[5]}, [r4], r1
-+        vst2.8          {d2[6], d3[6]}, [r4], r1
-+        vst2.8          {d2[7], d3[7]}, [r4]
-+2:      add             r4, r0, r1, lsl #2
-+        tst             r5, #1
-+        bne             3f
-+        vst2.8          {d4[0], d5[0]}, [r0], r1
-+        vst2.8          {d4[1], d5[1]}, [r0], r1
-+        vst2.8          {d4[2], d5[2]}, [r0], r1
-+        vst2.8          {d4[3], d5[3]}, [r0]
-+3:      tst             r6, #1
-+        bne             4f
-+        vst2.8          {d4[4], d5[4]}, [r4], r1
-+        vst2.8          {d4[5], d5[5]}, [r4], r1
-+        vst2.8          {d4[6], d5[6]}, [r4], r1
-+        vst2.8          {d4[7], d5[7]}, [r4]
-+4:      vpop            {d8-d15}
-+        pop             {r4-r6,pc}
-+endfunc
-+
-+@ Copy at most the specified number of bytes from source to destination buffer,
-+@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence
-+@ On entry:
-+@   r0 -> source buffer
-+@   r1 = max number of bytes to copy
-+@   r2 -> destination buffer, optimally 8-byte aligned
-+@ On exit:
-+@   r0 = number of bytes not copied
-+function ff_vc1_unescape_buffer_helper_neon, export=1
-+        @ Offset by 48 to screen out cases that are too short for us to handle,
-+        @ and also make it easy to test for loop termination, or to determine
-+        @ whether we need an odd number of half-iterations of the loop.
-+        subs    r1, r1, #48
-+        bmi     90f
-+
-+        @ Set up useful constants
-+        vmov.i32        q0, #0x3000000
-+        vmov.i32        q1, #0x30000
-+
-+        tst             r1, #16
-+        bne             1f
-+
-+          vld1.8          {q8, q9}, [r0]!
-+          vbic            q12, q8, q0
-+          vext.8          q13, q8, q9, #1
-+          vext.8          q14, q8, q9, #2
-+          vext.8          q15, q8, q9, #3
-+          veor            q12, q12, q1
-+          vbic            q13, q13, q0
-+          vbic            q14, q14, q0
-+          vbic            q15, q15, q0
-+          vceq.i32        q12, q12, #0
-+          veor            q13, q13, q1
-+          veor            q14, q14, q1
-+          veor            q15, q15, q1
-+          vceq.i32        q13, q13, #0
-+          vceq.i32        q14, q14, #0
-+          vceq.i32        q15, q15, #0
-+          add             r1, r1, #16
-+          b               3f
-+
-+1:      vld1.8          {q10, q11}, [r0]!
-+        vbic            q12, q10, q0
-+        vext.8          q13, q10, q11, #1
-+        vext.8          q14, q10, q11, #2
-+        vext.8          q15, q10, q11, #3
-+        veor            q12, q12, q1
-+        vbic            q13, q13, q0
-+        vbic            q14, q14, q0
-+        vbic            q15, q15, q0
-+        vceq.i32        q12, q12, #0
-+        veor            q13, q13, q1
-+        veor            q14, q14, q1
-+        veor            q15, q15, q1
-+        vceq.i32        q13, q13, #0
-+        vceq.i32        q14, q14, #0
-+        vceq.i32        q15, q15, #0
-+        @ Drop through...
-+2:        vmov            q8, q11
-+          vld1.8          {q9}, [r0]!
-+        vorr            q13, q12, q13
-+        vorr            q15, q14, q15
-+          vbic            q12, q8, q0
-+        vorr            q3, q13, q15
-+          vext.8          q13, q8, q9, #1
-+          vext.8          q14, q8, q9, #2
-+          vext.8          q15, q8, q9, #3
-+          veor            q12, q12, q1
-+        vorr            d6, d6, d7
-+          vbic            q13, q13, q0
-+          vbic            q14, q14, q0
-+          vbic            q15, q15, q0
-+          vceq.i32        q12, q12, #0
-+        vmov            r3, r12, d6
-+          veor            q13, q13, q1
-+          veor            q14, q14, q1
-+          veor            q15, q15, q1
-+          vceq.i32        q13, q13, #0
-+          vceq.i32        q14, q14, #0
-+          vceq.i32        q15, q15, #0
-+        orrs            r3, r3, r12
-+        bne             90f
-+        vst1.64         {q10}, [r2]!
-+3:          vmov            q10, q9
-+            vld1.8          {q11}, [r0]!
-+          vorr            q13, q12, q13
-+          vorr            q15, q14, q15
-+            vbic            q12, q10, q0
-+          vorr            q3, q13, q15
-+            vext.8          q13, q10, q11, #1
-+            vext.8          q14, q10, q11, #2
-+            vext.8          q15, q10, q11, #3
-+            veor            q12, q12, q1
-+          vorr            d6, d6, d7
-+            vbic            q13, q13, q0
-+            vbic            q14, q14, q0
-+            vbic            q15, q15, q0
-+            vceq.i32        q12, q12, #0
-+          vmov            r3, r12, d6
-+            veor            q13, q13, q1
-+            veor            q14, q14, q1
-+            veor            q15, q15, q1
-+            vceq.i32        q13, q13, #0
-+            vceq.i32        q14, q14, #0
-+            vceq.i32        q15, q15, #0
-+          orrs            r3, r3, r12
-+          bne             91f
-+          vst1.64         {q8}, [r2]!
-+        subs            r1, r1, #32
-+        bpl             2b
-+
-+90:     add             r0, r1, #48
-+        bx              lr
-+
-+91:     sub             r1, r1, #16
-+        b               90b
-+endfunc
---- a/libavcodec/avcodec.h
-+++ b/libavcodec/avcodec.h
-@@ -2567,6 +2567,17 @@ typedef struct AVHWAccel {
-      * that avctx->hwaccel_priv_data is invalid.
-      */
-     int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
-+
-+    /**
-+     * Called if parsing fails
-+     *
-+     * An error has occured, end_frame will not be called
-+     * start_frame & decode_slice may or may not have been called
-+     * Optional
-+     *
-+     * @param avctx the codec context
-+     */
-+    void (*abort_frame)(AVCodecContext *avctx);
- } AVHWAccel;
- 
- /**
---- a/libavcodec/cabac.h
-+++ b/libavcodec/cabac.h
-@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_table
- typedef struct CABACContext{
-     int low;
-     int range;
--    int outstanding_count;
-+    union
-+    {
-+        int outstanding_count;
-+        struct {
-+            uint16_t bits;
-+            uint16_t range;
-+        } by22;
-+    };
-     const uint8_t *bytestream_start;
-     const uint8_t *bytestream;
-     const uint8_t *bytestream_end;
---- a/libavcodec/codec.h
-+++ b/libavcodec/codec.h
-@@ -350,6 +350,17 @@ const AVCodec *av_codec_iterate(void **o
- AVCodec *avcodec_find_decoder(enum AVCodecID id);
- 
- /**
-+ * Find a registered decoder with a matching codec ID and pix_fmt.
-+ * A decoder will pix_fmt set to NULL will match any fmt.
-+ * A fmt of AV_PIX_FMT_NONE will only match a decoder will px_fmt NULL.
-+ *
-+ * @param id AVCodecID of the requested decoder
-+ * @param fmt AVPixelForma that msut be supported by decoder
-+ * @return A decoder if one was found, NULL otherwise.
-+ */
-+AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt);
-+
-+/**
-  * Find a registered decoder with the specified name.
-  *
-  * @param name name of the requested decoder
---- /dev/null
-+++ b/libavcodec/hevc-ctrls-v1.h
-@@ -0,0 +1,229 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * These are the HEVC state controls for use with stateless HEVC
-+ * codec drivers.
-+ *
-+ * It turns out that these structs are not stable yet and will undergo
-+ * more changes. So keep them private until they are stable and ready to
-+ * become part of the official public API.
-+ */
-+
-+#ifndef _HEVC_CTRLS_H_
-+#define _HEVC_CTRLS_H_
-+
-+#include <linux/videodev2.h>
-+
-+/* The pixel format isn't stable at the moment and will likely be renamed. */
-+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
-+
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_MPEG_BASE + 1008)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_MPEG_BASE + 1009)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_MPEG_BASE + 1010)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_MPEG_BASE + 1011)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_MPEG_BASE + 1015)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_MPEG_BASE + 1016)
-+
-+/* enum v4l2_ctrl_type type values */
-+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
-+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
-+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
-+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
-+
-+enum v4l2_mpeg_video_hevc_decode_mode {
-+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
-+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
-+};
-+
-+enum v4l2_mpeg_video_hevc_start_code {
-+	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
-+	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
-+};
-+
-+#define V4L2_HEVC_SLICE_TYPE_B	0
-+#define V4L2_HEVC_SLICE_TYPE_P	1
-+#define V4L2_HEVC_SLICE_TYPE_I	2
-+
-+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
-+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
-+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
-+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
-+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
-+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
-+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
-+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
-+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
-+
-+/* The controls are not stable at the moment and will likely be reworked. */
-+struct v4l2_ctrl_hevc_sps {
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
-+	__u16	pic_width_in_luma_samples;
-+	__u16	pic_height_in_luma_samples;
-+	__u8	bit_depth_luma_minus8;
-+	__u8	bit_depth_chroma_minus8;
-+	__u8	log2_max_pic_order_cnt_lsb_minus4;
-+	__u8	sps_max_dec_pic_buffering_minus1;
-+	__u8	sps_max_num_reorder_pics;
-+	__u8	sps_max_latency_increase_plus1;
-+	__u8	log2_min_luma_coding_block_size_minus3;
-+	__u8	log2_diff_max_min_luma_coding_block_size;
-+	__u8	log2_min_luma_transform_block_size_minus2;
-+	__u8	log2_diff_max_min_luma_transform_block_size;
-+	__u8	max_transform_hierarchy_depth_inter;
-+	__u8	max_transform_hierarchy_depth_intra;
-+	__u8	pcm_sample_bit_depth_luma_minus1;
-+	__u8	pcm_sample_bit_depth_chroma_minus1;
-+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
-+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
-+	__u8	num_short_term_ref_pic_sets;
-+	__u8	num_long_term_ref_pics_sps;
-+	__u8	chroma_format_idc;
-+	__u8	sps_max_sub_layers_minus1;
-+
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT		(1ULL << 0)
-+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
-+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
-+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
-+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
-+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
-+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
-+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
-+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
-+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
-+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
-+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
-+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
-+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
-+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
-+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
-+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
-+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
-+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
-+
-+struct v4l2_ctrl_hevc_pps {
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
-+	__u8	num_extra_slice_header_bits;
-+	__s8	init_qp_minus26;
-+	__u8	diff_cu_qp_delta_depth;
-+	__s8	pps_cb_qp_offset;
-+	__s8	pps_cr_qp_offset;
-+	__u8	num_tile_columns_minus1;
-+	__u8	num_tile_rows_minus1;
-+	__u8	column_width_minus1[20];
-+	__u8	row_height_minus1[22];
-+	__s8	pps_beta_offset_div2;
-+	__s8	pps_tc_offset_div2;
-+	__u8	log2_parallel_merge_level_minus2;
-+
-+	__u8	padding[4];
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE	0x01
-+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER	0x02
-+#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR		0x03
-+
-+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
-+
-+struct v4l2_hevc_dpb_entry {
-+	__u64	timestamp;
-+	__u8	rps;
-+	__u8	field_pic;
-+	__u16	pic_order_cnt[2];
-+	__u8	padding[2];
-+};
-+
-+struct v4l2_hevc_pred_weight_table {
-+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+
-+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+
-+	__u8	padding[6];
-+
-+	__u8	luma_log2_weight_denom;
-+	__s8	delta_chroma_log2_weight_denom;
-+};
-+
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT		(1ULL << 9)
-+
-+struct v4l2_ctrl_hevc_slice_params {
-+	__u32	bit_size;
-+	__u32	data_bit_offset;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u32	slice_segment_addr;
-+	__u32	num_entry_point_offsets;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
-+	__u8	nal_unit_type;
-+	__u8	nuh_temporal_id_plus1;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u8	slice_type;
-+	__u8	colour_plane_id;
-+	__u16	slice_pic_order_cnt;
-+	__u8	num_ref_idx_l0_active_minus1;
-+	__u8	num_ref_idx_l1_active_minus1;
-+	__u8	collocated_ref_idx;
-+	__u8	five_minus_max_num_merge_cand;
-+	__s8	slice_qp_delta;
-+	__s8	slice_cb_qp_offset;
-+	__s8	slice_cr_qp_offset;
-+	__s8	slice_act_y_qp_offset;
-+	__s8	slice_act_cb_qp_offset;
-+	__s8	slice_act_cr_qp_offset;
-+	__s8	slice_beta_offset_div2;
-+	__s8	slice_tc_offset_div2;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
-+	__u8	pic_struct;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u8	num_active_dpb_entries;
-+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+
-+	__u8	num_rps_poc_st_curr_before;
-+	__u8	num_rps_poc_st_curr_after;
-+	__u8	num_rps_poc_lt_curr;
-+
-+	__u8	padding;
-+
-+	__u32	entry_point_offset_minus1[256];
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
-+	struct v4l2_hevc_pred_weight_table pred_weight_table;
-+
-+	__u64	flags;
-+};
-+
-+struct v4l2_ctrl_hevc_scaling_matrix {
-+	__u8	scaling_list_4x4[6][16];
-+	__u8	scaling_list_8x8[6][64];
-+	__u8	scaling_list_16x16[6][64];
-+	__u8	scaling_list_32x32[2][64];
-+	__u8	scaling_list_dc_coef_16x16[6];
-+	__u8	scaling_list_dc_coef_32x32[2];
-+};
-+
-+#endif
---- /dev/null
-+++ b/libavcodec/hevc-ctrls-v2.h
-@@ -0,0 +1,257 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * These are the HEVC state controls for use with stateless HEVC
-+ * codec drivers.
-+ *
-+ * It turns out that these structs are not stable yet and will undergo
-+ * more changes. So keep them private until they are stable and ready to
-+ * become part of the official public API.
-+ */
-+
-+#ifndef _HEVC_CTRLS_H_
-+#define _HEVC_CTRLS_H_
-+
-+#include <linux/videodev2.h>
-+
-+/* The pixel format isn't stable at the moment and will likely be renamed. */
-+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
-+
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_BASE + 1011)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
-+
-+/* enum v4l2_ctrl_type type values */
-+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
-+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
-+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
-+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
-+#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
-+
-+enum v4l2_mpeg_video_hevc_decode_mode {
-+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
-+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
-+};
-+
-+enum v4l2_mpeg_video_hevc_start_code {
-+	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
-+	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
-+};
-+
-+#define V4L2_HEVC_SLICE_TYPE_B	0
-+#define V4L2_HEVC_SLICE_TYPE_P	1
-+#define V4L2_HEVC_SLICE_TYPE_I	2
-+
-+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
-+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
-+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
-+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
-+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
-+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
-+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
-+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
-+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
-+
-+/* The controls are not stable at the moment and will likely be reworked. */
-+struct v4l2_ctrl_hevc_sps {
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
-+	__u16	pic_width_in_luma_samples;
-+	__u16	pic_height_in_luma_samples;
-+	__u8	bit_depth_luma_minus8;
-+	__u8	bit_depth_chroma_minus8;
-+	__u8	log2_max_pic_order_cnt_lsb_minus4;
-+	__u8	sps_max_dec_pic_buffering_minus1;
-+	__u8	sps_max_num_reorder_pics;
-+	__u8	sps_max_latency_increase_plus1;
-+	__u8	log2_min_luma_coding_block_size_minus3;
-+	__u8	log2_diff_max_min_luma_coding_block_size;
-+	__u8	log2_min_luma_transform_block_size_minus2;
-+	__u8	log2_diff_max_min_luma_transform_block_size;
-+	__u8	max_transform_hierarchy_depth_inter;
-+	__u8	max_transform_hierarchy_depth_intra;
-+	__u8	pcm_sample_bit_depth_luma_minus1;
-+	__u8	pcm_sample_bit_depth_chroma_minus1;
-+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
-+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
-+	__u8	num_short_term_ref_pic_sets;
-+	__u8	num_long_term_ref_pics_sps;
-+	__u8	chroma_format_idc;
-+	__u8	sps_max_sub_layers_minus1;
-+
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
-+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
-+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
-+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
-+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
-+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
-+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
-+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
-+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
-+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
-+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
-+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
-+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
-+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
-+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
-+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
-+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
-+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
-+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
-+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
-+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
-+
-+struct v4l2_ctrl_hevc_pps {
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
-+	__u8	num_extra_slice_header_bits;
-+	__u8	num_ref_idx_l0_default_active_minus1;
-+	__u8	num_ref_idx_l1_default_active_minus1;
-+	__s8	init_qp_minus26;
-+	__u8	diff_cu_qp_delta_depth;
-+	__s8	pps_cb_qp_offset;
-+	__s8	pps_cr_qp_offset;
-+	__u8	num_tile_columns_minus1;
-+	__u8	num_tile_rows_minus1;
-+	__u8	column_width_minus1[20];
-+	__u8	row_height_minus1[22];
-+	__s8	pps_beta_offset_div2;
-+	__s8	pps_tc_offset_div2;
-+	__u8	log2_parallel_merge_level_minus2;
-+
-+	__u8	padding[4];
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE	0x01
-+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER	0x02
-+#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR		0x03
-+
-+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
-+
-+struct v4l2_hevc_dpb_entry {
-+	__u64	timestamp;
-+	__u8	rps;
-+	__u8	field_pic;
-+	__u16	pic_order_cnt[2];
-+	__u8	padding[2];
-+};
-+
-+struct v4l2_hevc_pred_weight_table {
-+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+
-+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+
-+	__u8	padding[6];
-+
-+	__u8	luma_log2_weight_denom;
-+	__s8	delta_chroma_log2_weight_denom;
-+};
-+
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
-+
-+struct v4l2_ctrl_hevc_slice_params {
-+	__u32	bit_size;
-+	__u32	data_bit_offset;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u32	slice_segment_addr;
-+	__u32	num_entry_point_offsets;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
-+	__u8	nal_unit_type;
-+	__u8	nuh_temporal_id_plus1;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u8	slice_type;
-+	__u8	colour_plane_id;
-+	__u16	slice_pic_order_cnt;
-+	__u8	num_ref_idx_l0_active_minus1;
-+	__u8	num_ref_idx_l1_active_minus1;
-+	__u8	collocated_ref_idx;
-+	__u8	five_minus_max_num_merge_cand;
-+	__s8	slice_qp_delta;
-+	__s8	slice_cb_qp_offset;
-+	__s8	slice_cr_qp_offset;
-+	__s8	slice_act_y_qp_offset;
-+	__s8	slice_act_cb_qp_offset;
-+	__s8	slice_act_cr_qp_offset;
-+	__s8	slice_beta_offset_div2;
-+	__s8	slice_tc_offset_div2;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
-+	__u8	pic_struct;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+
-+	__u8	padding[5];
-+
-+	__u32	entry_point_offset_minus1[256];
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
-+	struct v4l2_hevc_pred_weight_table pred_weight_table;
-+
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
-+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
-+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
-+
-+struct v4l2_ctrl_hevc_decode_params {
-+	__s32	pic_order_cnt_val;
-+	__u8	num_active_dpb_entries;
-+	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	num_poc_st_curr_before;
-+	__u8	num_poc_st_curr_after;
-+	__u8	num_poc_lt_curr;
-+	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u64	flags;
-+};
-+
-+/*  MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
-+#define V4L2_CID_CODEC_HANTRO_BASE				(V4L2_CTRL_CLASS_CODEC | 0x1200)
-+/*
-+ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
-+ * the number of data (in bits) to skip in the
-+ * slice segment header.
-+ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
-+ * to before syntax element "slice_temporal_mvp_enabled_flag".
-+ * If IDR, the skipped bits are just "pic_output_flag"
-+ * (separate_colour_plane_flag is not supported).
-+ */
-+#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP	(V4L2_CID_CODEC_HANTRO_BASE + 0)
-+
-+struct v4l2_ctrl_hevc_scaling_matrix {
-+	__u8	scaling_list_4x4[6][16];
-+	__u8	scaling_list_8x8[6][64];
-+	__u8	scaling_list_16x16[6][64];
-+	__u8	scaling_list_32x32[2][64];
-+	__u8	scaling_list_dc_coef_16x16[6];
-+	__u8	scaling_list_dc_coef_32x32[2];
-+};
-+
-+#endif
---- /dev/null
-+++ b/libavcodec/hevc-ctrls-v3.h
-@@ -0,0 +1,255 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * These are the HEVC state controls for use with stateless HEVC
-+ * codec drivers.
-+ *
-+ * It turns out that these structs are not stable yet and will undergo
-+ * more changes. So keep them private until they are stable and ready to
-+ * become part of the official public API.
-+ */
-+
-+#ifndef _HEVC_CTRLS_H_
-+#define _HEVC_CTRLS_H_
-+
-+#include <linux/videodev2.h>
-+
-+/* The pixel format isn't stable at the moment and will likely be renamed. */
-+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
-+
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_BASE + 1011)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
-+
-+/* enum v4l2_ctrl_type type values */
-+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
-+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
-+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
-+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
-+#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
-+
-+enum v4l2_mpeg_video_hevc_decode_mode {
-+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
-+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
-+};
-+
-+enum v4l2_mpeg_video_hevc_start_code {
-+	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
-+	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
-+};
-+
-+#define V4L2_HEVC_SLICE_TYPE_B	0
-+#define V4L2_HEVC_SLICE_TYPE_P	1
-+#define V4L2_HEVC_SLICE_TYPE_I	2
-+
-+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
-+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
-+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
-+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
-+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
-+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
-+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
-+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
-+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
-+
-+/* The controls are not stable at the moment and will likely be reworked. */
-+struct v4l2_ctrl_hevc_sps {
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
-+	__u16	pic_width_in_luma_samples;
-+	__u16	pic_height_in_luma_samples;
-+	__u8	bit_depth_luma_minus8;
-+	__u8	bit_depth_chroma_minus8;
-+	__u8	log2_max_pic_order_cnt_lsb_minus4;
-+	__u8	sps_max_dec_pic_buffering_minus1;
-+	__u8	sps_max_num_reorder_pics;
-+	__u8	sps_max_latency_increase_plus1;
-+	__u8	log2_min_luma_coding_block_size_minus3;
-+	__u8	log2_diff_max_min_luma_coding_block_size;
-+	__u8	log2_min_luma_transform_block_size_minus2;
-+	__u8	log2_diff_max_min_luma_transform_block_size;
-+	__u8	max_transform_hierarchy_depth_inter;
-+	__u8	max_transform_hierarchy_depth_intra;
-+	__u8	pcm_sample_bit_depth_luma_minus1;
-+	__u8	pcm_sample_bit_depth_chroma_minus1;
-+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
-+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
-+	__u8	num_short_term_ref_pic_sets;
-+	__u8	num_long_term_ref_pics_sps;
-+	__u8	chroma_format_idc;
-+	__u8	sps_max_sub_layers_minus1;
-+
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
-+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
-+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
-+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
-+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
-+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
-+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
-+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
-+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
-+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
-+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
-+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
-+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
-+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
-+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
-+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
-+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
-+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
-+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
-+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
-+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
-+
-+struct v4l2_ctrl_hevc_pps {
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
-+	__u8	num_extra_slice_header_bits;
-+	__u8	num_ref_idx_l0_default_active_minus1;
-+	__u8	num_ref_idx_l1_default_active_minus1;
-+	__s8	init_qp_minus26;
-+	__u8	diff_cu_qp_delta_depth;
-+	__s8	pps_cb_qp_offset;
-+	__s8	pps_cr_qp_offset;
-+	__u8	num_tile_columns_minus1;
-+	__u8	num_tile_rows_minus1;
-+	__u8	column_width_minus1[20];
-+	__u8	row_height_minus1[22];
-+	__s8	pps_beta_offset_div2;
-+	__s8	pps_tc_offset_div2;
-+	__u8	log2_parallel_merge_level_minus2;
-+
-+	__u8	padding[4];
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
-+
-+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
-+
-+struct v4l2_hevc_dpb_entry {
-+	__u64	timestamp;
-+	__u8	flags;
-+	__u8	field_pic;
-+	__u16	pic_order_cnt[2];
-+	__u8	padding[2];
-+};
-+
-+struct v4l2_hevc_pred_weight_table {
-+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+
-+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+
-+	__u8	padding[6];
-+
-+	__u8	luma_log2_weight_denom;
-+	__s8	delta_chroma_log2_weight_denom;
-+};
-+
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
-+
-+struct v4l2_ctrl_hevc_slice_params {
-+	__u32	bit_size;
-+	__u32	data_bit_offset;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u32	slice_segment_addr;
-+	__u32	num_entry_point_offsets;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
-+	__u8	nal_unit_type;
-+	__u8	nuh_temporal_id_plus1;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u8	slice_type;
-+	__u8	colour_plane_id;
-+	__u16	slice_pic_order_cnt;
-+	__u8	num_ref_idx_l0_active_minus1;
-+	__u8	num_ref_idx_l1_active_minus1;
-+	__u8	collocated_ref_idx;
-+	__u8	five_minus_max_num_merge_cand;
-+	__s8	slice_qp_delta;
-+	__s8	slice_cb_qp_offset;
-+	__s8	slice_cr_qp_offset;
-+	__s8	slice_act_y_qp_offset;
-+	__s8	slice_act_cb_qp_offset;
-+	__s8	slice_act_cr_qp_offset;
-+	__s8	slice_beta_offset_div2;
-+	__s8	slice_tc_offset_div2;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
-+	__u8	pic_struct;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+
-+	__u8	padding[5];
-+
-+	__u32	entry_point_offset_minus1[256];
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
-+	struct v4l2_hevc_pred_weight_table pred_weight_table;
-+
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
-+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
-+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
-+
-+struct v4l2_ctrl_hevc_decode_params {
-+	__s32	pic_order_cnt_val;
-+	__u8	num_active_dpb_entries;
-+	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	num_poc_st_curr_before;
-+	__u8	num_poc_st_curr_after;
-+	__u8	num_poc_lt_curr;
-+	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u64	flags;
-+};
-+
-+struct v4l2_ctrl_hevc_scaling_matrix {
-+	__u8	scaling_list_4x4[6][16];
-+	__u8	scaling_list_8x8[6][64];
-+	__u8	scaling_list_16x16[6][64];
-+	__u8	scaling_list_32x32[2][64];
-+	__u8	scaling_list_dc_coef_16x16[6];
-+	__u8	scaling_list_dc_coef_32x32[2];
-+};
-+
-+/*  MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
-+#define V4L2_CID_CODEC_HANTRO_BASE				(V4L2_CTRL_CLASS_CODEC | 0x1200)
-+/*
-+ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
-+ * the number of data (in bits) to skip in the
-+ * slice segment header.
-+ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
-+ * to before syntax element "slice_temporal_mvp_enabled_flag".
-+ * If IDR, the skipped bits are just "pic_output_flag"
-+ * (separate_colour_plane_flag is not supported).
-+ */
-+#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP	(V4L2_CID_CODEC_HANTRO_BASE + 0)
-+
-+#endif
---- a/libavcodec/hevc_parser.c
-+++ b/libavcodec/hevc_parser.c
-@@ -98,6 +98,19 @@ static int hevc_parse_slice_header(AVCod
-     avctx->profile  = ps->sps->ptl.general_ptl.profile_idc;
-     avctx->level    = ps->sps->ptl.general_ptl.level_idc;
- 
-+    if (ps->sps->chroma_format_idc == 1) {
-+        avctx->chroma_sample_location = ps->sps->vui.chroma_loc_info_present_flag ?
-+            ps->sps->vui.chroma_sample_loc_type_top_field + 1 :
-+            AVCHROMA_LOC_LEFT;
-+    }
-+    else if (ps->sps->chroma_format_idc == 2 ||
-+             ps->sps->chroma_format_idc == 3) {
-+        avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;;
-+    }
-+    else {
-+        avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED;
-+    }
-+
-     if (ps->vps->vps_timing_info_present_flag) {
-         num = ps->vps->vps_num_units_in_tick;
-         den = ps->vps->vps_time_scale;
---- a/libavcodec/hevc_refs.c
-+++ b/libavcodec/hevc_refs.c
-@@ -96,18 +96,22 @@ static HEVCFrame *alloc_frame(HEVCContex
-         if (!frame->rpl_buf)
-             goto fail;
- 
--        frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
--        if (!frame->tab_mvf_buf)
--            goto fail;
--        frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
-+        if (s->tab_mvf_pool) {
-+            frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
-+            if (!frame->tab_mvf_buf)
-+                goto fail;
-+            frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
-+        }
- 
--        frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
--        if (!frame->rpl_tab_buf)
--            goto fail;
--        frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
--        frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
--        for (j = 0; j < frame->ctb_count; j++)
--            frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
-+        if (s->rpl_tab_pool) {
-+            frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
-+            if (!frame->rpl_tab_buf)
-+                goto fail;
-+            frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
-+            frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
-+            for (j = 0; j < frame->ctb_count; j++)
-+                frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
-+        }
- 
-         frame->frame->top_field_first  = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
-         frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
-@@ -276,14 +280,17 @@ static int init_slice_rpl(HEVCContext *s
-     int ctb_count    = frame->ctb_count;
-     int ctb_addr_ts  = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
-     int i;
-+    RefPicListTab * const tab = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
- 
-     if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab))
-         return AVERROR_INVALIDDATA;
- 
--    for (i = ctb_addr_ts; i < ctb_count; i++)
--        frame->rpl_tab[i] = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
-+    if (frame->rpl_tab) {
-+        for (i = ctb_addr_ts; i < ctb_count; i++)
-+            frame->rpl_tab[i] = tab;
-+    }
- 
--    frame->refPicList = (RefPicList *)frame->rpl_tab[ctb_addr_ts];
-+    frame->refPicList = tab->refPicList;
- 
-     return 0;
- }
---- a/libavcodec/hevcdec.c
-+++ b/libavcodec/hevcdec.c
-@@ -332,6 +332,19 @@ static void export_stream_params(HEVCCon
- 
-     ff_set_sar(avctx, sps->vui.sar);
- 
-+    if (sps->chroma_format_idc == 1) {
-+        avctx->chroma_sample_location = sps->vui.chroma_loc_info_present_flag ?
-+            sps->vui.chroma_sample_loc_type_top_field + 1 :
-+            AVCHROMA_LOC_LEFT;
-+    }
-+    else if (sps->chroma_format_idc == 2 ||
-+             sps->chroma_format_idc == 3) {
-+        avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;;
-+    }
-+    else {
-+        avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED;
-+    }
-+
-     if (sps->vui.video_signal_type_present_flag)
-         avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG
-                                                             : AVCOL_RANGE_MPEG;
-@@ -372,14 +385,20 @@ static enum AVPixelFormat get_format(HEV
- #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \
-                      CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \
-                      CONFIG_HEVC_NVDEC_HWACCEL + \
-+                     CONFIG_HEVC_V4L2REQUEST_HWACCEL + \
-                      CONFIG_HEVC_VAAPI_HWACCEL + \
-                      CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \
-+                     CONFIG_HEVC_RPI4_8_HWACCEL + \
-+                     CONFIG_HEVC_RPI4_10_HWACCEL + \
-                      CONFIG_HEVC_VDPAU_HWACCEL)
-     enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
- 
-     switch (sps->pix_fmt) {
-     case AV_PIX_FMT_YUV420P:
-     case AV_PIX_FMT_YUVJ420P:
-+#if CONFIG_HEVC_RPI4_8_HWACCEL
-+        *fmt++ = AV_PIX_FMT_RPI4_8;
-+#endif
- #if CONFIG_HEVC_DXVA2_HWACCEL
-         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
- #endif
-@@ -399,8 +418,14 @@ static enum AVPixelFormat get_format(HEV
- #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL
-         *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
- #endif
-+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
-+        *fmt++ = AV_PIX_FMT_DRM_PRIME;
-+#endif
-         break;
-     case AV_PIX_FMT_YUV420P10:
-+#if CONFIG_HEVC_RPI4_10_HWACCEL
-+        *fmt++ = AV_PIX_FMT_RPI4_10;
-+#endif
- #if CONFIG_HEVC_DXVA2_HWACCEL
-         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
- #endif
-@@ -417,6 +442,9 @@ static enum AVPixelFormat get_format(HEV
- #if CONFIG_HEVC_NVDEC_HWACCEL
-         *fmt++ = AV_PIX_FMT_CUDA;
- #endif
-+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
-+        *fmt++ = AV_PIX_FMT_DRM_PRIME;
-+#endif
-         break;
-     case AV_PIX_FMT_YUV444P:
- #if CONFIG_HEVC_VDPAU_HWACCEL
-@@ -459,6 +487,16 @@ static int set_sps(HEVCContext *s, const
-     if (!sps)
-         return 0;
- 
-+    // If hwaccel then we don't need all the s/w decode helper arrays
-+    if (s->avctx->hwaccel) {
-+        export_stream_params(s, sps);
-+
-+        s->avctx->pix_fmt = pix_fmt;
-+        s->ps.sps = sps;
-+        s->ps.vps = (HEVCVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
-+        return 0;
-+    }
-+
-     ret = pic_arrays_init(s, sps);
-     if (ret < 0)
-         goto fail;
-@@ -2809,11 +2847,13 @@ static int hevc_frame_start(HEVCContext
-                            ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1);
-     int ret;
- 
--    memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
--    memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
--    memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
--    memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
--    memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
-+    if (s->horizontal_bs) {
-+        memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
-+        memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
-+        memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
-+        memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
-+        memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
-+    }
- 
-     s->is_decoded        = 0;
-     s->first_nal_type    = s->nal_unit_type;
-@@ -3230,7 +3270,14 @@ static int hevc_decode_frame(AVCodecCont
-     s->ref = NULL;
-     ret    = decode_nal_units(s, avpkt->data, avpkt->size);
-     if (ret < 0)
-+    {
-+        // Ensure that hwaccel knows this frame is over
-+        if (s->avctx->hwaccel && s->avctx->hwaccel->abort_frame) {
-+            s->avctx->hwaccel->abort_frame(s->avctx);
-+        }
-+
-         return ret;
-+    }
- 
-     if (avctx->hwaccel) {
-         if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) {
-@@ -3273,15 +3320,19 @@ static int hevc_ref_frame(HEVCContext *s
-     if (ret < 0)
-         return ret;
- 
--    dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
--    if (!dst->tab_mvf_buf)
--        goto fail;
--    dst->tab_mvf = src->tab_mvf;
-+    if (src->tab_mvf_buf) {
-+        dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
-+        if (!dst->tab_mvf_buf)
-+            goto fail;
-+        dst->tab_mvf = src->tab_mvf;
-+    }
- 
--    dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
--    if (!dst->rpl_tab_buf)
--        goto fail;
--    dst->rpl_tab = src->rpl_tab;
-+    if (src->rpl_tab_buf) {
-+        dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
-+        if (!dst->rpl_tab_buf)
-+            goto fail;
-+        dst->rpl_tab = src->rpl_tab;
-+    }
- 
-     dst->rpl_buf = av_buffer_ref(src->rpl_buf);
-     if (!dst->rpl_buf)
-@@ -3585,6 +3636,15 @@ AVCodec ff_hevc_decoder = {
- #if CONFIG_HEVC_NVDEC_HWACCEL
-                                HWACCEL_NVDEC(hevc),
- #endif
-+#if CONFIG_HEVC_RPI4_8_HWACCEL
-+                               HWACCEL_RPI4_8(hevc),
-+#endif
-+#if CONFIG_HEVC_RPI4_10_HWACCEL
-+                               HWACCEL_RPI4_10(hevc),
-+#endif
-+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
-+                               HWACCEL_V4L2REQUEST(hevc),
-+#endif
- #if CONFIG_HEVC_VAAPI_HWACCEL
-                                HWACCEL_VAAPI(hevc),
- #endif
---- a/libavcodec/hwaccels.h
-+++ b/libavcodec/hwaccels.h
-@@ -34,6 +34,9 @@ extern const AVHWAccel ff_hevc_d3d11va_h
- extern const AVHWAccel ff_hevc_d3d11va2_hwaccel;
- extern const AVHWAccel ff_hevc_dxva2_hwaccel;
- extern const AVHWAccel ff_hevc_nvdec_hwaccel;
-+extern const AVHWAccel ff_hevc_rpi4_8_hwaccel;
-+extern const AVHWAccel ff_hevc_rpi4_10_hwaccel;
-+extern const AVHWAccel ff_hevc_v4l2request_hwaccel;
- extern const AVHWAccel ff_hevc_vaapi_hwaccel;
- extern const AVHWAccel ff_hevc_vdpau_hwaccel;
- extern const AVHWAccel ff_hevc_videotoolbox_hwaccel;
---- a/libavcodec/hwconfig.h
-+++ b/libavcodec/hwconfig.h
-@@ -24,6 +24,7 @@
- 
- 
- #define HWACCEL_CAP_ASYNC_SAFE      (1 << 0)
-+#define HWACCEL_CAP_MT_SAFE         (1 << 1)
- 
- 
- typedef struct AVCodecHWConfigInternal {
-@@ -70,6 +71,12 @@ typedef struct AVCodecHWConfigInternal {
-     HW_CONFIG_HWACCEL(1, 1, 0, D3D11,        D3D11VA,      ff_ ## codec ## _d3d11va2_hwaccel)
- #define HWACCEL_NVDEC(codec) \
-     HW_CONFIG_HWACCEL(1, 1, 0, CUDA,         CUDA,         ff_ ## codec ## _nvdec_hwaccel)
-+#define HWACCEL_RPI4_8(codec) \
-+    HW_CONFIG_HWACCEL(0, 0, 1, RPI4_8,       NONE,         ff_ ## codec ## _rpi4_8_hwaccel)
-+#define HWACCEL_RPI4_10(codec) \
-+    HW_CONFIG_HWACCEL(0, 0, 1, RPI4_10,      NONE,         ff_ ## codec ## _rpi4_10_hwaccel)
-+#define HWACCEL_V4L2REQUEST(codec) \
-+    HW_CONFIG_HWACCEL(1, 0, 0, DRM_PRIME,    DRM,          ff_ ## codec ## _v4l2request_hwaccel)
- #define HWACCEL_VAAPI(codec) \
-     HW_CONFIG_HWACCEL(1, 1, 1, VAAPI,        VAAPI,        ff_ ## codec ## _vaapi_hwaccel)
- #define HWACCEL_VDPAU(codec) \
---- a/libavcodec/mmaldec.c
-+++ b/libavcodec/mmaldec.c
-@@ -24,6 +24,9 @@
-  * MMAL Video Decoder
-  */
- 
-+#pragma GCC diagnostic push
-+// Many many redundant decls in the header files
-+#pragma GCC diagnostic ignored "-Wredundant-decls"
- #include <bcm_host.h>
- #include <interface/mmal/mmal.h>
- #include <interface/mmal/mmal_parameters_video.h>
-@@ -31,6 +34,7 @@
- #include <interface/mmal/util/mmal_util_params.h>
- #include <interface/mmal/util/mmal_default_components.h>
- #include <interface/mmal/vc/mmal_vc_api.h>
-+#pragma GCC diagnostic pop
- #include <stdatomic.h>
- 
- #include "avcodec.h"
---- a/libavcodec/pthread_frame.c
-+++ b/libavcodec/pthread_frame.c
-@@ -191,7 +191,8 @@ static attribute_align_arg void *frame_w
- 
-         /* if the previous thread uses hwaccel then we take the lock to ensure
-          * the threads don't run concurrently */
--        if (avctx->hwaccel) {
-+        if (avctx->hwaccel &&
-+            !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
-             pthread_mutex_lock(&p->parent->hwaccel_mutex);
-             p->hwaccel_serializing = 1;
-         }
-@@ -614,7 +615,9 @@ void ff_thread_finish_setup(AVCodecConte
- 
-     if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return;
- 
--    if (avctx->hwaccel && !p->hwaccel_serializing) {
-+    if (avctx->hwaccel &&
-+        !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) &&
-+        !p->hwaccel_serializing) {
-         pthread_mutex_lock(&p->parent->hwaccel_mutex);
-         p->hwaccel_serializing = 1;
-     }
---- a/libavcodec/raw.c
-+++ b/libavcodec/raw.c
-@@ -293,6 +293,12 @@ const PixelFormatTag ff_raw_pix_fmt_tags
-     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
-     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
- 
-+    /* RPI (Might as well define for everything) */
-+    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
-+    { AV_PIX_FMT_RPI4_8,      MKTAG('S', 'A', 'N', 'D') },
-+    { AV_PIX_FMT_SAND64_10,   MKTAG('S', 'N', 'D', 'A') },
-+    { AV_PIX_FMT_RPI4_10,     MKTAG('S', 'N', 'D', 'B') },
-+
-     { AV_PIX_FMT_NONE, 0 },
- };
- 
---- a/libavcodec/rawenc.c
-+++ b/libavcodec/rawenc.c
-@@ -24,6 +24,7 @@
-  * Raw Video Encoder
-  */
- 
-+#include "config.h"
- #include "avcodec.h"
- #include "raw.h"
- #include "internal.h"
-@@ -31,6 +32,10 @@
- #include "libavutil/intreadwrite.h"
- #include "libavutil/imgutils.h"
- #include "libavutil/internal.h"
-+#include "libavutil/avassert.h"
-+#if CONFIG_SAND
-+#include "libavutil/rpi_sand_fns.h"
-+#endif
- 
- static av_cold int raw_encode_init(AVCodecContext *avctx)
- {
-@@ -49,22 +54,114 @@ FF_ENABLE_DEPRECATION_WARNINGS
-     return 0;
- }
- 
-+#if CONFIG_SAND
-+static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
-+                      const AVFrame *frame)
-+{
-+    const int width = av_frame_cropped_width(frame);
-+    const int height = av_frame_cropped_height(frame);
-+    const int x0 = frame->crop_left;
-+    const int y0 = frame->crop_top;
-+    const int size = width * height * 3 / 2;
-+    uint8_t * dst;
-+    int ret;
-+
-+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
-+        return ret;
-+
-+    dst = pkt->data;
-+
-+    av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
-+    dst += width * height;
-+    av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
-+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
-+    return 0;
-+}
-+
-+static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
-+                      const AVFrame *frame)
-+{
-+    const int width = av_frame_cropped_width(frame);
-+    const int height = av_frame_cropped_height(frame);
-+    const int x0 = frame->crop_left;
-+    const int y0 = frame->crop_top;
-+    const int size = width * height * 3;
-+    uint8_t * dst;
-+    int ret;
-+
-+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
-+        return ret;
-+
-+    dst = pkt->data;
-+
-+    av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
-+    dst += width * height * 2;
-+    av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
-+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
-+    return 0;
-+}
-+
-+static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
-+                      const AVFrame *frame)
-+{
-+    const int width = av_frame_cropped_width(frame);
-+    const int height = av_frame_cropped_height(frame);
-+    const int x0 = frame->crop_left;
-+    const int y0 = frame->crop_top;
-+    const int size = width * height * 3;
-+    uint8_t * dst;
-+    int ret;
-+
-+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
-+        return ret;
-+
-+    dst = pkt->data;
-+
-+    av_rpi_sand30_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
-+    dst += width * height * 2;
-+    av_rpi_sand30_to_planar_c16(dst, width, dst + width * height / 2, width,
-+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0/2, y0 / 2, width/2, height / 2);
-+    return 0;
-+}
-+#endif
-+
-+
- static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
--                      const AVFrame *frame, int *got_packet)
-+                      const AVFrame *src_frame, int *got_packet)
- {
--    int ret = av_image_get_buffer_size(frame->format,
--                                       frame->width, frame->height, 1);
-+    int ret;
-+    AVFrame * frame = NULL;
- 
--    if (ret < 0)
-+#if CONFIG_SAND
-+    if (av_rpi_is_sand_frame(src_frame)) {
-+        ret = av_rpi_is_sand8_frame(src_frame) ? raw_sand8_as_yuv420(avctx, pkt, src_frame) :
-+            av_rpi_is_sand16_frame(src_frame) ? raw_sand16_as_yuv420(avctx, pkt, src_frame) :
-+            av_rpi_is_sand30_frame(src_frame) ? raw_sand30_as_yuv420(avctx, pkt, src_frame) : -1;
-+        *got_packet = (ret == 0);
-         return ret;
-+    }
-+#endif
-+
-+    if ((frame = av_frame_clone(src_frame)) == NULL) {
-+        ret = AVERROR(ENOMEM);
-+        goto fail;
-+    }
-+
-+    if ((ret = av_frame_apply_cropping(frame, AV_FRAME_CROP_UNALIGNED)) < 0)
-+        goto fail;
-+
-+    ret = av_image_get_buffer_size(frame->format,
-+                                       frame->width, frame->height, 1);
-+    if (ret < 0)
-+        goto fail;
- 
-     if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
--        return ret;
-+        goto fail;
-     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
-                                        (const uint8_t **)frame->data, frame->linesize,
-                                        frame->format,
-                                        frame->width, frame->height, 1)) < 0)
--        return ret;
-+        goto fail;
- 
-     if(avctx->codec_tag == AV_RL32("yuv2") && ret > 0 &&
-        frame->format   == AV_PIX_FMT_YUYV422) {
-@@ -81,8 +178,14 @@ static int raw_encode(AVCodecContext *av
-         }
-     }
-     pkt->flags |= AV_PKT_FLAG_KEY;
-+    av_frame_free(&frame);
-     *got_packet = 1;
-     return 0;
-+
-+fail:
-+    av_frame_free(&frame);
-+    *got_packet = 0;
-+    return ret;
- }
- 
- AVCodec ff_rawvideo_encoder = {
---- /dev/null
-+++ b/libavcodec/rpi_hevc_cabac.c
-@@ -0,0 +1,2257 @@
-+/*
-+ * HEVC CABAC decoding
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#define UNCHECKED_BITSTREAM_READER 1
-+
-+#include "libavutil/attributes.h"
-+#include "libavutil/common.h"
-+
-+#include "cabac_functions.h"
-+#include "rpi_hevc_data.h"
-+#include "hevc.h"
-+#include "rpi_hevcdec.h"
-+#include "rpi_hevc_cabac_fns.h"
-+
-+#include "libavutil/rpi_sand_fns.h"
-+
-+// BY22 is probably faster than simple bypass if the processor has
-+// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
-+// x86 has fast int divide
-+// Arm doesn't have divide or general fast 64 bit, but does have the multiply
-+// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
-+#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
-+// Use native divide if we have a fast one - otherwise use mpy 1/x
-+// x86 has a fast integer divide - arm doesn't - unsure about other
-+// architectures
-+#define USE_BY22_DIV  ARCH_X86
-+
-+// Special case blocks with a single significant ceoff
-+// Decreases the complexity of the code for a common case but increases the
-+// code size.
-+#define USE_N_END_1 1
-+
-+#if !USE_BY22_DIV
-+// * 1/x @ 32 bits gets us 22 bits of accuracy
-+#define CABAC_BY22_PEEK_BITS  22
-+#else
-+// A real 32-bit divide gets us another bit
-+// If we have a 64 bit int & a unit time divider then we should get a lot
-+// of bits (55)  but that is untested and it is unclear if it would give
-+// us a large advantage
-+#define CABAC_BY22_PEEK_BITS  23
-+#endif
-+
-+#define CABAC_MAX_BIN 31
-+
-+
-+#if USE_BY22 && !USE_BY22_DIV
-+#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
-+
-+static const uint32_t cabac_by22_inv_range[256] = {
-+                                                    0,      I(257), I(258), I(259),
-+    I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
-+    I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
-+    I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
-+    I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
-+    I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
-+    I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
-+    I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
-+    I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
-+    I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
-+    I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
-+    I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
-+    I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
-+    I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
-+    I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
-+    I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
-+    I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
-+    I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
-+    I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
-+    I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
-+    I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
-+    I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
-+    I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
-+    I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
-+    I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
-+    I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
-+    I(510), I(511)
-+};
-+#undef I
-+#endif  // USE_BY22
-+
-+#if ARCH_ARM
-+#include "arm/rpi_hevc_cabac.h"
-+#endif
-+
-+/**
-+ * number of bin by SyntaxElement.
-+ */
-+static const int8_t num_bins_in_se[] = {
-+     1, // sao_merge_flag
-+     1, // sao_type_idx
-+     0, // sao_eo_class
-+     0, // sao_band_position
-+     0, // sao_offset_abs
-+     0, // sao_offset_sign
-+     0, // end_of_slice_flag
-+     3, // split_coding_unit_flag
-+     1, // cu_transquant_bypass_flag
-+     3, // skip_flag
-+     3, // cu_qp_delta
-+     1, // pred_mode
-+     4, // part_mode
-+     0, // pcm_flag
-+     1, // prev_intra_luma_pred_mode
-+     0, // mpm_idx
-+     0, // rem_intra_luma_pred_mode
-+     2, // intra_chroma_pred_mode
-+     1, // merge_flag
-+     1, // merge_idx
-+     5, // inter_pred_idc
-+     2, // ref_idx_l0
-+     2, // ref_idx_l1
-+     2, // abs_mvd_greater0_flag
-+     2, // abs_mvd_greater1_flag
-+     0, // abs_mvd_minus2
-+     0, // mvd_sign_flag
-+     1, // mvp_lx_flag
-+     1, // no_residual_data_flag
-+     3, // split_transform_flag
-+     2, // cbf_luma
-+     4, // cbf_cb, cbf_cr
-+     2, // transform_skip_flag[][]
-+     2, // explicit_rdpcm_flag[][]
-+     2, // explicit_rdpcm_dir_flag[][]
-+    18, // last_significant_coeff_x_prefix
-+    18, // last_significant_coeff_y_prefix
-+     0, // last_significant_coeff_x_suffix
-+     0, // last_significant_coeff_y_suffix
-+     4, // significant_coeff_group_flag
-+    44, // significant_coeff_flag
-+    24, // coeff_abs_level_greater1_flag
-+     6, // coeff_abs_level_greater2_flag
-+     0, // coeff_abs_level_remaining
-+     0, // coeff_sign_flag
-+     8, // log2_res_scale_abs
-+     2, // res_scale_sign_flag
-+     1, // cu_chroma_qp_offset_flag
-+     1, // cu_chroma_qp_offset_idx
-+};
-+
-+/**
-+ * Offset to ctxIdx 0 in init_values and states, indexed by SyntaxElement.
-+ */
-+static const int elem_offset[sizeof(num_bins_in_se)] = {
-+    0, // sao_merge_flag
-+    1, // sao_type_idx
-+    2, // sao_eo_class
-+    2, // sao_band_position
-+    2, // sao_offset_abs
-+    2, // sao_offset_sign
-+    2, // end_of_slice_flag
-+    2, // split_coding_unit_flag
-+    5, // cu_transquant_bypass_flag
-+    6, // skip_flag
-+    9, // cu_qp_delta
-+    12, // pred_mode
-+    13, // part_mode
-+    17, // pcm_flag
-+    17, // prev_intra_luma_pred_mode
-+    18, // mpm_idx
-+    18, // rem_intra_luma_pred_mode
-+    18, // intra_chroma_pred_mode
-+    20, // merge_flag
-+    21, // merge_idx
-+    22, // inter_pred_idc
-+    27, // ref_idx_l0
-+    29, // ref_idx_l1
-+    31, // abs_mvd_greater0_flag
-+    33, // abs_mvd_greater1_flag
-+    35, // abs_mvd_minus2
-+    35, // mvd_sign_flag
-+    35, // mvp_lx_flag
-+    36, // no_residual_data_flag
-+    37, // split_transform_flag
-+    40, // cbf_luma
-+    42, // cbf_cb, cbf_cr
-+    46, // transform_skip_flag[][]
-+    48, // explicit_rdpcm_flag[][]
-+    50, // explicit_rdpcm_dir_flag[][]
-+    52, // last_significant_coeff_x_prefix
-+    70, // last_significant_coeff_y_prefix
-+    88, // last_significant_coeff_x_suffix
-+    88, // last_significant_coeff_y_suffix
-+    88, // significant_coeff_group_flag
-+    92, // significant_coeff_flag
-+    136, // coeff_abs_level_greater1_flag
-+    160, // coeff_abs_level_greater2_flag
-+    166, // coeff_abs_level_remaining
-+    166, // coeff_sign_flag
-+    166, // log2_res_scale_abs
-+    174, // res_scale_sign_flag
-+    176, // cu_chroma_qp_offset_flag
-+    177, // cu_chroma_qp_offset_idx
-+};
-+
-+#define CNU 154
-+/**
-+ * Indexed by init_type
-+ */
-+static const uint8_t init_values[3][HEVC_CONTEXTS] = {
-+    { // sao_merge_flag
-+      153,
-+      // sao_type_idx
-+      200,
-+      // split_coding_unit_flag
-+      139, 141, 157,
-+      // cu_transquant_bypass_flag
-+      154,
-+      // skip_flag
-+      CNU, CNU, CNU,
-+      // cu_qp_delta
-+      154, 154, 154,
-+      // pred_mode
-+      CNU,
-+      // part_mode
-+      184, CNU, CNU, CNU,
-+      // prev_intra_luma_pred_mode
-+      184,
-+      // intra_chroma_pred_mode
-+      63, 139,
-+      // merge_flag
-+      CNU,
-+      // merge_idx
-+      CNU,
-+      // inter_pred_idc
-+      CNU, CNU, CNU, CNU, CNU,
-+      // ref_idx_l0
-+      CNU, CNU,
-+      // ref_idx_l1
-+      CNU, CNU,
-+      // abs_mvd_greater1_flag
-+      CNU, CNU,
-+      // abs_mvd_greater1_flag
-+      CNU, CNU,
-+      // mvp_lx_flag
-+      CNU,
-+      // no_residual_data_flag
-+      CNU,
-+      // split_transform_flag
-+      153, 138, 138,
-+      // cbf_luma
-+      111, 141,
-+      // cbf_cb, cbf_cr
-+      94, 138, 182, 154,
-+      // transform_skip_flag
-+      139, 139,
-+      // explicit_rdpcm_flag
-+      139, 139,
-+      // explicit_rdpcm_dir_flag
-+      139, 139,
-+      // last_significant_coeff_x_prefix
-+      110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
-+       79, 108, 123,  63,
-+      // last_significant_coeff_y_prefix
-+      110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
-+       79, 108, 123,  63,
-+      // significant_coeff_group_flag
-+      91, 171, 134, 141,
-+      // significant_coeff_flag
-+      111, 111, 125, 110, 110,  94, 124, 108, 124, 107, 125, 141, 179, 153,
-+      125, 107, 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140,
-+      139, 182, 182, 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111,
-+      141, 111,
-+      // coeff_abs_level_greater1_flag
-+      140,  92, 137, 138, 140, 152, 138, 139, 153,  74, 149,  92, 139, 107,
-+      122, 152, 140, 179, 166, 182, 140, 227, 122, 197,
-+      // coeff_abs_level_greater2_flag
-+      138, 153, 136, 167, 152, 152,
-+      // log2_res_scale_abs
-+      154, 154, 154, 154, 154, 154, 154, 154,
-+      // res_scale_sign_flag
-+      154, 154,
-+      // cu_chroma_qp_offset_flag
-+      154,
-+      // cu_chroma_qp_offset_idx
-+      154,
-+    },
-+    { // sao_merge_flag
-+      153,
-+      // sao_type_idx
-+      185,
-+      // split_coding_unit_flag
-+      107, 139, 126,
-+      // cu_transquant_bypass_flag
-+      154,
-+      // skip_flag
-+      197, 185, 201,
-+      // cu_qp_delta
-+      154, 154, 154,
-+      // pred_mode
-+      149,
-+      // part_mode
-+      154, 139, 154, 154,
-+      // prev_intra_luma_pred_mode
-+      154,
-+      // intra_chroma_pred_mode
-+      152, 139,
-+      // merge_flag
-+      110,
-+      // merge_idx
-+      122,
-+      // inter_pred_idc
-+      95, 79, 63, 31, 31,
-+      // ref_idx_l0
-+      153, 153,
-+      // ref_idx_l1
-+      153, 153,
-+      // abs_mvd_greater1_flag
-+      140, 198,
-+      // abs_mvd_greater1_flag
-+      140, 198,
-+      // mvp_lx_flag
-+      168,
-+      // no_residual_data_flag
-+      79,
-+      // split_transform_flag
-+      124, 138, 94,
-+      // cbf_luma
-+      153, 111,
-+      // cbf_cb, cbf_cr
-+      149, 107, 167, 154,
-+      // transform_skip_flag
-+      139, 139,
-+      // explicit_rdpcm_flag
-+      139, 139,
-+      // explicit_rdpcm_dir_flag
-+      139, 139,
-+      // last_significant_coeff_x_prefix
-+      125, 110,  94, 110,  95,  79, 125, 111, 110,  78, 110, 111, 111,  95,
-+       94, 108, 123, 108,
-+      // last_significant_coeff_y_prefix
-+      125, 110,  94, 110,  95,  79, 125, 111, 110,  78, 110, 111, 111,  95,
-+       94, 108, 123, 108,
-+      // significant_coeff_group_flag
-+      121, 140, 61, 154,
-+      // significant_coeff_flag
-+      155, 154, 139, 153, 139, 123, 123,  63, 153, 166, 183, 140, 136, 153,
-+      154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
-+      153, 123, 123, 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140,
-+      140, 140,
-+      // coeff_abs_level_greater1_flag
-+      154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
-+      136, 137, 169, 194, 166, 167, 154, 167, 137, 182,
-+      // coeff_abs_level_greater2_flag
-+      107, 167, 91, 122, 107, 167,
-+      // log2_res_scale_abs
-+      154, 154, 154, 154, 154, 154, 154, 154,
-+      // res_scale_sign_flag
-+      154, 154,
-+      // cu_chroma_qp_offset_flag
-+      154,
-+      // cu_chroma_qp_offset_idx
-+      154,
-+    },
-+    { // sao_merge_flag
-+      153,
-+      // sao_type_idx
-+      160,
-+      // split_coding_unit_flag
-+      107, 139, 126,
-+      // cu_transquant_bypass_flag
-+      154,
-+      // skip_flag
-+      197, 185, 201,
-+      // cu_qp_delta
-+      154, 154, 154,
-+      // pred_mode
-+      134,
-+      // part_mode
-+      154, 139, 154, 154,
-+      // prev_intra_luma_pred_mode
-+      183,
-+      // intra_chroma_pred_mode
-+      152, 139,
-+      // merge_flag
-+      154,
-+      // merge_idx
-+      137,
-+      // inter_pred_idc
-+      95, 79, 63, 31, 31,
-+      // ref_idx_l0
-+      153, 153,
-+      // ref_idx_l1
-+      153, 153,
-+      // abs_mvd_greater1_flag
-+      169, 198,
-+      // abs_mvd_greater1_flag
-+      169, 198,
-+      // mvp_lx_flag
-+      168,
-+      // no_residual_data_flag
-+      79,
-+      // split_transform_flag
-+      224, 167, 122,
-+      // cbf_luma
-+      153, 111,
-+      // cbf_cb, cbf_cr
-+      149, 92, 167, 154,
-+      // transform_skip_flag
-+      139, 139,
-+      // explicit_rdpcm_flag
-+      139, 139,
-+      // explicit_rdpcm_dir_flag
-+      139, 139,
-+      // last_significant_coeff_x_prefix
-+      125, 110, 124, 110,  95,  94, 125, 111, 111,  79, 125, 126, 111, 111,
-+       79, 108, 123,  93,
-+      // last_significant_coeff_y_prefix
-+      125, 110, 124, 110,  95,  94, 125, 111, 111,  79, 125, 126, 111, 111,
-+       79, 108, 123,  93,
-+      // significant_coeff_group_flag
-+      121, 140, 61, 154,
-+      // significant_coeff_flag
-+      170, 154, 139, 153, 139, 123, 123,  63, 124, 166, 183, 140, 136, 153,
-+      154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
-+      153, 138, 138, 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140,
-+      140, 140,
-+      // coeff_abs_level_greater1_flag
-+      154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
-+      136, 122, 169, 208, 166, 167, 154, 152, 167, 182,
-+      // coeff_abs_level_greater2_flag
-+      107, 167, 91, 107, 107, 167,
-+      // log2_res_scale_abs
-+      154, 154, 154, 154, 154, 154, 154, 154,
-+      // res_scale_sign_flag
-+      154, 154,
-+      // cu_chroma_qp_offset_flag
-+      154,
-+      // cu_chroma_qp_offset_idx
-+      154,
-+    },
-+};
-+
-+static const uint8_t scan_1x1[1] = {
-+    0,
-+};
-+
-+static const uint8_t horiz_scan2x2_x[4] = {
-+    0, 1, 0, 1,
-+};
-+
-+static const uint8_t horiz_scan2x2_y[4] = {
-+    0, 0, 1, 1
-+};
-+
-+static const uint8_t horiz_scan4x4_x[16] = {
-+    0, 1, 2, 3,
-+    0, 1, 2, 3,
-+    0, 1, 2, 3,
-+    0, 1, 2, 3,
-+};
-+
-+static const uint8_t horiz_scan4x4_y[16] = {
-+    0, 0, 0, 0,
-+    1, 1, 1, 1,
-+    2, 2, 2, 2,
-+    3, 3, 3, 3,
-+};
-+
-+static const uint8_t horiz_scan8x8_inv[8][8] = {
-+    {  0,  1,  2,  3, 16, 17, 18, 19, },
-+    {  4,  5,  6,  7, 20, 21, 22, 23, },
-+    {  8,  9, 10, 11, 24, 25, 26, 27, },
-+    { 12, 13, 14, 15, 28, 29, 30, 31, },
-+    { 32, 33, 34, 35, 48, 49, 50, 51, },
-+    { 36, 37, 38, 39, 52, 53, 54, 55, },
-+    { 40, 41, 42, 43, 56, 57, 58, 59, },
-+    { 44, 45, 46, 47, 60, 61, 62, 63, },
-+};
-+
-+static const uint8_t diag_scan2x2_x[4] = {
-+    0, 0, 1, 1,
-+};
-+
-+static const uint8_t diag_scan2x2_y[4] = {
-+    0, 1, 0, 1,
-+};
-+
-+static const uint8_t diag_scan2x2_inv[2][2] = {
-+    { 0, 2, },
-+    { 1, 3, },
-+};
-+
-+static const uint8_t diag_scan4x4_inv[4][4] = {
-+    { 0,  2,  5,  9, },
-+    { 1,  4,  8, 12, },
-+    { 3,  7, 11, 14, },
-+    { 6, 10, 13, 15, },
-+};
-+
-+static const uint8_t diag_scan8x8_inv[8][8] = {
-+    {  0,  2,  5,  9, 14, 20, 27, 35, },
-+    {  1,  4,  8, 13, 19, 26, 34, 42, },
-+    {  3,  7, 12, 18, 25, 33, 41, 48, },
-+    {  6, 11, 17, 24, 32, 40, 47, 53, },
-+    { 10, 16, 23, 31, 39, 46, 52, 57, },
-+    { 15, 22, 30, 38, 45, 51, 56, 60, },
-+    { 21, 29, 37, 44, 50, 55, 59, 62, },
-+    { 28, 36, 43, 49, 54, 58, 61, 63, },
-+};
-+
-+
-+typedef struct
-+{
-+    uint16_t coeff;
-+    uint16_t scale;
-+} xy_off_t;
-+
-+#define XYT_C(x,y,t) ((x) + ((y) << (t)))
-+#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
-+#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
-+#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
-+
-+#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
-+
-+#define OFF_DIAG(t) {\
-+    XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
-+    XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
-+    XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
-+    XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
-+}
-+
-+#define OFF_HORIZ(t) {\
-+    XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
-+    XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
-+    XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
-+    XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
-+}
-+
-+#define OFF_VERT(t) {\
-+    XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
-+    XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
-+    XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
-+    XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
-+}
-+
-+static const xy_off_t off_xys[3][4][16] =
-+{
-+    {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
-+    {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
-+    {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
-+};
-+
-+
-+// Helper fns
-+#ifndef hevc_mem_bits32
-+static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
-+{
-+    return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
-+}
-+#endif
-+
-+#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
-+#define hevc_clz32 hevc_clz32_builtin
-+static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
-+{
-+    // __builtin_clz says it works on ints - so adjust if int is >32 bits long
-+    return __builtin_clz(x) - (sizeof(int) * 8 - 32);
-+}
-+#endif
-+
-+// It is unlikely that we will ever need this but include for completeness
-+#ifndef hevc_clz32
-+static inline unsigned int hevc_clz32(unsigned int x)
-+{
-+    unsigned int n = 1;
-+    if ((x & 0xffff0000) == 0) {
-+        n += 16;
-+        x <<= 16;
-+    }
-+    if ((x & 0xff000000) == 0) {
-+        n += 8;
-+        x <<= 8;
-+    }
-+    if ((x & 0xf0000000) == 0) {
-+        n += 4;
-+        x <<= 4;
-+    }
-+    if ((x & 0xc0000000) == 0) {
-+        n += 2;
-+        x <<= 2;
-+    }
-+    return n - ((x >> 31) & 1);
-+}
-+#endif
-+
-+static inline int cabac_overflow(const CABACContext * const cc)
-+{
-+    av_assert0(cc->bytestream >= cc->bytestream_start);
-+    return cc->bytestream >= cc->bytestream_end + 4;
-+}
-+
-+int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc)
-+{
-+    return cabac_overflow(&lc->cc);
-+}
-+
-+#if !USE_BY22
-+// If no by22 then _by22 functions will revert to normal and so _peek/_flush
-+// will no longer be called but the setup calls will still exist and we want
-+// to null them out
-+#define bypass_start(s)
-+#define bypass_finish(s)
-+#else
-+// Use BY22 for residual bypass block
-+
-+#define bypass_start(cc) get_cabac_by22_start(cc)
-+#define bypass_finish(cc) get_cabac_by22_finish(cc)
-+
-+// BY22 notes that bypass is simply a divide into the bitstream and so we
-+// can peek out large quantities of bits at once and treat the result as if
-+// it was VLC.  In many cases this will lead to O(1) processing rather than
-+// O(n) though the setup and teardown is sufficiently expensive that it is
-+// only worth using if we expect to be dealing with more than a few bits
-+// The definition of "a few bits" will vary from platform to platform but
-+// tests on ARM show that it probably isn't worth it for a single coded
-+// residual, but is for >1 - it also seems likely that if there are
-+// more residuals then they are likely to be bigger and this will make the
-+// O(1) nature of the code more worthwhile.
-+
-+
-+// Bypass block start
-+// Must be called before _by22_peek is used as it sets the CABAC environment
-+// into the correct state.  _by22_finish must be called to return to 'normal'
-+// (i.e. non-bypass) cabac decoding
-+#ifndef get_cabac_by22_start
-+static inline void get_cabac_by22_start(CABACContext * const c)
-+{
-+    const unsigned int bits = __builtin_ctz(c->low);
-+    const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
-+    uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
-+#if !USE_BY22_DIV
-+    const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
-+#endif
-+
-+    c->bytestream -= (CABAC_BITS / 8);
-+    c->by22.bits = bits;
-+#if !USE_BY22_DIV
-+    c->by22.range = c->range;
-+    c->range = inv;
-+#endif
-+    c->low = x;
-+}
-+#endif
-+
-+// Bypass block finish
-+// Must be called at the end of the bypass block to return to normal operation
-+static inline void get_cabac_by22_finish(CABACContext * const c)
-+{
-+    unsigned int used = c->by22.bits;
-+    unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
-+    unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
-+
-+    c->bytestream += bytes_used + (CABAC_BITS / 8);
-+    c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
-+#if !USE_BY22_DIV
-+    c->range = c->by22.range;
-+#endif
-+}
-+
-+// Peek bypass bits
-+// _by22_start must be called before _by22_peek is called and _by22_flush
-+// must be called afterwards to flush any used bits
-+// The actual number of valid bits returned is
-+// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
-+// will be at least 22 which should be long enough for any prefix or suffix
-+// though probably not long enough for the worst case combination
-+#ifndef get_cabac_by22_peek
-+static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
-+{
-+#if USE_BY22_DIV
-+    return ((unsigned int)c->low / (unsigned int)c->range) << 9;
-+#else
-+    uint32_t x = c->low & ~1U;
-+    const uint32_t inv = c->range;
-+
-+    if (inv != 0)
-+        x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
-+
-+    return x << 1;
-+#endif
-+}
-+#endif
-+
-+// Flush bypass bits peeked by _by22_peek
-+// Flush n bypass bits. n must be >= 1 to guarantee correct operation
-+// val is an unmodified copy of whatever _by22_peek returned
-+#ifndef get_cabac_by22_flush
-+static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
-+{
-+    // Subtract the bits used & reshift up to the top of the word
-+#if USE_BY22_DIV
-+    const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
-+#else
-+    const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
-+#endif
-+
-+    // and refill lower bits
-+    // We will probably OR over some existing bits but that doesn't matter
-+    c->by22.bits += n;
-+    c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
-+}
-+#endif
-+
-+#endif  // USE_BY22
-+
-+
-+void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc)
-+{
-+    memcpy(s->cabac_save->rice, lc->stat_coeff, 4);
-+    memcpy(s->cabac_save->state, lc->cabac_state, HEVC_CONTEXTS);
-+}
-+
-+static void load_states(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    memcpy(lc->stat_coeff, s->cabac_save->rice, 4);
-+    memcpy(lc->cabac_state, s->cabac_save->state, HEVC_CONTEXTS);
-+}
-+
-+int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc)
-+{
-+    GetBitContext * const gb = &lc->gb;
-+    skip_bits(gb, 1);
-+    align_get_bits(gb);
-+    return ff_init_cabac_decoder(&lc->cc,
-+                          gb->buffer + get_bits_count(gb) / 8,
-+                          (get_bits_left(gb) + 7) / 8);
-+}
-+
-+static void cabac_init_state(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    int init_type = 2 - s->sh.slice_type;
-+    int i;
-+
-+    if (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I)
-+        init_type ^= 3;
-+
-+    for (i = 0; i < HEVC_CONTEXTS; i++) {
-+        int init_value = init_values[init_type][i];
-+        int m = (init_value >> 4) * 5 - 45;
-+        int n = ((init_value & 15) << 3) - 16;
-+        int pre = 2 * (((m * av_clip(s->sh.slice_qp, 0, 51)) >> 4) + n) - 127;
-+
-+        pre ^= pre >> 31;
-+        if (pre > 124)
-+            pre = 124 + (pre & 1);
-+        lc->cabac_state[i] = pre;
-+    }
-+
-+    for (i = 0; i < 4; i++)
-+        lc->stat_coeff[i] = 0;
-+}
-+
-+void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags)
-+{
-+    if (lc->cabac_init_req == 1 || (ctb_flags & CTB_TS_FLAGS_CIREQ) != 0)
-+    {
-+        lc->qPy_pred = s->sh.slice_qp;
-+        cabac_init_state(s, lc);
-+    }
-+    else if ((ctb_flags & CTB_TS_FLAGS_CLOAD) != 0)
-+    {
-+        lc->qPy_pred = s->sh.slice_qp;
-+        load_states(s, lc);
-+    }
-+    lc->cabac_init_req = 0;
-+}
-+
-+#define GET_CABAC_LC(ctx) get_cabac(&lc->cc, lc->cabac_state + (ctx))
-+
-+int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state)
-+{
-+    return get_cabac_inline(c, state);
-+}
-+
-+int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c)
-+{
-+    return get_cabac_terminate(c);
-+}
-+
-+int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc)
-+{
-+    if (!GET_CABAC_LC(elem_offset[SAO_TYPE_IDX]))
-+        return 0;
-+
-+    if (!get_cabac_bypass(&lc->cc))
-+        return SAO_BAND;
-+    return SAO_EDGE;
-+}
-+
-+int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc)
-+{
-+    int i;
-+    int value = get_cabac_bypass(&lc->cc);
-+
-+    for (i = 0; i < 4; i++)
-+        value = (value << 1) | get_cabac_bypass(&lc->cc);
-+    return value;
-+}
-+
-+int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    int i = 0;
-+    int length = (1 << (FFMIN(s->ps.sps->bit_depth, 10) - 5)) - 1;
-+
-+    while (i < length && get_cabac_bypass(&lc->cc))
-+        i++;
-+    return i;
-+}
-+
-+int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return get_cabac_bypass(&lc->cc);
-+}
-+
-+int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc)
-+{
-+    int ret = get_cabac_bypass(&lc->cc) << 1;
-+    ret    |= get_cabac_bypass(&lc->cc);
-+    return ret;
-+}
-+
-+int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc)
-+{
-+    int val = 1;
-+
-+    if (get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA) == 0)
-+        return 0;
-+
-+    while (val < 5 &&
-+           get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA + 1) != 0)
-+        val++;
-+
-+    if (val >= 5) {
-+        unsigned int k = 0;
-+        while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) {
-+            val += 1 << k;
-+            k++;
-+        }
-+//        if (k == CABAC_MAX_BIN)
-+//            av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
-+
-+        while (k--)
-+            val += get_cabac_bypass(&lc->cc) << k;
-+    }
-+    return get_cabac_bypass(&lc->cc) ? -val : val;
-+}
-+
-+int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    int c_max= FFMAX(5, s->ps.pps->chroma_qp_offset_list_len_minus1);
-+    int i = 0;
-+
-+    while (i < c_max && GET_CABAC_LC(elem_offset[CU_CHROMA_QP_OFFSET_IDX]))
-+        i++;
-+
-+    return i;
-+}
-+
-+int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size)
-+{
-+    if (GET_CABAC_LC(elem_offset[PART_MODE])) // 1
-+        return PART_2Nx2N;
-+    if (log2_cb_size == s->ps.sps->log2_min_cb_size) {
-+        if (lc->cu.pred_mode == MODE_INTRA) // 0
-+            return PART_NxN;
-+        if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01
-+            return PART_2NxN;
-+        if (log2_cb_size == 3) // 00
-+            return PART_Nx2N;
-+        if (GET_CABAC_LC(elem_offset[PART_MODE] + 2)) // 001
-+            return PART_Nx2N;
-+        return PART_NxN; // 000
-+    }
-+
-+    if (!s->ps.sps->amp_enabled_flag) {
-+        if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01
-+            return PART_2NxN;
-+        return PART_Nx2N;
-+    }
-+
-+    if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) { // 01X, 01XX
-+        if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 011
-+            return PART_2NxN;
-+        if (get_cabac_bypass(&lc->cc)) // 0101
-+            return PART_2NxnD;
-+        return PART_2NxnU; // 0100
-+    }
-+
-+    if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 001
-+        return PART_Nx2N;
-+    if (get_cabac_bypass(&lc->cc)) // 0001
-+        return PART_nRx2N;
-+    return PART_nLx2N;  // 0000
-+}
-+
-+int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc)
-+{
-+    int i = 0;
-+    while (i < 2 && get_cabac_bypass(&lc->cc))
-+        i++;
-+    return i;
-+}
-+
-+int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc)
-+{
-+    int i;
-+    int value = get_cabac_bypass(&lc->cc);
-+
-+    for (i = 0; i < 4; i++)
-+        value = (value << 1) | get_cabac_bypass(&lc->cc);
-+    return value;
-+}
-+
-+int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc)
-+{
-+    int ret;
-+    if (!GET_CABAC_LC(elem_offset[INTRA_CHROMA_PRED_MODE]))
-+        return 4;
-+
-+    ret  = get_cabac_bypass(&lc->cc) << 1;
-+    ret |= get_cabac_bypass(&lc->cc);
-+    return ret;
-+}
-+
-+int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    int i = GET_CABAC_LC(elem_offset[MERGE_IDX]);
-+
-+    if (i != 0) {
-+        while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&lc->cc))
-+            i++;
-+    }
-+    return i;
-+}
-+
-+int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH)
-+{
-+    if (nPbW + nPbH == 12)
-+        return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4);
-+    if (GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + lc->ct_depth))
-+        return PRED_BI;
-+
-+    return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4);
-+}
-+
-+int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx)
-+{
-+    int i = 0;
-+    int max = num_ref_idx_lx - 1;
-+    int max_ctx = FFMIN(max, 2);
-+
-+    while (i < max_ctx && GET_CABAC_LC(elem_offset[REF_IDX_L0] + i))
-+        i++;
-+    if (i == 2) {
-+        while (i < max && get_cabac_bypass(&lc->cc))
-+            i++;
-+    }
-+
-+    return i;
-+}
-+
-+static av_always_inline int abs_mvd_greater0_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER0_FLAG]);
-+}
-+
-+static av_always_inline int abs_mvd_greater1_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER1_FLAG] + 1);
-+}
-+
-+#if !USE_BY22
-+static av_always_inline int mvd_decode(HEVCRpiLocalContext * const lc)
-+{
-+    int ret = 2;
-+    int k = 1;
-+
-+    while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) {
-+        ret += 1U << k;
-+        k++;
-+    }
-+    if (k == CABAC_MAX_BIN) {
-+        av_log(NULL, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
-+        return 0;
-+    }
-+
-+    while (k--)
-+        ret += get_cabac_bypass(&lc->cc) << k;
-+    return get_cabac_bypass_sign(&lc->cc, -ret);
-+}
-+#endif
-+
-+static av_always_inline int mvd_sign_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return get_cabac_bypass_sign(&lc->cc, -1);
-+}
-+
-+static int hevc_transform_skip_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
-+{
-+    return GET_CABAC_LC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
-+}
-+
-+static int explicit_rdpcm_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
-+{
-+    return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
-+}
-+
-+static int explicit_rdpcm_dir_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
-+{
-+    return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
-+}
-+
-+
-+int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx) {
-+    int i =0;
-+
-+    while (i < 4 && GET_CABAC_LC(elem_offset[LOG2_RES_SCALE_ABS] + 4 * idx + i))
-+        i++;
-+
-+    return i;
-+}
-+
-+static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCRpiLocalContext * const lc, int c_idx_nz,
-+                                                   int log2_size, int *last_scx_prefix, int *last_scy_prefix)
-+{
-+    int i = 0;
-+    int max = (log2_size << 1) - 1;
-+    int ctx_offset, ctx_shift;
-+
-+    if (!c_idx_nz) {
-+        ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
-+        ctx_shift = (log2_size + 1) >> 2;
-+    } else {
-+        ctx_offset = 15;
-+        ctx_shift = log2_size - 2;
-+    }
-+    while (i < max &&
-+           GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_X_PREFIX] + (i >> ctx_shift) + ctx_offset))
-+        i++;
-+    *last_scx_prefix = i;
-+
-+    i = 0;
-+    while (i < max &&
-+           GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_Y_PREFIX] + (i >> ctx_shift) + ctx_offset))
-+        i++;
-+    *last_scy_prefix = i;
-+}
-+
-+static av_always_inline int last_significant_coeff_suffix_decode(HEVCRpiLocalContext * const lc,
-+                                                 int last_significant_coeff_prefix)
-+{
-+    int i;
-+    int length = (last_significant_coeff_prefix >> 1) - 1;
-+    int value = get_cabac_bypass(&lc->cc);
-+
-+    for (i = 1; i < length; i++)
-+        value = (value << 1) | get_cabac_bypass(&lc->cc);
-+    return value;
-+}
-+
-+static av_always_inline int significant_coeff_group_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz, int ctx_cg)
-+{
-+    int inc;
-+
-+    inc = (ctx_cg != 0) + (c_idx_nz << 1);
-+
-+    return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
-+}
-+
-+static av_always_inline int significant_coeff_flag_decode_0(HEVCRpiLocalContext * const lc, int offset)
-+{
-+    return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
-+}
-+
-+#if !USE_BY22
-+#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
-+#endif
-+
-+
-+#ifndef coeff_abs_level_remaining_decode_bypass
-+static int coeff_abs_level_remaining_decode_bypass(CABACContext * const c, const unsigned int rice_param)
-+{
-+    uint32_t y;
-+    unsigned int prefix;
-+    unsigned int last_coeff_abs_level_remaining;
-+    unsigned int n;
-+
-+    y = get_cabac_by22_peek(c);
-+    prefix = hevc_clz32(~y);
-+    // y << prefix will always have top bit 0
-+
-+    if (prefix < 3) {
-+        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
-+        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
-+        n = prefix + 1 + rice_param;
-+    }
-+    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
-+    {
-+        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
-+
-+        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
-+        n = prefix * 2 + rice_param - 2;
-+    }
-+    else {
-+        unsigned int suffix;
-+
-+        get_cabac_by22_flush(c, prefix, y);
-+        y = get_cabac_by22_peek(c);
-+
-+        suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
-+        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
-+        n = prefix + rice_param - 2;
-+    }
-+
-+    get_cabac_by22_flush(c, n, y);
-+
-+    return last_coeff_abs_level_remaining;
-+}
-+#endif
-+
-+static int coeff_abs_level_remaining_decode(CABACContext * const c, int rc_rice_param)
-+{
-+    int prefix = 0;
-+    int suffix = 0;
-+    int last_coeff_abs_level_remaining;
-+    int i;
-+
-+    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
-+        prefix++;
-+    if (prefix == CABAC_MAX_BIN) {
-+//        av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
-+        return 0;
-+    }
-+
-+    if (prefix < 3) {
-+        for (i = 0; i < rc_rice_param; i++)
-+            suffix = (suffix << 1) | get_cabac_bypass(c);
-+        last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
-+    } else {
-+        int prefix_minus3 = prefix - 3;
-+        for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
-+            suffix = (suffix << 1) | get_cabac_bypass(c);
-+        last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
-+                                              << rc_rice_param) + suffix;
-+    }
-+
-+    return last_coeff_abs_level_remaining;
-+}
-+
-+#if !USE_BY22
-+#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
-+static inline uint32_t coeff_sign_flag_decode(CABACContext * const c, const unsigned int nb)
-+{
-+    unsigned int i;
-+    uint32_t ret = 0;
-+
-+    for (i = 0; i < nb; i++)
-+        ret = (ret << 1) | get_cabac_bypass(c);
-+
-+    return ret << (32 - nb);
-+}
-+#endif
-+
-+#ifndef coeff_sign_flag_decode_bypass
-+static inline uint32_t coeff_sign_flag_decode_bypass(CABACContext * const c, const unsigned int nb)
-+{
-+    uint32_t y;
-+    y = get_cabac_by22_peek(c);
-+    get_cabac_by22_flush(c, nb, y);
-+    return y & ~(0xffffffffU >> nb);
-+}
-+#endif
-+
-+
-+#ifndef get_cabac_greater1_bits
-+static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
-+    uint8_t * const state0)
-+{
-+    unsigned int i;
-+    unsigned int rv = 0;
-+    for (i = 0; i != n; ++i) {
-+        const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
-+        const unsigned int b = get_cabac(c, state0 + idx);
-+        rv = (rv << 1) | b;
-+    }
-+    return rv;
-+}
-+#endif
-+
-+
-+// N.B. levels returned are the values assuming coeff_abs_level_remaining
-+// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
-+// this version of events.
-+static inline uint32_t get_greaterx_bits(HEVCRpiLocalContext * const lc, const unsigned int n_end, int * const levels,
-+    int * const pprev_subset_coded, int * const psum,
-+    const unsigned int idx0_gt1, const unsigned int idx_gt2)
-+{
-+    CABACContext * const c = &lc->cc;
-+    uint8_t * const state0 = lc->cabac_state + idx0_gt1;
-+    uint8_t * const state_gt2 = lc->cabac_state + idx_gt2;
-+    unsigned int rv;
-+    unsigned int i;
-+    const unsigned int n = FFMIN(n_end, 8);
-+
-+    // Really this is i != n but the simple unconditional loop is cheaper
-+    // and faster
-+    for (i = 0; i != 8; ++i)
-+        levels[i] = 1;
-+
-+    rv = get_cabac_greater1_bits(c, n, state0);
-+
-+    *pprev_subset_coded = 0;
-+    *psum = n;
-+
-+    rv <<= (32 - n);
-+    if (rv != 0)
-+    {
-+        *pprev_subset_coded = 1;
-+        *psum = n + 1;
-+        i = hevc_clz32(rv);
-+        levels[i] = 2;
-+        if (get_cabac(c, state_gt2) == 0)
-+        {
-+            // Unset first coded bit
-+            rv &= ~(0x80000000U >> i);
-+        }
-+    }
-+
-+    if (n_end > 8) {
-+        const unsigned int g8 = n_end - 8;
-+        rv |= ((1 << g8) - 1) << (24 - g8);
-+        for (i = 0; i != g8; ++i) {
-+            levels[i + 8] = 0;
-+        }
-+    }
-+
-+    return rv;
-+}
-+
-+// extended_precision_processing_flag must be false given we are
-+// putting the result into a 16-bit array
-+// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
-+// scale_m is uint8_t
-+//
-+// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
-+//   or it can be 2 (if we have transquant_bypass)
-+// shift is set to one less than we really want but would normally be
-+//   s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
-+// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
-+// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
-+// to achieve it
-+
-+#ifndef trans_scale_sat
-+static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
-+{
-+    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
-+}
-+#endif
-+
-+
-+#ifndef update_rice
-+static inline void update_rice(uint8_t * const stat_coeff,
-+    const unsigned int last_coeff_abs_level_remaining,
-+    const unsigned int c_rice_param)
-+{
-+    const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
-+    if (x >= 6)
-+        (*stat_coeff)++;
-+    else if (x == 0 && *stat_coeff > 0)
-+        (*stat_coeff)--;
-+}
-+#endif
-+
-+
-+// n must be > 0 on entry
-+#ifndef get_cabac_sig_coeff_flag_idxs
-+static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
-+    unsigned int n,
-+    const uint8_t const * ctx_map,
-+    uint8_t * p)
-+{
-+    do {
-+        if (get_cabac(c, state0 + ctx_map[n]))
-+            *p++ = n;
-+    } while (--n != 0);
-+    return p;
-+}
-+#endif
-+
-+
-+static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
-+    unsigned int n,
-+    const uint8_t * ctx_map,  // const ptr here but not in asm
-+    uint8_t * const flag_idx)
-+{
-+    int rv;
-+
-+    rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
-+
-+    return rv;
-+}
-+
-+#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
-+     x0,  x1,  x2,  x3,\
-+     x4,  x5,  x6,  x7,\
-+     x8,  x9, x10, x11,\
-+    x12, x13, x14, x15}
-+
-+#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
-+     x0,  x4,  x8, x12,\
-+     x1,  x5,  x9, x13,\
-+     x2,  x6, x10, x14,\
-+     x3,  x7, x11, x15}
-+
-+#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
-+     x0,  x4,  x1,  x8,\
-+     x5,  x2, x12,  x9,\
-+     x6,  x3, x13, x10,\
-+     x7, x14, x11, x15}
-+
-+
-+static inline int next_subset(HEVCRpiLocalContext * const lc, int i, const int c_idx_nz,
-+    uint8_t * const significant_coeff_group_flag,
-+    const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
-+    int * const pPrev_sig)
-+{
-+    while (--i >= 0) {
-+        uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag;
-+        const unsigned int x_cg = scan_x_cg[i];
-+
-+        // For the flag decode we only care about Z/NZ but
-+        // we use the full Right * 2 + Down when calculating
-+        // significant coeff flags so we obtain it here.
-+        //
-+        // The group flag array is one longer than it needs to
-+        // be so we don't need to check for y_cg limits
-+        const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1);
-+
-+        if (i == 0 ||
-+            significant_coeff_group_flag_decode(lc, c_idx_nz, prev_sig))
-+        {
-+            gf_y[0] |= (1 << x_cg);
-+            *pPrev_sig = prev_sig;
-+            break;
-+        }
-+    }
-+
-+    return i;
-+}
-+
-+static void rpi_add_residual(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
-+    const unsigned int log2_trafo_size, const unsigned int c_idx,
-+    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
-+{
-+    const AVFrame * const frame = s->frame;
-+    const unsigned int stride = frame_stride1(s->frame, c_idx);
-+    const unsigned int x = x0 >> ctx_hshift(s, c_idx);
-+    const unsigned int y = y0 >> ctx_vshift(s, c_idx);
-+    const int is_sliced = 1;  // av_rpi_is_sand_frame(frame);
-+    uint8_t * const dst = !is_sliced ?
-+            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
-+        c_idx == 0 ?
-+            av_rpi_sand_frame_pos_y(frame, x, y) :
-+            av_rpi_sand_frame_pos_c(frame, x, y);
-+
-+    const unsigned int i = jb->intra.n;
-+    HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
-+
-+    if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
-+        pc->ta.dst == dst)
-+    {
-+        av_assert1(pc->size == log2_trafo_size &&
-+                   pc->c_idx == 1 &&
-+                   pc->ta.stride == stride);
-+
-+        pc->type = RPI_PRED_ADD_RESIDUAL_C;
-+    }
-+    else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
-+        pc->dc.dst == dst)
-+    {
-+        const int16_t dc = (int16_t)pc->dc.dc;  // Discard top bits
-+        av_assert1(pc->size == log2_trafo_size &&
-+                   pc->c_idx == 1 &&
-+                   pc->dc.stride == stride);
-+
-+        // Rewrite as add residual - must rewrite all fields as different union member
-+        pc->type = RPI_PRED_ADD_RESIDUAL_V;
-+        pc->ta.buf = coeffs;
-+        pc->ta.dst = dst;
-+        pc->ta.stride = stride;
-+        pc->ta.dc = dc;
-+    }
-+    else
-+    {
-+        HEVCPredCmd * const cmd = pc + 1;
-+        jb->intra.n = i + 1;
-+
-+        cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
-+        cmd->size = log2_trafo_size;
-+        cmd->ta.buf = coeffs;
-+        cmd->ta.dst = dst;
-+        cmd->ta.stride = stride;
-+        cmd->ta.dc = 0;
-+    }
-+}
-+
-+
-+static void rpi_add_dc(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+    const unsigned int log2_trafo_size, const unsigned int c_idx,
-+    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
-+{
-+    const AVFrame * const frame = s->frame;
-+    const unsigned int stride = frame_stride1(s->frame, c_idx);
-+    const unsigned int x = x0 >> ctx_hshift(s, c_idx);
-+    const unsigned int y = y0 >> ctx_vshift(s, c_idx);
-+    const int is_sliced = 1;
-+    uint8_t * const dst = !is_sliced ?
-+            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
-+        c_idx == 0 ?
-+            av_rpi_sand_frame_pos_y(frame, x, y) :
-+            av_rpi_sand_frame_pos_c(frame, x, y);
-+
-+    const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0);
-+    const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1);
-+
-+    const unsigned int i = jb->intra.n;
-+    HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
-+
-+    if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
-+        pc->ta.dst == dst)
-+    {
-+        av_assert1(pc->size == log2_trafo_size &&
-+                   pc->c_idx == 1 &&
-+                   pc->ta.stride == stride);
-+
-+        pc->ta.dc = (int16_t)coeff;
-+    }
-+    else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
-+        pc->dc.dst == dst)
-+    {
-+        av_assert1(pc->size == log2_trafo_size &&
-+                   pc->c_idx == 1 &&
-+                   pc->dc.stride == stride &&
-+                   (pc->dc.dc & ~0xffff) == 0);
-+
-+        pc->dc.dc |= (coeff << 16);
-+    }
-+    else
-+    {
-+        HEVCPredCmd * const cmd = pc + 1;
-+        jb->intra.n = i + 1;
-+
-+        cmd->type = RPI_PRED_ADD_DC + c_idx;
-+        cmd->size = log2_trafo_size;
-+        cmd->dc.dst = dst;
-+        cmd->dc.stride = stride;
-+        cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff;
-+    }
-+}
-+
-+
-+void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                const int x0, const int y0,
-+                                const int log2_trafo_size, const enum ScanType scan_idx,
-+                                const int c_idx)
-+{
-+    int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
-+
-+    int last_significant_coeff_x, last_significant_coeff_y;
-+    int num_coeff = 0;
-+    int prev_subset_coded = 0;
-+
-+    int num_last_subset;
-+    int x_cg_last_sig, y_cg_last_sig;
-+
-+    const uint8_t *scan_x_cg, *scan_y_cg;
-+    const xy_off_t * const scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
-+
-+    int use_vpu;
-+#if RPI_COMPRESS_COEFFS                                
-+    int num_nonzero = 0;
-+    int use_compress = 0;
-+    int *coeffs32;
-+#endif
-+    int use_dc = 0;
-+    int16_t *coeffs;
-+    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
-+    int explicit_rdpcm_flag = 0;
-+    int explicit_rdpcm_dir_flag;
-+
-+    int i;
-+    int shift,scale;
-+    const uint8_t *scale_matrix = NULL;
-+    uint8_t dc_scale;
-+    const int c_idx_nz = (c_idx != 0);
-+    const int pred_mode_intra = c_idx_nz ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode;
-+    int prev_sig = 0;
-+    int may_hide_sign;
-+
-+    int16_t dummy_coeffs[16];
-+
-+    // Derive QP for dequant
-+    if (!lc->cu.cu_transquant_bypass_flag) {
-+        may_hide_sign = s->ps.pps->sign_data_hiding_flag;
-+
-+        if (s->ps.pps->transform_skip_enabled_flag &&
-+            log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
-+            int transform_skip_flag = hevc_transform_skip_flag_decode(lc, c_idx_nz);
-+            if (transform_skip_flag) {
-+                trans_skip_or_bypass = 1;
-+                if (lc->cu.pred_mode ==  MODE_INTRA  &&
-+                    s->ps.sps->implicit_rdpcm_enabled_flag &&
-+                    (pred_mode_intra == 10 || pred_mode_intra == 26)) {
-+                    may_hide_sign = 0;
-+                }
-+            }
-+        }
-+
-+        {
-+            static const uint8_t level_scale[8] = {
-+                40, 45, 51, 57, 64, 72, 0, 0  // Pad to 8
-+            };
-+            const int qp6 = (int8_t)lc->tu.qp_divmod6[c_idx][lc->qp_y];
-+
-+            // Shift is set to one less than will actually occur as the scale
-+            // and saturate step adds 1 and then shifts right again
-+            scale = level_scale[qp6 & 7];
-+//            shift = s->ps.sps->bit_depth + log2_trafo_size - (int)(qp6 >> 3);
-+            shift = log2_trafo_size - (qp6 >> 3);
-+
-+            if (shift < 0) {
-+                scale <<= -shift;
-+                shift = 0;
-+            }
-+        }
-+
-+        if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
-+            const ScalingList * const sl = s->ps.pps->scaling_list_data_present_flag ?
-+                &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
-+            const unsigned int matrix_id =
-+                lc->cu.pred_mode != MODE_INTRA ? 3 + c_idx : c_idx;
-+
-+            scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
-+            dc_scale = scale_matrix[0];
-+            if (log2_trafo_size >= 4)
-+                dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
-+        }
-+        else
-+        {
-+            static const uint8_t sixteen_scale[64] = {
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16
-+            };
-+            scale_matrix = sixteen_scale;
-+            dc_scale = 16;
-+        }
-+    } else {
-+        static const uint8_t unit_scale[64] = {
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+        };
-+        scale_matrix = unit_scale;
-+        shift        = 0;
-+        scale        = 2;  // We will shift right to kill this
-+        dc_scale     = 1;
-+
-+        may_hide_sign = 0;
-+    }
-+
-+
-+
-+
-+    if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
-+        trans_skip_or_bypass) {
-+        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(lc, c_idx_nz);
-+        if (explicit_rdpcm_flag) {
-+            may_hide_sign = 0;
-+            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(lc, c_idx_nz);
-+        }
-+    }
-+
-+    last_significant_coeff_xy_prefix_decode(lc, c_idx_nz, log2_trafo_size,
-+                                           &last_significant_coeff_x, &last_significant_coeff_y);
-+
-+    if (last_significant_coeff_x > 3) {
-+        int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_x);
-+        last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) *
-+        (2 + (last_significant_coeff_x & 1)) +
-+        suffix;
-+    }
-+
-+    if (last_significant_coeff_y > 3) {
-+        int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_y);
-+        last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) *
-+        (2 + (last_significant_coeff_y & 1)) +
-+        suffix;
-+    }
-+
-+    if (scan_idx == SCAN_VERT)
-+        FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y);
-+
-+    x_cg_last_sig = last_significant_coeff_x >> 2;
-+    y_cg_last_sig = last_significant_coeff_y >> 2;
-+
-+    switch (scan_idx) {
-+    case SCAN_DIAG: {
-+        int last_x_c = last_significant_coeff_x & 3;
-+        int last_y_c = last_significant_coeff_y & 3;
-+
-+        num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
-+
-+        switch (log2_trafo_size) {
-+        case 2:
-+            scan_x_cg = scan_1x1;
-+            scan_y_cg = scan_1x1;
-+            break;
-+        case 3:
-+            num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-+            scan_x_cg = diag_scan2x2_x;
-+            scan_y_cg = diag_scan2x2_y;
-+            break;
-+        case 4:
-+            num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-+            scan_x_cg = ff_hevc_rpi_diag_scan4x4_x;
-+            scan_y_cg = ff_hevc_rpi_diag_scan4x4_y;
-+            break;
-+        case 5:
-+        default:
-+            num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-+            scan_x_cg = ff_hevc_rpi_diag_scan8x8_x;
-+            scan_y_cg = ff_hevc_rpi_diag_scan8x8_y;
-+            break;
-+        }
-+        break;
-+    }
-+    case SCAN_HORIZ:
-+        scan_x_cg = horiz_scan2x2_x;
-+        scan_y_cg = horiz_scan2x2_y;
-+        num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
-+        break;
-+    default: //SCAN_VERT
-+        scan_x_cg = horiz_scan2x2_y;
-+        scan_y_cg = horiz_scan2x2_x;
-+        num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
-+        break;
-+    }
-+    num_coeff++;
-+    num_last_subset = (num_coeff - 1) >> 4;
-+
-+    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
-+
-+    {
-+        const unsigned int ccount = 1 << (log2_trafo_size * 2);
-+        const int special = trans_skip_or_bypass /* || lc->tu.cross_pf */;  // These need special processing
-+        use_vpu = 0;
-+        use_dc = (num_coeff == 1) && !special &&
-+            !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2);
-+
-+        if (use_dc) {
-+            // Just need a little empty space
-+            coeffs = dummy_coeffs;
-+            // No need to clear
-+        }
-+        else
-+        {
-+            use_vpu = !special && log2_trafo_size >= 4;
-+#if RPI_COMPRESS_COEFFS
-+            use_compress = use_vpu && lc->jb0->coeffs.s[log2_trafo_size - 2].packed;
-+#endif
-+            coeffs = rpi_alloc_coeff_buf(lc->jb0, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
-+#if RPI_COMPRESS_COEFFS
-+            coeffs32 = (int*)coeffs;
-+            if (!use_compress)
-+#endif
-+#if HAVE_NEON
-+            rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
-+#else
-+            memset(coeffs, 0, ccount * sizeof(int16_t));
-+#endif
-+        }
-+    }
-+
-+    i = num_last_subset;
-+    do {
-+        int implicit_non_zero_coeff = 0;
-+        int n_end;
-+
-+        uint8_t significant_coeff_flag_idx[16];
-+        unsigned int nb_significant_coeff_flag = 0;
-+
-+        if (i == num_last_subset) {
-+            // First time through
-+            int last_scan_pos = num_coeff - (i << 4) - 1;
-+            n_end = last_scan_pos - 1;
-+            significant_coeff_flag_idx[0] = last_scan_pos;
-+            nb_significant_coeff_flag = 1;
-+        } else {
-+            n_end = 15;
-+            implicit_non_zero_coeff = (i != 0);
-+        }
-+
-+        if (n_end >= 0) {
-+            static const uint8_t ctx_idx_maps_ts2[3][16] = {
-+                D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
-+                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
-+                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
-+            };
-+            // N.B. prev_sig = Right * 2 + Down
-+            static const uint8_t ctx_idx_maps[3][4][16] = {
-+                {
-+                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
-+                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
-+                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
-+                },
-+                {
-+                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
-+                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
-+                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
-+                },
-+                {
-+                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
-+                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
-+                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
-+                }
-+            };
-+            const uint8_t *ctx_idx_map_p;
-+            int scf_offset = 0;
-+
-+            if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
-+                ctx_idx_map_p = ctx_idx_maps[0][3];
-+                scf_offset = 40 + c_idx_nz;
-+            } else {
-+                if (c_idx_nz != 0)
-+                    scf_offset = 27;
-+
-+                if (log2_trafo_size == 2) {
-+                    ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
-+                } else {
-+                    ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
-+                    if (!c_idx_nz) {
-+                        if (i != 0)
-+                            scf_offset += 3;
-+
-+                        if (log2_trafo_size == 3) {
-+                            scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
-+                        } else {
-+                            scf_offset += 21;
-+                        }
-+                    } else {
-+                        if (log2_trafo_size == 3)
-+                            scf_offset += 9;
-+                        else
-+                            scf_offset += 12;
-+                    }
-+                }
-+            }
-+
-+            if (n_end > 0) {
-+                int cnt = get_sig_coeff_flag_idxs(&lc->cc,
-+                    lc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
-+                    n_end, ctx_idx_map_p,
-+                    significant_coeff_flag_idx + nb_significant_coeff_flag);
-+
-+                nb_significant_coeff_flag += cnt;
-+                if (cnt != 0) {
-+                    implicit_non_zero_coeff = 0;
-+                }
-+            }
-+
-+            if (implicit_non_zero_coeff == 0) {
-+                if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
-+                    scf_offset = 42 + c_idx_nz;
-+                } else {
-+                    if (i == 0) {
-+                        scf_offset = c_idx_nz ? 27 : 0;
-+                    } else {
-+                        scf_offset = 2 + scf_offset;
-+                    }
-+                }
-+                if (significant_coeff_flag_decode_0(lc, scf_offset) == 1) {
-+                    significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
-+                    nb_significant_coeff_flag++;
-+                }
-+            } else {
-+                significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
-+                nb_significant_coeff_flag++;
-+            }
-+        }
-+#if RPI_COMPRESS_COEFFS
-+        if (use_compress && (nb_significant_coeff_flag + num_nonzero + 1 >= (1<<(2*log2_trafo_size-1)))) { // Overflow when half-full!
-+          int16_t temp[32*32];
-+          const unsigned int ccount = 1 << (log2_trafo_size * 2);
-+          lc->jb0->coeffs.s[log2_trafo_size - 2].packed = 0;
-+          lc->jb0->coeffs.s[log2_trafo_size - 2].packed_n = lc->jb0->coeffs.s[log2_trafo_size - 2].n - ccount; // Don't want to unpack the last buffer
-+          memcpy(temp, coeffs, sizeof(int)*num_nonzero);
-+          coeffs32 = (int *)temp;
-+          memset(coeffs, 0, ccount * sizeof(int16_t));
-+          num_nonzero--;
-+          while (num_nonzero >= 0) {
-+            const unsigned int res = coeffs32[num_nonzero];
-+            const unsigned int offset = res & 0xffff;
-+            coeffs[ offset ] = res >> 16;
-+            num_nonzero--;
-+          }
-+          use_compress = 0;
-+        }
-+#endif            
-+
-+        if (nb_significant_coeff_flag != 0) {
-+            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
-+                ((i != 0 && !c_idx_nz) ? 2 : 0) |
-+                prev_subset_coded;
-+            const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
-+                (gt1_idx_delta << 2);
-+            const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
-+                gt1_idx_delta;
-+
-+            const unsigned int x_cg = scan_x_cg[i];
-+            const unsigned int y_cg = scan_y_cg[i];
-+            int16_t * const blk_coeffs = coeffs +
-+                ((x_cg + (y_cg << log2_trafo_size)) << 2);
-+            // This calculation is 'wrong' for log2_traffo_size == 2
-+            // but that doesn't matter as in this case x_cg & y_cg
-+            // are always 0 so result is correct (0) anyway
-+            const uint8_t * const blk_scale = scale_matrix +
-+                (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
-+
-+            // * The following code block doesn't deal with these flags:
-+            //   (nor did the one it replaces)
-+            //
-+            // cabac_bypass_alignment_enabled_flag
-+            //    This should be easy but I can't find a test case
-+            // extended_precision_processing_flag
-+            //    This can extend the required precision past 16bits
-+            //    so is probably tricky - also no example found yet
-+
-+#if USE_N_END_1
-+            if (nb_significant_coeff_flag == 1) {
-+                // There is a small gain to be had from special casing the single
-+                // transform coefficient case.  The reduction in complexity
-+                // makes up for the code duplicatioon.
-+
-+                int trans_coeff_level = 1;
-+                int coeff_sign_flag;
-+                int coded_val = 0;
-+
-+                // initialize first elem of coeff_bas_level_greater1_flag
-+                prev_subset_coded = 0;
-+
-+                if (get_cabac(&lc->cc, lc->cabac_state + idx0_gt1 + 1)) {
-+                    trans_coeff_level = 2;
-+                    prev_subset_coded = 1;
-+                    coded_val = get_cabac(&lc->cc, lc->cabac_state + idx_gt2);
-+                }
-+
-+                // Probably not worth the overhead of starting by22 for just one value
-+                coeff_sign_flag = get_cabac_bypass(&lc->cc);
-+
-+                if (coded_val)
-+                {
-+                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
-+                        trans_coeff_level = 3 + coeff_abs_level_remaining_decode(&lc->cc, 0);
-+                    } else {
-+                        uint8_t * const stat_coeff =
-+                            lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
-+                        const unsigned int c_rice_param = *stat_coeff >> 2;
-+                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(&lc->cc, c_rice_param);
-+
-+                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
-+                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
-+                    }
-+                }
-+
-+                {
-+                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
-+                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
-+                    const unsigned int scale_m = blk_scale[xy_off->scale];
-+                    const int res = trans_scale_sat(
-+                        (trans_coeff_level ^ k) - k,  // Apply sign
-+                        scale,
-+                        i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
-+                        shift);
-+#if RPI_COMPRESS_COEFFS                                
-+                      if (use_compress)
-+                        coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs);
-+                      else
-+#endif
-+                      blk_coeffs[xy_off->coeff] = res;
-+                }
-+            }
-+            else
-+#endif
-+            {
-+                int sign_hidden = may_hide_sign;
-+                int levels[16]; // Should be able to get away with int16_t but that fails some tests
-+                uint32_t coeff_sign_flags;
-+                uint32_t coded_vals = 0;
-+                // Sum(abs(level[]))
-+                // In fact we only need the bottom bit and in some future
-+                // version that may be all we calculate
-+                unsigned int sum_abs;
-+
-+                coded_vals = get_greaterx_bits(lc, nb_significant_coeff_flag, levels,
-+                    &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
-+
-+                if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
-+                    sign_hidden = 0;
-+
-+                // -- Start bypass block
-+
-+                bypass_start(&lc->cc);
-+
-+                coeff_sign_flags = coeff_sign_flag_decode_bypass(&lc->cc, nb_significant_coeff_flag - sign_hidden);
-+
-+                if (coded_vals != 0)
-+                {
-+                    const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
-+                    uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
-+                        lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
-+                    int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
-+                    int * level = levels - 1;
-+
-+                    do {
-+                        {
-+                            const unsigned int z = hevc_clz32(coded_vals) + 1;
-+                            level += z;
-+                            coded_vals <<= z;
-+                        }
-+
-+                        {
-+                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(&lc->cc, c_rice_param);
-+                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
-+
-+                            sum_abs += last_coeff_abs_level_remaining + 1;
-+                            *level = trans_coeff_level;
-+
-+                            if (stat_coeff != NULL)
-+                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
-+                            stat_coeff = NULL;
-+
-+                            if (trans_coeff_level > (3 << c_rice_param) &&
-+                                (c_rice_param < 4 || rice_adaptation_enabled))
-+                                ++c_rice_param;
-+                        }
-+                    } while (coded_vals != 0);
-+                }
-+
-+                // sign_hidden = 0 or 1 so we can combine the tests
-+                if ((sign_hidden & sum_abs) != 0) {
-+                    levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
-+                }
-+
-+                bypass_finish(&lc->cc);
-+
-+                // -- Finish bypass block
-+
-+                // Scale loop
-+                {
-+                    int m = nb_significant_coeff_flag - 1;
-+
-+                    // Deal with DC component (if any) first
-+                    if (i == 0 && significant_coeff_flag_idx[m] == 0)
-+                    {
-+                        const int k = (int32_t)(coeff_sign_flags << m) >> 31;
-+                        const int res = trans_scale_sat(
-+                            (levels[m] ^ k) - k, scale, dc_scale, shift);
-+#if RPI_COMPRESS_COEFFS
-+                        if (use_compress)
-+                        {
-+                            coeffs32[num_nonzero++] = (res<<16) + (blk_coeffs - coeffs);
-+                        }
-+                        else
-+#endif
-+                        {
-+                            blk_coeffs[0] = res;
-+                        }
-+                        --m;
-+                    }
-+
-+#if !USE_N_END_1
-+                    // If N_END_1 set then m was at least 1 initially
-+                    if (m >= 0)
-+#endif
-+                    {
-+                        do {
-+                            const xy_off_t * const xy_off = scan_xy_off +
-+                                significant_coeff_flag_idx[m];
-+                            const int k = (int32_t)(coeff_sign_flags << m) >> 31;
-+                            const int res = trans_scale_sat(
-+                                (levels[m] ^ k) - k,
-+                                scale,
-+                                blk_scale[xy_off->scale],
-+                                shift);
-+#if RPI_COMPRESS_COEFFS
-+                            if (use_compress) {
-+                              coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs);
-+                            } else
-+#endif
-+                              blk_coeffs[xy_off->coeff] = res;
-+                        } while (--m >= 0);
-+                    }
-+                }
-+
-+            }
-+        }
-+    } while ((i = next_subset(lc, i, c_idx_nz,
-+                              significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0 &&
-+             !cabac_overflow(&lc->cc));
-+
-+    if (lc->cu.cu_transquant_bypass_flag) {
-+        if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-+                                    (pred_mode_intra == 10 || pred_mode_intra == 26))) {
-+            int mode = s->ps.sps->implicit_rdpcm_enabled_flag ? (pred_mode_intra == 26) : explicit_rdpcm_dir_flag;
-+
-+            s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
-+        }
-+    } else {
-+        if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
-+            int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
-+                      log2_trafo_size == 2 &&
-+                      lc->cu.pred_mode == MODE_INTRA;
-+            if (rot) {
-+                for (i = 0; i < 8; i++)
-+                    FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
-+            }
-+
-+            s->hevcdsp.dequant(coeffs, log2_trafo_size);
-+
-+            if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-+                                        lc->cu.pred_mode == MODE_INTRA &&
-+                                        (pred_mode_intra == 10 || pred_mode_intra == 26))) {
-+                int mode = explicit_rdpcm_flag ? explicit_rdpcm_dir_flag : (pred_mode_intra == 26);
-+
-+                s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
-+            }
-+        } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
-+            s->hevcdsp.transform_4x4_luma(coeffs);
-+        }
-+        else if (!use_vpu)
-+        {
-+            int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
-+            if (max_xy == 0)
-+            {
-+                if (use_dc)
-+                    rpi_add_dc(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
-+                else
-+                    s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
-+            }
-+            else {
-+                int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
-+                if (max_xy < 4)
-+                    col_limit = FFMIN(4, col_limit);
-+                else if (max_xy < 8)
-+                    col_limit = FFMIN(8, col_limit);
-+                else if (max_xy < 12)
-+                    col_limit = FFMIN(24, col_limit);
-+                s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit);
-+            }
-+        }
-+    }
-+
-+#if 0
-+    // Mildly rotted - we support no mode where cross is valid
-+    if (lc->tu.cross_pf) {
-+        int16_t * const coeffs_y = (int16_t*)lc->edge_emu_buffer;
-+        const int ccount = 1 << (log2_trafo_size * 2);
-+
-+        for (i = 0; i < ccount; i++) {
-+            coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
-+        }
-+    }
-+#endif
-+
-+    if (!use_dc) {
-+#if RPI_COMPRESS_COEFFS                                
-+        if (use_compress) {
-+          coeffs32[num_nonzero] = 0;
-+        }
-+#endif      
-+        rpi_add_residual(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
-+    }
-+}
-+
-+#if !USE_BY22
-+// Stores results to lc
-+MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
-+{
-+    int x = abs_mvd_greater0_flag_decode(lc);
-+    int y = abs_mvd_greater0_flag_decode(lc);
-+
-+    if (x)
-+        x += abs_mvd_greater1_flag_decode(lc);
-+    if (y)
-+        y += abs_mvd_greater1_flag_decode(lc);
-+
-+    switch (x) {
-+    case 2: x = mvd_decode(lc);           break;
-+    case 1: x = mvd_sign_flag_decode(lc); break;
-+    case 0: x = 0;                       break;
-+    }
-+
-+    switch (y) {
-+    case 2: y = mvd_decode(lc);           break;
-+    case 1: y = mvd_sign_flag_decode(lc); break;
-+    case 0: y = 0;                       break;
-+    }
-+    return MV_XY(x,y);
-+}
-+#else
-+MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
-+{
-+    int x = abs_mvd_greater0_flag_decode(lc);
-+    int y = abs_mvd_greater0_flag_decode(lc);
-+
-+    if ((x | y) == 0)
-+        return 0;
-+
-+    if (x != 0)
-+        x += abs_mvd_greater1_flag_decode(lc);
-+    if (y != 0)
-+        y += abs_mvd_greater1_flag_decode(lc);
-+
-+    if ((x | y) == 1)
-+    {
-+        // Not worth starting BY22
-+        if (x != 0)
-+            x = mvd_sign_flag_decode(lc);
-+        if (y != 0)
-+            y = mvd_sign_flag_decode(lc);
-+    }
-+    else
-+    {
-+        CABACContext * const cc = &lc->cc;
-+        uint32_t val;
-+        uint32_t b;
-+        unsigned int n = 0;
-+
-+        bypass_start(cc);
-+        b = val = get_cabac_by22_peek(cc);
-+
-+        if (x == 1) {
-+            x = ((int32_t)b >> 31) | 1;
-+            n = 1;
-+            b <<= 1;
-+        }
-+        else if (x == 2) {
-+            // EG1 so we have (leading one bits + 1) of suffix
-+            // This makes prefix & suffix lengths the same
-+            const unsigned int k = hevc_clz32(~b) + 1;
-+            int s;
-+
-+            av_assert2(k <= 15);
-+
-+            b <<= k;
-+            n = 2 * k + 1; // Includes suffix & sign
-+
-+            // We need to have k*2 + 2 (prefix, suffix, sign, y-sign) bits peeked
-+            // if we are going to do this without a flush
-+            if (k > CABAC_BY22_PEEK_BITS / 2 - 1)
-+            {
-+                // Need too many bits - flush
-+                // n = k
-+                get_cabac_by22_flush(cc, k, val);
-+                b = val = get_cabac_by22_peek(cc);
-+                n = k + 1;
-+            }
-+
-+            x = (b >> (32 - k)) + (1 << k);
-+            b <<= k;
-+            s = (int32_t)b >> 31;
-+            x = (x ^ s) - s;
-+            b <<= 1;
-+
-+            // Max abs value of an mv is 2^15 - 1 (i.e. a prefix len of 15 bits)
-+            if (y > 1 && n > CABAC_BY22_PEEK_BITS - 15)
-+            {
-+                get_cabac_by22_flush(cc, n, val);
-+                b = val = get_cabac_by22_peek(cc);
-+                n = 0;
-+            }
-+        }
-+
-+        if (y == 1) {
-+            y = ((int32_t)b >> 31) | 1;
-+            ++n;
-+            // don't care about b anymore
-+        }
-+        else if (y == 2) {
-+            const unsigned int k = hevc_clz32(~b) + 1;
-+            int s;
-+
-+            av_assert2(k <= 15);
-+
-+            // We need to have k*2 + 1 (prefix, suffix, sign) bits peeked
-+            // if we are going to do this without a flush
-+            b <<= k;
-+            n += 2 * k + 1;
-+
-+            if (n > CABAC_BY22_PEEK_BITS)
-+            {
-+                // Need too many bits - flush
-+                get_cabac_by22_flush(cc, n - (k + 1), val);
-+                b = val = get_cabac_by22_peek(cc);
-+                n = k + 1;
-+            }
-+
-+            y = (b >> (32 - k)) + (1 << k);
-+            s = (int32_t)(b << k) >> 31;
-+            y = (y ^ s) - s;
-+            // don't care about b anymore
-+        }
-+
-+        get_cabac_by22_flush(cc, n, val);
-+        bypass_finish(cc);
-+    }
-+
-+    return MV_XY(x, y);
-+}
-+#endif
---- /dev/null
-+++ b/libavcodec/rpi_hevc_cabac_fns.h
-@@ -0,0 +1,217 @@
-+/*
-+ * HEVC CABAC decoding
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ * Copyright (C) 2018 John Cox
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+
-+#ifndef AVCODEC_RPI_HEVC_CABAC_FNS_H
-+#define AVCODEC_RPI_HEVC_CABAC_FNS_H
-+
-+#include "config.h"
-+#include "rpi_hevcdec.h"
-+
-+void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc);
-+void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags);
-+int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size);
-+int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH);
-+int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx);
-+int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx);
-+
-+//int ff_hevc_rpi_cu_qp_delta_sign_flag(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
-+void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                const int x0, const int y0,
-+                                const int log2_trafo_size, const enum ScanType scan_idx,
-+                                const int c_idx);
-+
-+MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc);
-+
-+#define HEVC_BIN_SAO_MERGE_FLAG                         0
-+#define HEVC_BIN_SAO_TYPE_IDX                           1
-+#define HEVC_BIN_SAO_EO_CLASS                           2
-+#define HEVC_BIN_SAO_BAND_POSITION                      2
-+#define HEVC_BIN_SAO_OFFSET_ABS                         2
-+#define HEVC_BIN_SAO_OFFSET_SIGN                        2
-+#define HEVC_BIN_END_OF_SLICE_FLAG                      2
-+#define HEVC_BIN_SPLIT_CODING_UNIT_FLAG                 2
-+#define HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG              5
-+#define HEVC_BIN_SKIP_FLAG                              6
-+#define HEVC_BIN_CU_QP_DELTA                            9
-+#define HEVC_BIN_PRED_MODE                              12
-+#define HEVC_BIN_PART_MODE                              13
-+#define HEVC_BIN_PCM_FLAG                               17
-+#define HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE              17
-+#define HEVC_BIN_MPM_IDX                                18
-+#define HEVC_BIN_REM_INTRA_LUMA_PRED_MODE               18
-+#define HEVC_BIN_INTRA_CHROMA_PRED_MODE                 18
-+#define HEVC_BIN_MERGE_FLAG                             20
-+#define HEVC_BIN_MERGE_IDX                              21
-+#define HEVC_BIN_INTER_PRED_IDC                         22
-+#define HEVC_BIN_REF_IDX_L0                             27
-+#define HEVC_BIN_REF_IDX_L1                             29
-+#define HEVC_BIN_ABS_MVD_GREATER0_FLAG                  31
-+#define HEVC_BIN_ABS_MVD_GREATER1_FLAG                  33
-+#define HEVC_BIN_ABS_MVD_MINUS2                         35
-+#define HEVC_BIN_MVD_SIGN_FLAG                          35
-+#define HEVC_BIN_MVP_LX_FLAG                            35
-+#define HEVC_BIN_NO_RESIDUAL_DATA_FLAG                  36
-+#define HEVC_BIN_SPLIT_TRANSFORM_FLAG                   37
-+#define HEVC_BIN_CBF_LUMA                               40
-+#define HEVC_BIN_CBF_CB_CR                              42
-+#define HEVC_BIN_TRANSFORM_SKIP_FLAG                    46
-+#define HEVC_BIN_EXPLICIT_RDPCM_FLAG                    48
-+#define HEVC_BIN_EXPLICIT_RDPCM_DIR_FLAG                50
-+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_PREFIX        52
-+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_PREFIX        70
-+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_SUFFIX        88
-+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_SUFFIX        88
-+#define HEVC_BIN_SIGNIFICANT_COEFF_GROUP_FLAG           88
-+#define HEVC_BIN_SIGNIFICANT_COEFF_FLAG                 92
-+#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER1_FLAG          136
-+#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER2_FLAG          160
-+#define HEVC_BIN_COEFF_ABS_LEVEL_REMAINING              166
-+#define HEVC_BIN_COEFF_SIGN_FLAG                        166
-+#define HEVC_BIN_LOG2_RES_SCALE_ABS                     166
-+#define HEVC_BIN_RES_SCALE_SIGN_FLAG                    174
-+#define HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG               176
-+#define HEVC_BIN_CU_CHROMA_QP_OFFSET_IDX                177
-+
-+
-+int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state);
-+int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c);
-+
-+static inline const uint8_t* ff_hevc_rpi_cabac_skip_bytes(CABACContext * const c, int n) {
-+    const uint8_t *ptr = c->bytestream;
-+
-+    if (c->low & 0x1)
-+        ptr--;
-+#if CABAC_BITS == 16
-+    if (c->low & 0x1FF)
-+        ptr--;
-+#endif
-+    if ((int) (c->bytestream_end - ptr) < n)
-+        return NULL;
-+    if (ff_init_cabac_decoder(c, ptr + n, c->bytestream_end - ptr - n) < 0)
-+        return NULL;
-+
-+    return ptr;
-+}
-+
-+static inline int ff_hevc_rpi_sao_merge_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SAO_MERGE_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_cu_transquant_bypass_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_cu_chroma_qp_offset_flag(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_split_coding_unit_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                                            const unsigned int ct_depth,
-+                                                            const unsigned int x0, const unsigned int y0)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_CODING_UNIT_FLAG +
-+                                 ((s->cabac_stash_left[y0 >> 3] >> 1) > ct_depth) +
-+                                 ((s->cabac_stash_up[x0 >> 3] >> 1) > ct_depth));
-+}
-+
-+static inline int ff_hevc_rpi_skip_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                             const int x0, const int y0, const int x_cb, const int y_cb)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SKIP_FLAG +
-+                                 (s->cabac_stash_left[y0 >> 3] & 1) +
-+                                 (s->cabac_stash_up[x0 >> 3] & 1));
-+}
-+
-+static inline int ff_hevc_rpi_pred_mode_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PRED_MODE);
-+}
-+
-+static inline int ff_hevc_rpi_pcm_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac_terminate(&lc->cc);
-+}
-+
-+static inline int ff_hevc_rpi_prev_intra_luma_pred_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE);
-+}
-+
-+static inline int ff_hevc_rpi_merge_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MERGE_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_mvp_lx_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MVP_LX_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_no_residual_syntax_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_NO_RESIDUAL_DATA_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_cbf_cb_cr_decode(HEVCRpiLocalContext * const lc, const int trafo_depth)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_CB_CR + trafo_depth);
-+}
-+
-+static inline int ff_hevc_rpi_cbf_luma_decode(HEVCRpiLocalContext * const lc, const int trafo_depth)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_LUMA + !trafo_depth);
-+}
-+
-+static inline int ff_hevc_rpi_split_transform_flag_decode(HEVCRpiLocalContext * const lc, const int log2_trafo_size)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_TRANSFORM_FLAG + 5 - log2_trafo_size);
-+}
-+
-+static inline int ff_hevc_rpi_res_scale_sign_flag(HEVCRpiLocalContext *const lc, const int idx)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_RES_SCALE_SIGN_FLAG + idx);
-+}
-+
-+
-+
-+#endif
-+
---- /dev/null
-+++ b/libavcodec/rpi_hevc_data.c
-@@ -0,0 +1,75 @@
-+/*
-+ * HEVC shared tables
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include <stdint.h>
-+
-+#include "rpi_hevc_data.h"
-+
-+const uint8_t ff_hevc_rpi_diag_scan4x4_x[16] = {
-+    0, 0, 1, 0,
-+    1, 2, 0, 1,
-+    2, 3, 1, 2,
-+    3, 2, 3, 3,
-+};
-+
-+const uint8_t ff_hevc_rpi_diag_scan4x4_y[16] = {
-+    0, 1, 0, 2,
-+    1, 0, 3, 2,
-+    1, 0, 3, 2,
-+    1, 3, 2, 3,
-+};
-+
-+const uint8_t ff_hevc_rpi_diag_scan8x8_x[64] = {
-+    0, 0, 1, 0,
-+    1, 2, 0, 1,
-+    2, 3, 0, 1,
-+    2, 3, 4, 0,
-+    1, 2, 3, 4,
-+    5, 0, 1, 2,
-+    3, 4, 5, 6,
-+    0, 1, 2, 3,
-+    4, 5, 6, 7,
-+    1, 2, 3, 4,
-+    5, 6, 7, 2,
-+    3, 4, 5, 6,
-+    7, 3, 4, 5,
-+    6, 7, 4, 5,
-+    6, 7, 5, 6,
-+    7, 6, 7, 7,
-+};
-+
-+const uint8_t ff_hevc_rpi_diag_scan8x8_y[64] = {
-+    0, 1, 0, 2,
-+    1, 0, 3, 2,
-+    1, 0, 4, 3,
-+    2, 1, 0, 5,
-+    4, 3, 2, 1,
-+    0, 6, 5, 4,
-+    3, 2, 1, 0,
-+    7, 6, 5, 4,
-+    3, 2, 1, 0,
-+    7, 6, 5, 4,
-+    3, 2, 1, 7,
-+    6, 5, 4, 3,
-+    2, 7, 6, 5,
-+    4, 3, 7, 6,
-+    5, 4, 7, 6,
-+    5, 7, 6, 7,
-+};
---- /dev/null
-+++ b/libavcodec/rpi_hevc_data.h
-@@ -0,0 +1,31 @@
-+/*
-+ * HEVC shared data tables
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVC_DATA_H
-+#define AVCODEC_RPI_HEVC_DATA_H
-+
-+#include <stdint.h>
-+
-+extern const uint8_t ff_hevc_rpi_diag_scan4x4_x[16];
-+extern const uint8_t ff_hevc_rpi_diag_scan4x4_y[16];
-+extern const uint8_t ff_hevc_rpi_diag_scan8x8_x[64];
-+extern const uint8_t ff_hevc_rpi_diag_scan8x8_y[64];
-+
-+#endif /* AVCODEC_RPI_HEVC_DATA_H */
---- /dev/null
-+++ b/libavcodec/rpi_hevc_filter.c
-@@ -0,0 +1,1210 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Originally by:
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2013 Seppo Tomperi
-+ * Copyright (C) 2013 Wassim Hamidouche
-+ *
-+ * Substantially rewritten:
-+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+//#define DISABLE_SAO
-+//#define DISABLE_DEBLOCK
-+//#define DISABLE_STRENGTHS
-+// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
-+//#define DISABLE_DEBLOCK_NONREF
-+
-+#include "libavutil/common.h"
-+#include "libavutil/internal.h"
-+
-+#include "rpi_hevcdec.h"
-+
-+#include "bit_depth_template.c"
-+
-+#include "rpi_qpu.h"
-+#include "rpi_zc.h"
-+#include "libavutil/rpi_sand_fns.h"
-+
-+#define LUMA 0
-+#define CB 1
-+#define CR 2
-+
-+// tcoffset: -12,12; qp: 0,51; (bs-1)*2: 0,2
-+// so -12,75 overall
-+static const uint8_t tctablex[] = {
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  // -ve quant padding
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
-+
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,                          // -12..-1
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 1, // QP  0...18
-+    1, 1, 1, 1, 1, 1, 1,  1,  2,  2,  2,  2,  3,  3,  3,  3, 4, 4, 4, // QP 19...37
-+    5, 5, 6, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 20, 22, 24,          // QP 38...53
-+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24                    // 54..75
-+};
-+#define tctable (tctablex + 12 + 6*8)
-+
-+static const uint8_t betatablex[] = {
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  // -ve quant padding
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
-+
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,                          // -12..-1
-+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  6,  7,  8, // QP 0...18
-+     9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, // QP 19...37
-+    38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64,                      // QP 38...51
-+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64                    // 52..73
-+};
-+#define betatable (betatablex + 12 + 6*8)
-+
-+static inline int chroma_tc(const HEVCRpiContext * const s, const int qp_y,
-+                            const int c_idx, const int tc_offset)
-+{
-+    return tctable[(int)s->ps.pps->qp_dblk_x[c_idx][qp_y] + tc_offset + 2];
-+}
-+
-+static inline int get_qPy_pred(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
-+                               const unsigned int xBase, const unsigned int yBase)
-+{
-+    const unsigned int ctb_size_mask        = (1 << s->ps.sps->log2_ctb_size) - 1;
-+    const unsigned int MinCuQpDeltaSizeMask = ~0U << s->ps.pps->log2_min_cu_qp_delta_size;
-+    const unsigned int xQgBase              = xBase & MinCuQpDeltaSizeMask;
-+    const unsigned int yQgBase              = yBase & MinCuQpDeltaSizeMask;
-+    const unsigned int min_cb_width         = s->ps.sps->min_cb_width;
-+    const unsigned int x_cb                 = xQgBase >> s->ps.sps->log2_min_cb_size;
-+    const unsigned int y_cb                 = yQgBase >> s->ps.sps->log2_min_cb_size;
-+    const int qPy_pred = lc->qPy_pred;
-+
-+    return (((xQgBase & ctb_size_mask) == 0 ? qPy_pred :
-+             s->qp_y_tab[(x_cb - 1) + y_cb * min_cb_width]) +
-+            ((yQgBase & ctb_size_mask) == 0 ? qPy_pred :
-+             s->qp_y_tab[x_cb + (y_cb - 1) * min_cb_width]) + 1) >> 1;
-+}
-+
-+// * Only called from bitstream decode in foreground
-+//   so should be safe
-+void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase)
-+{
-+    const int qp_y = get_qPy_pred(s, lc, xBase, yBase);
-+
-+    if (lc->tu.cu_qp_delta != 0) {
-+        // ?? I suspect that the -bd_offset here leads to us adding it elsewhere
-+        int off = s->ps.sps->qp_bd_offset;
-+        lc->qp_y = FFUMOD(qp_y + lc->tu.cu_qp_delta + 52 + 2 * off,
-+                                 52 + off) - off;
-+    } else
-+        lc->qp_y = qp_y;
-+}
-+
-+static inline unsigned int pixel_shift(const HEVCRpiContext * const s, const unsigned int c_idx)
-+{
-+    return c_idx != 0 ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift;
-+}
-+
-+// "DSP" these?
-+static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
-+{
-+    switch (pixel_shift)
-+    {
-+        case 2:
-+            *(uint32_t *)dst = *(uint32_t *)src;
-+            break;
-+        case 1:
-+            *(uint16_t *)dst = *(uint16_t *)src;
-+            break;
-+        default:
-+            *dst = *src;
-+            break;
-+    }
-+}
-+
-+static void copy_CTB_to_hv(const HEVCRpiContext * const s, const uint8_t * const src,
-+                           ptrdiff_t stride_src, int x, int y, int width, int height,
-+                           int c_idx, int x_ctb, int y_ctb)
-+{
-+    const unsigned int sh = pixel_shift(s, c_idx);
-+    const unsigned int w = s->ps.sps->width >> ctx_hshift(s, c_idx);
-+    const unsigned int h = s->ps.sps->height >> ctx_vshift(s, c_idx);
-+
-+    /* copy horizontal edges */
-+    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh),
-+        src, width << sh);
-+    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh),
-+        src + stride_src * (height - 1), width << sh);
-+
-+    /* copy vertical edges */
-+    ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src);
-+
-+    ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src);
-+}
-+
-+// N.B. Src & dst are swapped as this is a restore!
-+// x0 & y0 are in luma coords
-+// Width & height are in Y/C pels as appropriate
-+// * Clear scope for optimsation here but not used enough to be worth it
-+static void restore_tqb_pixels(const HEVCRpiContext * const s,
-+                               uint8_t *src1, const uint8_t *dst1,
-+                               const ptrdiff_t stride_src, const ptrdiff_t stride_dst,
-+                               const unsigned int x0, const unsigned int y0,
-+                               const unsigned int width, const int height,
-+                               const int c_idx)
-+{
-+    if (s->ps.pps->transquant_bypass_enable_flag ||
-+        s->ps.sps->pcm.loop_filter_disable_flag)
-+    {
-+        const uint8_t *pcm = s->is_pcm + (x0 >> 6) + (y0 >> 3) * s->ps.sps->pcm_width;
-+        int blks_y = height >> (c_idx == 0 ? 3 : 2);
-+        const unsigned int bwidth = 8 << s->ps.sps->pixel_shift;  // Y & C have the same width in sand
-+        const unsigned int bheight = (c_idx == 0) ? 8 : 4;
-+        const unsigned int sh = ((x0 >> 3) & 7);
-+        const unsigned int mask = (1 << (width >> (c_idx == 0 ? 3 : 2))) - 1;
-+
-+        do {
-+            unsigned int m = (*pcm >> sh) & mask;
-+            uint8_t * bd = src1;
-+            const uint8_t * bs = dst1;
-+            while (m != 0) {
-+                if ((m & 1) != 0) {
-+                    s->hevcdsp.cpy_blk(bd, stride_src, bs, stride_dst, bwidth, bheight);
-+                }
-+                m >>= 1;
-+                bs += bwidth;
-+                bd += bwidth;
-+            }
-+            src1 += stride_src * bheight;
-+            dst1 += stride_dst * bheight;
-+            pcm += s->ps.sps->pcm_width;
-+        } while (--blks_y > 0);
-+    }
-+}
-+
-+#define CTB(tab, x, y) ((tab)[(y) * s->ps.sps->ctb_width + (x)])
-+
-+static void sao_filter_CTB(const HEVCRpiContext * const s, const int x, const int y)
-+{
-+#if SAO_FILTER_N == 5
-+    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
-+#elif SAO_FILTER_N == 6
-+    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
-+#else
-+#error Confused by size of sao fn array
-+#endif
-+    int c_idx;
-+    int edges[4];  // 0 left 1 top 2 right 3 bottom
-+    int x_ctb                = x >> s->ps.sps->log2_ctb_size;
-+    int y_ctb                = y >> s->ps.sps->log2_ctb_size;
-+    int ctb_addr_rs          = y_ctb * s->ps.sps->ctb_width + x_ctb;
-+    int ctb_addr_ts          = s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs];
-+    RpiSAOParams *sao           = &CTB(s->sao, x_ctb, y_ctb);
-+    // flags indicating unfilterable edges
-+    uint8_t vert_edge[]      = { 0, 0 };
-+    uint8_t horiz_edge[]     = { 0, 0 };
-+    uint8_t diag_edge[]      = { 0, 0, 0, 0 };
-+    uint8_t lfase            = CTB(s->filter_slice_edges, x_ctb, y_ctb);
-+    uint8_t no_tile_filter   = s->ps.pps->tiles_enabled_flag &&
-+                               !s->ps.pps->loop_filter_across_tiles_enabled_flag;
-+    uint8_t restore          = no_tile_filter || !lfase;
-+    uint8_t left_tile_edge   = 0;
-+    uint8_t right_tile_edge  = 0;
-+    uint8_t up_tile_edge     = 0;
-+    uint8_t bottom_tile_edge = 0;
-+    const int sliced = 1;
-+    const int plane_count = sliced ? 2 : (ctx_cfmt(s) != 0 ? 3 : 1);
-+
-+    edges[0]   = x_ctb == 0;
-+    edges[1]   = y_ctb == 0;
-+    edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
-+    edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
-+
-+#ifdef DISABLE_SAO
-+    return;
-+#endif
-+
-+    if (restore) {
-+        if (!edges[0]) {
-+            left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
-+            vert_edge[0]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge;
-+        }
-+        if (!edges[2]) {
-+            right_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1]];
-+            vert_edge[1]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb)) || right_tile_edge;
-+        }
-+        if (!edges[1]) {
-+            up_tile_edge     = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]];
-+            horiz_edge[0]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge;
-+        }
-+        if (!edges[3]) {
-+            bottom_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs + s->ps.sps->ctb_width]];
-+            horiz_edge[1]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb + 1)) || bottom_tile_edge;
-+        }
-+        if (!edges[0] && !edges[1]) {
-+            diag_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge;
-+        }
-+        if (!edges[1] && !edges[2]) {
-+            diag_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb - 1)) || right_tile_edge || up_tile_edge;
-+        }
-+        if (!edges[2] && !edges[3]) {
-+            diag_edge[2] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb + 1)) || right_tile_edge || bottom_tile_edge;
-+        }
-+        if (!edges[0] && !edges[3]) {
-+            diag_edge[3] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb + 1)) || left_tile_edge || bottom_tile_edge;
-+        }
-+    }
-+
-+    for (c_idx = 0; c_idx < plane_count; c_idx++) {
-+        const unsigned int vshift = ctx_vshift(s, c_idx);
-+        const unsigned int hshift = ctx_hshift(s, c_idx);
-+        const int x0 = x >> hshift;
-+        const int y0 = y >> vshift;
-+        const ptrdiff_t stride_src = frame_stride1(s->frame, c_idx);
-+        const int ctb_size_h = (1 << (s->ps.sps->log2_ctb_size)) >> hshift;
-+        const int ctb_size_v = (1 << (s->ps.sps->log2_ctb_size)) >> vshift;
-+        const int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> hshift) - x0);
-+        const int height = FFMIN(ctb_size_v, (s->ps.sps->height >> vshift) - y0);
-+        int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
-+        ptrdiff_t stride_dst;
-+        uint8_t *dst;
-+
-+        const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0);
-+        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
-+        uint8_t * const src = !sliced ?
-+                &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] :
-+            c_idx == 0 ?
-+                av_rpi_sand_frame_pos_y(s->frame, x0, y0) :
-+                av_rpi_sand_frame_pos_c(s->frame, x0, y0);
-+        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
-+            !sliced ? src - (1 << sh) :
-+            c_idx == 0 ?
-+                av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) :
-+                av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0);
-+        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
-+            !sliced ? src + (width << sh) :
-+            c_idx == 0 ?
-+                av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) :
-+                av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0);
-+
-+        if (sliced && c_idx > 1) {
-+            break;
-+        }
-+
-+//        if (c_idx == 1)
-+//            printf("%d: %dx%d %d,%d: lr=%d\n", c_idx, width, height, x0, y0, wants_lr);
-+
-+        switch (sao->type_idx[c_idx]) {
-+        case SAO_BAND:
-+            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
-+                           x_ctb, y_ctb);
-+            if (s->ps.pps->transquant_bypass_enable_flag ||
-+                s->ps.sps->pcm.loop_filter_disable_flag)
-+            {
-+                // Can't use the edge buffer here as it may be in use by the foreground
-+                DECLARE_ALIGNED(64, uint8_t, dstbuf)
-+                    [2*MAX_PB_SIZE*MAX_PB_SIZE];
-+                dst = dstbuf;
-+                stride_dst = 2*MAX_PB_SIZE;
-+                s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height);
-+                if (sliced && c_idx != 0)
-+                {
-+                    s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
-+                                                    sao->offset_val[1], sao->band_position[1],
-+                                                    sao->offset_val[2], sao->band_position[2],
-+                                                    width, height);
-+                }
-+                else
-+                {
-+                    s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
-+                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
-+                                                    width, height);
-+                }
-+                restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
-+                                   x, y, width, height, c_idx);
-+            } else {
-+                if (sliced && c_idx != 0)
-+                {
-+                    s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
-+                                                    sao->offset_val[1], sao->band_position[1],
-+                                                    sao->offset_val[2], sao->band_position[2],
-+                                                    width, height);
-+                }
-+                else
-+                {
-+                    s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
-+                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
-+                                                    width, height);
-+                }
-+            }
-+            sao->type_idx[c_idx] = SAO_APPLIED;
-+            break;
-+        case SAO_EDGE:
-+        {
-+            const int w = s->ps.sps->width >> hshift;
-+            const int h = s->ps.sps->height >> vshift;
-+            int top_edge = edges[1];
-+            int bottom_edge = edges[3];
-+            // Can't use the edge buffer here as it may be in use by the foreground
-+            DECLARE_ALIGNED(64, uint8_t, dstbuf)
-+                [RPI_HEVC_SAO_BUF_STRIDE * (MAX_PB_SIZE + 2) + 64];
-+
-+            stride_dst = RPI_HEVC_SAO_BUF_STRIDE;
-+            dst = dstbuf + stride_dst + 32;
-+
-+            if (!top_edge) {
-+                uint8_t *dst1;
-+                int src_idx;
-+                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh);
-+
-+                dst1 = dst - stride_dst;
-+
-+                if (src_l != NULL) {
-+                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
-+                               SAO_APPLIED);
-+                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh);
-+                }
-+
-+                src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
-+                           SAO_APPLIED);
-+                memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh);
-+
-+                if (src_r != NULL) {
-+                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
-+                               SAO_APPLIED);
-+                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh);
-+                }
-+            }
-+            if (!bottom_edge) {
-+                uint8_t * const dst1 = dst + height * stride_dst;
-+                int src_idx;
-+                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh);
-+                const unsigned int hoff = height * stride_src;
-+
-+                if (src_l != NULL) {
-+                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
-+                               SAO_APPLIED);
-+                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh);
-+                }
-+
-+                src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
-+                           SAO_APPLIED);
-+                memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh);
-+
-+                if (src_r != NULL) {
-+                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
-+                               SAO_APPLIED);
-+                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh);
-+                }
-+            }
-+            if (src_l != NULL) {
-+                if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
-+                    ff_hevc_rpi_copy_vert(dst - (1 << sh),
-+                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
-+                              sh, height, stride_dst, 1 << sh);
-+                } else {
-+                    ff_hevc_rpi_copy_vert(dst - (1 << sh),
-+                              src_l,
-+                              sh, height, stride_dst, stride_src);
-+                }
-+            }
-+            if (src_r != NULL) {
-+                if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
-+                    ff_hevc_rpi_copy_vert(dst + (width << sh),
-+                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
-+                              sh, height, stride_dst, 1 << sh);
-+                } else {
-+                    ff_hevc_rpi_copy_vert(dst + (width << sh),
-+                              src_r,
-+                              sh, height, stride_dst, stride_src);
-+                }
-+            }
-+
-+            s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height);
-+
-+            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
-+                           x_ctb, y_ctb);
-+            if (sliced && c_idx != 0)
-+            {
-+                // Class always the same for both U & V (which is just as well :-))
-+                s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src,
-+                                                sao->offset_val[1], sao->offset_val[2], sao->eo_class[1],
-+                                                width, height);
-+                s->hevcdsp.sao_edge_restore_c[restore](src, dst,
-+                                                    stride_src, stride_dst,
-+                                                    sao,
-+                                                    edges, width,
-+                                                    height, c_idx,
-+                                                    vert_edge,
-+                                                    horiz_edge,
-+                                                    diag_edge);
-+            }
-+            else
-+            {
-+                s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
-+                                                sao->eo_class[c_idx], width, height);
-+                s->hevcdsp.sao_edge_restore[restore](src, dst,
-+                                                    stride_src, stride_dst,
-+                                                    sao,
-+                                                    edges, width,
-+                                                    height, c_idx,
-+                                                    vert_edge,
-+                                                    horiz_edge,
-+                                                    diag_edge);
-+            }
-+            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
-+                               x, y, width, height, c_idx);
-+            sao->type_idx[c_idx] = SAO_APPLIED;
-+            break;
-+        }
-+        }
-+    }
-+
-+#if RPI_ZC_SAND_8_IN_10_BUF
-+    if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL &&
-+        (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2]))
-+    {
-+        const unsigned int stride1 = frame_stride1(s->frame, 1);
-+        const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame);
-+        const unsigned int xoff = (x >> 8) * stride2 * stride1;
-+        const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size);
-+        const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1;
-+        uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1;
-+        const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1;
-+        uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1;
-+        const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255);
-+        const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y;
-+
-+//        printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size);
-+        av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3);
-+        av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3);
-+    }
-+#endif
-+}
-+
-+// When bits are delivered to deblock we want them
-+//#define TL 1
-+//#define TR 2
-+//#define BL 4
-+//#define BR 8
-+
-+// pcm4 returns them as b0 = tl, b1 = tr, b16 = bl, b17 = br
-+// so we need to rearrange before passing on
-+
-+static inline uint32_t pcm4(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
-+{
-+    const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width;
-+    return (pcm[0] |
-+        (pcm[1] << 8) |
-+        (pcm[s->ps.sps->pcm_width] << 16) |
-+        (pcm[s->ps.sps->pcm_width + 1] << 24)) >> ((x >> 3) & 7);
-+}
-+
-+static inline uint32_t pcm2(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
-+{
-+    const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width;
-+    return (pcm[0] | (pcm[1] << 8)) >> ((x >> 3) & 7);
-+}
-+
-+// We cast away const here as we want this to work for both get and set
-+static inline uint32_t * bs_ptr32(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
-+{
-+    return (uint32_t *)(bs +
-+#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
-+#warning Unexpected masks
-+        // As it happens we end up with stride1 = sizeof(uint32_t) so this expr vanishes
-+        ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
-+            (~3 & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT))) +
-+#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
-+#error Stride1 < return size
-+#endif
-+        ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
-+        (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
-+}
-+
-+static inline uint8_t * bs_ptr8(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
-+{
-+    return (uint8_t *)(bs +
-+        ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
-+            (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) +
-+        ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
-+        (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
-+}
-+
-+
-+// Get block strength
-+// Given how we call we will always get within the 32bit boundries
-+static inline uint32_t bs_get32(const uint8_t * bs, unsigned int stride2,
-+                                unsigned int xl, unsigned int xr, const unsigned int y)
-+{
-+    if (xr <= xl) {
-+        return 0;
-+    }
-+    else
-+    {
-+#if HAVE_ARMV6T2_INLINE
-+#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
-+#error This case not yet handled in bs_get32
-+#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
-+#error Stride1 < return size
-+#endif
-+        uint32_t tmp;
-+        __asm__ (
-+            "lsr         %[tmp], %[xl], %[xl_shift]                  \n\t"
-+            "rsb         %[xr], %[xl], %[xr]                         \n\t"
-+            "mla         %[stride2], %[stride2], %[tmp], %[bs]       \n\t"
-+            "add         %[xr], %[xr], #7                            \n\t"
-+            "lsr         %[bs], %[y], %[y_shift1]                    \n\t"
-+            "bic         %[xr], %[xr], #7                            \n\t"
-+            "ubfx        %[xl], %[xl], #1, #5                        \n\t"
-+            "lsr         %[xr], %[xr], #1                            \n\t"
-+            "cmp         %[xr], #32                                  \n\t"
-+            "mvn         %[tmp], #0                                  \n\t"
-+            "ldr         %[bs], [%[stride2], %[bs], lsl %[y_shift2]] \n\t"
-+            "lsl         %[tmp], %[tmp], %[xr]                       \n\t"
-+            "lsr         %[xl], %[bs], %[xl]                         \n\t"
-+            "it ne                                                   \n\t"
-+            "bicne       %[bs], %[xl], %[tmp]                        \n\t"
-+            :  // Outputs
-+                      [bs]"+r"(bs),
-+                 [stride2]"+r"(stride2),
-+                      [xl]"+r"(xl),
-+                      [xr]"+r"(xr),
-+                     [tmp]"=&r"(tmp)
-+            :  // Inputs
-+                       [y]"r"(y),
-+                [xl_shift]"M"(HEVC_RPI_BS_STRIDE1_PEL_SHIFT),
-+                [y_shift1]"M"(HEVC_RPI_BS_Y_SHR),
-+                [y_shift2]"M"(HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
-+            :  // Clobbers
-+                "cc"
-+        );
-+        return (uint32_t) bs;
-+#else
-+        const uint32_t a = *bs_ptr32(bs, stride2, xl, y);
-+        const unsigned int n = ((xr - xl + 7) & ~7) >> 1;
-+
-+        return n == 32 ? a :
-+            (a >> ((xl >> 1) & 31)) & ~(~0U << n);
-+#endif
-+    }
-+}
-+
-+static inline uint32_t hbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
-+{
-+    av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
-+    return bs_get32(s->bs_horizontal, s->bs_stride2, xl, xr, y);
-+}
-+
-+static inline uint32_t vbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
-+{
-+    av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
-+    return bs_get32(s->bs_vertical, s->bs_stride2, xl, xr, y);
-+}
-+
-+
-+static void deblock_y_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y)
-+{
-+    const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
-+    const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
-+    const unsigned int ctb_size = (1 << log2_ctb_size);
-+    const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 :  1);
-+    const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size;
-+    const DBParams * cb_dbp = s->deblock + ctb_n;
-+    const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8);
-+
-+    unsigned int cb_x;
-+
-+    // Do in CTB-shaped blocks
-+    for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++cb_dbp)
-+    {
-+        const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r);
-+        const unsigned int bv_l = FFMAX(cb_x, 8);
-+        const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r - 8 : cb_x + ctb_size - 9;
-+        const unsigned int bh_l = bv_l - 8;
-+        unsigned int y;
-+
-+        // Main body
-+        for (y = (bounds.y == 0 ? 0 : bounds.y - 8); y < b_b; y += 8)
-+        {
-+            uint32_t vbs = vbs_get32(s, bv_l, bv_r, y);
-+
-+            const DBParams * const dbp = y < bounds.y ? cb_dbp - s->ps.sps->ctb_width : cb_dbp;
-+            const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
-+            const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
-+
-+            if (vbs != 0)
-+            {
-+                const uint8_t * const tcv = tctable + dbp->tc_offset;
-+                const uint8_t * const betav = betatable + dbp->beta_offset;
-+                unsigned int pcmfa = pcm2(s, bv_l - 1, y);
-+                unsigned int x;
-+
-+                for (x = bv_l; vbs != 0; x += 8, vbs >>= 4, pcmfa >>= 1)
-+                {
-+                    if ((vbs & 0xf) != 0 && (pcmfa & 3) != 3)
-+                    {
-+                        const int qp = (qtb[(x - 1) >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
-+                        s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
-+                                                         frame_stride1(s->frame, LUMA),
-+                                                         betav[qp],
-+                                                         ((vbs & 3) == 0 ? 0 : tcv[qp + (int)(vbs & 2)]) |
-+                                                          (((vbs & 0xc) == 0 ? 0 : tcv[qp + (int)((vbs >> 2) & 2)]) << 16),
-+                                                         pcmfa & 3,
-+                                                         av_rpi_sand_frame_pos_y(s->frame, x - 4, y));
-+                    }
-+                }
-+            }
-+
-+            if (y != 0)
-+            {
-+                uint32_t hbs;
-+
-+                // H left - mostly separated out so we only need a uint32_t hbs
-+                if ((hbs = hbs_get32(s, bh_l, cb_x, y)) != 0)
-+                {
-+                    const unsigned int x = bh_l;
-+                    const unsigned int pcmfa = pcm4(s, bh_l, y - 1);
-+                    const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
-+                    const DBParams * const dbph = dbp - 1;
-+                    const uint8_t * const tc = tctable + dbph->tc_offset + qp;
-+
-+                    av_assert2(cb_x - bh_l == 8);
-+
-+                    s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
-+                                                         frame_stride1(s->frame, LUMA),
-+                                                         betatable[qp + dbph->beta_offset],
-+                                                         ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
-+                                                            (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
-+                                                         (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
-+                }
-+
-+                // H
-+                if ((hbs = hbs_get32(s, cb_x, bh_r + 1, y)) != 0)  // Will give (x <= bh_r) in for loop
-+                {
-+                    unsigned int x;
-+                    unsigned int pcmfa = pcm4(s, cb_x, y - 1);
-+
-+                    for (x = cb_x; hbs != 0; x += 8, hbs >>= 4, pcmfa >>= 1)
-+                    {
-+                        if ((hbs & 0xf) != 0 && (~pcmfa & 0x10001) != 0)
-+                        {
-+                            const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
-+                            const uint8_t * const tc = tctable + dbp->tc_offset + qp;
-+                            s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
-+                                                                frame_stride1(s->frame, LUMA),
-+                                                                betatable[qp + dbp->beta_offset],
-+                                                                ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
-+                                                                   (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
-+                                                                (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
-+                        }
-+                    }
-+                }
-+            }
-+
-+        }
-+    }
-+}
-+
-+static av_always_inline int q2h(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
-+{
-+    const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
-+    const int8_t * const qt = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
-+    return (qt[(x - 1) >> log2_min_cb_size] + qt[x >> log2_min_cb_size] + 1) >> 1;
-+}
-+
-+static void deblock_uv_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y)
-+{
-+    const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
-+    const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
-+    const unsigned int ctb_size = (1 << log2_ctb_size);
-+    const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 :  8);
-+    const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size;
-+    const DBParams * dbp = s->deblock + ctb_n;
-+    const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8);
-+    const uint8_t * const tcq_u = s->ps.pps->qp_dblk_x[1];
-+    const uint8_t * const tcq_v = s->ps.pps->qp_dblk_x[2];
-+
-+    unsigned int cb_x;
-+
-+    av_assert1((bounds.x & (ctb_size - 1)) == 0);
-+    av_assert1((bounds.y & (ctb_size - 1)) == 0);
-+    av_assert1(bounds.h <= ctb_size);
-+
-+    // Do in CTB-shaped blocks
-+    for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++dbp) {
-+        const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r);
-+        const unsigned int bv_l = FFMAX(cb_x, 16);
-+        unsigned int y;
-+
-+        // V above
-+        if (bounds.y != 0) {
-+            // Deblock V up 8
-+            // CTB above current
-+            // Top-half only (tc4 & ~0xffff == 0) is special cased in asm
-+            const unsigned int y = bounds.y - 8;
-+            uint32_t vbs = vbs_get32(s, bv_l, bv_r, y) & 0x02020202U;
-+
-+            if (vbs != 0)
-+            {
-+                unsigned int pcmfa = pcm2(s, bv_l - 1, y);
-+                const uint8_t * const tc = tctable + 2 + (dbp - s->ps.sps->ctb_width)->tc_offset;
-+                unsigned int x;
-+
-+                for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
-+                {
-+                    if ((vbs & 2) != 0 && (~pcmfa & 3) != 0)
-+                    {
-+                        const int qp0 = q2h(s, x, y);
-+                        s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
-+                                                       frame_stride1(s->frame, 1),
-+                                                       tc[tcq_u[qp0]] | (tc[tcq_v[qp0]] << 8),
-+                                                       av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
-+                                                       pcmfa & 3);
-+                    }
-+                }
-+            }
-+        }
-+
-+        for (y = bounds.y; y < b_b; y += 16)
-+        {
-+            uint32_t vbs = (vbs_get32(s, bv_l, bv_r, y) & 0x02020202U) |
-+                (y + 16 > b_b ? 0 : (vbs_get32(s, bv_l, bv_r, y + 8) & 0x02020202U) << 4);
-+
-+            // V
-+            if (vbs != 0)
-+            {
-+                unsigned int x;
-+                unsigned int pcmfa =
-+                    (y + 16 > b_b ?
-+                        pcm2(s, bv_l - 1, y) | 0xffff0000 :
-+                        pcm4(s, bv_l - 1, y));
-+                const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
-+
-+                for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
-+                {
-+                    if ((vbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
-+                    {
-+                        const int qp0 = q2h(s, x, y);
-+                        const int qp1 = q2h(s, x, y + 8);
-+                        s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
-+                            frame_stride1(s->frame, 1),
-+                            ((vbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
-+                                ((vbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
-+                            av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
-+                            (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
-+                    }
-+                }
-+            }
-+
-+            // H
-+            if (y != 0)
-+            {
-+                uint32_t hbs;
-+                const unsigned int bh_l = bv_l - 16;
-+                const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r : cb_x + ctb_size - 16;
-+                const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
-+                const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
-+
-+                // H left - mostly separated out so we only need a uint32_t hbs
-+                // Stub is width 8 to the left of bounds, but width 16 internally
-+                if ((hbs = hbs_get32(s, bh_l, cb_x, y) & 0x22U) != 0)
-+                {
-+                    unsigned int pcmfa = pcm4(s, bh_l, y - 1);
-+
-+                    // Chop off bits we don't want...
-+                    if (bh_l < bounds.x) {
-+                        pcmfa |= 0x10001; // TL|BL pre rearrangement
-+                        hbs &= ~3;  // Make BS 0
-+                    }
-+
-+                    // Double check we still want this
-+                    if (hbs != 0 && (~pcmfa & 0x30003) != 0)
-+                    {
-+                        const unsigned int x = bh_l;
-+                        const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
-+                        const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
-+                        const uint8_t * const tc = tctable + 2 + (dbp - 1)->tc_offset;
-+
-+                        s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
-+                            frame_stride1(s->frame, 1),
-+                            ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
-+                                ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
-+                            (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
-+                    }
-+                }
-+
-+                // H main
-+                if ((hbs = (hbs_get32(s, cb_x, bh_r, y) & 0x22222222U)) != 0)
-+                {
-+                    unsigned int x;
-+                    unsigned int pcmfa = pcm4(s, cb_x, y - 1);  // Might like to mask out far right writes but probably not worth it
-+
-+                    for (x = cb_x; hbs != 0; x += 16, hbs >>= 8, pcmfa >>= 2)
-+                    {
-+                        if ((hbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
-+                        {
-+                            const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
-+                            const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
-+                            const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
-+
-+                            s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
-+                                frame_stride1(s->frame, 1),
-+                                ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
-+                                    ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
-+                                (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
-+                        }
-+                    }
-+                }
-+            }
-+        }
-+    }
-+}
-+
-+static inline unsigned int off_boundary(const unsigned int x, const unsigned int log2_n)
-+{
-+    return x & ~(~0U << log2_n);
-+}
-+
-+static inline void hbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
-+{
-+    av_assert2((y & 7) == 0);
-+
-+    // This doesn't have the same simultainious update issues that bsf_stash
-+    // does (other threads will have a different y) so we can do it the easy way
-+    if ((bsf &= mask) != 0)
-+        *bs_ptr32(s->bs_horizontal, s->bs_stride2, x, y) |= bsf << ((x >> 1) & 31);
-+}
-+
-+
-+static void vbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
-+{
-+    // We arrange this in a slightly odd fashion but it lines up with
-+    // how we are going to use it in the actual deblock code & it is easier
-+    // to do the contortions here than there
-+    //
-+    // Arrange (LE) {x0y0, x0y4, x8y0, x8,y4}, {x16y0, x16y4, x24y0, x24y4},...
-+
-+    av_assert2((x & 7) == 0);
-+
-+    if ((bsf &= mask) != 0)
-+    {
-+        uint8_t *p = bs_ptr8(s->bs_vertical, s->bs_stride2, x, y);
-+        const unsigned int sh = ((x & 8) | (y & 4)) >> 1;
-+
-+        if (mask <= 0xf)
-+        {
-+            *p |= (bsf << sh);
-+        }
-+        else
-+        {
-+            do {
-+                *p |= (bsf & 0xf) << sh;
-+                p += HEVC_RPI_BS_STRIDE1_BYTES;
-+            } while ((bsf >>= 4) != 0);
-+        }
-+    }
-+}
-+
-+static inline uint32_t bsf_mv(const HEVCRpiContext * const s,
-+                              const unsigned int rep, const unsigned int dup,
-+                              const unsigned int mvf_stride0,
-+                              const unsigned int mvf_stride1,
-+                              const RefPicList * const rpl_p, const RefPicList * const rpl_q,
-+                              const HEVCRpiMvField * const mvf_p, const HEVCRpiMvField * const mvf_q)
-+{
-+    return s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup,
-+            mvf_p, mvf_q,
-+            rpl_p[0].list, rpl_p[1].list, rpl_q[0].list, rpl_q[1].list,
-+            sizeof(HEVCRpiMvField) * mvf_stride0, sizeof(HEVCRpiMvField) * mvf_stride1);
-+}
-+
-+
-+void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s,
-+                                               const HEVCRpiLocalContext * const lc,
-+                                               const unsigned int x0, const unsigned int y0,
-+                                               const unsigned int log2_trafo_size,
-+                                               const int is_coded_block)
-+{
-+    const HEVCRpiMvField * const mvf_curr      = mvf_stash_ptr(s, lc, x0, y0);
-+    const unsigned int log2_min_pu_size = LOG2_MIN_PU_SIZE;
-+    const RefPicList * const rpl        = s->refPicList;
-+    // Rep count for bsf_mv when running with min_pu chuncks
-+    const unsigned int log2_rep_min_pu  = log2_trafo_size <= log2_min_pu_size ? 0 : log2_trafo_size - log2_min_pu_size;
-+    const unsigned int boundary_flags   = s->sh.no_dblk_boundary_flags & lc->boundary_flags;
-+    const unsigned int trafo_size       = (1U << log2_trafo_size);
-+    const uint32_t bsf_mask             = log2_trafo_size > 5 ? ~0U : (1U << (trafo_size >> 1)) - 1;
-+    const uint32_t bsf_cbf              = (bsf_mask & 0x55555555);
-+
-+    // Do we cover a pred split line?
-+    const int has_x_split = x0 < lc->cu.x_split && x0 + trafo_size > lc->cu.x_split;
-+    const int has_y_split = y0 < lc->cu.y_split && y0 + trafo_size > lc->cu.y_split;
-+
-+    uint32_t bsf_h;
-+    uint32_t bsf_v;
-+
-+#ifdef DISABLE_STRENGTHS
-+    return;
-+#endif
-+
-+    // We are always on a size boundary
-+    av_assert2((x0 & (trafo_size - 1)) == 0);
-+    av_assert2((y0 & (trafo_size - 1)) == 0);
-+    // log2_trafo_size not really a transform size; we can have to deal
-+    // with size 2^6 blocks
-+    av_assert2(log2_trafo_size >= 2 && log2_trafo_size <= 6);
-+
-+    // Retrieve and update coded (b0), intra (b1) bs flags
-+    //
-+    // Store on min width (rather than uint32_t) to avoid possible issues
-+    // with another thread on another core running wpp using the same
-+    // memory (min CTB = 16 pels = 4 bsf els = 8 bits)
-+    //
-+    // In bsf BS=2 is represented by 3 as it is much easier to test & set
-+    // and the actual deblock code tests for 0 and b1 set/not-set so 2 and
-+    // 3 will work the same
-+    {
-+        // Given where we are called from is_cbf_luma & is_intra will be constant over the block
-+        const uint32_t bsf0 =  (lc->cu.pred_mode == MODE_INTRA) ? bsf_mask : is_coded_block ? bsf_cbf : 0;
-+        uint8_t *const p = s->bsf_stash_up + (x0 >> 4);
-+        uint8_t *const q = s->bsf_stash_left + (y0 >> 4);
-+
-+        switch (log2_trafo_size)
-+        {
-+            case 2:
-+            case 3:
-+            {
-+                const unsigned int sh_h = (x0 >> 1) & 7;
-+                const unsigned int sh_v = (y0 >> 1) & 7;
-+                bsf_h = *p;
-+                bsf_v = *q;
-+                *p = (bsf_h & ~(bsf_mask << sh_h)) | (bsf0 << sh_h);
-+                *q = (bsf_v & ~(bsf_mask << sh_v)) | (bsf0 << sh_v);
-+                bsf_h >>= sh_h;
-+                bsf_v >>= sh_v;
-+                break;
-+            }
-+            case 4:
-+                bsf_h = *p;
-+                bsf_v = *q;
-+                *p = bsf0;
-+                *q = bsf0;
-+                break;
-+            case 5:
-+                bsf_h = *(uint16_t *)p;
-+                bsf_v = *(uint16_t *)q;
-+                *(uint16_t *)p = bsf0;
-+                *(uint16_t *)q = bsf0;
-+                break;
-+            case 6:
-+            default:
-+                bsf_h = *(uint32_t *)p;
-+                bsf_v = *(uint32_t *)q;
-+                *(uint32_t *)p = bsf0;
-+                *(uint32_t *)q = bsf0;
-+                break;
-+        }
-+
-+        bsf_h |= bsf0;
-+        bsf_v |= bsf0;
-+    }
-+
-+    // Do Horizontal
-+    if ((y0 & 7) == 0)
-+    {
-+        // Boundary upper
-+        if (y0 != 0 &&
-+            (off_boundary(y0, s->ps.sps->log2_ctb_size) ||
-+             (boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0))
-+        {
-+            // Look at MVs (BS=1) if we don't already has a full set of bs bits
-+            if ((~bsf_h & bsf_cbf) != 0 && (y0 == lc->cu.y || y0 == lc->cu.y_split))
-+            {
-+                // If we aren't on the top boundary we must be in the middle
-+                // and in that case we know where mvf can change
-+                const unsigned int log2_rep = (y0 == lc->cu.y) ? log2_rep_min_pu : has_x_split ? 1 : 0;
-+                const RefPicList *const rpl_top = !off_boundary(y0, s->ps.sps->log2_ctb_size) ?
-+                      s->rpl_up[x0 >> s->ps.sps->log2_ctb_size] :
-+                      rpl;
-+
-+                bsf_h |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
-+                    trafo_size >> (log2_min_pu_size + log2_rep),
-+                    trafo_size >> (log2_min_pu_size + log2_rep),
-+                    rpl, rpl_top,
-+                    mvf_curr, mvf_ptr(s, lc, x0, y0, x0, y0 - 1));
-+            }
-+
-+            // Finally put the results into bs
-+            hbs_set(s, x0, y0, bsf_mask, bsf_h);
-+        }
-+
-+        // Max of 1 pu internal split - ignore if not on 8pel boundary
-+        if (has_y_split && !off_boundary(lc->cu.y_split, 3))
-+        {
-+            const HEVCRpiMvField * const mvf = mvf_stash_ptr(s, lc, x0, lc->cu.y_split);
-+            // If we have the x split as well then it must be in the middle
-+            const unsigned int log2_rep = has_x_split ? 1 : 0;
-+
-+            hbs_set(s, x0, lc->cu.y_split, bsf_mask,
-+                bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
-+                   trafo_size >> (log2_min_pu_size + log2_rep),
-+                   trafo_size >> (log2_min_pu_size + log2_rep),
-+                   rpl, rpl,
-+                   mvf, mvf - MVF_STASH_WIDTH_PU));
-+        }
-+    }
-+
-+    // And again for vertical - same logic as horizontal just in the other direction
-+    if ((x0 & 7) == 0)
-+    {
-+        // Boundary left
-+        if (x0 != 0 &&
-+            (off_boundary(x0, s->ps.sps->log2_ctb_size) ||
-+             (boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0))
-+        {
-+            if ((~bsf_v & bsf_cbf) != 0 && (x0 == lc->cu.x || x0 == lc->cu.x_split))
-+            {
-+                const unsigned int log2_rep = (x0 == lc->cu.x) ? log2_rep_min_pu : has_y_split ? 1 : 0;
-+                const RefPicList *const rpl_left = !off_boundary(x0, s->ps.sps->log2_ctb_size) ?
-+                    s->rpl_left[y0 >> s->ps.sps->log2_ctb_size] :
-+                    rpl;
-+
-+                bsf_v |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
-+                    (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
-+                    (mvf_left_stride(s, x0, x0 - 1) << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
-+                    rpl, rpl_left,
-+                    mvf_curr, mvf_ptr(s, lc, x0, y0, x0 - 1, y0));
-+            }
-+
-+            vbs_set(s, x0, y0, bsf_mask, bsf_v);
-+        }
-+
-+        if (has_x_split && !off_boundary(lc->cu.x_split, 3))
-+        {
-+            const HEVCRpiMvField *const mvf = mvf_stash_ptr(s, lc, lc->cu.x_split, y0);
-+            const unsigned int log2_rep = has_y_split ? 1 : 0;
-+
-+            vbs_set(s, lc->cu.x_split, y0, bsf_mask,
-+                bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
-+                   (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
-+                   (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
-+                   rpl, rpl,
-+                   mvf, mvf - 1));
-+        }
-+    }
-+}
-+
-+#undef LUMA
-+#undef CB
-+#undef CR
-+
-+static inline unsigned int ussub(const unsigned int a, const unsigned int b)
-+{
-+    return a < b ? 0 : a - b;
-+}
-+
-+static inline int cache_boundry(const AVFrame * const frame, const unsigned int x)
-+{
-+    return ((x >> av_rpi_sand_frame_xshl(frame)) & ~63) == 0;
-+}
-+
-+int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot)
-+{
-+    const int ctb_size = (1 << s->ps.sps->log2_ctb_size);
-+    int x, y;
-+
-+    const unsigned int br = bounds.x + bounds.w;
-+    const unsigned int bb = bounds.y + bounds.h;
-+
-+    const int x_end = (br >= s->ps.sps->width);
-+    const int y_end = (bb >= s->ps.sps->height);
-+
-+    // Deblock may not touch the edges of the bound as they are still needed
-+    // for Intra pred
-+    //
-+    // Deblock is disabled with a per-slice flag
-+    // Given that bounds may cover multiple slices & we dblock outside bounds
-+    // anyway we can't avoid deblock using that flag - about the only thing we
-+    // could do is have a "no deblock seen yet" flag but it doesn't really
-+    // seem worth the effort
-+
-+    deblock_y_blk(s, bounds, x_end, y_end);
-+    deblock_uv_blk(s, bounds, x_end, y_end);
-+
-+    // SAO needs
-+    // (a) CTB alignment
-+    // (b) Valid pixels all the way around the CTB in particular it needs the DR pixel
-+    {
-+        const unsigned int xo = bounds.x - ((bounds.x - 16) & ~(ctb_size - 1));
-+        const unsigned int yo = bounds.y - ((bounds.y - 16) & ~(ctb_size - 1));
-+        const unsigned int yt = ussub(bounds.y, yo);
-+        const unsigned int yb = y_end ? bb : ussub(bb, yo);
-+        const unsigned int xl = ussub(bounds.x, xo);
-+        const unsigned int xr = x_end ? br : ussub(br, xo);
-+
-+        if (s->ps.sps->sao_enabled)
-+        {
-+            for (y = yt; y < yb; y += ctb_size) {
-+                for (x = xl; x < xr; x += ctb_size) {
-+                    sao_filter_CTB(s, x, y);
-+                }
-+            }
-+        }
-+
-+        // Cache invalidate
-+        y = 0;
-+        if (xr != 0 && yb != 0)
-+        {
-+            const unsigned int llen =
-+                (av_rpi_sand_frame_stride1(s->frame) >> av_rpi_sand_frame_xshl(s->frame));
-+            const unsigned int mask = ~(llen - 1);
-+            const unsigned int il = (xl == 0) ? 0 : (xl - 1) & mask;
-+            const unsigned int ir = x_end || !cache_boundry(s->frame, br) ? br : (xr - 1) & mask;
-+            const unsigned int it = ussub(yt, 1);
-+            const unsigned int ib = y_end ? bb : yb - 1;
-+
-+            if (il < ir) {
-+                rpi_cache_buf_t cbuf;
-+                rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(&cbuf);
-+                rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+                  il, it, ir - il, ib - it,
-+                  ctx_vshift(s, 1), 1, 1);
-+
-+                // If we have to commit the right hand tile boundry due to
-+                // cache boundry considerations then at EoTile we must commit
-+                // that boundry to bottom of tile (bounds)
-+                if (ib != bb && ir == br && eot) {
-+                    rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+                      br - 1, ib, 1, bb - ib,
-+                      ctx_vshift(s, 1), 1, 1);
-+                }
-+
-+                rpi_cache_flush_finish(rfe);
-+
-+                if (x_end)
-+                    y = y_end ? INT_MAX : ib;
-+
-+//                printf("Flush: %4d,%4d -> %4d,%4d: signal: %d\n", il, it, ir, ib, y - 1);
-+            }
-+        }
-+    }
-+
-+    return y;
-+}
-+
---- /dev/null
-+++ b/libavcodec/rpi_hevc_mv.h
-@@ -0,0 +1,71 @@
-+#ifndef AVCODEC_RPI_HEVC_MV_H
-+#define AVCODEC_RPI_HEVC_MV_H
-+
-+#include "config.h"
-+
-+typedef int32_t MvXY;
-+
-+typedef struct HEVCRpiMvField {
-+    MvXY xy[2];
-+    int8_t ref_idx[2];
-+    int8_t pred_flag;
-+    int8_t dummy; // To 12 bytes
-+} HEVCRpiMvField;
-+
-+
-+#define MV_X(xy) (((xy) << 16) >> 16)
-+#define MV_Y(xy) ((xy) >> 16)
-+#define MV_XY(x, y) ((x & 0xffff) | ((y) << 16))
-+
-+#if ARCH_ARM
-+#include "arm/rpi_hevc_mv_arm.h"
-+#endif
-+
-+#ifndef mvxy_add
-+static inline MvXY mvxy_add(const MvXY a, const MvXY b)
-+{
-+    return MV_XY(MV_X(a) + MV_X(b), MV_Y(a) + MV_Y(b));
-+}
-+#endif
-+
-+
-+#ifndef mv_scale_xy
-+static inline MvXY mv_scale_xy(const MvXY const src, int td, int tb)
-+{
-+    int tx, scale_factor;
-+
-+    td = td == 0 ? 1 : av_clip_int8(td);
-+    tb = av_clip_int8(tb);
-+    tx = (0x4000 + (abs(td) >> 1)) / td;
-+    scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12);
-+    return MV_XY(
-+        av_clip_int16((scale_factor * MV_X(src) + 127 +
-+                           (scale_factor * MV_X(src) < 0)) >> 8),
-+        av_clip_int16((scale_factor * MV_Y(src) + 127 +
-+                           (scale_factor * MV_Y(src) < 0)) >> 8));
-+}
-+#endif
-+
-+// 8.3.1 states that the bitstream may not contain poc diffs that do not
-+// fit in 16 bits, so given that we don't care about the high bits we only
-+// store the low 16 + LT & Inter flags
-+
-+#define COL_POC_INTRA   0
-+#define COL_POC_INTER   (1 << 16)
-+#define COL_POC_LT      (1 << 17)
-+#define COL_POC_DIFF(x,y) ((int16_t)((x) - (y)))
-+#define COL_POC_MAKE_INTER(lt,poc) (COL_POC_INTER | ((lt) ? COL_POC_LT : 0) | ((poc) & 0xffff))
-+#define COL_POC_IS_LT(x) (((x) & COL_POC_LT) != 0)
-+
-+typedef struct ColMv_s {
-+    int32_t poc;
-+    int32_t xy;
-+} ColMv;
-+
-+typedef struct ColMvField_s {
-+    ColMv L[2];
-+} ColMvField;
-+
-+
-+
-+#endif // AVCODEC_RPI_HEVC_MV_H
---- /dev/null
-+++ b/libavcodec/rpi_hevc_mvs.c
-@@ -0,0 +1,487 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2013 Anand Meher Kotra
-+ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "hevc.h"
-+#include "rpi_hevcdec.h"
-+
-+static av_always_inline int
-+is_eq_mer(const unsigned int plevel,
-+    const unsigned int xN, const unsigned int yN,
-+    const unsigned int xP, const unsigned int yP)
-+{
-+    return (((xN ^ xP) | (yN ^ yP)) >> plevel) == 0;
-+}
-+
-+// check if the mv's and refidx are the same between A and B
-+static av_always_inline int compare_mv_ref_idx(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b)
-+{
-+    return a->pred_flag == b->pred_flag &&
-+        ((a->pred_flag & PF_L0) == 0 || (a->ref_idx[0] == b->ref_idx[0] && a->xy[0] == b->xy[0])) &&
-+        ((a->pred_flag & PF_L1) == 0 || (a->ref_idx[1] == b->ref_idx[1] && a->xy[1] == b->xy[1]));
-+    return 0;
-+}
-+
-+/*
-+ * 8.5.3.1.7  temporal luma motion vector prediction
-+ */
-+static int temporal_luma_motion_vector(const HEVCRpiContext * const s,
-+                                       const HEVCRpiLocalContext * const lc, const int x0, const int y0,
-+                                       const int nPbW, const int nPbH, const int refIdxLx,
-+                                       MvXY * const mvLXCol, const int X)
-+{
-+    int x, y;
-+    const ColMv * cmv = NULL;
-+
-+    HEVCRpiFrame * const col_ref = s->ref->collocated_ref;
-+    const RefPicList * const refPicList = s->refPicList + X;
-+    const int cur_lt = refPicList->isLongTerm[refIdxLx];
-+
-+    *mvLXCol = 0;
-+    // Unlikely but we might have a col_ref IDR frame!
-+    if (col_ref->col_mvf == NULL)
-+        return 0;
-+
-+    ff_hevc_rpi_progress_wait_mv(s, lc->jb0, col_ref, y0 + nPbH);
-+
-+    //bottom right collocated motion vector
-+    x = x0 + nPbW;
-+    y = y0 + nPbH;
-+
-+    if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
-+        y < s->ps.sps->height &&
-+        x < s->ps.sps->width)
-+    {
-+        const ColMvField * const col = col_ref->col_mvf + (x >> 4) +
-+            (y >> 4) * s->col_mvf_stride;
-+
-+        if (col->L[0].poc != COL_POC_INTRA &&
-+            (col->L[1].poc == COL_POC_INTRA ||
-+             (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
-+        {
-+            cmv = col->L + 0;
-+        }
-+        else if (col->L[1].poc != COL_POC_INTRA)
-+        {
-+            cmv = col->L + 1;
-+        }
-+    }
-+
-+    // derive center collocated motion vector
-+    if (cmv == NULL || COL_POC_IS_LT(cmv->poc) != cur_lt)
-+    {
-+        cmv = NULL;
-+        x                  = x0 + (nPbW >> 1);
-+        y                  = y0 + (nPbH >> 1);
-+
-+        {
-+            const ColMvField * const col = col_ref->col_mvf + (x >> 4) +
-+              (y >> 4) * s->col_mvf_stride;
-+
-+            if (col->L[0].poc != COL_POC_INTRA &&
-+              (col->L[1].poc == COL_POC_INTRA ||
-+               (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
-+            {
-+              cmv = col->L + 0;
-+            }
-+            else if (col->L[1].poc != COL_POC_INTRA)
-+            {
-+              cmv = col->L + 1;
-+            }
-+        }
-+    }
-+
-+    if (cmv == NULL || cur_lt != COL_POC_IS_LT(cmv->poc))
-+        return 0;
-+
-+    {
-+        const int col_poc  = col_ref->poc;
-+        const int ref_poc  = refPicList->list[refIdxLx];
-+
-+        *mvLXCol = (cur_lt ||
-+                        cmv->poc == col_poc ||
-+                        COL_POC_DIFF(col_poc, cmv->poc) == s->poc - ref_poc) ?
-+                    cmv->xy :
-+                    mv_scale_xy(cmv->xy, COL_POC_DIFF(col_poc, cmv->poc), s->poc - ref_poc);
-+    }
-+
-+    return cmv != NULL;
-+}
-+
-+static inline int mvf_eq(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b)
-+{
-+    return b != NULL && compare_mv_ref_idx(a, b);
-+}
-+
-+
-+
-+/*
-+ * 8.5.3.1.2  Derivation process for spatial merging candidates
-+ */
-+static inline const HEVCRpiMvField *
-+derive_spatial_merge_candidates(
-+    const HEVCRpiContext * const s,
-+    const HEVCRpiLocalContext * const lc,
-+    const unsigned int x0, const unsigned int y0,
-+    const unsigned int nPbW, const unsigned int nPbH,
-+    const unsigned int avail,
-+    const unsigned int part_idx,
-+    const unsigned int merge_idx,
-+    HEVCRpiMvField * const mvf_t)
-+{
-+    const unsigned int parts_a1 = (1 << PART_Nx2N) | (1 << PART_nLx2N) | (1 << PART_nRx2N);
-+    const unsigned int parts_b1 = (1 << PART_2NxN) | (1<< PART_2NxnU) | (1 << PART_2NxnD);
-+
-+    const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1);
-+    const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1);
-+    const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1);
-+    const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1;
-+    const unsigned int plevel = s->ps.pps->log2_parallel_merge_level;
-+    const unsigned int part_mode = lc->cu.part_mode;
-+
-+    const HEVCRpiMvField * perm[4];
-+    unsigned int nb_merge_cand = 0;
-+
-+    // singleMCLFlag => part_idx == 0 so no need to test for it
-+    if ((avail & AVAIL_L) == 0 ||
-+        (part_idx == 1 &&
-+            ((parts_a1 >> part_mode) & 1) != 0 ||
-+                is_eq_mer(plevel, x0 - 1, y0 + nPbH - 1, x0, y0)) ||
-+        mvf_a1->pred_flag == PF_INTRA)
-+    {
-+        mvf_a1 = NULL;
-+    }
-+    else
-+    {
-+        if (merge_idx == nb_merge_cand)
-+            return mvf_a1;
-+        perm[nb_merge_cand++] = mvf_a1;
-+    }
-+
-+    if ((avail & AVAIL_U) == 0 ||
-+            (part_idx == 1 &&
-+               ((parts_b1 >> part_mode) & 1) != 0 ||
-+                   is_eq_mer(plevel, x0 + nPbW - 1, y0 - 1, x0, y0)) ||
-+            mvf_b1->pred_flag == PF_INTRA)
-+    {
-+        mvf_b1 = NULL;
-+    }
-+    else if (!mvf_eq(mvf_b1, mvf_a1))
-+    {
-+        if (merge_idx == nb_merge_cand)
-+            return mvf_b1;
-+        perm[nb_merge_cand++] = mvf_b1;
-+    }
-+
-+    // above right spatial merge candidate
-+    // Never need mvf_b0 again so don't bother zeroing if navail
-+    if ((avail & AVAIL_UR) != 0 &&
-+        !is_eq_mer(plevel, x0 + nPbW, y0 - 1, x0, y0) &&
-+        mvf_b0->pred_flag != PF_INTRA &&
-+        !mvf_eq(mvf_b0, mvf_b1))
-+    {
-+        if (merge_idx == nb_merge_cand)
-+            return mvf_b0;
-+        perm[nb_merge_cand++] = mvf_b0;
-+    }
-+
-+    // left bottom spatial merge candidate
-+    // Never need mvf_a0 again so don't bother zeroing if navail
-+    if ((avail & AVAIL_DL) != 0 &&
-+        !is_eq_mer(plevel, x0 - 1, y0 + nPbH, x0, y0) &&
-+        mvf_a0->pred_flag != PF_INTRA &&
-+        !mvf_eq(mvf_a0, mvf_a1))
-+    {
-+        if (merge_idx == nb_merge_cand)
-+            return mvf_a0;
-+        perm[nb_merge_cand++] = mvf_a0;
-+    }
-+
-+    // above left spatial merge candidate
-+    if (nb_merge_cand != 4 &&
-+        (avail & AVAIL_UL) != 0 &&
-+        !is_eq_mer(plevel, x0 - 1, y0 - 1, x0, y0))
-+    {
-+        const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1);  // UL
-+
-+        if (mvf_b2->pred_flag != PF_INTRA &&
-+            !mvf_eq(mvf_b2, mvf_a1) &&
-+            !mvf_eq(mvf_b2, mvf_b1))
-+        {
-+            if (merge_idx == nb_merge_cand)
-+                return mvf_b2;
-+            perm[nb_merge_cand++] = mvf_b2;
-+        }
-+    }
-+
-+    // temporal motion vector candidate
-+    if (s->sh.slice_temporal_mvp_enabled_flag)
-+    {
-+        static const HEVCRpiMvField mvf_z = {{0}};
-+
-+        *mvf_t = mvf_z;
-+
-+        if (temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
-+                                        0, mvf_t->xy + 0, 0))
-+            mvf_t->pred_flag = PF_L0;
-+
-+        if (s->sh.slice_type == HEVC_SLICE_B &&
-+                temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
-+                                            0, mvf_t->xy + 1, 1))
-+            mvf_t->pred_flag |= PF_L1;
-+
-+        if (mvf_t->pred_flag != 0)
-+        {
-+            if (merge_idx == nb_merge_cand)
-+                return mvf_t;
-+            perm[nb_merge_cand++] = mvf_t;
-+        }
-+    }
-+
-+    // combined bi-predictive merge candidates  (applies for B slices)
-+    if (s->sh.slice_type == HEVC_SLICE_B && nb_merge_cand > 1)
-+    {
-+        unsigned int comb_idx = 0;
-+        const unsigned int cand_count = nb_merge_cand * (nb_merge_cand - 1);
-+        const RefPicList * const refPicList = s->refPicList;
-+
-+        for (comb_idx = 0; comb_idx < cand_count; comb_idx++)
-+        {
-+            static const uint8_t l0_l1_cand_idx[12][2] = {
-+                { 0, 1, },
-+                { 1, 0, },
-+                { 0, 2, },
-+                { 2, 0, },
-+                { 1, 2, },
-+                { 2, 1, },
-+                { 0, 3, },
-+                { 3, 0, },
-+                { 1, 3, },
-+                { 3, 1, },
-+                { 2, 3, },
-+                { 3, 2, },
-+            };
-+
-+            const unsigned int l0_cand_idx = l0_l1_cand_idx[comb_idx][0];
-+            const unsigned int l1_cand_idx = l0_l1_cand_idx[comb_idx][1];
-+            const HEVCRpiMvField * const mvf_c0 = perm[l0_cand_idx];
-+            const HEVCRpiMvField * const mvf_c1 = perm[l1_cand_idx];
-+
-+            if ((mvf_c0->pred_flag & PF_L0) != 0 &&
-+                (mvf_c1->pred_flag & PF_L1) != 0 &&
-+                (refPicList[0].list[mvf_c0->ref_idx[0]] != refPicList[1].list[mvf_c1->ref_idx[1]] ||
-+                 mvf_c0->xy[0] != mvf_c1->xy[1]))
-+            {
-+                if (merge_idx == nb_merge_cand++)
-+                {
-+                    // Need to be a bit careful as we will construct mvf_t and we
-+                    // may already be using that as one of our condidates
-+                    // so build & copy rather than build in place
-+                    const HEVCRpiMvField mvf_m = {
-+                        .xy = {
-+                            mvf_c0->xy[0],
-+                            mvf_c1->xy[1]},
-+                        .ref_idx = {
-+                            mvf_c0->ref_idx[0],
-+                            mvf_c1->ref_idx[1]},
-+                        .pred_flag = PF_BI
-+                    };
-+                    *mvf_t = mvf_m;
-+                    return mvf_t;
-+                }
-+            }
-+        }
-+    }
-+
-+    // "append" Zero motion vector candidates
-+    {
-+        const unsigned int nb_refs = (s->sh.slice_type == HEVC_SLICE_B) ?
-+                            FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]) : s->sh.nb_refs[0];
-+        const unsigned int zero_idx = merge_idx - nb_merge_cand;
-+
-+        const HEVCRpiMvField mvf_m = {
-+            .xy = {0, 0},
-+            .ref_idx = {
-+                zero_idx < nb_refs ? zero_idx : 0,
-+                (s->sh.slice_type == HEVC_SLICE_B && zero_idx < nb_refs) ? zero_idx : 0},
-+            .pred_flag = (s->sh.slice_type == HEVC_SLICE_B) ? PF_BI : PF_L0
-+        };
-+
-+        *mvf_t = mvf_m;
-+        return mvf_t;
-+    }
-+}
-+
-+
-+// 8.5.3.1.1 Derivation process of luma Mvs for merge mode
-+void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
-+                                int nPbH, int log2_cb_size, int part_idx,
-+                                int merge_idx, HEVCRpiMvField * const mv)
-+{
-+    const HEVCRpiMvField * mvf_m = (s->ps.pps->log2_parallel_merge_level > 2 && log2_cb_size == 3) ?
-+        derive_spatial_merge_candidates(s, lc, lc->cu.x, lc->cu.y, 8, 8,
-+                                        ff_hevc_rpi_tb_avail_flags(s, lc, lc->cu.x, lc->cu.y, 8, 8),
-+                                        0, merge_idx, mv) :
-+        derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH,
-+                                        ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH),
-+                                        part_idx, merge_idx, mv);
-+
-+    if (mvf_m != mv)
-+        *mv = *mvf_m;
-+
-+    if (mv->pred_flag == PF_BI && (nPbW + nPbH) == 12)
-+        mv->pred_flag = PF_L0;
-+}
-+
-+
-+static av_always_inline const MvXY *
-+mvf_same_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1, const int poc0, const HEVCRpiMvField * const mvf)
-+{
-+    if (mvf != NULL)
-+    {
-+        if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].list[mvf->ref_idx[pfi0]] == poc0)
-+            return mvf->xy + pfi0;
-+        if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].list[mvf->ref_idx[pfi1]] == poc0)
-+            return mvf->xy + pfi1;
-+    }
-+    return NULL;
-+}
-+
-+static av_always_inline const MvXY *
-+mvf_other_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1,
-+              const int islt0, const int poc0, const int poc_cur,
-+              MvXY * const mv_t, const HEVCRpiMvField * const mvf)
-+{
-+    if (mvf != NULL)
-+    {
-+        if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].isLongTerm[mvf->ref_idx[pfi0]] == islt0)
-+        {
-+            const int poc1 = rpl[pfi0].list[mvf->ref_idx[pfi0]];
-+            if (islt0 || poc1 == poc0) {
-+                return mvf->xy + pfi0;
-+            }
-+            *mv_t = mv_scale_xy(mvf->xy[pfi0], poc_cur - poc1, poc_cur - poc0);
-+            return mv_t;
-+        }
-+        if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].isLongTerm[mvf->ref_idx[pfi1]] == islt0)
-+        {
-+            const int poc1 = rpl[pfi1].list[mvf->ref_idx[pfi1]];
-+            if (islt0 || poc1 == poc0) {
-+                return mvf->xy + pfi1;
-+            }
-+            *mv_t = mv_scale_xy(mvf->xy[pfi1], poc_cur - poc1, poc_cur - poc0);
-+            return mv_t;
-+        }
-+    }
-+    return NULL;
-+}
-+
-+void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+    const unsigned int x0, const unsigned int y0,
-+    const unsigned int nPbW, const unsigned int nPbH,
-+    const unsigned int avail,
-+    HEVCRpiMvField * const mv,
-+    const unsigned int mvp_lx_flag, const unsigned int LX)
-+{
-+    const unsigned int pfi0 = LX;
-+    const unsigned int pfi1 = LX == 0 ? 1 : 0;
-+    const RefPicList * const rpl = s->refPicList;
-+    const int poc0 = rpl[LX].list[mv->ref_idx[LX]];
-+    const int poc_cur = s->poc;
-+    const int islt0 = rpl[LX].isLongTerm[mv->ref_idx[LX]];
-+
-+    const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1);
-+    const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1);
-+    const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1);  // UL
-+    const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1);
-+    const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1;
-+    const MvXY * mva = NULL;
-+    const MvXY * mvb;
-+    MvXY * const mv_rv = mv->xy + LX;
-+    MvXY mvt_a, mvt_b;
-+
-+    *mv_rv = 0;
-+
-+    if ((avail & AVAIL_DL) == 0 || mvf_a0->pred_flag == PF_INTRA)
-+        mvf_a0 = NULL;
-+    else if ((mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a0)) != NULL && mvp_lx_flag == 0)
-+        goto use_mva;
-+
-+    if ((avail & AVAIL_L) == 0 || mvf_a1->pred_flag == PF_INTRA)
-+        mvf_a1 = NULL;
-+
-+    if (mva == NULL &&
-+        (mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a1)) == NULL &&
-+        (mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a0)) == NULL)
-+        mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a1);
-+
-+    if (mvp_lx_flag == 0 && mva != NULL)
-+        goto use_mva;
-+
-+    if ((avail & AVAIL_UR) == 0 || mvf_b0->pred_flag == PF_INTRA)
-+        mvf_b0 = NULL;
-+    if ((avail & AVAIL_U) == 0 || mvf_b1->pred_flag == PF_INTRA)
-+        mvf_b1 = NULL;
-+    if ((avail & AVAIL_UL) == 0 || mvf_b2->pred_flag == PF_INTRA)
-+        mvf_b2 = NULL;
-+
-+    if ((mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b0)) == NULL &&
-+        (mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b1)) == NULL)
-+        mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b2);
-+
-+    if (mvf_a0 == NULL && mvf_a1 == NULL) {
-+        mva = mvb;
-+        if (mvp_lx_flag == 0 && mva != NULL)
-+            goto use_mva;
-+
-+        if ((mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b0)) == NULL &&
-+            (mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b1)) == NULL)
-+            mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b2);
-+    }
-+
-+    if (mva == NULL) {
-+        mva = mvb;
-+        mvb = NULL;
-+    }
-+
-+    if (mvb != NULL && *mva == *mvb)  // If A == B then ignore B
-+        mvb = NULL;
-+
-+    if (mvp_lx_flag == 0 && mva != NULL) {
-+        goto use_mva;
-+    }
-+    else if (mvp_lx_flag != 0 && mvb != NULL) {
-+        *mv_rv = *mvb;
-+    }
-+    else if (s->sh.slice_temporal_mvp_enabled_flag && ((mvp_lx_flag == 0 && mva == NULL) || (mvp_lx_flag != 0 && mva != NULL))) {
-+        temporal_luma_motion_vector(s, lc, x0, y0, nPbW,
-+                                    nPbH, mv->ref_idx[LX],
-+                                    mv_rv, LX);
-+    }
-+    return;
-+
-+use_mva:
-+    *mv_rv = *mva;
-+    return;
-+}
-+
---- /dev/null
-+++ b/libavcodec/rpi_hevc_parse.c
-@@ -0,0 +1,143 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "bytestream.h"
-+#include "h2645_parse.h"
-+#include "hevc.h"
-+#include "rpi_hevc_parse.h"
-+
-+static int hevc_decode_nal_units(const uint8_t *buf, int buf_size, HEVCRpiParamSets *ps,
-+                                 HEVCSEIContext *sei, int is_nalff, int nal_length_size,
-+                                 int err_recognition, int apply_defdispwin, void *logctx)
-+{
-+    int i;
-+    int ret = 0;
-+    H2645Packet pkt = { 0 };
-+
-+    ret = ff_h2645_packet_split(&pkt, buf, buf_size, logctx, is_nalff,
-+                                nal_length_size, AV_CODEC_ID_HEVC, 1, 0);
-+    if (ret < 0) {
-+        goto done;
-+    }
-+
-+    for (i = 0; i < pkt.nb_nals; i++) {
-+        H2645NAL *nal = &pkt.nals[i];
-+
-+        /* ignore everything except parameter sets and VCL NALUs */
-+        switch (nal->type) {
-+        case HEVC_NAL_VPS:
-+            ret = ff_hevc_rpi_decode_nal_vps(&nal->gb, logctx, ps);
-+            if (ret < 0)
-+                goto done;
-+            break;
-+        case HEVC_NAL_SPS:
-+            ret = ff_hevc_rpi_decode_nal_sps(&nal->gb, logctx, ps, apply_defdispwin);
-+            if (ret < 0)
-+                goto done;
-+            break;
-+        case HEVC_NAL_PPS:
-+            ret = ff_hevc_rpi_decode_nal_pps(&nal->gb, logctx, ps);
-+            if (ret < 0)
-+                goto done;
-+            break;
-+        case HEVC_NAL_SEI_PREFIX:
-+        case HEVC_NAL_SEI_SUFFIX:
-+            ret = ff_hevc_rpi_decode_nal_sei(&nal->gb, logctx, sei, ps, nal->type);
-+            if (ret < 0)
-+                goto done;
-+            break;
-+        default:
-+            av_log(logctx, AV_LOG_VERBOSE, "Ignoring NAL type %d in extradata\n", nal->type);
-+            break;
-+        }
-+    }
-+
-+done:
-+    ff_h2645_packet_uninit(&pkt);
-+    if (err_recognition & AV_EF_EXPLODE)
-+        return ret;
-+
-+    return 0;
-+}
-+
-+int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps,
-+                             HEVCSEIContext *sei, int *is_nalff, int *nal_length_size,
-+                             int err_recognition, int apply_defdispwin, void *logctx)
-+{
-+    int ret = 0;
-+    GetByteContext gb;
-+
-+    bytestream2_init(&gb, data, size);
-+
-+    if (size > 3 && (data[0] || data[1] || data[2] > 1)) {
-+        /* It seems the extradata is encoded as hvcC format.
-+         * Temporarily, we support configurationVersion==0 until 14496-15 3rd
-+         * is finalized. When finalized, configurationVersion will be 1 and we
-+         * can recognize hvcC by checking if avctx->extradata[0]==1 or not. */
-+        int i, j, num_arrays, nal_len_size;
-+
-+        *is_nalff = 1;
-+
-+        bytestream2_skip(&gb, 21);
-+        nal_len_size = (bytestream2_get_byte(&gb) & 3) + 1;
-+        num_arrays   = bytestream2_get_byte(&gb);
-+
-+        /* nal units in the hvcC always have length coded with 2 bytes,
-+         * so put a fake nal_length_size = 2 while parsing them */
-+        *nal_length_size = 2;
-+
-+        /* Decode nal units from hvcC. */
-+        for (i = 0; i < num_arrays; i++) {
-+            int type = bytestream2_get_byte(&gb) & 0x3f;
-+            int cnt  = bytestream2_get_be16(&gb);
-+
-+            for (j = 0; j < cnt; j++) {
-+                // +2 for the nal size field
-+                int nalsize = bytestream2_peek_be16(&gb) + 2;
-+                if (bytestream2_get_bytes_left(&gb) < nalsize) {
-+                    av_log(logctx, AV_LOG_ERROR,
-+                           "Invalid NAL unit size in extradata.\n");
-+                    return AVERROR_INVALIDDATA;
-+                }
-+
-+                ret = hevc_decode_nal_units(gb.buffer, nalsize, ps, sei, *is_nalff,
-+                                            *nal_length_size, err_recognition, apply_defdispwin,
-+                                            logctx);
-+                if (ret < 0) {
-+                    av_log(logctx, AV_LOG_ERROR,
-+                           "Decoding nal unit %d %d from hvcC failed\n",
-+                           type, i);
-+                    return ret;
-+                }
-+                bytestream2_skip(&gb, nalsize);
-+            }
-+        }
-+
-+        /* Now store right nal length size, that will be used to parse
-+         * all other nals */
-+        *nal_length_size = nal_len_size;
-+    } else {
-+        *is_nalff = 0;
-+        ret = hevc_decode_nal_units(data, size, ps, sei, *is_nalff, *nal_length_size,
-+                                    err_recognition, apply_defdispwin, logctx);
-+        if (ret < 0)
-+            return ret;
-+    }
-+
-+    return ret;
-+}
---- /dev/null
-+++ b/libavcodec/rpi_hevc_parse.h
-@@ -0,0 +1,36 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+/**
-+ * @file
-+ * H.265 parser code
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVC_PARSE_H
-+#define AVCODEC_RPI_HEVC_PARSE_H
-+
-+#include <stdint.h>
-+
-+#include "rpi_hevc_ps.h"
-+#include "rpi_hevc_sei.h"
-+
-+int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps,
-+                             HEVCSEIContext *sei, int *is_nalff, int *nal_length_size,
-+                             int err_recognition, int apply_defdispwin, void *logctx);
-+
-+#endif /* AVCODEC_RPI_HEVC_PARSE_H */
---- /dev/null
-+++ b/libavcodec/rpi_hevc_ps.c
-@@ -0,0 +1,1938 @@
-+/*
-+ * HEVC Parameter Set decoding
-+ *
-+ * Copyright (C) 2012 - 2103 Guillaume Martres
-+ * Copyright (C) 2012 - 2103 Mickael Raulet
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ * Copyright (C) 2013 Vittorio Giovara
-+ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/imgutils.h"
-+#include "golomb.h"
-+#include "rpi_hevc_data.h"
-+#include "rpi_hevc_ps.h"
-+#include "rpi_hevcdec.h"
-+
-+static const uint8_t default_scaling_list_intra[] = {
-+    16, 16, 16, 16, 17, 18, 21, 24,
-+    16, 16, 16, 16, 17, 19, 22, 25,
-+    16, 16, 17, 18, 20, 22, 25, 29,
-+    16, 16, 18, 21, 24, 27, 31, 36,
-+    17, 17, 20, 24, 30, 35, 41, 47,
-+    18, 19, 22, 27, 35, 44, 54, 65,
-+    21, 22, 25, 31, 41, 54, 70, 88,
-+    24, 25, 29, 36, 47, 65, 88, 115
-+};
-+
-+static const uint8_t default_scaling_list_inter[] = {
-+    16, 16, 16, 16, 17, 18, 20, 24,
-+    16, 16, 16, 17, 18, 20, 24, 25,
-+    16, 16, 17, 18, 20, 24, 25, 28,
-+    16, 17, 18, 20, 24, 25, 28, 33,
-+    17, 18, 20, 24, 25, 28, 33, 41,
-+    18, 20, 24, 25, 28, 33, 41, 54,
-+    20, 24, 25, 28, 33, 41, 54, 71,
-+    24, 25, 28, 33, 41, 54, 71, 91
-+};
-+
-+static const AVRational vui_sar[] = {
-+    {  0,   1 },
-+    {  1,   1 },
-+    { 12,  11 },
-+    { 10,  11 },
-+    { 16,  11 },
-+    { 40,  33 },
-+    { 24,  11 },
-+    { 20,  11 },
-+    { 32,  11 },
-+    { 80,  33 },
-+    { 18,  11 },
-+    { 15,  11 },
-+    { 64,  33 },
-+    { 160, 99 },
-+    {  4,   3 },
-+    {  3,   2 },
-+    {  2,   1 },
-+};
-+
-+
-+// pps_cb_qp_offset: -12,+12
-+// slice_cb_qp_offset: -12,+12 also
-+//   "The value of pps_cb_qp_offset + slice_cb_qp_offset shall be in the range of -12 to +12, inclusive."
-+// cr_qp_offset_list[n]: -12,+12
-+// So worst case total offset: -24,+24
-+
-+#define T(n) ((((48+(n))/6-10)<<3) | (48+(n))%6)
-+#define C(B,n) T(B*6+(n) < 0 ? -B*6 : (n) > 51 ? 51 : (n))
-+#define M(B,n) C(B,(-n))
-+
-+// Sizeof the QP_START_BLOCK
-+#define QP_OFFSET_0 (8*6 + 12*2)
-+#define QP_START(B) \
-+    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
-+    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
-+    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
-+    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
-+\
-+    M(B,48), M(B,47), M(B,46), M(B,45), M(B,44), M(B,43),\
-+    M(B,42), M(B,41), M(B,40), M(B,39), M(B,38), M(B,37),\
-+    M(B,36), M(B,35), M(B,34), M(B,33), M(B,32), M(B,31),\
-+    M(B,30), M(B,29), M(B,28), M(B,27), M(B,26), M(B,25),\
-+    M(B,24), M(B,23), M(B,22), M(B,21), M(B,20), M(B,19),\
-+    M(B,18), M(B,17), M(B,16), M(B,15), M(B,14), M(B,13),\
-+    M(B,12), M(B,11), M(B,10), M(B, 9), M(B, 8), M(B, 7),\
-+    M(B, 6), M(B, 5), M(B, 4), M(B, 3), M(B, 2), M(B, 1)
-+#define QP_END(B) \
-+    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
-+    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
-+    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51)
-+
-+#define T1(B)\
-+{\
-+    QP_START(B),\
-+    C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\
-+    C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\
-+    C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\
-+    C(B,29), C(B,30), C(B,31), C(B,32), C(B,33), C(B,33), C(B,34), C(B,34), C(B,35), C(B,35),\
-+    C(B,36), C(B,36), C(B,37), C(B,37), C(B,38), C(B,39), C(B,40), C(B,41), C(B,42), C(B,43),\
-+    C(B,44), C(B,45),\
-+    C(B,46), C(B,47), C(B,48), C(B,49), C(B,50), C(B,51),\
-+    QP_END(B)\
-+}
-+#define T0(B)\
-+{\
-+    QP_START(B),\
-+    C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\
-+    C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\
-+    C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\
-+    C(B,30), C(B,31), C(B,32), C(B,33), C(B,34), C(B,35), C(B,36), C(B,37), C(B,38), C(B,39),\
-+    C(B,40), C(B,41), C(B,42), C(B,43), C(B,44), C(B,45), C(B,46), C(B,47), C(B,48), C(B,49),\
-+    C(B,50), C(B,51),\
-+    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
-+    QP_END(B)\
-+}
-+
-+#define QP_TABLE_SIZE (QP_OFFSET_0 + 52 + 12*2)
-+
-+static const int8_t qp_c_bd_0[8][QP_TABLE_SIZE] = {T0(0),T0(1),T0(2),T0(3),T0(4),T0(5),T0(6),T0(7)};
-+static const int8_t qp_c_bd_1[8][QP_TABLE_SIZE] = {T1(0),T1(1),T1(2),T1(3),T1(4),T1(5),T1(6),T1(7)};
-+
-+#undef T
-+#undef C
-+#undef QP_END
-+
-+#define C(B,n) ((n)<0?0:(n)>51?51:(n))
-+// We do need a lot of -ve padding to cope with high bit depths that give -ve qps
-+#define QP_DBLK_OFFSET_0 QP_OFFSET_0
-+#define QP_END(B)\
-+ 51, 51, 51, 51, 51, 51
-+
-+// These don't need all the padding we have here (12 top/bottom would be enough)
-+static const uint8_t qp_c_dblk_0[] = T0(0);
-+static const uint8_t qp_c_dblk_1[] = T1(0);
-+
-+#undef T
-+#undef M
-+#undef C
-+#undef QP_END
-+#undef QP_START
-+
-+
-+static void remove_pps(HEVCRpiParamSets * const s, const int id)
-+{
-+    if (s->pps_list[id] && s->pps == (const HEVCRpiPPS*)s->pps_list[id]->data)
-+        s->pps = NULL;
-+    av_buffer_unref(&s->pps_list[id]);
-+}
-+
-+static void remove_sps(HEVCRpiParamSets * const s, const int id)
-+{
-+    int i;
-+    if (s->sps_list[id]) {
-+        if (s->sps == (const HEVCRpiSPS*)s->sps_list[id]->data)
-+            s->sps = NULL;
-+
-+        /* drop all PPS that depend on this SPS */
-+        for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++)
-+            if (s->pps_list[i] && ((HEVCRpiPPS*)s->pps_list[i]->data)->sps_id == id)
-+                remove_pps(s, i);
-+
-+        av_assert0(!(s->sps_list[id] && s->sps == (HEVCRpiSPS*)s->sps_list[id]->data));
-+    }
-+    av_buffer_unref(&s->sps_list[id]);
-+}
-+
-+static void remove_vps(HEVCRpiParamSets * const s, const int id)
-+{
-+    int i;
-+    if (s->vps_list[id]) {
-+        if (s->vps == (const HEVCRpiVPS*)s->vps_list[id]->data)
-+            s->vps = NULL;
-+
-+        for (i = 0; i < FF_ARRAY_ELEMS(s->sps_list); i++)
-+            if (s->sps_list[i] && ((HEVCRpiSPS*)s->sps_list[i]->data)->vps_id == id)
-+                remove_sps(s, i);
-+    }
-+    av_buffer_unref(&s->vps_list[id]);
-+}
-+
-+int ff_hevc_rpi_decode_short_term_rps(GetBitContext * const gb, AVCodecContext * const avctx,
-+                                  ShortTermRPS * const rps, const HEVCRpiSPS * const sps, const int is_slice_header)
-+{
-+    uint8_t rps_predict = 0;
-+    int delta_poc;
-+    int k0 = 0;
-+    int k1 = 0;
-+    int k  = 0;
-+    int i;
-+
-+    if (rps != sps->st_rps && sps->nb_st_rps)
-+        rps_predict = get_bits1(gb);
-+
-+    if (rps_predict) {
-+        const ShortTermRPS *rps_ridx;
-+        int delta_rps;
-+        unsigned abs_delta_rps;
-+        uint8_t use_delta_flag = 0;
-+        uint8_t delta_rps_sign;
-+
-+        if (is_slice_header) {
-+            unsigned int delta_idx = get_ue_golomb_long(gb) + 1;
-+            if (delta_idx > sps->nb_st_rps) {
-+                av_log(avctx, AV_LOG_ERROR,
-+                       "Invalid value of delta_idx in slice header RPS: %d > %d.\n",
-+                       delta_idx, sps->nb_st_rps);
-+                return AVERROR_INVALIDDATA;
-+            }
-+            rps_ridx = &sps->st_rps[sps->nb_st_rps - delta_idx];
-+            rps->rps_idx_num_delta_pocs = rps_ridx->num_delta_pocs;
-+        } else
-+            rps_ridx = &sps->st_rps[rps - sps->st_rps - 1];
-+
-+        delta_rps_sign = get_bits1(gb);
-+        abs_delta_rps  = get_ue_golomb_long(gb) + 1;
-+        if (abs_delta_rps < 1 || abs_delta_rps > 32768) {
-+            av_log(avctx, AV_LOG_ERROR,
-+                   "Invalid value of abs_delta_rps: %d\n",
-+                   abs_delta_rps);
-+            return AVERROR_INVALIDDATA;
-+        }
-+        delta_rps      = (1 - (delta_rps_sign << 1)) * abs_delta_rps;
-+        for (i = 0; i <= rps_ridx->num_delta_pocs; i++) {
-+            int used = rps->used[k] = get_bits1(gb);
-+
-+            if (!used)
-+                use_delta_flag = get_bits1(gb);
-+
-+            if (used || use_delta_flag) {
-+                if (i < rps_ridx->num_delta_pocs)
-+                    delta_poc = delta_rps + rps_ridx->delta_poc[i];
-+                else
-+                    delta_poc = delta_rps;
-+                rps->delta_poc[k] = delta_poc;
-+                if (delta_poc < 0)
-+                    k0++;
-+                else
-+                    k1++;
-+                k++;
-+            }
-+        }
-+
-+        if (k >= FF_ARRAY_ELEMS(rps->used)) {
-+            av_log(avctx, AV_LOG_ERROR,
-+                   "Invalid num_delta_pocs: %d\n", k);
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        rps->num_delta_pocs    = k;
-+        rps->num_negative_pics = k0;
-+        // sort in increasing order (smallest first)
-+        if (rps->num_delta_pocs != 0) {
-+            int used, tmp;
-+            for (i = 1; i < rps->num_delta_pocs; i++) {
-+                delta_poc = rps->delta_poc[i];
-+                used      = rps->used[i];
-+                for (k = i - 1; k >= 0; k--) {
-+                    tmp = rps->delta_poc[k];
-+                    if (delta_poc < tmp) {
-+                        rps->delta_poc[k + 1] = tmp;
-+                        rps->used[k + 1]      = rps->used[k];
-+                        rps->delta_poc[k]     = delta_poc;
-+                        rps->used[k]          = used;
-+                    }
-+                }
-+            }
-+        }
-+        if ((rps->num_negative_pics >> 1) != 0) {
-+            int used;
-+            k = rps->num_negative_pics - 1;
-+            // flip the negative values to largest first
-+            for (i = 0; i < rps->num_negative_pics >> 1; i++) {
-+                delta_poc         = rps->delta_poc[i];
-+                used              = rps->used[i];
-+                rps->delta_poc[i] = rps->delta_poc[k];
-+                rps->used[i]      = rps->used[k];
-+                rps->delta_poc[k] = delta_poc;
-+                rps->used[k]      = used;
-+                k--;
-+            }
-+        }
-+    } else {
-+        unsigned int prev, nb_positive_pics;
-+        rps->num_negative_pics = get_ue_golomb_long(gb);
-+        nb_positive_pics       = get_ue_golomb_long(gb);
-+
-+        if (rps->num_negative_pics >= HEVC_MAX_REFS ||
-+            nb_positive_pics >= HEVC_MAX_REFS) {
-+            av_log(avctx, AV_LOG_ERROR, "Too many refs in a short term RPS.\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        rps->num_delta_pocs = rps->num_negative_pics + nb_positive_pics;
-+        if (rps->num_delta_pocs) {
-+            prev = 0;
-+            for (i = 0; i < rps->num_negative_pics; i++) {
-+                delta_poc = get_ue_golomb_long(gb) + 1;
-+                if (delta_poc < 1 || delta_poc > 32768) {
-+                    av_log(avctx, AV_LOG_ERROR,
-+                        "Invalid value of delta_poc: %d\n",
-+                        delta_poc);
-+                    return AVERROR_INVALIDDATA;
-+                }
-+                prev -= delta_poc;
-+                rps->delta_poc[i] = prev;
-+                rps->used[i]      = get_bits1(gb);
-+            }
-+            prev = 0;
-+            for (i = 0; i < nb_positive_pics; i++) {
-+                delta_poc = get_ue_golomb_long(gb) + 1;
-+                if (delta_poc < 1 || delta_poc > 32768) {
-+                    av_log(avctx, AV_LOG_ERROR,
-+                        "Invalid value of delta_poc: %d\n",
-+                        delta_poc);
-+                    return AVERROR_INVALIDDATA;
-+                }
-+                prev += delta_poc;
-+                rps->delta_poc[rps->num_negative_pics + i] = prev;
-+                rps->used[rps->num_negative_pics + i]      = get_bits1(gb);
-+            }
-+        }
-+    }
-+    return 0;
-+}
-+
-+
-+static int decode_profile_tier_level(GetBitContext * const gb, AVCodecContext * const avctx,
-+                                      PTLCommon * const ptl)
-+{
-+    int i;
-+
-+    if (get_bits_left(gb) < 2+1+5 + 32 + 4 + 16 + 16 + 12)
-+        return -1;
-+
-+    ptl->profile_space = get_bits(gb, 2);
-+    ptl->tier_flag     = get_bits1(gb);
-+    ptl->profile_idc   = get_bits(gb, 5);
-+    if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN)
-+        av_log(avctx, AV_LOG_DEBUG, "Main profile bitstream\n");
-+    else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_10)
-+        av_log(avctx, AV_LOG_DEBUG, "Main 10 profile bitstream\n");
-+    else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_STILL_PICTURE)
-+        av_log(avctx, AV_LOG_DEBUG, "Main Still Picture profile bitstream\n");
-+    else if (ptl->profile_idc == FF_PROFILE_HEVC_REXT)
-+        av_log(avctx, AV_LOG_DEBUG, "Range Extension profile bitstream\n");
-+    else
-+        av_log(avctx, AV_LOG_WARNING, "Unknown HEVC profile: %d\n", ptl->profile_idc);
-+
-+    for (i = 0; i < 32; i++) {
-+        ptl->profile_compatibility_flag[i] = get_bits1(gb);
-+
-+        if (ptl->profile_idc == 0 && i > 0 && ptl->profile_compatibility_flag[i])
-+            ptl->profile_idc = i;
-+    }
-+    ptl->progressive_source_flag    = get_bits1(gb);
-+    ptl->interlaced_source_flag     = get_bits1(gb);
-+    ptl->non_packed_constraint_flag = get_bits1(gb);
-+    ptl->frame_only_constraint_flag = get_bits1(gb);
-+
-+    skip_bits(gb, 16); // XXX_reserved_zero_44bits[0..15]
-+    skip_bits(gb, 16); // XXX_reserved_zero_44bits[16..31]
-+    skip_bits(gb, 12); // XXX_reserved_zero_44bits[32..43]
-+
-+    return 0;
-+}
-+
-+static int parse_ptl(GetBitContext * const gb, AVCodecContext * const avctx,
-+                      PTL * const ptl, const int max_num_sub_layers)
-+{
-+    int i;
-+    if (decode_profile_tier_level(gb, avctx, &ptl->general_ptl) < 0 ||
-+        get_bits_left(gb) < 8 + (8*2 * (max_num_sub_layers - 1 > 0))) {
-+        av_log(avctx, AV_LOG_ERROR, "PTL information too short\n");
-+        return -1;
-+    }
-+
-+    ptl->general_ptl.level_idc = get_bits(gb, 8);
-+
-+    for (i = 0; i < max_num_sub_layers - 1; i++) {
-+        ptl->sub_layer_profile_present_flag[i] = get_bits1(gb);
-+        ptl->sub_layer_level_present_flag[i]   = get_bits1(gb);
-+    }
-+
-+    if (max_num_sub_layers - 1> 0)
-+        for (i = max_num_sub_layers - 1; i < 8; i++)
-+            skip_bits(gb, 2); // reserved_zero_2bits[i]
-+    for (i = 0; i < max_num_sub_layers - 1; i++) {
-+        if (ptl->sub_layer_profile_present_flag[i] &&
-+            decode_profile_tier_level(gb, avctx, &ptl->sub_layer_ptl[i]) < 0) {
-+            av_log(avctx, AV_LOG_ERROR,
-+                   "PTL information for sublayer %i too short\n", i);
-+            return -1;
-+        }
-+        if (ptl->sub_layer_level_present_flag[i]) {
-+            if (get_bits_left(gb) < 8) {
-+                av_log(avctx, AV_LOG_ERROR,
-+                       "Not enough data for sublayer %i level_idc\n", i);
-+                return -1;
-+            } else
-+                ptl->sub_layer_ptl[i].level_idc = get_bits(gb, 8);
-+        }
-+    }
-+
-+    return 0;
-+}
-+
-+static void decode_sublayer_hrd(GetBitContext * const gb, const unsigned int nb_cpb,
-+                                const int subpic_params_present)
-+{
-+    int i;
-+
-+    for (i = 0; i < nb_cpb; i++) {
-+        get_ue_golomb_long(gb); // bit_rate_value_minus1
-+        get_ue_golomb_long(gb); // cpb_size_value_minus1
-+
-+        if (subpic_params_present) {
-+            get_ue_golomb_long(gb); // cpb_size_du_value_minus1
-+            get_ue_golomb_long(gb); // bit_rate_du_value_minus1
-+        }
-+        skip_bits1(gb); // cbr_flag
-+    }
-+}
-+
-+static int decode_hrd(GetBitContext * const gb, const int common_inf_present,
-+                      const int max_sublayers)
-+{
-+    int nal_params_present = 0, vcl_params_present = 0;
-+    int subpic_params_present = 0;
-+    int i;
-+
-+    if (common_inf_present) {
-+        nal_params_present = get_bits1(gb);
-+        vcl_params_present = get_bits1(gb);
-+
-+        if (nal_params_present || vcl_params_present) {
-+            subpic_params_present = get_bits1(gb);
-+
-+            if (subpic_params_present) {
-+                skip_bits(gb, 8); // tick_divisor_minus2
-+                skip_bits(gb, 5); // du_cpb_removal_delay_increment_length_minus1
-+                skip_bits(gb, 1); // sub_pic_cpb_params_in_pic_timing_sei_flag
-+                skip_bits(gb, 5); // dpb_output_delay_du_length_minus1
-+            }
-+
-+            skip_bits(gb, 4); // bit_rate_scale
-+            skip_bits(gb, 4); // cpb_size_scale
-+
-+            if (subpic_params_present)
-+                skip_bits(gb, 4);  // cpb_size_du_scale
-+
-+            skip_bits(gb, 5); // initial_cpb_removal_delay_length_minus1
-+            skip_bits(gb, 5); // au_cpb_removal_delay_length_minus1
-+            skip_bits(gb, 5); // dpb_output_delay_length_minus1
-+        }
-+    }
-+
-+    for (i = 0; i < max_sublayers; i++) {
-+        int low_delay = 0;
-+        unsigned int nb_cpb = 1;
-+        int fixed_rate = get_bits1(gb);
-+
-+        if (!fixed_rate)
-+            fixed_rate = get_bits1(gb);
-+
-+        if (fixed_rate)
-+            get_ue_golomb_long(gb);  // elemental_duration_in_tc_minus1
-+        else
-+            low_delay = get_bits1(gb);
-+
-+        if (!low_delay) {
-+            nb_cpb = get_ue_golomb_long(gb) + 1;
-+            if (nb_cpb < 1 || nb_cpb > 32) {
-+                av_log(NULL, AV_LOG_ERROR, "nb_cpb %d invalid\n", nb_cpb);
-+                return AVERROR_INVALIDDATA;
-+            }
-+        }
-+
-+        if (nal_params_present)
-+            decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
-+        if (vcl_params_present)
-+            decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
-+    }
-+    return 0;
-+}
-+
-+int ff_hevc_rpi_decode_nal_vps(GetBitContext * const gb, AVCodecContext * const avctx,
-+                           HEVCRpiParamSets * const ps)
-+{
-+    int i,j;
-+    int vps_id = 0;
-+    ptrdiff_t nal_size;
-+    HEVCRpiVPS *vps;
-+    AVBufferRef *vps_buf = av_buffer_allocz(sizeof(*vps));
-+
-+    if (!vps_buf)
-+        return AVERROR(ENOMEM);
-+    vps = (HEVCRpiVPS*)vps_buf->data;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "Decoding VPS\n");
-+
-+    nal_size = gb->buffer_end - gb->buffer;
-+    if (nal_size > sizeof(vps->data)) {
-+        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized VPS "
-+               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
-+               nal_size, sizeof(vps->data));
-+        vps->data_size = sizeof(vps->data);
-+    } else {
-+        vps->data_size = nal_size;
-+    }
-+    memcpy(vps->data, gb->buffer, vps->data_size);
-+
-+    vps_id = get_bits(gb, 4);
-+    if (vps_id >= HEVC_MAX_VPS_COUNT) {
-+        av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", vps_id);
-+        goto err;
-+    }
-+
-+    if (get_bits(gb, 2) != 3) { // vps_reserved_three_2bits
-+        av_log(avctx, AV_LOG_ERROR, "vps_reserved_three_2bits is not three\n");
-+        goto err;
-+    }
-+
-+    vps->vps_max_layers               = get_bits(gb, 6) + 1;
-+    vps->vps_max_sub_layers           = get_bits(gb, 3) + 1;
-+    vps->vps_temporal_id_nesting_flag = get_bits1(gb);
-+
-+    if (get_bits(gb, 16) != 0xffff) { // vps_reserved_ffff_16bits
-+        av_log(avctx, AV_LOG_ERROR, "vps_reserved_ffff_16bits is not 0xffff\n");
-+        goto err;
-+    }
-+
-+    if (vps->vps_max_sub_layers > HEVC_MAX_SUB_LAYERS) {
-+        av_log(avctx, AV_LOG_ERROR, "vps_max_sub_layers out of range: %d\n",
-+               vps->vps_max_sub_layers);
-+        goto err;
-+    }
-+
-+    if (parse_ptl(gb, avctx, &vps->ptl, vps->vps_max_sub_layers) < 0)
-+        goto err;
-+
-+    vps->vps_sub_layer_ordering_info_present_flag = get_bits1(gb);
-+
-+    i = vps->vps_sub_layer_ordering_info_present_flag ? 0 : vps->vps_max_sub_layers - 1;
-+    for (; i < vps->vps_max_sub_layers; i++) {
-+        vps->vps_max_dec_pic_buffering[i] = get_ue_golomb_long(gb) + 1;
-+        vps->vps_num_reorder_pics[i]      = get_ue_golomb_long(gb);
-+        vps->vps_max_latency_increase[i]  = get_ue_golomb_long(gb) - 1;
-+
-+        if (vps->vps_max_dec_pic_buffering[i] > HEVC_MAX_DPB_SIZE || !vps->vps_max_dec_pic_buffering[i]) {
-+            av_log(avctx, AV_LOG_ERROR, "vps_max_dec_pic_buffering_minus1 out of range: %d\n",
-+                   vps->vps_max_dec_pic_buffering[i] - 1);
-+            goto err;
-+        }
-+        if (vps->vps_num_reorder_pics[i] > vps->vps_max_dec_pic_buffering[i] - 1) {
-+            av_log(avctx, AV_LOG_WARNING, "vps_max_num_reorder_pics out of range: %d\n",
-+                   vps->vps_num_reorder_pics[i]);
-+            if (avctx->err_recognition & AV_EF_EXPLODE)
-+                goto err;
-+        }
-+    }
-+
-+    vps->vps_max_layer_id   = get_bits(gb, 6);
-+    vps->vps_num_layer_sets = get_ue_golomb_long(gb) + 1;
-+    if (vps->vps_num_layer_sets < 1 || vps->vps_num_layer_sets > 1024 ||
-+        (vps->vps_num_layer_sets - 1LL) * (vps->vps_max_layer_id + 1LL) > get_bits_left(gb)) {
-+        av_log(avctx, AV_LOG_ERROR, "too many layer_id_included_flags\n");
-+        goto err;
-+    }
-+
-+    for (i = 1; i < vps->vps_num_layer_sets; i++)
-+        for (j = 0; j <= vps->vps_max_layer_id; j++)
-+            skip_bits(gb, 1);  // layer_id_included_flag[i][j]
-+
-+    vps->vps_timing_info_present_flag = get_bits1(gb);
-+    if (vps->vps_timing_info_present_flag) {
-+        vps->vps_num_units_in_tick               = get_bits_long(gb, 32);
-+        vps->vps_time_scale                      = get_bits_long(gb, 32);
-+        vps->vps_poc_proportional_to_timing_flag = get_bits1(gb);
-+        if (vps->vps_poc_proportional_to_timing_flag)
-+            vps->vps_num_ticks_poc_diff_one = get_ue_golomb_long(gb) + 1;
-+        vps->vps_num_hrd_parameters = get_ue_golomb_long(gb);
-+        if (vps->vps_num_hrd_parameters > (unsigned)vps->vps_num_layer_sets) {
-+            av_log(avctx, AV_LOG_ERROR,
-+                   "vps_num_hrd_parameters %d is invalid\n", vps->vps_num_hrd_parameters);
-+            goto err;
-+        }
-+        for (i = 0; i < vps->vps_num_hrd_parameters; i++) {
-+            int common_inf_present = 1;
-+
-+            get_ue_golomb_long(gb); // hrd_layer_set_idx
-+            if (i)
-+                common_inf_present = get_bits1(gb);
-+            decode_hrd(gb, common_inf_present, vps->vps_max_sub_layers);
-+        }
-+    }
-+    get_bits1(gb); /* vps_extension_flag */
-+
-+    if (get_bits_left(gb) < 0) {
-+        av_log(avctx, AV_LOG_ERROR,
-+               "Overread VPS by %d bits\n", -get_bits_left(gb));
-+        if (ps->vps_list[vps_id])
-+            goto err;
-+    }
-+
-+    if (ps->vps_list[vps_id] &&
-+        !memcmp(ps->vps_list[vps_id]->data, vps_buf->data, vps_buf->size)) {
-+        av_buffer_unref(&vps_buf);
-+    } else {
-+        remove_vps(ps, vps_id);
-+        ps->vps_list[vps_id] = vps_buf;
-+    }
-+
-+    return 0;
-+
-+err:
-+    av_buffer_unref(&vps_buf);
-+    return AVERROR_INVALIDDATA;
-+}
-+
-+static void decode_vui(GetBitContext * const gb, AVCodecContext * const avctx,
-+                       const int apply_defdispwin, HEVCRpiSPS * const sps)
-+{
-+    VUI backup_vui, * const vui = &sps->vui;
-+    GetBitContext backup;
-+    int sar_present, alt = 0;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "Decoding VUI\n");
-+
-+    sar_present = get_bits1(gb);
-+    if (sar_present) {
-+        uint8_t sar_idx = get_bits(gb, 8);
-+        if (sar_idx < FF_ARRAY_ELEMS(vui_sar))
-+            vui->sar = vui_sar[sar_idx];
-+        else if (sar_idx == 255) {
-+            vui->sar.num = get_bits(gb, 16);
-+            vui->sar.den = get_bits(gb, 16);
-+        } else
-+            av_log(avctx, AV_LOG_WARNING,
-+                   "Unknown SAR index: %u.\n", sar_idx);
-+    }
-+
-+    vui->overscan_info_present_flag = get_bits1(gb);
-+    if (vui->overscan_info_present_flag)
-+        vui->overscan_appropriate_flag = get_bits1(gb);
-+
-+    vui->video_signal_type_present_flag = get_bits1(gb);
-+    if (vui->video_signal_type_present_flag) {
-+        vui->video_format                    = get_bits(gb, 3);
-+        vui->video_full_range_flag           = get_bits1(gb);
-+        vui->colour_description_present_flag = get_bits1(gb);
-+        if (vui->video_full_range_flag && sps->pix_fmt == AV_PIX_FMT_YUV420P)
-+            sps->pix_fmt = AV_PIX_FMT_YUVJ420P;
-+        if (vui->colour_description_present_flag) {
-+            vui->colour_primaries        = get_bits(gb, 8);
-+            vui->transfer_characteristic = get_bits(gb, 8);
-+            vui->matrix_coeffs           = get_bits(gb, 8);
-+
-+            // Set invalid values to "unspecified"
-+            if (!av_color_primaries_name(vui->colour_primaries))
-+                vui->colour_primaries = AVCOL_PRI_UNSPECIFIED;
-+            if (!av_color_transfer_name(vui->transfer_characteristic))
-+                vui->transfer_characteristic = AVCOL_TRC_UNSPECIFIED;
-+            if (!av_color_space_name(vui->matrix_coeffs))
-+                vui->matrix_coeffs = AVCOL_SPC_UNSPECIFIED;
-+            if (vui->matrix_coeffs == AVCOL_SPC_RGB) {
-+                switch (sps->pix_fmt) {
-+                case AV_PIX_FMT_YUV444P:
-+                    sps->pix_fmt = AV_PIX_FMT_GBRP;
-+                    break;
-+                case AV_PIX_FMT_YUV444P10:
-+                    sps->pix_fmt = AV_PIX_FMT_GBRP10;
-+                    break;
-+                case AV_PIX_FMT_YUV444P12:
-+                    sps->pix_fmt = AV_PIX_FMT_GBRP12;
-+                    break;
-+                }
-+            }
-+        }
-+    }
-+
-+    vui->chroma_loc_info_present_flag = get_bits1(gb);
-+    if (vui->chroma_loc_info_present_flag) {
-+        vui->chroma_sample_loc_type_top_field    = get_ue_golomb_long(gb);
-+        vui->chroma_sample_loc_type_bottom_field = get_ue_golomb_long(gb);
-+    }
-+
-+    vui->neutra_chroma_indication_flag = get_bits1(gb);
-+    vui->field_seq_flag                = get_bits1(gb);
-+    vui->frame_field_info_present_flag = get_bits1(gb);
-+
-+    // Backup context in case an alternate header is detected
-+    memcpy(&backup, gb, sizeof(backup));
-+    memcpy(&backup_vui, vui, sizeof(backup_vui));
-+    if (get_bits_left(gb) >= 68 && show_bits_long(gb, 21) == 0x100000) {
-+        vui->default_display_window_flag = 0;
-+        av_log(avctx, AV_LOG_WARNING, "Invalid default display window\n");
-+    } else
-+        vui->default_display_window_flag = get_bits1(gb);
-+
-+    if (vui->default_display_window_flag) {
-+        int vert_mult  = 1 + (sps->chroma_format_idc < 2);
-+        int horiz_mult = 1 + (sps->chroma_format_idc < 3);
-+        vui->def_disp_win.left_offset   = get_ue_golomb_long(gb) * horiz_mult;
-+        vui->def_disp_win.right_offset  = get_ue_golomb_long(gb) * horiz_mult;
-+        vui->def_disp_win.top_offset    = get_ue_golomb_long(gb) *  vert_mult;
-+        vui->def_disp_win.bottom_offset = get_ue_golomb_long(gb) *  vert_mult;
-+
-+        if (apply_defdispwin &&
-+            avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
-+            av_log(avctx, AV_LOG_DEBUG,
-+                   "discarding vui default display window, "
-+                   "original values are l:%u r:%u t:%u b:%u\n",
-+                   vui->def_disp_win.left_offset,
-+                   vui->def_disp_win.right_offset,
-+                   vui->def_disp_win.top_offset,
-+                   vui->def_disp_win.bottom_offset);
-+
-+            vui->def_disp_win.left_offset   =
-+            vui->def_disp_win.right_offset  =
-+            vui->def_disp_win.top_offset    =
-+            vui->def_disp_win.bottom_offset = 0;
-+        }
-+    }
-+
-+timing_info:
-+    vui->vui_timing_info_present_flag = get_bits1(gb);
-+
-+    if (vui->vui_timing_info_present_flag) {
-+        if( get_bits_left(gb) < 66 && !alt) {
-+            // The alternate syntax seem to have timing info located
-+            // at where def_disp_win is normally located
-+            av_log(avctx, AV_LOG_WARNING,
-+                   "Strange VUI timing information, retrying...\n");
-+            memcpy(vui, &backup_vui, sizeof(backup_vui));
-+            memcpy(gb, &backup, sizeof(backup));
-+            alt = 1;
-+            goto timing_info;
-+        }
-+        vui->vui_num_units_in_tick               = get_bits_long(gb, 32);
-+        vui->vui_time_scale                      = get_bits_long(gb, 32);
-+        if (alt) {
-+            av_log(avctx, AV_LOG_INFO, "Retry got %"PRIu32"/%"PRIu32" fps\n",
-+                   vui->vui_time_scale, vui->vui_num_units_in_tick);
-+        }
-+        vui->vui_poc_proportional_to_timing_flag = get_bits1(gb);
-+        if (vui->vui_poc_proportional_to_timing_flag)
-+            vui->vui_num_ticks_poc_diff_one_minus1 = get_ue_golomb_long(gb);
-+        vui->vui_hrd_parameters_present_flag = get_bits1(gb);
-+        if (vui->vui_hrd_parameters_present_flag)
-+            decode_hrd(gb, 1, sps->max_sub_layers);
-+    }
-+
-+    vui->bitstream_restriction_flag = get_bits1(gb);
-+    if (vui->bitstream_restriction_flag) {
-+        if (get_bits_left(gb) < 8 && !alt) {
-+            av_log(avctx, AV_LOG_WARNING,
-+                   "Strange VUI bitstream restriction information, retrying"
-+                   " from timing information...\n");
-+            memcpy(vui, &backup_vui, sizeof(backup_vui));
-+            memcpy(gb, &backup, sizeof(backup));
-+            alt = 1;
-+            goto timing_info;
-+        }
-+        vui->tiles_fixed_structure_flag              = get_bits1(gb);
-+        vui->motion_vectors_over_pic_boundaries_flag = get_bits1(gb);
-+        vui->restricted_ref_pic_lists_flag           = get_bits1(gb);
-+        vui->min_spatial_segmentation_idc            = get_ue_golomb_long(gb);
-+        vui->max_bytes_per_pic_denom                 = get_ue_golomb_long(gb);
-+        vui->max_bits_per_min_cu_denom               = get_ue_golomb_long(gb);
-+        vui->log2_max_mv_length_horizontal           = get_ue_golomb_long(gb);
-+        vui->log2_max_mv_length_vertical             = get_ue_golomb_long(gb);
-+    }
-+
-+    if (get_bits_left(gb) < 1 && !alt) {
-+        // XXX: Alternate syntax when sps_range_extension_flag != 0?
-+        av_log(avctx, AV_LOG_WARNING,
-+               "Overread in VUI, retrying from timing information...\n");
-+        memcpy(vui, &backup_vui, sizeof(backup_vui));
-+        memcpy(gb, &backup, sizeof(backup));
-+        alt = 1;
-+        goto timing_info;
-+    }
-+}
-+
-+static void set_default_scaling_list_data(ScalingList * const sl)
-+{
-+    int matrixId;
-+
-+    for (matrixId = 0; matrixId < 6; matrixId++) {
-+        // 4x4 default is 16
-+        memset(sl->sl[0][matrixId], 16, 16);
-+        sl->sl_dc[0][matrixId] = 16; // default for 16x16
-+        sl->sl_dc[1][matrixId] = 16; // default for 32x32
-+    }
-+
-+    memcpy(sl->sl[1][0], default_scaling_list_intra, 64);
-+    memcpy(sl->sl[1][1], default_scaling_list_intra, 64);
-+    memcpy(sl->sl[1][2], default_scaling_list_intra, 64);
-+
-+    memcpy(sl->sl[1][3], default_scaling_list_inter, 64);
-+    memcpy(sl->sl[1][4], default_scaling_list_inter, 64);
-+    memcpy(sl->sl[1][5], default_scaling_list_inter, 64);
-+
-+    memcpy(sl->sl[2][0], default_scaling_list_intra, 64);
-+    memcpy(sl->sl[2][1], default_scaling_list_intra, 64);
-+    memcpy(sl->sl[2][2], default_scaling_list_intra, 64);
-+
-+    memcpy(sl->sl[2][3], default_scaling_list_inter, 64);
-+    memcpy(sl->sl[2][4], default_scaling_list_inter, 64);
-+    memcpy(sl->sl[2][5], default_scaling_list_inter, 64);
-+
-+    memcpy(sl->sl[3][0], default_scaling_list_intra, 64);
-+    memcpy(sl->sl[3][1], default_scaling_list_intra, 64);
-+    memcpy(sl->sl[3][2], default_scaling_list_intra, 64);
-+
-+    memcpy(sl->sl[3][3], default_scaling_list_inter, 64);
-+    memcpy(sl->sl[3][4], default_scaling_list_inter, 64);
-+    memcpy(sl->sl[3][5], default_scaling_list_inter, 64);
-+}
-+
-+static int scaling_list_data(GetBitContext * const gb, AVCodecContext * const avctx, ScalingList * const sl,
-+                             const HEVCRpiSPS * const sps)
-+{
-+    uint8_t scaling_list_pred_mode_flag;
-+    int32_t scaling_list_dc_coef[2][6];
-+    int size_id, matrix_id, pos;
-+    int i;
-+
-+    for (size_id = 0; size_id < 4; size_id++)
-+        for (matrix_id = 0; matrix_id < 6; matrix_id += ((size_id == 3) ? 3 : 1)) {
-+            scaling_list_pred_mode_flag = get_bits1(gb);
-+            if (!scaling_list_pred_mode_flag) {
-+                unsigned int delta = get_ue_golomb_long(gb);
-+                /* Only need to handle non-zero delta. Zero means default,
-+                 * which should already be in the arrays. */
-+                if (delta) {
-+                    // Copy from previous array.
-+                    delta *= (size_id == 3) ? 3 : 1;
-+                    if (matrix_id < delta) {
-+                        av_log(avctx, AV_LOG_ERROR,
-+                               "Invalid delta in scaling list data: %d.\n", delta);
-+                        return AVERROR_INVALIDDATA;
-+                    }
-+
-+                    memcpy(sl->sl[size_id][matrix_id],
-+                           sl->sl[size_id][matrix_id - delta],
-+                           size_id > 0 ? 64 : 16);
-+                    if (size_id > 1)
-+                        sl->sl_dc[size_id - 2][matrix_id] = sl->sl_dc[size_id - 2][matrix_id - delta];
-+                }
-+            } else {
-+                int next_coef, coef_num;
-+                int32_t scaling_list_delta_coef;
-+
-+                next_coef = 8;
-+                coef_num  = FFMIN(64, 1 << (4 + (size_id << 1)));
-+                if (size_id > 1) {
-+                    scaling_list_dc_coef[size_id - 2][matrix_id] = get_se_golomb(gb) + 8;
-+                    next_coef = scaling_list_dc_coef[size_id - 2][matrix_id];
-+                    sl->sl_dc[size_id - 2][matrix_id] = next_coef;
-+                }
-+                for (i = 0; i < coef_num; i++) {
-+                    if (size_id == 0)
-+                        pos = 4 * ff_hevc_rpi_diag_scan4x4_y[i] +
-+                                  ff_hevc_rpi_diag_scan4x4_x[i];
-+                    else
-+                        pos = 8 * ff_hevc_rpi_diag_scan8x8_y[i] +
-+                                  ff_hevc_rpi_diag_scan8x8_x[i];
-+
-+                    scaling_list_delta_coef = get_se_golomb(gb);
-+                    next_coef = (next_coef + 256U + scaling_list_delta_coef) % 256;
-+                    sl->sl[size_id][matrix_id][pos] = next_coef;
-+                }
-+            }
-+        }
-+
-+    if (sps->chroma_format_idc == 3) {
-+        for (i = 0; i < 64; i++) {
-+            sl->sl[3][1][i] = sl->sl[2][1][i];
-+            sl->sl[3][2][i] = sl->sl[2][2][i];
-+            sl->sl[3][4][i] = sl->sl[2][4][i];
-+            sl->sl[3][5][i] = sl->sl[2][5][i];
-+        }
-+        sl->sl_dc[1][1] = sl->sl_dc[0][1];
-+        sl->sl_dc[1][2] = sl->sl_dc[0][2];
-+        sl->sl_dc[1][4] = sl->sl_dc[0][4];
-+        sl->sl_dc[1][5] = sl->sl_dc[0][5];
-+    }
-+
-+
-+    return 0;
-+}
-+
-+static int map_pixel_format(HEVCRpiSPS * const sps)
-+{
-+    const int cfmt = sps->chroma_format_idc;
-+
-+    sps->pix_fmt = AV_PIX_FMT_NONE;
-+    switch (sps->bit_depth) {
-+    case 8:
-+        if (cfmt == 1)
-+            sps->pix_fmt = AV_PIX_FMT_SAND128;
-+        break;
-+    case 10:
-+        if (cfmt == 1)
-+            sps->pix_fmt = AV_PIX_FMT_SAND64_10;
-+        break;
-+    default:
-+        break;
-+    }
-+
-+    sps->hshift[0] = sps->vshift[0] = 0;
-+    sps->hshift[2] = sps->hshift[1] = cfmt > 2 ? 0 : 1; // 1 unless 4:4:4
-+    sps->vshift[2] = sps->vshift[1] = cfmt > 1 ? 0 : 1; // 1 unless 4:4:4 or 4:2:2
-+
-+    sps->pixel_shift = sps->bit_depth > 8 ? 1 : 0;
-+
-+    return 0;
-+}
-+
-+static int ff_hevc_rpi_parse_sps(HEVCRpiSPS * const sps, GetBitContext * const gb, unsigned int * const sps_id,
-+                      const int apply_defdispwin, AVBufferRef * const * const vps_list, AVCodecContext * const avctx)
-+{
-+    HEVCRpiWindow *ow;
-+    int ret = 0;
-+    int log2_diff_max_min_transform_block_size;
-+    int bit_depth_chroma, start, vui_present, sublayer_ordering_info;
-+    int i;
-+
-+    // Coded parameters
-+
-+    sps->vps_id = get_bits(gb, 4);
-+    if (sps->vps_id >= HEVC_MAX_VPS_COUNT) {
-+        av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", sps->vps_id);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (vps_list && !vps_list[sps->vps_id]) {
-+        av_log(avctx, AV_LOG_ERROR, "VPS %d does not exist\n",
-+               sps->vps_id);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    sps->max_sub_layers = get_bits(gb, 3) + 1;
-+    if (sps->max_sub_layers > HEVC_MAX_SUB_LAYERS) {
-+        av_log(avctx, AV_LOG_ERROR, "sps_max_sub_layers out of range: %d\n",
-+               sps->max_sub_layers);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    sps->temporal_id_nesting_flag = get_bits(gb, 1);
-+
-+    if ((ret = parse_ptl(gb, avctx, &sps->ptl, sps->max_sub_layers)) < 0)
-+        return ret;
-+
-+    *sps_id = get_ue_golomb_long(gb);
-+    if (*sps_id >= HEVC_MAX_SPS_COUNT) {
-+        av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", *sps_id);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    sps->chroma_format_idc = get_ue_golomb_long(gb);
-+    if (sps->chroma_format_idc > 3U) {
-+        av_log(avctx, AV_LOG_ERROR, "chroma_format_idc %d is invalid\n", sps->chroma_format_idc);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (sps->chroma_format_idc == 3)
-+        sps->separate_colour_plane_flag = get_bits1(gb);
-+
-+    if (sps->separate_colour_plane_flag)
-+        sps->chroma_format_idc = 0;
-+
-+    sps->width  = get_ue_golomb_long(gb);
-+    sps->height = get_ue_golomb_long(gb);
-+    if ((ret = av_image_check_size(sps->width,
-+                                   sps->height, 0, avctx)) < 0)
-+        return ret;
-+
-+    if (get_bits1(gb)) { // pic_conformance_flag
-+        int vert_mult  = 1 + (sps->chroma_format_idc < 2);
-+        int horiz_mult = 1 + (sps->chroma_format_idc < 3);
-+        sps->pic_conf_win.left_offset   = get_ue_golomb_long(gb) * horiz_mult;
-+        sps->pic_conf_win.right_offset  = get_ue_golomb_long(gb) * horiz_mult;
-+        sps->pic_conf_win.top_offset    = get_ue_golomb_long(gb) *  vert_mult;
-+        sps->pic_conf_win.bottom_offset = get_ue_golomb_long(gb) *  vert_mult;
-+
-+        if (avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
-+            av_log(avctx, AV_LOG_DEBUG,
-+                   "discarding sps conformance window, "
-+                   "original values are l:%u r:%u t:%u b:%u\n",
-+                   sps->pic_conf_win.left_offset,
-+                   sps->pic_conf_win.right_offset,
-+                   sps->pic_conf_win.top_offset,
-+                   sps->pic_conf_win.bottom_offset);
-+
-+            sps->pic_conf_win.left_offset   =
-+            sps->pic_conf_win.right_offset  =
-+            sps->pic_conf_win.top_offset    =
-+            sps->pic_conf_win.bottom_offset = 0;
-+        }
-+        sps->output_window = sps->pic_conf_win;
-+    }
-+
-+    sps->bit_depth   = get_ue_golomb_long(gb) + 8;
-+    bit_depth_chroma = get_ue_golomb_long(gb) + 8;
-+    if (sps->chroma_format_idc && bit_depth_chroma != sps->bit_depth) {
-+        av_log(avctx, AV_LOG_ERROR,
-+               "Luma bit depth (%d) is different from chroma bit depth (%d), "
-+               "this is unsupported.\n",
-+               sps->bit_depth, bit_depth_chroma);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    ret = map_pixel_format(sps);
-+    if (ret < 0)
-+        return ret;
-+
-+    sps->log2_max_poc_lsb = get_ue_golomb_long(gb) + 4;
-+    if (sps->log2_max_poc_lsb > 16) {
-+        av_log(avctx, AV_LOG_ERROR, "log2_max_pic_order_cnt_lsb_minus4 out range: %d\n",
-+               sps->log2_max_poc_lsb - 4);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    sublayer_ordering_info = get_bits1(gb);
-+    start = sublayer_ordering_info ? 0 : sps->max_sub_layers - 1;
-+    for (i = start; i < sps->max_sub_layers; i++) {
-+        sps->temporal_layer[i].max_dec_pic_buffering = get_ue_golomb_long(gb) + 1;
-+        sps->temporal_layer[i].num_reorder_pics      = get_ue_golomb_long(gb);
-+        sps->temporal_layer[i].max_latency_increase  = get_ue_golomb_long(gb) - 1;
-+        if (sps->temporal_layer[i].max_dec_pic_buffering > (unsigned)HEVC_MAX_DPB_SIZE) {
-+            av_log(avctx, AV_LOG_ERROR, "sps_max_dec_pic_buffering_minus1 out of range: %d\n",
-+                   sps->temporal_layer[i].max_dec_pic_buffering - 1U);
-+            return AVERROR_INVALIDDATA;
-+        }
-+        if (sps->temporal_layer[i].num_reorder_pics > sps->temporal_layer[i].max_dec_pic_buffering - 1) {
-+            av_log(avctx, AV_LOG_WARNING, "sps_max_num_reorder_pics out of range: %d\n",
-+                   sps->temporal_layer[i].num_reorder_pics);
-+            if (avctx->err_recognition & AV_EF_EXPLODE ||
-+                sps->temporal_layer[i].num_reorder_pics > HEVC_MAX_DPB_SIZE - 1) {
-+                return AVERROR_INVALIDDATA;
-+            }
-+            sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[i].num_reorder_pics + 1;
-+        }
-+    }
-+
-+    if (!sublayer_ordering_info) {
-+        for (i = 0; i < start; i++) {
-+            sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[start].max_dec_pic_buffering;
-+            sps->temporal_layer[i].num_reorder_pics      = sps->temporal_layer[start].num_reorder_pics;
-+            sps->temporal_layer[i].max_latency_increase  = sps->temporal_layer[start].max_latency_increase;
-+        }
-+    }
-+
-+    sps->log2_min_cb_size                    = get_ue_golomb_long(gb) + 3;
-+    sps->log2_diff_max_min_coding_block_size = get_ue_golomb_long(gb);
-+    sps->log2_min_tb_size                    = get_ue_golomb_long(gb) + 2;
-+    log2_diff_max_min_transform_block_size   = get_ue_golomb_long(gb);
-+    sps->log2_max_trafo_size                 = log2_diff_max_min_transform_block_size +
-+                                               sps->log2_min_tb_size;
-+
-+    if (sps->log2_min_cb_size < 3 || sps->log2_min_cb_size > 30) {
-+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_min_cb_size", sps->log2_min_cb_size);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (sps->log2_diff_max_min_coding_block_size > 30) {
-+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_coding_block_size", sps->log2_diff_max_min_coding_block_size);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (sps->log2_min_tb_size >= sps->log2_min_cb_size || sps->log2_min_tb_size < 2) {
-+        av_log(avctx, AV_LOG_ERROR, "Invalid value for log2_min_tb_size");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (log2_diff_max_min_transform_block_size < 0 || log2_diff_max_min_transform_block_size > 30) {
-+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_transform_block_size", log2_diff_max_min_transform_block_size);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    {
-+        const unsigned int CtbLog2SizeY = sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size;
-+        // Not a bitstream limitation, but all profiles
-+        if (CtbLog2SizeY < 4 || CtbLog2SizeY > HEVC_MAX_LOG2_CTB_SIZE) {
-+            av_log(avctx, AV_LOG_ERROR, "Invalid value %d for CtbLog2SizeY", CtbLog2SizeY);
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        if (sps->log2_max_trafo_size > FFMIN(5, CtbLog2SizeY)) {
-+            av_log(avctx, AV_LOG_ERROR, "Invalid value %d for MaxTbLog2SizeY", sps->log2_max_trafo_size);
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        // Inferred parameters
-+        sps->log2_ctb_size = CtbLog2SizeY;
-+//        sps->log2_min_pu_size = sps->log2_min_cb_size - 1;
-+    }
-+
-+    sps->max_transform_hierarchy_depth_inter = get_ue_golomb_long(gb);
-+    sps->max_transform_hierarchy_depth_intra = get_ue_golomb_long(gb);
-+
-+    sps->scaling_list_enable_flag = get_bits1(gb);
-+    if (sps->scaling_list_enable_flag) {
-+        set_default_scaling_list_data(&sps->scaling_list);
-+
-+        if (get_bits1(gb)) {
-+            ret = scaling_list_data(gb, avctx, &sps->scaling_list, sps);
-+            if (ret < 0)
-+                return ret;
-+        }
-+    }
-+
-+    sps->amp_enabled_flag = get_bits1(gb);
-+    sps->sao_enabled      = get_bits1(gb);
-+
-+    // Set pcm defaults (0) so we don't have to test _enabled when we
-+    // want to use them
-+    memset(&sps->pcm, 0, sizeof(sps->pcm));
-+
-+    if (get_bits1(gb))  // pcm_enabled_flag
-+    {
-+        const unsigned int limit_max_pcm = FFMIN(5,
-+            sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size);
-+        sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
-+        sps->pcm.bit_depth_chroma = get_bits(gb, 4) + 1;
-+        sps->pcm.log2_min_pcm_cb_size = get_ue_golomb_long(gb) + 3;
-+        sps->pcm.log2_max_pcm_cb_size = sps->pcm.log2_min_pcm_cb_size +
-+                                        get_ue_golomb_long(gb);
-+        if (FFMAX(sps->pcm.bit_depth, sps->pcm.bit_depth_chroma) > sps->bit_depth) {
-+            av_log(avctx, AV_LOG_ERROR,
-+                   "PCM bit depth (%d, %d) is greater than normal bit depth (%d)\n",
-+                   sps->pcm.bit_depth, sps->pcm.bit_depth_chroma, sps->bit_depth);
-+            return AVERROR_INVALIDDATA;
-+        }
-+        if (sps->pcm.log2_min_pcm_cb_size < sps->log2_min_cb_size ||
-+            sps->pcm.log2_max_pcm_cb_size > limit_max_pcm) {
-+            av_log(avctx, AV_LOG_ERROR, "Bad PCM CB min/max size (%d->%d)",
-+                   sps->pcm.log2_min_pcm_cb_size, sps->pcm.log2_max_pcm_cb_size);
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        sps->pcm.loop_filter_disable_flag = get_bits1(gb);
-+    }
-+
-+    // Could be based on min_pcm_cb_size but much easier logic if we just stick
-+    // with 8 (and costs us little)
-+    sps->pcm_width = (sps->width + 63) >> 6;  // 8 for min size, 8 bits per byte - round up
-+    sps->pcm_height = (sps->height + 7) >> 3;
-+
-+    sps->nb_st_rps = get_ue_golomb_long(gb);
-+    if (sps->nb_st_rps > HEVC_MAX_SHORT_TERM_REF_PIC_SETS) {
-+        av_log(avctx, AV_LOG_ERROR, "Too many short term RPS: %d.\n",
-+               sps->nb_st_rps);
-+        return AVERROR_INVALIDDATA;
-+    }
-+    for (i = 0; i < sps->nb_st_rps; i++) {
-+        if ((ret = ff_hevc_rpi_decode_short_term_rps(gb, avctx, &sps->st_rps[i],
-+                                                 sps, 0)) < 0)
-+            return ret;
-+    }
-+
-+    sps->long_term_ref_pics_present_flag = get_bits1(gb);
-+    if (sps->long_term_ref_pics_present_flag) {
-+        sps->num_long_term_ref_pics_sps = get_ue_golomb_long(gb);
-+        if (sps->num_long_term_ref_pics_sps > HEVC_MAX_LONG_TERM_REF_PICS) {
-+            av_log(avctx, AV_LOG_ERROR, "num_long_term_ref_pics_sps %d is out of range.\n",
-+                   sps->num_long_term_ref_pics_sps);
-+            return AVERROR_INVALIDDATA;
-+        }
-+        for (i = 0; i < sps->num_long_term_ref_pics_sps; i++) {
-+            sps->lt_ref_pic_poc_lsb_sps[i]       = get_bits(gb, sps->log2_max_poc_lsb);
-+            sps->used_by_curr_pic_lt_sps_flag[i] = get_bits1(gb);
-+        }
-+    }
-+
-+    sps->sps_temporal_mvp_enabled_flag          = get_bits1(gb);
-+    sps->intra_filters_disable = get_bits1(gb) ? 0 : FILTER_STRONG; // sps->sps_strong_intra_smoothing_enable_flag
-+    sps->vui.sar = (AVRational){0, 1};
-+    vui_present = get_bits1(gb);
-+    if (vui_present)
-+        decode_vui(gb, avctx, apply_defdispwin, sps);
-+
-+    if (get_bits1(gb)) { // sps_extension_flag
-+        int sps_extension_flag[1];
-+        for (i = 0; i < 1; i++)
-+            sps_extension_flag[i] = get_bits1(gb);
-+        skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7);
-+        if (sps_extension_flag[0]) {
-+            int extended_precision_processing_flag;
-+            int cabac_bypass_alignment_enabled_flag;
-+
-+            sps->transform_skip_rotation_enabled_flag = get_bits1(gb);
-+            sps->transform_skip_context_enabled_flag  = get_bits1(gb);
-+            sps->implicit_rdpcm_enabled_flag = get_bits1(gb);
-+
-+            sps->explicit_rdpcm_enabled_flag = get_bits1(gb);
-+
-+            extended_precision_processing_flag = get_bits1(gb);
-+            if (extended_precision_processing_flag)
-+                av_log(avctx, AV_LOG_WARNING,
-+                   "extended_precision_processing_flag not yet implemented\n");
-+
-+            if (get_bits1(gb))          // sps->intra_smoothing_disabled_flag
-+                sps->intra_filters_disable |= FILTER_EITHER;
-+            sps->high_precision_offsets_enabled_flag = get_bits1(gb);
-+            sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb);
-+
-+            cabac_bypass_alignment_enabled_flag  = get_bits1(gb);
-+            if (cabac_bypass_alignment_enabled_flag)
-+                av_log(avctx, AV_LOG_WARNING,
-+                   "cabac_bypass_alignment_enabled_flag not yet implemented\n");
-+        }
-+    }
-+    if (apply_defdispwin) {
-+        sps->output_window.left_offset   += sps->vui.def_disp_win.left_offset;
-+        sps->output_window.right_offset  += sps->vui.def_disp_win.right_offset;
-+        sps->output_window.top_offset    += sps->vui.def_disp_win.top_offset;
-+        sps->output_window.bottom_offset += sps->vui.def_disp_win.bottom_offset;
-+    }
-+
-+    ow = &sps->output_window;
-+    if (ow->left_offset >= INT_MAX - ow->right_offset     ||
-+        ow->top_offset  >= INT_MAX - ow->bottom_offset    ||
-+        ow->left_offset + ow->right_offset  >= sps->width ||
-+        ow->top_offset  + ow->bottom_offset >= sps->height) {
-+        av_log(avctx, AV_LOG_WARNING, "Invalid cropping offsets: %u/%u/%u/%u\n",
-+               ow->left_offset, ow->right_offset, ow->top_offset, ow->bottom_offset);
-+        if (avctx->err_recognition & AV_EF_EXPLODE) {
-+            return AVERROR_INVALIDDATA;
-+        }
-+        av_log(avctx, AV_LOG_WARNING,
-+               "Displaying the whole video surface.\n");
-+        memset(ow, 0, sizeof(*ow));
-+        memset(&sps->pic_conf_win, 0, sizeof(sps->pic_conf_win));
-+    }
-+
-+    // Inferred parameters
-+
-+    sps->ctb_width  = (sps->width  + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
-+    sps->ctb_height = (sps->height + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
-+    sps->ctb_size   = sps->ctb_width * sps->ctb_height;
-+
-+    sps->min_cb_width  = sps->width  >> sps->log2_min_cb_size;
-+    sps->min_cb_height = sps->height >> sps->log2_min_cb_size;
-+    sps->min_tb_width  = sps->width  >> sps->log2_min_tb_size;
-+    sps->min_tb_height = sps->height >> sps->log2_min_tb_size;
-+    sps->min_pu_width  = sps->width  >> LOG2_MIN_PU_SIZE;
-+    sps->min_pu_height = sps->height >> LOG2_MIN_PU_SIZE;
-+    sps->tb_mask       = (1 << (sps->log2_ctb_size - sps->log2_min_tb_size)) - 1;
-+
-+    sps->qp_bd_offset = 6 * (sps->bit_depth - 8);
-+    sps->wp_offset_half_range = (1U << (sps->high_precision_offsets_enabled_flag ? sps->bit_depth - 1 : 7));
-+
-+    if (av_mod_uintp2(sps->width, sps->log2_min_cb_size) ||
-+        av_mod_uintp2(sps->height, sps->log2_min_cb_size)) {
-+        av_log(avctx, AV_LOG_ERROR, "Invalid coded frame dimensions.\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (sps->max_transform_hierarchy_depth_inter > sps->log2_ctb_size - sps->log2_min_tb_size) {
-+        av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_inter out of range: %d\n",
-+               sps->max_transform_hierarchy_depth_inter);
-+        return AVERROR_INVALIDDATA;
-+    }
-+    if (sps->max_transform_hierarchy_depth_intra > sps->log2_ctb_size - sps->log2_min_tb_size) {
-+        av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_intra out of range: %d\n",
-+               sps->max_transform_hierarchy_depth_intra);
-+        return AVERROR_INVALIDDATA;
-+    }
-+    if (sps->log2_max_trafo_size > FFMIN(sps->log2_ctb_size, 5)) {
-+        av_log(avctx, AV_LOG_ERROR,
-+               "max transform block size out of range: %d\n",
-+               sps->log2_max_trafo_size);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (get_bits_left(gb) < 0) {
-+        av_log(avctx, AV_LOG_ERROR,
-+               "Overread SPS by %d bits\n", -get_bits_left(gb));
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    return 0;
-+}
-+
-+int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
-+                           HEVCRpiParamSets *ps, int apply_defdispwin)
-+{
-+    HEVCRpiSPS *sps;
-+    AVBufferRef *sps_buf = av_buffer_allocz(sizeof(*sps));
-+    unsigned int sps_id;
-+    int ret;
-+    ptrdiff_t nal_size;
-+
-+    if (!sps_buf)
-+        return AVERROR(ENOMEM);
-+    sps = (HEVCRpiSPS*)sps_buf->data;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "Decoding SPS\n");
-+
-+    nal_size = gb->buffer_end - gb->buffer;
-+    if (nal_size > sizeof(sps->data)) {
-+        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized SPS "
-+               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
-+               nal_size, sizeof(sps->data));
-+        sps->data_size = sizeof(sps->data);
-+    } else {
-+        sps->data_size = nal_size;
-+    }
-+    memcpy(sps->data, gb->buffer, sps->data_size);
-+
-+    ret = ff_hevc_rpi_parse_sps(sps, gb, &sps_id,
-+                            apply_defdispwin,
-+                            ps->vps_list, avctx);
-+    if (ret < 0) {
-+        av_buffer_unref(&sps_buf);
-+        return ret;
-+    }
-+
-+    if (avctx->debug & FF_DEBUG_BITSTREAM) {
-+        av_log(avctx, AV_LOG_DEBUG,
-+               "Parsed SPS: id %d; coded wxh: %dx%d; "
-+               "cropped wxh: %dx%d; pix_fmt: %s.\n",
-+               sps_id, sps->width, sps->height,
-+               sps->width - (sps->output_window.left_offset + sps->output_window.right_offset),
-+               sps->height - (sps->output_window.top_offset + sps->output_window.bottom_offset),
-+               av_get_pix_fmt_name(sps->pix_fmt));
-+    }
-+
-+    /* check if this is a repeat of an already parsed SPS, then keep the
-+     * original one.
-+     * otherwise drop all PPSes that depend on it */
-+    if (ps->sps_list[sps_id] &&
-+        !memcmp(ps->sps_list[sps_id]->data, sps_buf->data, sps_buf->size)) {
-+        av_buffer_unref(&sps_buf);
-+    } else {
-+        remove_sps(ps, sps_id);
-+        ps->sps_list[sps_id] = sps_buf;
-+    }
-+
-+    return 0;
-+}
-+
-+static void hevc_pps_free(void *opaque, uint8_t *data)
-+{
-+    HEVCRpiPPS *pps = (HEVCRpiPPS*)data;
-+
-+    av_freep(&pps->column_width);
-+    av_freep(&pps->row_height);
-+    av_freep(&pps->col_bd);
-+    av_freep(&pps->row_bd);
-+    av_freep(&pps->col_idxX);
-+    av_freep(&pps->ctb_addr_rs_to_ts);
-+    av_freep(&pps->ctb_addr_ts_to_rs);
-+    av_freep(&pps->tile_pos_ts);
-+    av_freep(&pps->tile_size);
-+    av_freep(&pps->tile_id);
-+    av_freep(&pps->ctb_ts_flags);
-+
-+    av_freep(&pps);
-+}
-+
-+static int get_offset_list(GetBitContext * const gb, AVCodecContext * const avctx, unsigned int n_minus_1, int8_t * offsets)
-+{
-+    do
-+    {
-+        const int offset = get_se_golomb_long(gb);
-+        if (offset < -12 || offset > 12) {
-+            av_log(avctx, AV_LOG_ERROR, "qp_offset_list[]: %d out of range\n", offset);
-+            return AVERROR_INVALIDDATA;
-+        }
-+        *offsets++ = offset;
-+    } while (n_minus_1-- != 0);
-+    return 0;
-+}
-+
-+static int pps_range_extensions(GetBitContext * const gb, AVCodecContext * const avctx,
-+                                HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps)
-+{
-+    if (pps->transform_skip_enabled_flag) {
-+        pps->log2_max_transform_skip_block_size = get_ue_golomb_long(gb) + 2;
-+    }
-+    pps->cross_component_prediction_enabled_flag = get_bits1(gb);
-+    if (pps->cross_component_prediction_enabled_flag &&
-+        (sps->chroma_format_idc != 3 || sps->separate_colour_plane_flag))
-+    {
-+        av_log(avctx, AV_LOG_ERROR, "cross_component_prediction_enabled but chroma_format_idc != 3\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+    pps->chroma_qp_offset_list_enabled_flag = get_bits1(gb);
-+    if (pps->chroma_qp_offset_list_enabled_flag) {
-+        int err;
-+
-+        pps->diff_cu_chroma_qp_offset_depth = get_ue_golomb_long(gb);
-+        pps->chroma_qp_offset_list_len_minus1 = get_ue_golomb_long(gb);
-+        if (pps->chroma_qp_offset_list_len_minus1 > 5) {
-+            av_log(avctx, AV_LOG_ERROR,
-+                   "chroma_qp_offset_list_len_minus1 shall be in the range [0, 5].\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+        av_log(avctx, AV_LOG_WARNING, "cb_qp_offset_list not tested yet.\n");
-+
-+        if ((err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cb_qp_offset_list)) != 0 ||
-+            (err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cr_qp_offset_list)) != 0)
-+            return err;
-+    }
-+
-+    {
-+        const unsigned int max_offset = sps->bit_depth > 10 ? sps->bit_depth - 10 : 0;
-+
-+        pps->log2_sao_offset_scale_luma = get_ue_golomb_long(gb);
-+        if (pps->log2_sao_offset_scale_luma > max_offset) {
-+            av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_luma invalid");
-+            return AVERROR_INVALIDDATA;
-+        }
-+        pps->log2_sao_offset_scale_chroma = get_ue_golomb_long(gb);
-+        if (pps->log2_sao_offset_scale_chroma > max_offset) {
-+            av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_chroma invalid");
-+            return AVERROR_INVALIDDATA;
-+        }
-+    }
-+
-+    return(0);
-+}
-+
-+static inline int setup_pps(AVCodecContext * const avctx,
-+                            HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps)
-+{
-+    int pic_area_in_ctbs;
-+    int i, j, x, y, ctb_addr_rs, tile_id;
-+
-+    // Inferred parameters
-+
-+    // qp_y -> qp_u/qp_v tables
-+    // The tables have at least -24,+24 overrun after adding offset here
-+    // which should allow for clipless offseting
-+
-+    pps->qp_dblk_x[0] = qp_c_dblk_0 + QP_DBLK_OFFSET_0;  // No offset for luma, but may be useful for general code
-+    pps->qp_bd_x[0] = qp_c_bd_0[sps->bit_depth - 8] + QP_OFFSET_0;
-+
-+    if (sps->chroma_format_idc == 1) {
-+        pps->qp_dblk_x[1] = qp_c_dblk_1 + pps->cb_qp_offset + QP_DBLK_OFFSET_0;
-+        pps->qp_bd_x[1] = qp_c_bd_1[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0;
-+        pps->qp_dblk_x[2] = qp_c_dblk_1 + pps->cr_qp_offset + QP_DBLK_OFFSET_0;
-+        pps->qp_bd_x[2] = qp_c_bd_1[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0;
-+    }
-+    else
-+    {
-+        pps->qp_dblk_x[1] = qp_c_dblk_0 + pps->cb_qp_offset + QP_DBLK_OFFSET_0;
-+        pps->qp_bd_x[1] = qp_c_bd_0[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0;
-+        pps->qp_dblk_x[2] = qp_c_dblk_0 + pps->cr_qp_offset + QP_DBLK_OFFSET_0;
-+        pps->qp_bd_x[2] = qp_c_bd_0[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0;
-+    }
-+
-+    pps->col_bd   = av_malloc_array(pps->num_tile_columns + 1, sizeof(*pps->col_bd));
-+    pps->row_bd   = av_malloc_array(pps->num_tile_rows + 1,    sizeof(*pps->row_bd));
-+    pps->col_idxX = av_malloc_array(sps->ctb_width,    sizeof(*pps->col_idxX));
-+    if (!pps->col_bd || !pps->row_bd || !pps->col_idxX)
-+        return AVERROR(ENOMEM);
-+
-+    if (pps->uniform_spacing_flag) {
-+        if (!pps->column_width) {
-+            pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
-+            pps->row_height   = av_malloc_array(pps->num_tile_rows,    sizeof(*pps->row_height));
-+        }
-+        if (!pps->column_width || !pps->row_height)
-+            return AVERROR(ENOMEM);
-+
-+        for (i = 0; i < pps->num_tile_columns; i++) {
-+            pps->column_width[i] = ((i + 1) * sps->ctb_width) / pps->num_tile_columns -
-+                                   (i * sps->ctb_width) / pps->num_tile_columns;
-+        }
-+
-+        for (i = 0; i < pps->num_tile_rows; i++) {
-+            pps->row_height[i] = ((i + 1) * sps->ctb_height) / pps->num_tile_rows -
-+                                 (i * sps->ctb_height) / pps->num_tile_rows;
-+        }
-+    }
-+
-+    {
-+        const unsigned int td_mask = 63 >> (sps->log2_ctb_size + sps->pixel_shift);
-+        pps->col_bd[0] = 0;
-+        pps->tile_wpp_inter_disable = 0;
-+        for (i = 0; i < pps->num_tile_columns; i++)
-+        {
-+            pps->col_bd[i + 1] = pps->col_bd[i] + pps->column_width[i];
-+
-+            // Avoid trying tile parallel if the columns don't fall on cache boundries
-+            // (this causes too much pain syncing flushes with the QPU)
-+            // Ignore the final (RHS of pic) tile boundry
-+            if ((pps->col_bd[i] & td_mask) != 0) {
-+                pps->tile_wpp_inter_disable = 1;
-+            }
-+        }
-+
-+        // If we can start the next row before finishing the first line of
-+        // this one then we must wait at the end of the tile
-+        // * if this happens a lot then there are better but more complicated
-+        //   conditions that we could apply
-+        if (pps->tile_wpp_inter_disable) {
-+            for (i = 0; i < pps->num_tile_rows; i++)
-+            {
-+                if (pps->row_height[i] <= RPI_MAX_JOBS) {
-+                    pps->tile_wpp_inter_disable = 2;
-+                    break;
-+                }
-+            }
-+        }
-+    }
-+
-+    pps->row_bd[0] = 0;
-+    for (i = 0; i < pps->num_tile_rows; i++)
-+        pps->row_bd[i + 1] = pps->row_bd[i] + pps->row_height[i];
-+
-+    for (i = 0, j = 0; i < sps->ctb_width; i++) {
-+        if (i >= pps->col_bd[j + 1])
-+            j++;
-+        pps->col_idxX[i] = j;
-+    }
-+
-+    /**
-+     * 6.5
-+     */
-+    pic_area_in_ctbs     = sps->ctb_size;
-+
-+    pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_rs_to_ts));
-+    pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_ts_to_rs));
-+    pps->tile_id           = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->tile_id));
-+    pps->tile_size         = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_size));
-+    pps->tile_pos_ts       = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_pos_ts));
-+    pps->ctb_ts_flags      = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_ts_flags));
-+    if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs ||
-+        !pps->tile_id || pps->tile_pos_ts == NULL || pps->tile_size == NULL) {
-+        return AVERROR(ENOMEM);
-+    }
-+
-+    memset(pps->ctb_ts_flags, 0, pic_area_in_ctbs * sizeof(*pps->ctb_ts_flags));
-+
-+    for (ctb_addr_rs = 0; ctb_addr_rs < pic_area_in_ctbs; ctb_addr_rs++) {
-+        int tb_x   = ctb_addr_rs % sps->ctb_width;
-+        int tb_y   = ctb_addr_rs / sps->ctb_width;
-+        int tile_x = 0;
-+        int tile_y = 0;
-+        int val    = 0;
-+
-+        for (i = 0; i < pps->num_tile_columns; i++) {
-+            if (tb_x < pps->col_bd[i + 1]) {
-+                tile_x = i;
-+                break;
-+            }
-+        }
-+
-+        for (i = 0; i < pps->num_tile_rows; i++) {
-+            if (tb_y < pps->row_bd[i + 1]) {
-+                tile_y = i;
-+                break;
-+            }
-+        }
-+
-+        for (i = 0; i < tile_x; i++)
-+            val += pps->row_height[tile_y] * pps->column_width[i];
-+        for (i = 0; i < tile_y; i++)
-+            val += sps->ctb_width * pps->row_height[i];
-+
-+        val += (tb_y - pps->row_bd[tile_y]) * pps->column_width[tile_x] +
-+               tb_x - pps->col_bd[tile_x];
-+
-+        pps->ctb_addr_rs_to_ts[ctb_addr_rs] = val;
-+        pps->ctb_addr_ts_to_rs[val]         = ctb_addr_rs;
-+    }
-+
-+    {
-+        uint8_t * pflags = pps->ctb_ts_flags;
-+        uint16_t * ptid = pps->tile_id;
-+
-+        for (j = 0, tile_id = 0; j < pps->num_tile_rows; j++)
-+        {
-+            for (i = 0; i < pps->num_tile_columns; i++, tile_id++)
-+            {
-+                const unsigned int tile_w = pps->column_width[i];
-+
-+                pflags[0] |= CTB_TS_FLAGS_CIREQ;
-+
-+                for (x = 0; x != tile_w; ++x) {
-+                    pflags[x] |= CTB_TS_FLAGS_TOT;
-+                }
-+
-+                for (y = pps->row_bd[j]; y < pps->row_bd[j + 1]; y++)
-+                {
-+                    pflags[0] |= CTB_TS_FLAGS_SOTL;
-+
-+                    if (pps->entropy_coding_sync_enabled_flag)
-+                    {
-+                        if (pps->column_width[i] != 1)
-+                            pflags[1] |= CTB_TS_FLAGS_CSAVE;
-+                        else
-+                            pflags[0] |= CTB_TS_FLAGS_CIREQ;
-+
-+                        if ((pflags[0] & CTB_TS_FLAGS_CIREQ) == 0)
-+                            pflags[0] |= CTB_TS_FLAGS_CLOAD;
-+                    }
-+
-+                    for (x = 0; x != tile_w; ++x)
-+                        *ptid++ = tile_id;
-+
-+                    pflags += tile_w;
-+                    pflags[-1] |= CTB_TS_FLAGS_EOTL;
-+                    if (i + 1 == pps->num_tile_columns)
-+                        pflags[-1] |= CTB_TS_FLAGS_EOL;
-+                }
-+
-+                pflags[-1] |= CTB_TS_FLAGS_EOT;
-+            }
-+        }
-+    }
-+
-+    {
-+        unsigned int ts = 0;
-+        for (j = 0; j < pps->num_tile_rows; j++)
-+            for (i = 0; i < pps->num_tile_columns; i++)
-+            {
-+                const unsigned int size = pps->column_width[i] * pps->row_height[j];
-+                pps->tile_size[j * pps->num_tile_columns + i] = size;
-+                pps->tile_pos_ts[j * pps->num_tile_columns + i] = ts;
-+                ts += size;
-+            }
-+    }
-+
-+    return 0;
-+}
-+
-+int ff_hevc_rpi_decode_nal_pps(GetBitContext * const gb, AVCodecContext * const avctx,
-+                           HEVCRpiParamSets * const ps)
-+{
-+    const HEVCRpiSPS *sps = NULL;
-+    int i, ret = 0;
-+    unsigned int pps_id = 0;
-+    ptrdiff_t nal_size;
-+    unsigned log2_parallel_merge_level_minus2;
-+
-+    AVBufferRef *pps_buf;
-+    HEVCRpiPPS *pps = av_mallocz(sizeof(*pps));
-+
-+    if (!pps)
-+        return AVERROR(ENOMEM);
-+
-+    pps_buf = av_buffer_create((uint8_t *)pps, sizeof(*pps),
-+                               hevc_pps_free, NULL, 0);
-+    if (!pps_buf) {
-+        av_freep(&pps);
-+        return AVERROR(ENOMEM);
-+    }
-+
-+    av_log(avctx, AV_LOG_DEBUG, "Decoding PPS\n");
-+
-+    nal_size = gb->buffer_end - gb->buffer;
-+    if (nal_size > sizeof(pps->data)) {
-+        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized PPS "
-+               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
-+               nal_size, sizeof(pps->data));
-+        pps->data_size = sizeof(pps->data);
-+    } else {
-+        pps->data_size = nal_size;
-+    }
-+    memcpy(pps->data, gb->buffer, pps->data_size);
-+
-+    // Default values
-+    pps->loop_filter_across_tiles_enabled_flag = 1;
-+    pps->num_tile_columns                      = 1;
-+    pps->num_tile_rows                         = 1;
-+    pps->uniform_spacing_flag                  = 1;
-+    pps->disable_dbf                           = 0;
-+    pps->beta_offset                           = 0;
-+    pps->tc_offset                             = 0;
-+    pps->log2_max_transform_skip_block_size    = 2;
-+
-+    // Coded parameters
-+    pps_id = get_ue_golomb_long(gb);
-+    if (pps_id >= HEVC_MAX_PPS_COUNT) {
-+        av_log(avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", pps_id);
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+    pps->sps_id = get_ue_golomb_long(gb);
-+    if (pps->sps_id >= HEVC_MAX_SPS_COUNT) {
-+        av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", pps->sps_id);
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+    if (!ps->sps_list[pps->sps_id]) {
-+        av_log(avctx, AV_LOG_ERROR, "SPS %u does not exist.\n", pps->sps_id);
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+    sps = (HEVCRpiSPS *)ps->sps_list[pps->sps_id]->data;
-+
-+    pps->dependent_slice_segments_enabled_flag = get_bits1(gb);
-+    pps->output_flag_present_flag              = get_bits1(gb);
-+    pps->num_extra_slice_header_bits           = get_bits(gb, 3);
-+
-+    pps->sign_data_hiding_flag = get_bits1(gb);
-+
-+    pps->cabac_init_present_flag = get_bits1(gb);
-+
-+    pps->num_ref_idx_l0_default_active = get_ue_golomb_long(gb) + 1;
-+    if (pps->num_ref_idx_l0_default_active < 1 || pps->num_ref_idx_l0_default_active > 15) {
-+        av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l0_default_active invalid\n");
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+    pps->num_ref_idx_l1_default_active = get_ue_golomb_long(gb) + 1;
-+    if (pps->num_ref_idx_l1_default_active < 1 || pps->num_ref_idx_l1_default_active > 15) {
-+        av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l1_default_active invalid\n");
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+
-+    pps->pic_init_qp_minus26 = get_se_golomb(gb);
-+    if (pps->pic_init_qp_minus26 > 25 || pps->pic_init_qp_minus26 < -(26 + sps->qp_bd_offset)) {
-+        av_log(avctx, AV_LOG_ERROR,
-+               "init_qp_minus26 %d is outside the valid range "
-+               "[%d, %d].\n",
-+               pps->pic_init_qp_minus26,
-+               -(26 + sps->qp_bd_offset), 25);
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+
-+    pps->constrained_intra_pred_flag = get_bits1(gb);
-+    pps->transform_skip_enabled_flag = get_bits1(gb);
-+
-+    pps->cu_qp_delta_enabled_flag = get_bits1(gb);
-+    pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size;
-+    if (pps->cu_qp_delta_enabled_flag)
-+    {
-+        const unsigned int diff_cu_qp_delta_depth = get_ue_golomb_long(gb);
-+
-+        if (diff_cu_qp_delta_depth > sps->log2_diff_max_min_coding_block_size) {
-+            av_log(avctx, AV_LOG_ERROR, "diff_cu_qp_delta_depth %d is invalid\n",
-+                   diff_cu_qp_delta_depth);
-+            ret = AVERROR_INVALIDDATA;
-+            goto err;
-+        }
-+
-+        pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size - diff_cu_qp_delta_depth;
-+    }
-+
-+    pps->cb_qp_offset = get_se_golomb(gb);
-+    if (pps->cb_qp_offset < -12 || pps->cb_qp_offset > 12) {
-+        av_log(avctx, AV_LOG_ERROR, "pps_cb_qp_offset out of range: %d\n",
-+               pps->cb_qp_offset);
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+    pps->cr_qp_offset = get_se_golomb(gb);
-+    if (pps->cr_qp_offset < -12 || pps->cr_qp_offset > 12) {
-+        av_log(avctx, AV_LOG_ERROR, "pps_cr_qp_offset out of range: %d\n",
-+               pps->cr_qp_offset);
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+    pps->pic_slice_level_chroma_qp_offsets_present_flag = get_bits1(gb);
-+
-+    pps->weighted_pred_flag   = get_bits1(gb);
-+    pps->weighted_bipred_flag = get_bits1(gb);
-+
-+    pps->transquant_bypass_enable_flag    = get_bits1(gb);
-+    pps->tiles_enabled_flag               = get_bits1(gb);
-+    pps->entropy_coding_sync_enabled_flag = get_bits1(gb);
-+
-+    if (pps->tiles_enabled_flag) {
-+        pps->num_tile_columns = get_ue_golomb_long(gb) + 1;
-+        pps->num_tile_rows    = get_ue_golomb_long(gb) + 1;
-+        if (pps->num_tile_columns <= 0 ||
-+            pps->num_tile_columns >= sps->width) {
-+            av_log(avctx, AV_LOG_ERROR, "num_tile_columns_minus1 out of range: %d\n",
-+                   pps->num_tile_columns - 1);
-+            ret = AVERROR_INVALIDDATA;
-+            goto err;
-+        }
-+        if (pps->num_tile_rows <= 0 ||
-+            pps->num_tile_rows >= sps->height) {
-+            av_log(avctx, AV_LOG_ERROR, "num_tile_rows_minus1 out of range: %d\n",
-+                   pps->num_tile_rows - 1);
-+            ret = AVERROR_INVALIDDATA;
-+            goto err;
-+        }
-+
-+        pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
-+        pps->row_height   = av_malloc_array(pps->num_tile_rows,    sizeof(*pps->row_height));
-+        if (!pps->column_width || !pps->row_height) {
-+            ret = AVERROR(ENOMEM);
-+            goto err;
-+        }
-+
-+        pps->uniform_spacing_flag = get_bits1(gb);
-+        if (!pps->uniform_spacing_flag) {
-+            uint64_t sum = 0;
-+            for (i = 0; i < pps->num_tile_columns - 1; i++) {
-+                pps->column_width[i] = get_ue_golomb_long(gb) + 1;
-+                sum                 += pps->column_width[i];
-+            }
-+            if (sum >= sps->ctb_width) {
-+                av_log(avctx, AV_LOG_ERROR, "Invalid tile widths.\n");
-+                ret = AVERROR_INVALIDDATA;
-+                goto err;
-+            }
-+            pps->column_width[pps->num_tile_columns - 1] = sps->ctb_width - sum;
-+
-+            sum = 0;
-+            for (i = 0; i < pps->num_tile_rows - 1; i++) {
-+                pps->row_height[i] = get_ue_golomb_long(gb) + 1;
-+                sum               += pps->row_height[i];
-+            }
-+            if (sum >= sps->ctb_height) {
-+                av_log(avctx, AV_LOG_ERROR, "Invalid tile heights.\n");
-+                ret = AVERROR_INVALIDDATA;
-+                goto err;
-+            }
-+            pps->row_height[pps->num_tile_rows - 1] = sps->ctb_height - sum;
-+        }
-+        pps->loop_filter_across_tiles_enabled_flag = get_bits1(gb);
-+    }
-+
-+    pps->seq_loop_filter_across_slices_enabled_flag = get_bits1(gb);
-+
-+    pps->deblocking_filter_control_present_flag = get_bits1(gb);
-+    if (pps->deblocking_filter_control_present_flag) {
-+        pps->deblocking_filter_override_enabled_flag = get_bits1(gb);
-+        pps->disable_dbf                             = get_bits1(gb);
-+        if (!pps->disable_dbf) {
-+            int beta_offset_div2 = get_se_golomb(gb);
-+            int tc_offset_div2   = get_se_golomb(gb) ;
-+            if (beta_offset_div2 < -6 || beta_offset_div2 > 6) {
-+                av_log(avctx, AV_LOG_ERROR, "pps_beta_offset_div2 out of range: %d\n",
-+                       beta_offset_div2);
-+                ret = AVERROR_INVALIDDATA;
-+                goto err;
-+            }
-+            if (tc_offset_div2 < -6 || tc_offset_div2 > 6) {
-+                av_log(avctx, AV_LOG_ERROR, "pps_tc_offset_div2 out of range: %d\n",
-+                       tc_offset_div2);
-+                ret = AVERROR_INVALIDDATA;
-+                goto err;
-+            }
-+            pps->beta_offset = 2 * beta_offset_div2;
-+            pps->tc_offset   = 2 *   tc_offset_div2;
-+        }
-+    }
-+
-+    pps->scaling_list_data_present_flag = get_bits1(gb);
-+    if (pps->scaling_list_data_present_flag) {
-+        set_default_scaling_list_data(&pps->scaling_list);
-+        ret = scaling_list_data(gb, avctx, &pps->scaling_list, sps);
-+        if (ret < 0)
-+            goto err;
-+    }
-+    pps->lists_modification_present_flag = get_bits1(gb);
-+    log2_parallel_merge_level_minus2     = get_ue_golomb_long(gb);
-+    if (log2_parallel_merge_level_minus2 > sps->log2_ctb_size) {
-+        av_log(avctx, AV_LOG_ERROR, "log2_parallel_merge_level_minus2 out of range: %d\n",
-+               log2_parallel_merge_level_minus2);
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+    pps->log2_parallel_merge_level       = log2_parallel_merge_level_minus2 + 2;
-+
-+    pps->slice_header_extension_present_flag = get_bits1(gb);
-+
-+    if (get_bits1(gb)) { // pps_extension_present_flag
-+        int pps_range_extensions_flag = get_bits1(gb);
-+        skip_bits(gb, 7); // pps_extension_7bits
-+        if (sps->ptl.general_ptl.profile_idc == FF_PROFILE_HEVC_REXT && pps_range_extensions_flag) {
-+            if ((ret = pps_range_extensions(gb, avctx, pps, sps)) < 0)
-+                goto err;
-+        }
-+    }
-+
-+    ret = setup_pps(avctx, pps, sps);
-+    if (ret < 0)
-+        goto err;
-+
-+    if (get_bits_left(gb) < 0) {
-+        av_log(avctx, AV_LOG_ERROR,
-+               "Overread PPS by %d bits\n", -get_bits_left(gb));
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+
-+    remove_pps(ps, pps_id);
-+    ps->pps_list[pps_id] = pps_buf;
-+
-+    return 0;
-+
-+err:
-+    av_buffer_unref(&pps_buf);
-+    return ret;
-+}
-+
-+int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type)
-+{
-+    int max_poc_lsb  = 1 << sps->log2_max_poc_lsb;
-+    int prev_poc_lsb = pocTid0 % max_poc_lsb;
-+    int prev_poc_msb = pocTid0 - prev_poc_lsb;
-+    int poc_msb;
-+
-+    if (poc_lsb < prev_poc_lsb && prev_poc_lsb - poc_lsb >= max_poc_lsb / 2)
-+        poc_msb = prev_poc_msb + max_poc_lsb;
-+    else if (poc_lsb > prev_poc_lsb && poc_lsb - prev_poc_lsb > max_poc_lsb / 2)
-+        poc_msb = prev_poc_msb - max_poc_lsb;
-+    else
-+        poc_msb = prev_poc_msb;
-+
-+    // For BLA picture types, POCmsb is set to 0.
-+    if (nal_unit_type == HEVC_NAL_BLA_W_LP   ||
-+        nal_unit_type == HEVC_NAL_BLA_W_RADL ||
-+        nal_unit_type == HEVC_NAL_BLA_N_LP)
-+        poc_msb = 0;
-+
-+    return poc_msb + poc_lsb;
-+}
---- /dev/null
-+++ b/libavcodec/rpi_hevc_ps.h
-@@ -0,0 +1,449 @@
-+/*
-+ * HEVC parameter set parsing
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVC_PS_H
-+#define AVCODEC_RPI_HEVC_PS_H
-+
-+#include <stdint.h>
-+
-+#include "libavutil/buffer.h"
-+#include "libavutil/pixfmt.h"
-+#include "libavutil/rational.h"
-+
-+#include "avcodec.h"
-+#include "get_bits.h"
-+#include "hevc.h"
-+
-+typedef struct ShortTermRPS {
-+    unsigned int num_negative_pics;
-+    int num_delta_pocs;
-+    int rps_idx_num_delta_pocs;
-+    int32_t delta_poc[32];
-+    uint8_t used[32];
-+} ShortTermRPS;
-+
-+typedef struct LongTermRPS {
-+    int     poc[32];
-+    uint8_t used[32];
-+    uint8_t nb_refs;
-+} LongTermRPS;
-+
-+typedef struct RpiSliceHeader {
-+    unsigned int pps_id;
-+
-+    ///< address (in raster order) of the first block in the current slice segment
-+    unsigned int   slice_segment_addr;
-+    ///< address (in raster order) of the first block in the current slice
-+    unsigned int   slice_addr;
-+
-+    enum HEVCSliceType slice_type;
-+
-+    int pic_order_cnt_lsb;
-+
-+    uint8_t first_slice_in_pic_flag;
-+    uint8_t dependent_slice_segment_flag;
-+    uint8_t pic_output_flag;
-+    uint8_t colour_plane_id;
-+
-+    ///< RPS coded in the slice header itself is stored here
-+    int short_term_ref_pic_set_sps_flag;
-+    int short_term_ref_pic_set_size;
-+    ShortTermRPS slice_rps;
-+    const ShortTermRPS *short_term_rps;
-+    int long_term_ref_pic_set_size;
-+    LongTermRPS long_term_rps;
-+    unsigned int list_entry_lx[2][32];
-+
-+    uint8_t rpl_modification_flag[2];
-+    uint8_t no_output_of_prior_pics_flag;
-+    uint8_t slice_temporal_mvp_enabled_flag;
-+
-+    unsigned int nb_refs[2];
-+
-+    uint8_t slice_sample_adaptive_offset_flag[3];
-+    uint8_t mvd_l1_zero_flag;
-+
-+    uint8_t cabac_init_flag;
-+    uint8_t disable_deblocking_filter_flag; ///< slice_header_disable_deblocking_filter_flag
-+    uint8_t slice_loop_filter_across_slices_enabled_flag;
-+    uint8_t collocated_list;
-+
-+    uint8_t no_dblk_boundary_flags;
-+
-+    unsigned int collocated_ref_idx;
-+
-+    int slice_qp_delta;
-+    int slice_cb_qp_offset;  // -12, +12
-+    int slice_cr_qp_offset;  // -12, +12
-+
-+    uint8_t cu_chroma_qp_offset_enabled_flag;
-+
-+    int beta_offset;    ///< beta_offset_div2 * 2
-+    int tc_offset;      ///< tc_offset_div2 * 2
-+
-+    unsigned int max_num_merge_cand; ///< 5 - 5_minus_max_num_merge_cand
-+
-+    unsigned *entry_point_offset;
-+    int * offset;
-+    int * size;
-+    int num_entry_point_offsets;
-+    int offsets_allocated;
-+
-+    uint8_t offload_wpp;
-+    uint8_t offload_tiles;
-+
-+    int8_t slice_qp;
-+
-+    uint8_t luma_log2_weight_denom;
-+    uint8_t chroma_log2_weight_denom;
-+
-+    int16_t luma_weight_l0[16];     // -128, +255
-+    int16_t luma_offset_l0[16];
-+    int16_t chroma_weight_l0[16][2];
-+    int16_t chroma_offset_l0[16][2];
-+
-+    int16_t luma_weight_l1[16];
-+    int16_t luma_offset_l1[16];
-+    int16_t chroma_weight_l1[16][2];
-+    int16_t chroma_offset_l1[16][2];
-+
-+} RpiSliceHeader;
-+
-+typedef struct HEVCRpiWindow {
-+    uint16_t left_offset;
-+    uint16_t right_offset;
-+    uint16_t top_offset;
-+    uint16_t bottom_offset;
-+} HEVCRpiWindow;
-+
-+typedef struct VUI {
-+    AVRational sar;
-+
-+    int overscan_info_present_flag;
-+    int overscan_appropriate_flag;
-+
-+    int video_signal_type_present_flag;
-+    int video_format;
-+    int video_full_range_flag;
-+    int colour_description_present_flag;
-+    uint8_t colour_primaries;
-+    uint8_t transfer_characteristic;
-+    uint8_t matrix_coeffs;
-+
-+    int chroma_loc_info_present_flag;
-+    int chroma_sample_loc_type_top_field;
-+    int chroma_sample_loc_type_bottom_field;
-+    int neutra_chroma_indication_flag;
-+
-+    int field_seq_flag;
-+    int frame_field_info_present_flag;
-+
-+    int default_display_window_flag;
-+    HEVCRpiWindow def_disp_win;
-+
-+    int vui_timing_info_present_flag;
-+    uint32_t vui_num_units_in_tick;
-+    uint32_t vui_time_scale;
-+    int vui_poc_proportional_to_timing_flag;
-+    int vui_num_ticks_poc_diff_one_minus1;
-+    int vui_hrd_parameters_present_flag;
-+
-+    int bitstream_restriction_flag;
-+    int tiles_fixed_structure_flag;
-+    int motion_vectors_over_pic_boundaries_flag;
-+    int restricted_ref_pic_lists_flag;
-+    int min_spatial_segmentation_idc;
-+    int max_bytes_per_pic_denom;
-+    int max_bits_per_min_cu_denom;
-+    int log2_max_mv_length_horizontal;
-+    int log2_max_mv_length_vertical;
-+} VUI;
-+
-+typedef struct PTLCommon {
-+    uint8_t profile_space;
-+    uint8_t tier_flag;
-+    uint8_t profile_idc;
-+    uint8_t profile_compatibility_flag[32];
-+    uint8_t level_idc;
-+    uint8_t progressive_source_flag;
-+    uint8_t interlaced_source_flag;
-+    uint8_t non_packed_constraint_flag;
-+    uint8_t frame_only_constraint_flag;
-+} PTLCommon;
-+
-+typedef struct PTL {
-+    PTLCommon general_ptl;
-+    PTLCommon sub_layer_ptl[HEVC_MAX_SUB_LAYERS];
-+
-+    uint8_t sub_layer_profile_present_flag[HEVC_MAX_SUB_LAYERS];
-+    uint8_t sub_layer_level_present_flag[HEVC_MAX_SUB_LAYERS];
-+} PTL;
-+
-+typedef struct HEVCRpiVPS {
-+    uint8_t vps_temporal_id_nesting_flag;
-+    int vps_max_layers;
-+    int vps_max_sub_layers; ///< vps_max_temporal_layers_minus1 + 1
-+
-+    PTL ptl;
-+    int vps_sub_layer_ordering_info_present_flag;
-+    unsigned int vps_max_dec_pic_buffering[HEVC_MAX_SUB_LAYERS];
-+    unsigned int vps_num_reorder_pics[HEVC_MAX_SUB_LAYERS];
-+    unsigned int vps_max_latency_increase[HEVC_MAX_SUB_LAYERS];
-+    int vps_max_layer_id;
-+    int vps_num_layer_sets; ///< vps_num_layer_sets_minus1 + 1
-+    uint8_t vps_timing_info_present_flag;
-+    uint32_t vps_num_units_in_tick;
-+    uint32_t vps_time_scale;
-+    uint8_t vps_poc_proportional_to_timing_flag;
-+    int vps_num_ticks_poc_diff_one; ///< vps_num_ticks_poc_diff_one_minus1 + 1
-+    int vps_num_hrd_parameters;
-+
-+    uint8_t data[4096];
-+    int data_size;
-+} HEVCRpiVPS;
-+
-+typedef struct ScalingList {
-+    /* This is a little wasteful, since sizeID 0 only needs 8 coeffs,
-+     * and size ID 3 only has 2 arrays, not 6. */
-+    uint8_t sl[4][6][64];
-+    uint8_t sl_dc[2][6];
-+} ScalingList;
-+
-+typedef struct HEVCRpiSPS {
-+    unsigned vps_id;
-+    uint8_t chroma_format_idc;
-+    uint8_t separate_colour_plane_flag;
-+
-+    HEVCRpiWindow output_window;
-+
-+    HEVCRpiWindow pic_conf_win;
-+
-+    uint16_t wp_offset_half_range;  // WpOffsetHalfRange
-+
-+    uint8_t bit_depth;
-+
-+//    int bit_depth_chroma;  // We only support lum_bit_depth = chroma_bit_depth
-+    uint8_t pixel_shift;
-+    enum AVPixelFormat pix_fmt;
-+
-+    unsigned int log2_max_poc_lsb;
-+
-+    int max_sub_layers;
-+    struct {
-+        int max_dec_pic_buffering;
-+        int num_reorder_pics;
-+        int max_latency_increase;
-+    } temporal_layer[HEVC_MAX_SUB_LAYERS];
-+    uint8_t temporal_id_nesting_flag;
-+
-+    uint8_t scaling_list_enable_flag;
-+    ScalingList scaling_list;
-+
-+    unsigned int nb_st_rps;
-+    ShortTermRPS st_rps[HEVC_MAX_SHORT_TERM_REF_PIC_SETS];
-+
-+    uint8_t amp_enabled_flag;
-+    uint8_t sao_enabled;
-+
-+    uint8_t long_term_ref_pics_present_flag;
-+    uint16_t lt_ref_pic_poc_lsb_sps[HEVC_MAX_LONG_TERM_REF_PICS];
-+    uint8_t used_by_curr_pic_lt_sps_flag[HEVC_MAX_LONG_TERM_REF_PICS];
-+    uint8_t num_long_term_ref_pics_sps;
-+
-+    struct {
-+        uint8_t bit_depth;
-+        uint8_t bit_depth_chroma;
-+        uint8_t log2_min_pcm_cb_size;
-+        uint8_t log2_max_pcm_cb_size;
-+        uint8_t loop_filter_disable_flag;
-+    } pcm;
-+    char sps_temporal_mvp_enabled_flag;
-+//    char sps_strong_intra_smoothing_enable_flag;  -> intra_filtes_disable
-+
-+    uint8_t log2_min_cb_size;  // 3..6
-+    uint8_t log2_diff_max_min_coding_block_size;
-+    uint8_t log2_min_tb_size;  // 2..5
-+    uint8_t log2_max_trafo_size;
-+    uint8_t log2_ctb_size;     // 4..6
-+//    unsigned int log2_min_pu_size;  // 2..5 (min_cb_size - 1)
-+#define LOG2_MIN_PU_SIZE 2
-+#define LOG2_MIN_CU_SIZE 3
-+
-+    uint8_t max_transform_hierarchy_depth_inter;
-+    uint8_t max_transform_hierarchy_depth_intra;
-+
-+    char transform_skip_rotation_enabled_flag;
-+    char transform_skip_context_enabled_flag;
-+    char implicit_rdpcm_enabled_flag;
-+    char explicit_rdpcm_enabled_flag;
-+//    char intra_smoothing_disabled_flag;  -> intra_filtes_disable
-+    char high_precision_offsets_enabled_flag;
-+    char persistent_rice_adaptation_enabled_flag;
-+
-+    uint8_t intra_filters_disable;
-+
-+    ///< coded frame dimension in various units
-+    int width;
-+    int height;
-+    int ctb_width;
-+    int ctb_height;
-+    int ctb_size;   // Pic size in CTBs not size of a CTB
-+    int min_cb_width;
-+    int min_cb_height;
-+    int min_tb_width;
-+    int min_tb_height;
-+    int min_pu_width;
-+    int min_pu_height;
-+    int pcm_width;
-+    int pcm_height;
-+    int tb_mask;
-+
-+    int hshift[3];
-+    int vshift[3];
-+
-+    int qp_bd_offset;
-+
-+    uint8_t data[4096];
-+    int data_size;
-+
-+    VUI vui;
-+    PTL ptl;
-+} HEVCRpiSPS;
-+
-+#define CTB_TS_FLAGS_SOTL       (1U << 0)       // X start of tile line
-+#define CTB_TS_FLAGS_EOTL       (1U << 1)       // Last CTB of a tile line
-+#define CTB_TS_FLAGS_EOL        (1U << 2)       // Last CTB of a complete line
-+#define CTB_TS_FLAGS_EOT        (1U << 3)       // Last CTB of a tile
-+#define CTB_TS_FLAGS_CSAVE      (1U << 4)
-+#define CTB_TS_FLAGS_CIREQ      (1U << 5)       // Cabac init request
-+#define CTB_TS_FLAGS_TOT        (1U << 6)       // CTB on top row of a tile
-+#define CTB_TS_FLAGS_CLOAD      (1U << 7)
-+
-+typedef struct HEVCRpiPPS {
-+    unsigned int sps_id; ///< seq_parameter_set_id
-+
-+    uint8_t sign_data_hiding_flag;
-+
-+    uint8_t cabac_init_present_flag;
-+
-+    int num_ref_idx_l0_default_active; ///< num_ref_idx_l0_default_active_minus1 + 1
-+    int num_ref_idx_l1_default_active; ///< num_ref_idx_l1_default_active_minus1 + 1
-+    int pic_init_qp_minus26;
-+
-+    uint8_t constrained_intra_pred_flag;
-+    uint8_t transform_skip_enabled_flag;
-+
-+    uint8_t cu_qp_delta_enabled_flag;
-+    uint8_t log2_min_cu_qp_delta_size;
-+    int cb_qp_offset;   // -12..12
-+    int cr_qp_offset;   // -12..12
-+    const uint8_t * qp_dblk_x[3];
-+    const int8_t * qp_bd_x[3];
-+
-+    uint8_t pic_slice_level_chroma_qp_offsets_present_flag;
-+    uint8_t weighted_pred_flag;
-+    uint8_t weighted_bipred_flag;
-+    uint8_t output_flag_present_flag;
-+    uint8_t transquant_bypass_enable_flag;
-+
-+    uint8_t dependent_slice_segments_enabled_flag;
-+    uint8_t tiles_enabled_flag;
-+    uint8_t entropy_coding_sync_enabled_flag;
-+
-+    uint8_t tile_wpp_inter_disable;
-+    int num_tile_columns;   ///< num_tile_columns_minus1 + 1
-+    int num_tile_rows;      ///< num_tile_rows_minus1 + 1
-+    uint8_t uniform_spacing_flag;
-+    uint8_t loop_filter_across_tiles_enabled_flag;
-+
-+    uint8_t seq_loop_filter_across_slices_enabled_flag;
-+
-+    uint8_t deblocking_filter_control_present_flag;
-+    uint8_t deblocking_filter_override_enabled_flag;
-+    uint8_t disable_dbf;
-+    int beta_offset;    ///< beta_offset_div2 * 2
-+    int tc_offset;      ///< tc_offset_div2 * 2
-+
-+    uint8_t scaling_list_data_present_flag;
-+    ScalingList scaling_list;
-+
-+    uint8_t lists_modification_present_flag;
-+    int log2_parallel_merge_level; ///< log2_parallel_merge_level_minus2 + 2
-+    int num_extra_slice_header_bits;
-+    uint8_t slice_header_extension_present_flag;
-+    uint8_t log2_max_transform_skip_block_size;
-+    uint8_t cross_component_prediction_enabled_flag;
-+    uint8_t chroma_qp_offset_list_enabled_flag;
-+    uint8_t diff_cu_chroma_qp_offset_depth;
-+    uint8_t chroma_qp_offset_list_len_minus1;
-+    int8_t  cb_qp_offset_list[6];
-+    int8_t  cr_qp_offset_list[6];
-+    uint8_t log2_sao_offset_scale_luma;
-+    uint8_t log2_sao_offset_scale_chroma;
-+
-+    // Inferred parameters
-+    uint16_t *column_width;  ///< ColumnWidth
-+    uint16_t *row_height;    ///< RowHeight
-+    uint16_t *col_bd;        ///< ColBd
-+    uint16_t *row_bd;        ///< RowBd
-+    uint16_t *col_idxX;
-+
-+    // We can limit these to uint16_t given our other size limits
-+    uint16_t *ctb_addr_rs_to_ts; ///< CtbAddrRSToTS
-+    uint16_t *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS
-+    uint16_t *tile_id;           ///< TileId
-+    uint16_t *tile_pos_ts;       ///< TilePosRS
-+    uint16_t *tile_size;         ///< TileSize
-+    uint8_t * ctb_ts_flags;
-+
-+    uint8_t data[4096];
-+    int data_size;
-+} HEVCRpiPPS;
-+
-+typedef struct HEVCRpiParamSets {
-+    /* currently active parameter sets */
-+    const HEVCRpiVPS *vps;
-+    const HEVCRpiSPS *sps;
-+    const HEVCRpiPPS *pps;
-+
-+    AVBufferRef *vps_list[HEVC_MAX_VPS_COUNT];
-+    AVBufferRef *sps_list[HEVC_MAX_SPS_COUNT];
-+    AVBufferRef *pps_list[HEVC_MAX_PPS_COUNT];
-+} HEVCRpiParamSets;
-+
-+int ff_hevc_rpi_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
-+                           HEVCRpiParamSets *ps);
-+int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
-+                           HEVCRpiParamSets *ps, int apply_defdispwin);
-+int ff_hevc_rpi_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
-+                           HEVCRpiParamSets *ps);
-+
-+int ff_hevc_rpi_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
-+                                  ShortTermRPS *rps, const HEVCRpiSPS *sps, int is_slice_header);
-+
-+int ff_hevc_rpi_encode_nal_vps(HEVCRpiVPS *vps, unsigned int id,
-+                           uint8_t *buf, int buf_size);
-+
-+/**
-+ * Compute POC of the current frame and return it.
-+ */
-+int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type);
-+
-+#endif /* AVCODEC_RPI_HEVC_PS_H */
---- /dev/null
-+++ b/libavcodec/rpi_hevc_refs.c
-@@ -0,0 +1,485 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/avassert.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/rpi_sand_fns.h"
-+#include "internal.h"
-+#include "thread.h"
-+#include "hevc.h"
-+#include "rpi_hevcdec.h"
-+
-+void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags)
-+{
-+    /* frame->frame can be NULL if context init failed */
-+    if (!frame->frame || !frame->frame->buf[0])
-+        return;
-+
-+    frame->flags &= ~flags;
-+    if (!frame->flags) {
-+        ff_thread_release_buffer(s->avctx, &frame->tf);
-+
-+        av_buffer_unref(&frame->col_mvf_buf);  // OK if already NULL
-+        frame->col_mvf = NULL;
-+
-+        frame->collocated_ref = NULL;
-+    }
-+}
-+
-+void ff_hevc_rpi_clear_refs(HEVCRpiContext *s)
-+{
-+    int i;
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
-+        ff_hevc_rpi_unref_frame(s, &s->DPB[i],
-+                            HEVC_FRAME_FLAG_SHORT_REF |
-+                            HEVC_FRAME_FLAG_LONG_REF);
-+}
-+
-+void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s)
-+{
-+    int i;
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
-+        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
-+}
-+
-+static HEVCRpiFrame *alloc_frame(HEVCRpiContext * const s)
-+{
-+    int i, ret;
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCRpiFrame * const frame = &s->DPB[i];
-+        if (frame->frame->buf[0])
-+            continue;
-+
-+        ret = ff_thread_get_buffer(s->avctx, &frame->tf,
-+                                   AV_GET_BUFFER_FLAG_REF);
-+        if (ret < 0)
-+            return NULL;
-+
-+        frame->col_mvf = NULL;
-+        frame->col_mvf_buf = NULL;
-+        if (s->used_for_ref && !s->is_irap)
-+        {
-+            frame->col_mvf_buf = av_buffer_pool_get(s->col_mvf_pool);
-+            if (!frame->col_mvf_buf)
-+                goto fail;
-+            frame->col_mvf = (ColMvField *)frame->col_mvf_buf->data;
-+        }
-+
-+        frame->frame->top_field_first  = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
-+        frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
-+
-+        return frame;
-+
-+fail:
-+        ff_hevc_rpi_unref_frame(s, frame, ~0);
-+        return NULL;
-+    }
-+    av_log(s->avctx, AV_LOG_ERROR, "Error allocating frame, DPB full.\n");
-+    return NULL;
-+}
-+
-+int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc)
-+{
-+    HEVCRpiFrame *ref;
-+    int i;
-+
-+    /* check that this POC doesn't already exist */
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCRpiFrame *frame = &s->DPB[i];
-+
-+        if (frame->frame->buf[0] && frame->sequence == s->seq_decode &&
-+            frame->poc == poc) {
-+            av_log(s->avctx, AV_LOG_ERROR, "Duplicate POC in a sequence: %d.\n",
-+                   poc);
-+            return AVERROR_INVALIDDATA;
-+        }
-+    }
-+
-+    ref = alloc_frame(s);
-+    if (!ref)
-+        return AVERROR(ENOMEM);
-+
-+    *frame = ref->frame;
-+    s->ref = ref;
-+
-+    if (s->sh.pic_output_flag)
-+        ref->flags = HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_SHORT_REF;
-+    else
-+        ref->flags = HEVC_FRAME_FLAG_SHORT_REF;
-+
-+    ref->poc      = poc;
-+    ref->sequence = s->seq_decode;
-+    ref->frame->crop_left   = s->ps.sps->output_window.left_offset;
-+    ref->frame->crop_right  = s->ps.sps->output_window.right_offset;
-+    ref->frame->crop_top    = s->ps.sps->output_window.top_offset;
-+    ref->frame->crop_bottom = s->ps.sps->output_window.bottom_offset;
-+
-+    return 0;
-+}
-+
-+int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *out, int flush)
-+{
-+    do {
-+        int nb_output = 0;
-+        int min_poc   = INT_MAX;
-+        int i, min_idx, ret;
-+
-+        if (s->sh.no_output_of_prior_pics_flag == 1 && s->no_rasl_output_flag == 1) {
-+            for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+                HEVCRpiFrame *frame = &s->DPB[i];
-+                if (!(frame->flags & HEVC_FRAME_FLAG_BUMPING) && frame->poc != s->poc &&
-+                        frame->sequence == s->seq_output) {
-+                    ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
-+                }
-+            }
-+        }
-+
-+        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+            HEVCRpiFrame *frame = &s->DPB[i];
-+            if ((frame->flags & HEVC_FRAME_FLAG_OUTPUT) &&
-+                frame->sequence == s->seq_output) {
-+                nb_output++;
-+                if (frame->poc < min_poc || nb_output == 1) {
-+                    min_poc = frame->poc;
-+                    min_idx = i;
-+                }
-+            }
-+        }
-+
-+        /* wait for more frames before output */
-+        if (!flush && s->seq_output == s->seq_decode && s->ps.sps &&
-+            nb_output <= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].num_reorder_pics)
-+            return 0;
-+
-+        if (nb_output) {
-+            HEVCRpiFrame *frame = &s->DPB[min_idx];
-+            if (frame->frame->format == AV_PIX_FMT_VIDEOTOOLBOX && frame->frame->buf[0]->size == 1)
-+                return 0;
-+
-+            ret = av_frame_ref(out, frame->frame);
-+            if (frame->flags & HEVC_FRAME_FLAG_BUMPING)
-+                ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_BUMPING);
-+            else
-+                ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
-+            if (ret < 0)
-+                return ret;
-+            av_log(s->avctx, AV_LOG_DEBUG,
-+                   "Output frame with POC %d.\n", frame->poc);
-+            return 1;
-+        }
-+
-+        if (s->seq_output != s->seq_decode)
-+            s->seq_output = (s->seq_output + 1) & 0xff;
-+        else
-+            break;
-+    } while (1);
-+
-+    return 0;
-+}
-+
-+void ff_hevc_rpi_bump_frame(HEVCRpiContext *s)
-+{
-+    int dpb = 0;
-+    int min_poc = INT_MAX;
-+    int i;
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCRpiFrame *frame = &s->DPB[i];
-+        if ((frame->flags) &&
-+            frame->sequence == s->seq_output &&
-+            frame->poc != s->poc) {
-+            dpb++;
-+        }
-+    }
-+
-+    if (s->ps.sps && dpb >= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering) {
-+        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+            HEVCRpiFrame *frame = &s->DPB[i];
-+            if ((frame->flags) &&
-+                frame->sequence == s->seq_output &&
-+                frame->poc != s->poc) {
-+                if (frame->flags == HEVC_FRAME_FLAG_OUTPUT && frame->poc < min_poc) {
-+                    min_poc = frame->poc;
-+                }
-+            }
-+        }
-+
-+        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+            HEVCRpiFrame *frame = &s->DPB[i];
-+            if (frame->flags & HEVC_FRAME_FLAG_OUTPUT &&
-+                frame->sequence == s->seq_output &&
-+                frame->poc <= min_poc) {
-+                frame->flags |= HEVC_FRAME_FLAG_BUMPING;
-+            }
-+        }
-+
-+        dpb--;
-+    }
-+}
-+
-+static int init_slice_rpl(HEVCRpiContext *s)
-+{
-+    if (s->slice_idx >= s->rpl_tab_size)
-+        return AVERROR_INVALIDDATA;
-+
-+    s->refPicList = s->rpl_tab[s->slice_idx].refPicList + 0;
-+    return 0;
-+}
-+
-+int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s)
-+{
-+    RpiSliceHeader *sh = &s->sh;
-+
-+    uint8_t nb_list = sh->slice_type == HEVC_SLICE_B ? 2 : 1;
-+    uint8_t list_idx;
-+    int i, j, ret;
-+
-+    ret = init_slice_rpl(s);
-+    if (ret < 0)
-+        return ret;
-+
-+    if (!(s->rps[ST_CURR_BEF].nb_refs + s->rps[ST_CURR_AFT].nb_refs +
-+          s->rps[LT_CURR].nb_refs)) {
-+        av_log(s->avctx, AV_LOG_ERROR, "Zero refs in the frame RPS.\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    for (list_idx = 0; list_idx < nb_list; list_idx++) {
-+        RefPicList  rpl_tmp = { { 0 } };
-+        RefPicList *rpl     = &s->refPicList[list_idx];
-+
-+        /* The order of the elements is
-+         * ST_CURR_BEF - ST_CURR_AFT - LT_CURR for the L0 and
-+         * ST_CURR_AFT - ST_CURR_BEF - LT_CURR for the L1 */
-+        int cand_lists[3] = { list_idx ? ST_CURR_AFT : ST_CURR_BEF,
-+                              list_idx ? ST_CURR_BEF : ST_CURR_AFT,
-+                              LT_CURR };
-+
-+        /* concatenate the candidate lists for the current frame */
-+        while (rpl_tmp.nb_refs < sh->nb_refs[list_idx]) {
-+            for (i = 0; i < FF_ARRAY_ELEMS(cand_lists); i++) {
-+                RefPicList *rps = &s->rps[cand_lists[i]];
-+                for (j = 0; j < rps->nb_refs && rpl_tmp.nb_refs < HEVC_MAX_REFS; j++) {
-+                    rpl_tmp.list[rpl_tmp.nb_refs]       = rps->list[j];
-+                    rpl_tmp.ref[rpl_tmp.nb_refs]        = rps->ref[j];
-+                    rpl_tmp.isLongTerm[rpl_tmp.nb_refs] = i == 2;
-+                    rpl_tmp.nb_refs++;
-+                }
-+            }
-+        }
-+
-+        /* reorder the references if necessary */
-+        if (sh->rpl_modification_flag[list_idx]) {
-+            for (i = 0; i < sh->nb_refs[list_idx]; i++) {
-+                int idx = sh->list_entry_lx[list_idx][i];
-+
-+                if (idx >= rpl_tmp.nb_refs) {
-+                    av_log(s->avctx, AV_LOG_ERROR, "Invalid reference index.\n");
-+                    return AVERROR_INVALIDDATA;
-+                }
-+
-+                rpl->list[i]       = rpl_tmp.list[idx];
-+                rpl->ref[i]        = rpl_tmp.ref[idx];
-+                rpl->isLongTerm[i] = rpl_tmp.isLongTerm[idx];
-+                rpl->nb_refs++;
-+            }
-+        } else {
-+            memcpy(rpl, &rpl_tmp, sizeof(*rpl));
-+            rpl->nb_refs = FFMIN(rpl->nb_refs, sh->nb_refs[list_idx]);
-+        }
-+
-+        if (sh->collocated_list == list_idx &&
-+            sh->collocated_ref_idx < rpl->nb_refs)
-+            s->ref->collocated_ref = rpl->ref[sh->collocated_ref_idx];
-+    }
-+
-+    return 0;
-+}
-+
-+static HEVCRpiFrame *find_ref_idx(HEVCRpiContext *s, int poc)
-+{
-+    int i;
-+    int LtMask = (1 << s->ps.sps->log2_max_poc_lsb) - 1;
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCRpiFrame *ref = &s->DPB[i];
-+        if (ref->frame->buf[0] && (ref->sequence == s->seq_decode)) {
-+            if ((ref->poc & LtMask) == poc)
-+                return ref;
-+        }
-+    }
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCRpiFrame *ref = &s->DPB[i];
-+        if (ref->frame->buf[0] && ref->sequence == s->seq_decode) {
-+            if (ref->poc == poc || (ref->poc & LtMask) == poc)
-+                return ref;
-+        }
-+    }
-+
-+    if (s->nal_unit_type != HEVC_NAL_CRA_NUT && !IS_BLA(s))
-+        av_log(s->avctx, AV_LOG_ERROR,
-+               "Could not find ref with POC %d\n", poc);
-+    return NULL;
-+}
-+
-+static void mark_ref(HEVCRpiFrame *frame, int flag)
-+{
-+    frame->flags &= ~(HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF);
-+    frame->flags |= flag;
-+}
-+
-+static HEVCRpiFrame *generate_missing_ref(HEVCRpiContext *s, int poc)
-+{
-+    HEVCRpiFrame *frame;
-+    int i, x, y;
-+
-+    frame = alloc_frame(s);
-+    if (!frame)
-+        return NULL;
-+
-+    if (!s->ps.sps->pixel_shift) {
-+        for (i = 0; frame->frame->buf[i]; i++)
-+            memset(frame->frame->buf[i]->data, 1 << (s->ps.sps->bit_depth - 1),
-+                   frame->frame->buf[i]->size);
-+    } else {
-+        for (i = 0; frame->frame->data[i]; i++)
-+            for (y = 0; y < (s->ps.sps->height >> s->ps.sps->vshift[i]); y++)
-+                for (x = 0; x < (s->ps.sps->width >> s->ps.sps->hshift[i]); x++) {
-+                    AV_WN16(frame->frame->data[i] + y * frame_stride1(frame->frame, 1) + 2 * x,
-+                            1 << (s->ps.sps->bit_depth - 1));
-+                }
-+    }
-+
-+    frame->poc      = poc;
-+    frame->sequence = s->seq_decode;
-+    frame->flags    = 0;
-+
-+    ff_hevc_rpi_progress_set_all_done(frame);
-+
-+    return frame;
-+}
-+
-+/* add a reference with the given poc to the list and mark it as used in DPB */
-+static int add_candidate_ref(HEVCRpiContext *s, RefPicList *list,
-+                             int poc, int ref_flag)
-+{
-+    HEVCRpiFrame *ref = find_ref_idx(s, poc);
-+
-+    if (ref == s->ref || list->nb_refs >= HEVC_MAX_REFS)
-+        return AVERROR_INVALIDDATA;
-+
-+    if (!ref) {
-+        ref = generate_missing_ref(s, poc);
-+        if (!ref)
-+            return AVERROR(ENOMEM);
-+    }
-+
-+    list->list[list->nb_refs] = ref->poc;
-+    list->ref[list->nb_refs]  = ref;
-+    list->nb_refs++;
-+
-+    mark_ref(ref, ref_flag);
-+    return 0;
-+}
-+
-+int ff_hevc_rpi_frame_rps(HEVCRpiContext *s)
-+{
-+    const ShortTermRPS *short_rps = s->sh.short_term_rps;
-+    const LongTermRPS  *long_rps  = &s->sh.long_term_rps;
-+    RefPicList               *rps = s->rps;
-+    int i, ret = 0;
-+
-+    if (!short_rps) {
-+        rps[0].nb_refs = rps[1].nb_refs = 0;
-+        return 0;
-+    }
-+
-+    /* clear the reference flags on all frames except the current one */
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCRpiFrame *frame = &s->DPB[i];
-+
-+        if (frame == s->ref)
-+            continue;
-+
-+        mark_ref(frame, 0);
-+    }
-+
-+    for (i = 0; i < NB_RPS_TYPE; i++)
-+        rps[i].nb_refs = 0;
-+
-+    /* add the short refs */
-+    for (i = 0; i < short_rps->num_delta_pocs; i++) {
-+        int poc = s->poc + short_rps->delta_poc[i];
-+        int list;
-+
-+        if (!short_rps->used[i])
-+            list = ST_FOLL;
-+        else if (i < short_rps->num_negative_pics)
-+            list = ST_CURR_BEF;
-+        else
-+            list = ST_CURR_AFT;
-+
-+        ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_SHORT_REF);
-+        if (ret < 0)
-+            goto fail;
-+    }
-+
-+    /* add the long refs */
-+    for (i = 0; i < long_rps->nb_refs; i++) {
-+        int poc  = long_rps->poc[i];
-+        int list = long_rps->used[i] ? LT_CURR : LT_FOLL;
-+
-+        ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_LONG_REF);
-+        if (ret < 0)
-+            goto fail;
-+    }
-+
-+fail:
-+    /* release any frames that are now unused */
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
-+        ff_hevc_rpi_unref_frame(s, &s->DPB[i], 0);
-+
-+    return ret;
-+}
-+
-+int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s)
-+{
-+    int ret = 0;
-+    int i;
-+    const ShortTermRPS *rps = s->sh.short_term_rps;
-+    LongTermRPS *long_rps   = &s->sh.long_term_rps;
-+
-+    if (rps) {
-+        for (i = 0; i < rps->num_negative_pics; i++)
-+            ret += !!rps->used[i];
-+        for (; i < rps->num_delta_pocs; i++)
-+            ret += !!rps->used[i];
-+    }
-+
-+    if (long_rps) {
-+        for (i = 0; i < long_rps->nb_refs; i++)
-+            ret += !!long_rps->used[i];
-+    }
-+    return ret;
-+}
---- /dev/null
-+++ b/libavcodec/rpi_hevc_sei.c
-@@ -0,0 +1,368 @@
-+/*
-+ * HEVC Supplementary Enhancement Information messages
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ * Copyright (C) 2013 Vittorio Giovara
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "golomb.h"
-+#include "rpi_hevc_ps.h"
-+#include "rpi_hevc_sei.h"
-+
-+static int decode_nal_sei_decoded_picture_hash(HEVCSEIPictureHash *s, GetBitContext *gb)
-+{
-+    int cIdx, i;
-+    uint8_t hash_type;
-+    //uint16_t picture_crc;
-+    //uint32_t picture_checksum;
-+    hash_type = get_bits(gb, 8);
-+
-+    for (cIdx = 0; cIdx < 3/*((s->sps->chroma_format_idc == 0) ? 1 : 3)*/; cIdx++) {
-+        if (hash_type == 0) {
-+            s->is_md5 = 1;
-+            for (i = 0; i < 16; i++)
-+                s->md5[cIdx][i] = get_bits(gb, 8);
-+        } else if (hash_type == 1) {
-+            // picture_crc = get_bits(gb, 16);
-+            skip_bits(gb, 16);
-+        } else if (hash_type == 2) {
-+            // picture_checksum = get_bits_long(gb, 32);
-+            skip_bits(gb, 32);
-+        }
-+    }
-+    return 0;
-+}
-+
-+static int decode_nal_sei_mastering_display_info(HEVCSEIMasteringDisplay *s, GetBitContext *gb)
-+{
-+    int i;
-+    // Mastering primaries
-+    for (i = 0; i < 3; i++) {
-+        s->display_primaries[i][0] = get_bits(gb, 16);
-+        s->display_primaries[i][1] = get_bits(gb, 16);
-+    }
-+    // White point (x, y)
-+    s->white_point[0] = get_bits(gb, 16);
-+    s->white_point[1] = get_bits(gb, 16);
-+
-+    // Max and min luminance of mastering display
-+    s->max_luminance = get_bits_long(gb, 32);
-+    s->min_luminance = get_bits_long(gb, 32);
-+
-+    // As this SEI message comes before the first frame that references it,
-+    // initialize the flag to 2 and decrement on IRAP access unit so it
-+    // persists for the coded video sequence (e.g., between two IRAPs)
-+    s->present = 2;
-+    return 0;
-+}
-+
-+static int decode_nal_sei_content_light_info(HEVCSEIContentLight *s, GetBitContext *gb)
-+{
-+    // Max and average light levels
-+    s->max_content_light_level     = get_bits_long(gb, 16);
-+    s->max_pic_average_light_level = get_bits_long(gb, 16);
-+    // As this SEI message comes before the first frame that references it,
-+    // initialize the flag to 2 and decrement on IRAP access unit so it
-+    // persists for the coded video sequence (e.g., between two IRAPs)
-+    s->present = 2;
-+    return  0;
-+}
-+
-+static int decode_nal_sei_frame_packing_arrangement(HEVCSEIFramePacking *s, GetBitContext *gb)
-+{
-+    get_ue_golomb_long(gb);             // frame_packing_arrangement_id
-+    s->present = !get_bits1(gb);
-+
-+    if (s->present) {
-+        s->arrangement_type               = get_bits(gb, 7);
-+        s->quincunx_subsampling           = get_bits1(gb);
-+        s->content_interpretation_type    = get_bits(gb, 6);
-+
-+        // spatial_flipping_flag, frame0_flipped_flag, field_views_flag
-+        skip_bits(gb, 3);
-+        s->current_frame_is_frame0_flag = get_bits1(gb);
-+        // frame0_self_contained_flag, frame1_self_contained_flag
-+        skip_bits(gb, 2);
-+
-+        if (!s->quincunx_subsampling && s->arrangement_type != 5)
-+            skip_bits(gb, 16);  // frame[01]_grid_position_[xy]
-+        skip_bits(gb, 8);       // frame_packing_arrangement_reserved_byte
-+        skip_bits1(gb);         // frame_packing_arrangement_persistence_flag
-+    }
-+    skip_bits1(gb);             // upsampled_aspect_ratio_flag
-+    return 0;
-+}
-+
-+static int decode_nal_sei_display_orientation(HEVCSEIDisplayOrientation *s, GetBitContext *gb)
-+{
-+    s->present = !get_bits1(gb);
-+
-+    if (s->present) {
-+        s->hflip = get_bits1(gb);     // hor_flip
-+        s->vflip = get_bits1(gb);     // ver_flip
-+
-+        s->anticlockwise_rotation = get_bits(gb, 16);
-+        skip_bits1(gb);     // display_orientation_persistence_flag
-+    }
-+
-+    return 0;
-+}
-+
-+static int decode_nal_sei_pic_timing(HEVCSEIContext *s, GetBitContext *gb, const HEVCRpiParamSets *ps,
-+                                     void *logctx, int size)
-+{
-+    HEVCSEIPictureTiming *h = &s->picture_timing;
-+    HEVCRpiSPS *sps;
-+
-+    if (!ps->sps_list[s->active_seq_parameter_set_id])
-+        return(AVERROR(ENOMEM));
-+    sps = (HEVCRpiSPS*)ps->sps_list[s->active_seq_parameter_set_id]->data;
-+
-+    if (sps->vui.frame_field_info_present_flag) {
-+        int pic_struct = get_bits(gb, 4);
-+        h->picture_struct = AV_PICTURE_STRUCTURE_UNKNOWN;
-+        if (pic_struct == 2 || pic_struct == 10 || pic_struct == 12) {
-+            av_log(logctx, AV_LOG_DEBUG, "BOTTOM Field\n");
-+            h->picture_struct = AV_PICTURE_STRUCTURE_BOTTOM_FIELD;
-+        } else if (pic_struct == 1 || pic_struct == 9 || pic_struct == 11) {
-+            av_log(logctx, AV_LOG_DEBUG, "TOP Field\n");
-+            h->picture_struct = AV_PICTURE_STRUCTURE_TOP_FIELD;
-+        }
-+        get_bits(gb, 2);                   // source_scan_type
-+        get_bits(gb, 1);                   // duplicate_flag
-+        skip_bits1(gb);
-+        size--;
-+    }
-+    skip_bits_long(gb, 8 * size);
-+
-+    return 0;
-+}
-+
-+static int decode_registered_user_data_closed_caption(HEVCSEIA53Caption *s, GetBitContext *gb,
-+                                                      int size)
-+{
-+    int flag;
-+    int user_data_type_code;
-+    int cc_count;
-+
-+    if (size < 3)
-+       return AVERROR(EINVAL);
-+
-+    user_data_type_code = get_bits(gb, 8);
-+    if (user_data_type_code == 0x3) {
-+        skip_bits(gb, 1); // reserved
-+
-+        flag = get_bits(gb, 1); // process_cc_data_flag
-+        if (flag) {
-+            skip_bits(gb, 1);
-+            cc_count = get_bits(gb, 5);
-+            skip_bits(gb, 8); // reserved
-+            size -= 2;
-+
-+            if (cc_count && size >= cc_count * 3) {
-+                const uint64_t new_size = (s->a53_caption_size + cc_count
-+                                           * UINT64_C(3));
-+                int i, ret;
-+
-+                if (new_size > INT_MAX)
-+                    return AVERROR(EINVAL);
-+
-+                /* Allow merging of the cc data from two fields. */
-+                ret = av_reallocp(&s->a53_caption, new_size);
-+                if (ret < 0)
-+                    return ret;
-+
-+                for (i = 0; i < cc_count; i++) {
-+                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
-+                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
-+                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
-+                }
-+                skip_bits(gb, 8); // marker_bits
-+            }
-+        }
-+    } else {
-+        int i;
-+        for (i = 0; i < size - 1; i++)
-+            skip_bits(gb, 8);
-+    }
-+
-+    return 0;
-+}
-+
-+static int decode_nal_sei_user_data_registered_itu_t_t35(HEVCSEIContext *s, GetBitContext *gb,
-+                                                         int size)
-+{
-+    uint32_t country_code;
-+    uint32_t user_identifier;
-+
-+    if (size < 7)
-+        return AVERROR(EINVAL);
-+    size -= 7;
-+
-+    country_code = get_bits(gb, 8);
-+    if (country_code == 0xFF) {
-+        skip_bits(gb, 8);
-+        size--;
-+    }
-+
-+    skip_bits(gb, 8);
-+    skip_bits(gb, 8);
-+
-+    user_identifier = get_bits_long(gb, 32);
-+
-+    switch (user_identifier) {
-+        case MKBETAG('G', 'A', '9', '4'):
-+            return decode_registered_user_data_closed_caption(&s->a53_caption, gb, size);
-+        default:
-+            skip_bits_long(gb, size * 8);
-+            break;
-+    }
-+    return 0;
-+}
-+
-+static int decode_nal_sei_active_parameter_sets(HEVCSEIContext *s, GetBitContext *gb, void *logctx)
-+{
-+    int num_sps_ids_minus1;
-+    int i;
-+    unsigned active_seq_parameter_set_id;
-+
-+    get_bits(gb, 4); // active_video_parameter_set_id
-+    get_bits(gb, 1); // self_contained_cvs_flag
-+    get_bits(gb, 1); // num_sps_ids_minus1
-+    num_sps_ids_minus1 = get_ue_golomb_long(gb); // num_sps_ids_minus1
-+
-+    if (num_sps_ids_minus1 < 0 || num_sps_ids_minus1 > 15) {
-+        av_log(logctx, AV_LOG_ERROR, "num_sps_ids_minus1 %d invalid\n", num_sps_ids_minus1);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    active_seq_parameter_set_id = get_ue_golomb_long(gb);
-+    if (active_seq_parameter_set_id >= HEVC_MAX_SPS_COUNT) {
-+        av_log(logctx, AV_LOG_ERROR, "active_parameter_set_id %d invalid\n", active_seq_parameter_set_id);
-+        return AVERROR_INVALIDDATA;
-+    }
-+    s->active_seq_parameter_set_id = active_seq_parameter_set_id;
-+
-+    for (i = 1; i <= num_sps_ids_minus1; i++)
-+        get_ue_golomb_long(gb); // active_seq_parameter_set_id[i]
-+
-+    return 0;
-+}
-+
-+static int decode_nal_sei_alternative_transfer(HEVCSEIAlternativeTransfer *s, GetBitContext *gb)
-+{
-+    s->present = 1;
-+    s->preferred_transfer_characteristics = get_bits(gb, 8);
-+    return 0;
-+}
-+
-+static int decode_nal_sei_prefix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, const HEVCRpiParamSets *ps,
-+                                 int type, int size)
-+{
-+    switch (type) {
-+    case 256:  // Mismatched value from HM 8.1
-+        return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb);
-+    case HEVC_SEI_TYPE_FRAME_PACKING:
-+        return decode_nal_sei_frame_packing_arrangement(&s->frame_packing, gb);
-+    case HEVC_SEI_TYPE_DISPLAY_ORIENTATION:
-+        return decode_nal_sei_display_orientation(&s->display_orientation, gb);
-+    case HEVC_SEI_TYPE_PICTURE_TIMING:
-+        return decode_nal_sei_pic_timing(s, gb, ps, logctx, size);
-+    case HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO:
-+        return decode_nal_sei_mastering_display_info(&s->mastering_display, gb);
-+    case HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO:
-+        return decode_nal_sei_content_light_info(&s->content_light, gb);
-+    case HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS:
-+        return decode_nal_sei_active_parameter_sets(s, gb, logctx);
-+    case HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35:
-+        return decode_nal_sei_user_data_registered_itu_t_t35(s, gb, size);
-+    case HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS:
-+        return decode_nal_sei_alternative_transfer(&s->alternative_transfer, gb);
-+    default:
-+        av_log(logctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", type);
-+        skip_bits_long(gb, 8 * size);
-+        return 0;
-+    }
-+}
-+
-+static int decode_nal_sei_suffix(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
-+                                 int type, int size)
-+{
-+    switch (type) {
-+    case HEVC_SEI_TYPE_DECODED_PICTURE_HASH:
-+        return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb);
-+    default:
-+        av_log(logctx, AV_LOG_DEBUG, "Skipped SUFFIX SEI %d\n", type);
-+        skip_bits_long(gb, 8 * size);
-+        return 0;
-+    }
-+}
-+
-+static int decode_nal_sei_message(GetBitContext * const gb, void * const logctx, HEVCSEIContext * const s,
-+                                  const HEVCRpiParamSets * const ps, const int nal_unit_type)
-+{
-+    int payload_type = 0;
-+    int payload_size = 0;
-+    int byte = 0xFF;
-+    av_log(logctx, AV_LOG_DEBUG, "Decoding SEI\n");
-+
-+    while (byte == 0xFF) {
-+       if (get_bits_left(gb) < 16 || payload_type > INT_MAX - 255)
-+           return AVERROR_INVALIDDATA;
-+        byte          = get_bits(gb, 8);
-+        payload_type += byte;
-+    }
-+    byte = 0xFF;
-+    while (byte == 0xFF) {
-+        if (get_bits_left(gb) < 8 + 8LL*payload_size)
-+            return AVERROR_INVALIDDATA;
-+         byte          = get_bits(gb, 8);
-+        payload_size += byte;
-+    }
-+    if (nal_unit_type == HEVC_NAL_SEI_PREFIX) {
-+        return decode_nal_sei_prefix(gb, logctx, s, ps, payload_type, payload_size);
-+    } else { /* nal_unit_type == NAL_SEI_SUFFIX */
-+        return decode_nal_sei_suffix(gb, logctx, s, payload_type, payload_size);
-+    }
-+}
-+
-+static int more_rbsp_data(GetBitContext *gb)
-+{
-+    return get_bits_left(gb) > 0 && show_bits(gb, 8) != 0x80;
-+}
-+
-+int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
-+                           const HEVCRpiParamSets *ps, int type)
-+{
-+    int ret;
-+
-+    do {
-+        ret = decode_nal_sei_message(gb, logctx, s, ps, type);
-+        if (ret < 0)
-+            return ret;
-+    } while (more_rbsp_data(gb));
-+    return 1;
-+}
-+
-+void ff_hevc_rpi_reset_sei(HEVCSEIContext *s)
-+{
-+    s->a53_caption.a53_caption_size = 0;
-+    av_freep(&s->a53_caption.a53_caption);
-+}
---- /dev/null
-+++ b/libavcodec/rpi_hevc_sei.h
-@@ -0,0 +1,135 @@
-+/*
-+ * HEVC Supplementary Enhancement Information messages
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVC_SEI_H
-+#define AVCODEC_RPI_HEVC_SEI_H
-+
-+#include <stdint.h>
-+
-+#include "libavutil/md5.h"
-+
-+#include "get_bits.h"
-+
-+/**
-+ * SEI message types
-+ */
-+typedef enum {
-+    HEVC_SEI_TYPE_BUFFERING_PERIOD                     = 0,
-+    HEVC_SEI_TYPE_PICTURE_TIMING                       = 1,
-+    HEVC_SEI_TYPE_PAN_SCAN_RECT                        = 2,
-+    HEVC_SEI_TYPE_FILLER_PAYLOAD                       = 3,
-+    HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35       = 4,
-+    HEVC_SEI_TYPE_USER_DATA_UNREGISTERED               = 5,
-+    HEVC_SEI_TYPE_RECOVERY_POINT                       = 6,
-+    HEVC_SEI_TYPE_SCENE_INFO                           = 9,
-+    HEVC_SEI_TYPE_FULL_FRAME_SNAPSHOT                  = 15,
-+    HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_START = 16,
-+    HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_END   = 17,
-+    HEVC_SEI_TYPE_FILM_GRAIN_CHARACTERISTICS           = 19,
-+    HEVC_SEI_TYPE_POST_FILTER_HINT                     = 22,
-+    HEVC_SEI_TYPE_TONE_MAPPING_INFO                    = 23,
-+    HEVC_SEI_TYPE_FRAME_PACKING                        = 45,
-+    HEVC_SEI_TYPE_DISPLAY_ORIENTATION                  = 47,
-+    HEVC_SEI_TYPE_SOP_DESCRIPTION                      = 128,
-+    HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS                = 129,
-+    HEVC_SEI_TYPE_DECODING_UNIT_INFO                   = 130,
-+    HEVC_SEI_TYPE_TEMPORAL_LEVEL0_INDEX                = 131,
-+    HEVC_SEI_TYPE_DECODED_PICTURE_HASH                 = 132,
-+    HEVC_SEI_TYPE_SCALABLE_NESTING                     = 133,
-+    HEVC_SEI_TYPE_REGION_REFRESH_INFO                  = 134,
-+    HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO               = 137,
-+    HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO             = 144,
-+    HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147,
-+} HEVC_SEI_Type;
-+
-+typedef struct HEVCSEIPictureHash {
-+    uint8_t       md5[3][16];
-+    uint8_t is_md5;
-+} HEVCSEIPictureHash;
-+
-+typedef struct HEVCSEIFramePacking {
-+    int present;
-+    int arrangement_type;
-+    int content_interpretation_type;
-+    int quincunx_subsampling;
-+    int current_frame_is_frame0_flag;
-+} HEVCSEIFramePacking;
-+
-+typedef struct HEVCSEIDisplayOrientation {
-+    int present;
-+    int anticlockwise_rotation;
-+    int hflip, vflip;
-+} HEVCSEIDisplayOrientation;
-+
-+typedef struct HEVCSEIPictureTiming {
-+    int picture_struct;
-+} HEVCSEIPictureTiming;
-+
-+typedef struct HEVCSEIA53Caption {
-+    int a53_caption_size;
-+    uint8_t *a53_caption;
-+} HEVCSEIA53Caption;
-+
-+typedef struct HEVCSEIMasteringDisplay {
-+    int present;
-+    uint16_t display_primaries[3][2];
-+    uint16_t white_point[2];
-+    uint32_t max_luminance;
-+    uint32_t min_luminance;
-+} HEVCSEIMasteringDisplay;
-+
-+typedef struct HEVCSEIContentLight {
-+    int present;
-+    uint16_t max_content_light_level;
-+    uint16_t max_pic_average_light_level;
-+} HEVCSEIContentLight;
-+
-+typedef struct HEVCSEIAlternativeTransfer {
-+    int present;
-+    int preferred_transfer_characteristics;
-+} HEVCSEIAlternativeTransfer;
-+
-+typedef struct HEVCSEIContext {
-+    HEVCSEIPictureHash picture_hash;
-+    HEVCSEIFramePacking frame_packing;
-+    HEVCSEIDisplayOrientation display_orientation;
-+    HEVCSEIPictureTiming picture_timing;
-+    HEVCSEIA53Caption a53_caption;
-+    HEVCSEIMasteringDisplay mastering_display;
-+    HEVCSEIContentLight content_light;
-+    int active_seq_parameter_set_id;
-+    HEVCSEIAlternativeTransfer alternative_transfer;
-+} HEVCSEIContext;
-+
-+struct HEVCRpiParamSets;
-+
-+int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
-+                           const struct HEVCRpiParamSets *ps, int type);
-+
-+/**
-+ * Reset SEI values that are stored on the Context.
-+ * e.g. Caption data that was extracted during NAL
-+ * parsing.
-+ *
-+ * @param s HEVCRpiContext.
-+ */
-+void ff_hevc_rpi_reset_sei(HEVCSEIContext *s);
-+
-+#endif /* AVCODEC_RPI_HEVC_SEI_H */
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader.c
-@@ -0,0 +1,1537 @@
-+#include "rpi_hevc_shader.h"
-+
-+#ifdef _MSC_VER
-+   #include <stdint.h>
-+   /* cast through uintptr_t to avoid warnings */
-+   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
-+#else
-+   #define POINTER_TO_UINT(X) ((unsigned int)(X))
-+#endif
-+
-+#ifdef __cplusplus
-+extern "C" { /* the types are probably wrong... */
-+#endif
-+#ifdef __cplusplus
-+}
-+#endif
-+
-+#ifdef _MSC_VER
-+__declspec(align(8))
-+#elif defined(__GNUC__)
-+__attribute__((aligned(8)))
-+#endif
-+unsigned int ff_hevc_rpi_shader[] = {
-+// ::mc_setup_c_q0
-+// ::mc_start
-+/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_setup_c_qn
-+/* [0x00000008] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
-+/* [0x00000010] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+/* [0x00000018] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30      ; mov ra_base, unif
-+/* [0x00000020] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
-+/* [0x00000028] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
-+/* [0x00000030] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
-+/* [0x00000038] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
-+/* [0x00000040] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask
-+/* [0x00000048] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+/* [0x00000050] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+/* [0x00000058] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+/* [0x00000060] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
-+/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
-+/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000078] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch
-+/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
-+/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop                           ; mul24 r0, r0, 5
-+/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num
-+/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
-+/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
-+/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0                 ; mov ra_y, ra0.16a
-+/* [0x000000b0] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
-+/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
-+/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x000000e0] */ 0x8c827076, 0x10025800, // add r0, r0, r1                ; mov ra0, unif
-+/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
-+/* [0x000000f0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x000000f8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
-+/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000108] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
-+/* [0x00000110] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
-+/* [0x00000118] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000120] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x00000140] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
-+/* [0x00000148] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x         ; mov ra_y2, ra0.16a
-+/* [0x00000150] */ 0x938001f6, 0xd002480f, // max r0, r0, 0                 ; mov rb_base2, unif
-+/* [0x00000158] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000160] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00000168] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000170] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
-+/* [0x00000178] */ 0x949c307f, 0xd0024863, // and r1, r0, r1                ; mov r3, PREREAD
-+/* [0x00000180] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000188] */ 0x8c467076, 0x12024822, // add r0, r0, r1                ; mov r2, ra_y2
-+/* [0x00000190] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0    ; mov r0, ra_y
-+// :1
-+/* [0x00000198] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x000001a0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
-+/* [0x000001a8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x000001b0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x000001b8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
-+/* [0x000001c0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
-+/* [0x000001c8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x000001d0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x000001d8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x000001e0] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
-+/* [0x000001e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000001f0] */ 0x00000000, 0xe0024104, // mov ra4, 0                    ; mov rb4, 0
-+/* [0x000001f8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000200] */ 0x00000000, 0xe0024145, // mov ra5, 0                    ; mov rb5, 0
-+/* [0x00000208] */ 0x00000000, 0xe0024186, // mov ra6, 0                    ; mov rb6, 0
-+/* [0x00000210] */ 0x00000000, 0xe00241c7, // mov ra7, 0                    ; mov rb7, 0
-+// ::mc_filter_c_p
-+/* [0x00000218] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
-+/* [0x00000220] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
-+/* [0x00000228] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
-+/* [0x00000230] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
-+/* [0x00000238] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
-+/* [0x00000240] */ 0x93567176, 0x14024800, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
-+/* [0x00000248] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
-+/* [0x00000250] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3
-+/* [0x00000258] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000260] */ 0x54402077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
-+/* [0x00000268] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000270] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
-+/* [0x00000278] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
-+/* [0x00000280] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00000288] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x00000290] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
-+/* [0x00000298] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
-+/* [0x000002a0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
-+/* [0x000002a8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
-+/* [0x000002b0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
-+/* [0x000002b8] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x000002c0] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
-+/* [0x000002c8] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
-+/* [0x000002d0] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
-+// :1
-+/* [0x000002d8] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu0
-+/* [0x000002e0] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift        ; mov.ifz  r3, vra_y_next
-+/* [0x000002e8] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
-+/* [0x000002f0] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz  vra_base, vrx_base_next
-+/* [0x000002f8] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
-+/* [0x00000300] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+/* [0x00000308] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
-+/* [0x00000310] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
-+/* [0x00000318] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3      ; v8min r0, r0, rb_pmask
-+/* [0x00000320] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
-+/* [0x00000328] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
-+/* [0x00000330] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00000338] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00000340] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00000348] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00000350] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00000358] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
-+/* [0x00000360] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
-+/* [0x00000368] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
-+/* [0x00000370] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
-+/* [0x00000378] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
-+/* [0x00000380] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
-+/* [0x00000388] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
-+/* [0x00000390] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00000398] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+/* [0x000003a0] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
-+/* [0x000003a8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000003b0] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x000003b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x000003c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x000003c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x000003d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x000003d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x000003e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x000003e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000003f0] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
-+/* [0x000003f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00000400] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00000408] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_c_p_l1
-+/* [0x00000410] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
-+/* [0x00000418] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
-+/* [0x00000420] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
-+/* [0x00000428] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
-+/* [0x00000430] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
-+/* [0x00000438] */ 0x939c117f, 0x10125815, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
-+/* [0x00000440] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
-+/* [0x00000448] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3
-+/* [0x00000450] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000458] */ 0x54402077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
-+/* [0x00000460] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000468] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
-+/* [0x00000470] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
-+/* [0x00000478] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00000480] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x00000488] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
-+/* [0x00000490] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
-+/* [0x00000498] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
-+/* [0x000004a0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
-+/* [0x000004a8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
-+/* [0x000004b0] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x000004b8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
-+/* [0x000004c0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
-+/* [0x000004c8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
-+// :1
-+/* [0x000004d0] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu1
-+/* [0x000004d8] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift        ; mov.ifz  vra_base, vrx_base_next
-+/* [0x000004e0] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
-+/* [0x000004e8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef      ; mov.ifz  r3, vra_y_next
-+/* [0x000004f0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
-+/* [0x000004f8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+/* [0x00000500] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
-+/* [0x00000508] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
-+/* [0x00000510] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3      ; v8min r0, r0, ra_pmax
-+/* [0x00000518] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
-+/* [0x00000520] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
-+/* [0x00000528] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00000530] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00000538] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00000540] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00000548] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00000550] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
-+/* [0x00000558] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
-+/* [0x00000560] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
-+/* [0x00000568] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
-+/* [0x00000570] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
-+/* [0x00000578] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
-+/* [0x00000580] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
-+/* [0x00000588] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00000590] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+/* [0x00000598] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
-+/* [0x000005a0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000005a8] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x000005b0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x000005b8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x000005c0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x000005c8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x000005d0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x000005d8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x000005e0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000005e8] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
-+/* [0x000005f0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x000005f8] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00000600] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_c_b
-+/* [0x00000608] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
-+/* [0x00000610] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
-+/* [0x00000618] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r1, r1
-+/* [0x00000620] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x         ; mov ra_y_next, ra2.16a
-+/* [0x00000628] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch          ; mov ra_width_height, unif
-+/* [0x00000630] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
-+/* [0x00000638] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x          ; mov ra0, unif
-+/* [0x00000640] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00000648] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4                ; mov ra2, unif
-+/* [0x00000650] */ 0x54402077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
-+/* [0x00000658] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000660] */ 0x8c427076, 0x12024821, // add r0, r0, r1                ; mov r1, ra_height
-+/* [0x00000668] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0      ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00000670] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00000678] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x00000680] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0
-+/* [0x00000688] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift     ; mov ra3, unif
-+/* [0x00000690] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2                ; mov r3, unif
-+/* [0x00000698] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift    ; mov ra_y2_next, ra3.16a
-+/* [0x000006a0] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b
-+/* [0x000006a8] */ 0x918011f6, 0xd0025801, // shl r0, r0, v_x_shift         ; mov ra1, unif
-+/* [0x000006b0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x         ; mov ra3, unif
-+/* [0x000006b8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch          ; mov ra_wt_off_mul_l1, unif
-+/* [0x000006c0] */ 0x939de17f, 0x10025809, // max r0, r0, r5                ; mov ra9, rb_max_y
-+/* [0x000006c8] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x          ; mov r2, ra_kmul_add
-+/* [0x000006d0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x000006d8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4                ; mov.ifc ra_wt_off_mul_l1, unif
-+/* [0x000006e0] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1                ; mov r5rep, -4
-+/* [0x000006e8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x000006f0] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
-+/* [0x000006f8] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0     ; mov r0, ra_fir_off_val
-+/* [0x00000700] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
-+/* [0x00000708] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
-+/* [0x00000710] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1                ; mov r1, ra_wt_off_l1
-+/* [0x00000718] */ 0x910cd3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6       ; mov rb11, ra3.8d
-+/* [0x00000720] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0         ; mov ra_link, unif
-+/* [0x00000728] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2          ; mov rb7,  ra2.8d
-+// :1
-+/* [0x00000730] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu0
-+/* [0x00000738] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift         ; mov.ifz rb_base2, rb_base2_next
-+/* [0x00000740] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift         ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00000748] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz ra_base, ra_base_next
-+/* [0x00000750] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y             ; mov r3, ra_y
-+/* [0x00000758] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0             ; mov      r0, r1 << 15
-+/* [0x00000760] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9               ; mov.ifnc r1, r2 << 1
-+/* [0x00000768] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
-+/* [0x00000770] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3          ; v8min r0, r0, rb_pmask
-+/* [0x00000778] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask          ; mul24      r2, ra0.8a,       r0
-+/* [0x00000780] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra0.8d,       r1
-+/* [0x00000788] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00000790] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00000798] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x000007a0] */ 0x40032031, 0xdc0109e3, // nop                           ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x000007a8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3                ; mul24 ra4, rb5, ra2.8a        ; ldtmu1
-+/* [0x000007b0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10              ; mov rb5, rb6
-+/* [0x000007b8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift         ; mov r3, ra_y2
-+/* [0x000007c0] */ 0x8e1c01f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8  ; mov rb6, ra7
-+/* [0x000007c8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1          ; mov      r0, r1 << 15
-+/* [0x000007d0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+/* [0x000007d8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y          ; v8min r1, r1, ra_pmax
-+/* [0x000007e0] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
-+/* [0x000007e8] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3         ; v8min r0, r0, ra_pmax
-+/* [0x000007f0] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1              ; mul24      r2, ra1.8a,       r0
-+/* [0x000007f8] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra1.8d,       r1
-+/* [0x00000800] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00000808] */ 0x40074031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00000810] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2                ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00000818] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00000820] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00000828] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3                ; mul24 r0, rb9,  ra3.8a
-+/* [0x00000830] */ 0x550caffe, 0x1a025261, // mov rb9, rb10                 ; mul24 r1, rb10, ra3.8b
-+/* [0x00000838] */ 0x8e2c05f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00000840] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0                ; mul24 r1, rb5,  ra2.8b
-+/* [0x00000848] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount     ; mov r0, ra4
-+/* [0x00000850] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+/* [0x00000858] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra7,  rb7
-+/* [0x00000860] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+/* [0x00000868] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0                ; mul24 r0, ra11, rb11
-+/* [0x00000870] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0
-+/* [0x00000878] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6
-+/* [0x00000880] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6                 ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00000888] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1                ; mul24 r1, r2, ra_wt_mul_l1
-+/* [0x00000890] */ 0x4c667216, 0x14024862, // add r1, r1, r0                ; mul24 r2, r2, ra_kmul_add
-+/* [0x00000898] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2                ; mov r3, ra_blk_height
-+/* [0x000008a0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+/* [0x000008a8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000008b0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7
-+/* [0x000008b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x000008c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x000008c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x000008d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x000008d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x000008e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x000008e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000008f0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
-+/* [0x000008f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00000900] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00000908] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_sync_q0
-+/* [0x00000910] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000920] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000928] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000930] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000938] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000940] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000948] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000950] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q1
-+/* [0x00000958] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000960] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000968] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000970] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000978] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000980] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q2
-+/* [0x00000988] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000990] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000998] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x000009a0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x000009a8] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000009b0] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q3
-+/* [0x000009b8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000009c0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000009c8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x000009d0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x000009d8] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_sync_q4
-+/* [0x000009e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000009f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000009f8] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000a00] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000a08] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000a10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000a18] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000a20] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000a28] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q5
-+/* [0x00000a30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000a40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000a48] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000a50] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000a58] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q6
-+/* [0x00000a60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000a70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000a78] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000a80] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000a88] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q7
-+/* [0x00000a90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000a98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000aa0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000aa8] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000ab0] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_sync_q8
-+/* [0x00000ac0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000ac8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000ad0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000ad8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000ae0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000ae8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000af0] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000af8] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000b00] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q9
-+/* [0x00000b08] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000b18] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000b20] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000b28] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000b30] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q10
-+/* [0x00000b38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000b40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000b48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000b50] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000b58] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000b60] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q11
-+/* [0x00000b68] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000b70] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000b78] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000b80] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000b88] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000b90] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_exit_c_qn
-+// ::mc_exit_y_qn
-+/* [0x00000b98] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
-+// :1
-+/* [0x00000ba0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x00000ba8] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
-+/* [0x00000bb0] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
-+/* [0x00000bb8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x00000bc0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000bc8] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
-+/* [0x00000bd0] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_exit_c_q0
-+// ::mc_exit_y_q0
-+/* [0x00000be0] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
-+// :1
-+/* [0x00000be8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x00000bf0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
-+/* [0x00000bf8] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
-+/* [0x00000c00] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x00000c08] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000c10] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000c18] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
-+/* [0x00000c20] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
-+/* [0x00000c28] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_setup_y_q0
-+/* [0x00000c30] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_setup_y_qn
-+/* [0x00000c38] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
-+/* [0x00000c40] */ 0x15827d80, 0x10020267, // mov ra9, unif
-+/* [0x00000c48] */ 0x15827d80, 0x10020067, // mov ra1, unif
-+/* [0x00000c50] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+/* [0x00000c58] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30      ; mov ra11, unif
-+/* [0x00000c60] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
-+/* [0x00000c68] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask
-+/* [0x00000c70] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+/* [0x00000c78] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+/* [0x00000c80] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+/* [0x00000c88] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00
-+/* [0x00000c90] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40
-+/* [0x00000c98] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500
-+/* [0x00000ca0] */ 0x15827d80, 0x100200e7, // mov ra3, unif
-+/* [0x00000ca8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
-+/* [0x00000cb0] */ 0x0d0c1dc0, 0xd40216a7, // sub rb_max_x, ra3.16b, 1
-+/* [0x00000cb8] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
-+/* [0x00000cc0] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num              ; mov rb_pitch, unif
-+/* [0x00000cc8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000cd0] */ 0x159d03c0, 0x10021667, // or  rb_dma1_base, r1, rb_pitch
-+/* [0x00000cd8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
-+/* [0x00000ce0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000ce8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000cf0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00000cf8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4                ; v8subs r2, r2, r2
-+/* [0x00000d00] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
-+/* [0x00000d08] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00000d10] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000d18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000d20] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
-+/* [0x00000d28] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
-+/* [0x00000d30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000d38] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000d40] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00000d48] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000d50] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00000d58] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000d60] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000d68] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0
-+/* [0x00000d70] */ 0x80027036, 0x120049e0, // nop                           ; mov r0, ra0.16a
-+/* [0x00000d78] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD               ; mov r2, ra1.16a
-+// :1
-+/* [0x00000d80] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x00000d88] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
-+/* [0x00000d90] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00000d98] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x00000da0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
-+/* [0x00000da8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
-+/* [0x00000db0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x00000db8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00000dc0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x00000dc8] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
-+/* [0x00000dd0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000dd8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
-+/* [0x00000de0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000de8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
-+/* [0x00000df0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
-+/* [0x00000df8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000e00] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00000e08] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000e10] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000e18] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x00000e20] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000e28] */ 0x00000000, 0xe0024208, // mov ra8,  0                   ; mov rb8,  0
-+/* [0x00000e30] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000e38] */ 0x00000000, 0xe0024249, // mov ra9,  0                   ; mov rb9,  0
-+/* [0x00000e40] */ 0x00000000, 0xe002428a, // mov ra10, 0                   ; mov rb10, 0
-+/* [0x00000e48] */ 0x00000000, 0xe00242cb, // mov ra11, 0                   ; mov rb11, 0
-+// :per_block_setup_8
-+/* [0x00000e50] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
-+/* [0x00000e58] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000e60] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00000e68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000e70] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch          ; mov ra_base_next, unif
-+/* [0x00000e78] */ 0x940270b6, 0x12225853, // and r1, r0, r2                ; mov ra_y_next, ra0.16a
-+/* [0x00000e80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000e88] */ 0x8c827076, 0x10025801, // add r0, r0, r1                ; mov ra1, unif
-+/* [0x00000e90] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
-+/* [0x00000e98] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
-+/* [0x00000ea0] */ 0x93067176, 0x12125813, // max r0, r0, r5                ; mov ra_y2_next, ra1.16a
-+/* [0x00000ea8] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x          ; mov rb_base2_next, unif
-+/* [0x00000eb0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00000eb8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4                ; mov ra_width_height, unif
-+/* [0x00000ec0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2                ; mov vw_setup, rb_vpm_init
-+/* [0x00000ec8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000ed0] */ 0x4c401077, 0xd4024821, // add r0, r0, r1                ; mul24 r1, ra_width, v_x_mul
-+/* [0x00000ed8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
-+/* [0x00000ee0] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
-+/* [0x00000ee8] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
-+/* [0x00000ef0] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8)
-+/* [0x00000ef8] */ 0x916471f6, 0xd4024823, // shl r0, r0, v_dma_h_shift     ; mov r3, ra_kmul_add
-+/* [0x00000f00] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000f08] */ 0x916501f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift    ; mov r2, ra_fir_off_val
-+/* [0x00000f10] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif
-+/* [0x00000f18] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16    ; mov ra_wt_off_mul_l0, unif
-+/* [0x00000f20] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3                ; mov rb5, ra_k255
-+/* [0x00000f28] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
-+/* [0x00000f30] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
-+/* [0x00000f38] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
-+/* [0x00000f40] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d
-+/* [0x00000f48] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c
-+/* [0x00000f50] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
-+/* [0x00000f58] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d            ; mov ra_wt_off_mul_l1, unif
-+/* [0x00000f60] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c        ; v8min rb4, r0, rb5
-+/* [0x00000f68] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8                 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1
-+/* [0x00000f70] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d            ; mov ra_dest, unif
-+/* [0x00000f78] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c        ; v8min rb11, r0, rb5
-+/* [0x00000f80] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d
-+/* [0x00000f88] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c
-+/* [0x00000f90] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d
-+/* [0x00000f98] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c
-+/* [0x00000fa0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
-+/* [0x00000fa8] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
-+/* [0x00000fb0] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c        ; mov r5rep, -8
-+/* [0x00000fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000fc0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
-+/* [0x00000fc8] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d            ; mov ra_link, unif
-+/* [0x00000fd0] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c        ; v8min rb8, r0, rb5
-+// ::mc_filter_y_pxx
-+/* [0x00000fd8] */ 0xfffffe58, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
-+/* [0x00000fe0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
-+/* [0x00000fe8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
-+/* [0x00000ff0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00000ff8] */ 0x1158cdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5
-+/* [0x00001000] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x00001008] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
-+// :1
-+/* [0x00001010] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
-+/* [0x00001018] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
-+/* [0x00001020] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
-+/* [0x00001028] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
-+/* [0x00001030] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
-+/* [0x00001038] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
-+/* [0x00001040] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
-+/* [0x00001048] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
-+/* [0x00001050] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
-+/* [0x00001058] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
-+/* [0x00001060] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
-+/* [0x00001068] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+/* [0x00001070] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
-+/* [0x00001078] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+/* [0x00001080] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+/* [0x00001088] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00001090] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+/* [0x00001098] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+/* [0x000010a0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+/* [0x000010a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x000010b0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+/* [0x000010b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
-+/* [0x000010c0] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+/* [0x000010c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
-+/* [0x000010d0] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x000010d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
-+/* [0x000010e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+/* [0x000010e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000010f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
-+/* [0x000010f8] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
-+/* [0x00001100] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00001108] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+/* [0x00001110] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
-+/* [0x00001118] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
-+/* [0x00001120] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+/* [0x00001128] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
-+/* [0x00001130] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu      ; mov r3, ra_blk_height
-+/* [0x00001138] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4               ; mov.ifz rb_base2, rb_base2_next
-+/* [0x00001140] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0                ; mov.ifz ra_base, ra_base_next
-+/* [0x00001148] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6                 ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00001150] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00001158] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+/* [0x00001160] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
-+/* [0x00001168] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001170] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x00001178] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00001180] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x00001188] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x00001190] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001198] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x000011a0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x000011a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000011b0] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b
-+/* [0x000011b8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x000011c0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x000011c8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y_bxx
-+/* [0x000011d0] */ 0xfffffc60, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
-+/* [0x000011d8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
-+/* [0x000011e0] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
-+/* [0x000011e8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
-+/* [0x000011f0] */ 0x1158ddc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6
-+/* [0x000011f8] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x00001200] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0                ; mul24 r0, r2, ra_wt_mul_l1
-+/* [0x00001208] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
-+// :1
-+/* [0x00001210] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
-+/* [0x00001218] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
-+/* [0x00001220] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
-+/* [0x00001228] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
-+/* [0x00001230] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
-+/* [0x00001238] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
-+/* [0x00001240] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
-+/* [0x00001248] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
-+/* [0x00001250] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
-+/* [0x00001258] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
-+/* [0x00001260] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
-+/* [0x00001268] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+/* [0x00001270] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
-+/* [0x00001278] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+/* [0x00001280] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+/* [0x00001288] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00001290] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+/* [0x00001298] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+/* [0x000012a0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+/* [0x000012a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x000012b0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+/* [0x000012b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
-+/* [0x000012c0] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+/* [0x000012c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
-+/* [0x000012d0] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x000012d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
-+/* [0x000012e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+/* [0x000012e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000012f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
-+/* [0x000012f8] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
-+/* [0x00001300] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00001308] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+/* [0x00001310] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
-+/* [0x00001318] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
-+/* [0x00001320] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+/* [0x00001328] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
-+/* [0x00001330] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4
-+/* [0x00001338] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0                ; mov r2, rb_wt_off
-+/* [0x00001340] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+/* [0x00001348] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu      ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00001350] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
-+/* [0x00001358] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1                ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00001360] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount     ; mov.ifz ra_base, ra_base_next
-+/* [0x00001368] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2                ; mov r0, r1 << 8
-+/* [0x00001370] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0                ; mov r3, ra_blk_height
-+/* [0x00001378] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001380] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7      ; mul24 r2, r3, rb_pitch
-+/* [0x00001388] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00001390] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0                ; v8subs r0, ra_height, r3
-+/* [0x00001398] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x000013a0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x000013a8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x000013b0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x000013b8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000013c0] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b
-+/* [0x000013c8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x000013d0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x000013d8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y_p00
-+/* [0x000013e0] */ 0x959a0ff6, 0x10024020, // mov ra0, unif                 ; mov r0, elem_num
-+/* [0x000013e8] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5
-+/* [0x000013f0] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0           ; mov ra_base_next, unif
-+/* [0x000013f8] */ 0x93027176, 0x12225813, // max r0, r0, r5                ; mov ra_y_next, ra0.16a
-+/* [0x00001400] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x          ; mov ra_width_height, unif
-+/* [0x00001408] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00001410] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00001418] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch          ; mov ra_wt_off_mul_l0, unif
-+/* [0x00001420] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00001428] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00001430] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
-+/* [0x00001438] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
-+/* [0x00001440] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
-+/* [0x00001448] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
-+/* [0x00001450] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD     ; v8min r0, r0, ra_blk_height
-+/* [0x00001458] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift     ; mov rb_lcount, r0
-+/* [0x00001460] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00001468] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7
-+/* [0x00001470] */ 0x918101f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift    ; mov ra_link, unif
-+/* [0x00001478] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base
-+// :1
-+/* [0x00001480] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1
-+/* [0x00001488] */ 0x804e7036, 0xa42099d1, // nop                           ; mov.ifz ra_y, ra_y_next       ; ldtmu0
-+/* [0x00001490] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
-+/* [0x00001498] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x000014a0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
-+/* [0x000014a8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
-+/* [0x000014b0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2          ; v8min r0, r0, rb_pmask
-+/* [0x000014b8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r0, ra_wt_mul_l0
-+/* [0x000014c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
-+/* [0x000014c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+/* [0x000014d0] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000014d8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8
-+/* [0x000014e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x000014e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x000014f0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
-+/* [0x000014f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001500] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
-+/* [0x00001508] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, ra_dest
-+/* [0x00001510] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00001518] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b
-+/* [0x00001520] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00001528] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00001530] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y_b00
-+/* [0x00001538] */ 0xfffff8f8, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
-+/* [0x00001540] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
-+/* [0x00001548] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
-+/* [0x00001550] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00001558] */ 0x00000001, 0xe00208a7, // mov r2, 1
-+/* [0x00001560] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2    ; mov r1, ra_wt_off_mul_l0
-+/* [0x00001568] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5
-+/* [0x00001570] */ 0x809f8009, 0xd000d9d6, // nop                           ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
-+// :1
-+/* [0x00001578] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu1
-+/* [0x00001580] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
-+/* [0x00001588] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
-+/* [0x00001590] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x00001598] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
-+/* [0x000015a0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
-+/* [0x000015a8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2          ; mov.ifz rb_base2, rb_base2_next
-+/* [0x000015b0] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x000015b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x000015c0] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
-+/* [0x000015c8] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2         ; v8min r0, r0, ra_pmax
-+/* [0x000015d0] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask          ; mul24 r0, r0, ra_wt_mul_l0
-+/* [0x000015d8] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r1, ra_wt_mul_l1
-+/* [0x000015e0] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1                ; v8adds r5rep, r5, ra_k1
-+/* [0x000015e8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
-+/* [0x000015f0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+/* [0x000015f8] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001600] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32
-+/* [0x00001608] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00001610] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x00001618] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x00001620] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001628] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x00001630] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x00001638] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00001640] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b
-+/* [0x00001648] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00001650] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00001658] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_setup_c10_q0
-+/* [0x00001660] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_setup_c10_qn
-+/* [0x00001668] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
-+/* [0x00001670] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+/* [0x00001678] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30      ; mov ra_base, unif
-+/* [0x00001680] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
-+/* [0x00001688] */ 0x119c21c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
-+/* [0x00001690] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
-+/* [0x00001698] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
-+/* [0x000016a0] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask
-+/* [0x000016a8] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+/* [0x000016b0] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+/* [0x000016b8] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+/* [0x000016c0] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
-+/* [0x000016c8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
-+/* [0x000016d0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x000016d8] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch
-+/* [0x000016e0] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
-+/* [0x000016e8] */ 0x409c5007, 0xd00049e0, // nop                           ; mul24 r0, r0, 5
-+/* [0x000016f0] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-+/* [0x000016f8] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0
-+/* [0x00001700] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
-+/* [0x00001708] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
-+/* [0x00001710] */ 0x930001f6, 0xd2225811, // max r0, r0, 0                 ; mov ra_y, ra0.16a
-+/* [0x00001718] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00001720] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0         ; mov rb_xshift2_next, 0
-+/* [0x00001728] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
-+/* [0x00001730] */ 0x149e7040, 0x10020867, // and r1, r0, r1
-+/* [0x00001738] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00001740] */ 0x8c827076, 0x10025800, // add r0, r0, r1                ; mov ra0, unif
-+/* [0x00001748] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
-+/* [0x00001750] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00001758] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
-+/* [0x00001760] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
-+/* [0x00001768] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
-+/* [0x00001770] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
-+/* [0x00001778] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+/* [0x00001780] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00001788] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
-+/* [0x00001790] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
-+/* [0x00001798] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x000017a0] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
-+/* [0x000017a8] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x         ; mov ra_y2, ra0.16a
-+/* [0x000017b0] */ 0x938001f6, 0xd002480f, // max r0, r0, 0                 ; mov rb_base2, unif
-+/* [0x000017b8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x000017c0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
-+/* [0x000017c8] */ 0x949c307f, 0xd0024863, // and r1, r0, r1                ; mov r3, PREREAD
-+/* [0x000017d0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x000017d8] */ 0x8c467076, 0x12024822, // add r0, r0, r1                ; mov r2, ra_y2
-+/* [0x000017e0] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0    ; mov r0, ra_y
-+// :1
-+/* [0x000017e8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x000017f0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
-+/* [0x000017f8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00001800] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x00001808] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
-+/* [0x00001810] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
-+/* [0x00001818] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x00001820] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00001828] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x00001830] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
-+/* [0x00001838] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00001840] */ 0x00000000, 0xe0024104, // mov ra4, 0                    ; mov rb4, 0
-+/* [0x00001848] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00001850] */ 0x00000000, 0xe0024145, // mov ra5, 0                    ; mov rb5, 0
-+/* [0x00001858] */ 0x00000000, 0xe0024186, // mov ra6, 0                    ; mov rb6, 0
-+/* [0x00001860] */ 0x00000000, 0xe00241c7, // mov ra7, 0                    ; mov rb7, 0
-+// ::mc_filter_c10_p
-+/* [0x00001868] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
-+/* [0x00001870] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
-+/* [0x00001878] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
-+/* [0x00001880] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
-+/* [0x00001888] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
-+/* [0x00001890] */ 0x93567176, 0x14024800, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
-+/* [0x00001898] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
-+/* [0x000018a0] */ 0x54404077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
-+/* [0x000018a8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x000018b0] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
-+/* [0x000018b8] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
-+/* [0x000018c0] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x000018c8] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x000018d0] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
-+/* [0x000018d8] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
-+/* [0x000018e0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
-+/* [0x000018e8] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
-+/* [0x000018f0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
-+/* [0x000018f8] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x00001900] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
-+/* [0x00001908] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
-+/* [0x00001910] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
-+// :1
-+/* [0x00001918] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu0
-+/* [0x00001920] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift        ; mov.ifz  r3, vra_y_next
-+/* [0x00001928] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
-+/* [0x00001930] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz  vra_base, vrx_base_next
-+/* [0x00001938] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
-+/* [0x00001940] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+/* [0x00001948] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
-+/* [0x00001950] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
-+/* [0x00001958] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3      ; v8min r0, r0, rb_pmask
-+/* [0x00001960] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
-+/* [0x00001968] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
-+/* [0x00001970] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00001978] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00001980] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00001988] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00001990] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001998] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
-+/* [0x000019a0] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
-+/* [0x000019a8] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
-+/* [0x000019b0] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
-+/* [0x000019b8] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
-+/* [0x000019c0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
-+/* [0x000019c8] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
-+/* [0x000019d0] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x000019d8] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+/* [0x000019e0] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
-+/* [0x000019e8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000019f0] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x000019f8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00001a00] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x00001a08] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x00001a10] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001a18] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x00001a20] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x00001a28] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00001a30] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
-+/* [0x00001a38] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00001a40] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00001a48] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_c10_p_l1
-+/* [0x00001a50] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
-+/* [0x00001a58] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
-+/* [0x00001a60] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
-+/* [0x00001a68] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
-+/* [0x00001a70] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
-+/* [0x00001a78] */ 0x939c117f, 0x10125815, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
-+/* [0x00001a80] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
-+/* [0x00001a88] */ 0x54404077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
-+/* [0x00001a90] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00001a98] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
-+/* [0x00001aa0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
-+/* [0x00001aa8] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00001ab0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x00001ab8] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
-+/* [0x00001ac0] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
-+/* [0x00001ac8] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
-+/* [0x00001ad0] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
-+/* [0x00001ad8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
-+/* [0x00001ae0] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x00001ae8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
-+/* [0x00001af0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
-+/* [0x00001af8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
-+// :1
-+/* [0x00001b00] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu1
-+/* [0x00001b08] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift        ; mov.ifz  vra_base, vrx_base_next
-+/* [0x00001b10] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
-+/* [0x00001b18] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef      ; mov.ifz  r3, vra_y_next
-+/* [0x00001b20] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
-+/* [0x00001b28] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+/* [0x00001b30] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
-+/* [0x00001b38] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
-+/* [0x00001b40] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3      ; v8min r0, r0, ra_pmax
-+/* [0x00001b48] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
-+/* [0x00001b50] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
-+/* [0x00001b58] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00001b60] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00001b68] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00001b70] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00001b78] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001b80] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
-+/* [0x00001b88] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
-+/* [0x00001b90] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
-+/* [0x00001b98] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
-+/* [0x00001ba0] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
-+/* [0x00001ba8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
-+/* [0x00001bb0] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
-+/* [0x00001bb8] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00001bc0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+/* [0x00001bc8] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
-+/* [0x00001bd0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001bd8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x00001be0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00001be8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x00001bf0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x00001bf8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001c00] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x00001c08] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x00001c10] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00001c18] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
-+/* [0x00001c20] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00001c28] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00001c30] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_c10_b
-+/* [0x00001c38] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
-+/* [0x00001c40] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
-+/* [0x00001c48] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r1, r1
-+/* [0x00001c50] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x         ; mov ra_y_next, ra2.16a
-+/* [0x00001c58] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch          ; mov ra_width_height, unif
-+/* [0x00001c60] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
-+/* [0x00001c68] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x          ; mov ra0, unif
-+/* [0x00001c70] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4                ; mov ra2, unif
-+/* [0x00001c78] */ 0x54404077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
-+/* [0x00001c80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00001c88] */ 0x8c427076, 0x12024821, // add r0, r0, r1                ; mov r1, ra_height
-+/* [0x00001c90] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0      ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00001c98] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00001ca0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x00001ca8] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0
-+/* [0x00001cb0] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift     ; mov ra3, unif
-+/* [0x00001cb8] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2                ; mov r3, unif
-+/* [0x00001cc0] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift    ; mov ra_y2_next, ra3.16a
-+/* [0x00001cc8] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b
-+/* [0x00001cd0] */ 0x918021f6, 0xd0025801, // shl r0, r0, v_x_shift         ; mov ra1, unif
-+/* [0x00001cd8] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x         ; mov ra3, unif
-+/* [0x00001ce0] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch          ; mov ra_wt_off_mul_l1, unif
-+/* [0x00001ce8] */ 0x939de17f, 0x10025809, // max r0, r0, r5                ; mov ra9, rb_max_y
-+/* [0x00001cf0] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x          ; mov r2, ra_kmul_add
-+/* [0x00001cf8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4                ; mov.ifc ra_wt_off_mul_l1, unif
-+/* [0x00001d00] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1                ; mov r5rep, -4
-+/* [0x00001d08] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00001d10] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
-+/* [0x00001d18] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0     ; mov r0, ra_fir_off_val
-+/* [0x00001d20] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
-+/* [0x00001d28] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
-+/* [0x00001d30] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1                ; mov r1, ra_wt_off_l1
-+/* [0x00001d38] */ 0x910cb3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6       ; mov rb11, ra3.8d
-+/* [0x00001d40] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0         ; mov ra_link, unif
-+/* [0x00001d48] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2          ; mov rb7,  ra2.8d
-+// :1
-+/* [0x00001d50] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu0
-+/* [0x00001d58] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift         ; mov.ifz rb_base2, rb_base2_next
-+/* [0x00001d60] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift         ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00001d68] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz ra_base, ra_base_next
-+/* [0x00001d70] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y             ; mov r3, ra_y
-+/* [0x00001d78] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0             ; mov      r0, r1 << 15
-+/* [0x00001d80] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9               ; mov.ifnc r1, r2 << 1
-+/* [0x00001d88] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
-+/* [0x00001d90] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3          ; v8min r0, r0, rb_pmask
-+/* [0x00001d98] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask          ; mul24      r2, ra0.8a,       r0
-+/* [0x00001da0] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra0.8d,       r1
-+/* [0x00001da8] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00001db0] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00001db8] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00001dc0] */ 0x40032031, 0xdc0109e3, // nop                           ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00001dc8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3                ; mul24 ra4, rb5, ra2.8a        ; ldtmu1
-+/* [0x00001dd0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10              ; mov rb5, rb6
-+/* [0x00001dd8] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift         ; mov r3, ra_y2
-+/* [0x00001de0] */ 0x8e1c21f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8  ; mov rb6, ra7
-+/* [0x00001de8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1          ; mov      r0, r1 << 15
-+/* [0x00001df0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+/* [0x00001df8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y          ; v8min r1, r1, ra_pmax
-+/* [0x00001e00] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
-+/* [0x00001e08] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3         ; v8min r0, r0, ra_pmax
-+/* [0x00001e10] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1              ; mul24      r2, ra1.8a,       r0
-+/* [0x00001e18] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra1.8d,       r1
-+/* [0x00001e20] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00001e28] */ 0x40074031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00001e30] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2                ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00001e38] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00001e40] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001e48] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3                ; mul24 r0, rb9,  ra3.8a
-+/* [0x00001e50] */ 0x550caffe, 0x1a025261, // mov rb9, rb10                 ; mul24 r1, rb10, ra3.8b
-+/* [0x00001e58] */ 0x8e2c25f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00001e60] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0                ; mul24 r1, rb5,  ra2.8b
-+/* [0x00001e68] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount     ; mov r0, ra4
-+/* [0x00001e70] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+/* [0x00001e78] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra7,  rb7
-+/* [0x00001e80] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+/* [0x00001e88] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0                ; mul24 r0, ra11, rb11
-+/* [0x00001e90] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0
-+/* [0x00001e98] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6
-+/* [0x00001ea0] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6                 ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00001ea8] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1                ; mul24 r1, r2, ra_wt_mul_l1
-+/* [0x00001eb0] */ 0x4c667216, 0x14024862, // add r1, r1, r0                ; mul24 r2, r2, ra_kmul_add
-+/* [0x00001eb8] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2                ; mov r3, ra_blk_height
-+/* [0x00001ec0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+/* [0x00001ec8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001ed0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7
-+/* [0x00001ed8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00001ee0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x00001ee8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x00001ef0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001ef8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x00001f00] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x00001f08] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00001f10] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
-+/* [0x00001f18] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00001f20] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00001f28] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_sync10_q0
-+/* [0x00001f30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00001f38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00001f40] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00001f48] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00001f50] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00001f58] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00001f60] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00001f68] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00001f70] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q1
-+/* [0x00001f78] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00001f80] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00001f88] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00001f90] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00001f98] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00001fa0] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q2
-+/* [0x00001fa8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00001fb0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00001fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00001fc0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00001fc8] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00001fd0] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q3
-+/* [0x00001fd8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00001fe0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00001fe8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00001ff0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00001ff8] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002000] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_sync10_q4
-+/* [0x00002008] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002010] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002018] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002020] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002028] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002030] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002038] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002040] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00002048] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q5
-+/* [0x00002050] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002058] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002060] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002068] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00002070] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002078] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q6
-+/* [0x00002080] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002088] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002090] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002098] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x000020a0] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000020a8] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q7
-+/* [0x000020b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000020b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000020c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x000020c8] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x000020d0] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000020d8] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_sync10_q8
-+/* [0x000020e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000020e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000020f0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000020f8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002100] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002108] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002110] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002118] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00002120] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q9
-+/* [0x00002128] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002130] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002138] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002140] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00002148] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002150] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q10
-+/* [0x00002158] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002160] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002168] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002170] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00002178] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002180] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q11
-+/* [0x00002188] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002190] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002198] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x000021a0] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x000021a8] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000021b0] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_exit_c10_q0
-+// ::mc_exit_y10_q0
-+/* [0x000021b8] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
-+// :1
-+/* [0x000021c0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x000021c8] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
-+/* [0x000021d0] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
-+/* [0x000021d8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x000021e0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x000021e8] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000021f0] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
-+/* [0x000021f8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
-+/* [0x00002200] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_exit_c10_qn
-+// ::mc_exit_y10_qn
-+/* [0x00002208] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
-+// :1
-+/* [0x00002210] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x00002218] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
-+/* [0x00002220] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
-+/* [0x00002228] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x00002230] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00002238] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
-+/* [0x00002240] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00002248] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_setup_y10_q0
-+/* [0x00002250] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_setup_y10_qn
-+/* [0x00002258] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
-+/* [0x00002260] */ 0x15827d80, 0x10020267, // mov ra9, unif
-+/* [0x00002268] */ 0x15827d80, 0x10020067, // mov ra1, unif
-+/* [0x00002270] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+/* [0x00002278] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30      ; mov ra11, unif
-+/* [0x00002280] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
-+/* [0x00002288] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask
-+/* [0x00002290] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+/* [0x00002298] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+/* [0x000022a0] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+/* [0x000022a8] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00
-+/* [0x000022b0] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40
-+/* [0x000022b8] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500
-+/* [0x000022c0] */ 0x15827d80, 0x100200e7, // mov ra3, unif
-+/* [0x000022c8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
-+/* [0x000022d0] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1
-+/* [0x000022d8] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
-+/* [0x000022e0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
-+/* [0x000022e8] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num              ; mov rb_pitch, unif
-+/* [0x000022f0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x000022f8] */ 0x159d03c0, 0x10021667, // or  rb_dma1_base, r1, rb_pitch
-+/* [0x00002300] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
-+/* [0x00002308] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
-+/* [0x00002310] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00002318] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00002320] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00002328] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4                ; v8subs r2, r2, r2
-+/* [0x00002330] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
-+/* [0x00002338] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00002340] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00002348] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00002350] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
-+/* [0x00002358] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
-+/* [0x00002360] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
-+/* [0x00002368] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00002370] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00002378] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00002380] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00002388] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00002390] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00002398] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000023a0] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0
-+/* [0x000023a8] */ 0x80027036, 0x120049e0, // nop                           ; mov r0, ra0.16a
-+/* [0x000023b0] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD               ; mov r2, ra1.16a
-+// :1
-+/* [0x000023b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x000023c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
-+/* [0x000023c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x000023d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x000023d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
-+/* [0x000023e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
-+/* [0x000023e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x000023f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x000023f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x00002400] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
-+/* [0x00002408] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00002410] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
-+/* [0x00002418] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
-+/* [0x00002420] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
-+/* [0x00002428] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
-+/* [0x00002430] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+/* [0x00002438] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00002440] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
-+/* [0x00002448] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
-+/* [0x00002450] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x00002458] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002460] */ 0x00000000, 0xe0024208, // mov ra8,  0                   ; mov rb8,  0
-+/* [0x00002468] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002470] */ 0x00000000, 0xe0024249, // mov ra9,  0                   ; mov rb9,  0
-+/* [0x00002478] */ 0x00000000, 0xe002428a, // mov ra10, 0                   ; mov rb10, 0
-+/* [0x00002480] */ 0x00000000, 0xe00242cb, // mov ra11, 0                   ; mov rb11, 0
-+// :per_block_setup_10
-+/* [0x00002488] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
-+/* [0x00002490] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
-+/* [0x00002498] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x000024a0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x000024a8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x000024b0] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch          ; mov ra_base_next, unif
-+/* [0x000024b8] */ 0x940270b6, 0x12225853, // and r1, r0, r2                ; mov ra_y_next, ra0.16a
-+/* [0x000024c0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x000024c8] */ 0x8c827076, 0x10025801, // add r0, r0, r1                ; mov ra1, unif
-+/* [0x000024d0] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
-+/* [0x000024d8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
-+/* [0x000024e0] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
-+/* [0x000024e8] */ 0x93067176, 0x12125813, // max r0, r0, r5                ; mov ra_y2_next, ra1.16a
-+/* [0x000024f0] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x          ; mov rb_base2_next, unif
-+/* [0x000024f8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00002500] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4                ; mov ra_width_height, unif
-+/* [0x00002508] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2                ; mov vw_setup, rb_vpm_init
-+/* [0x00002510] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00002518] */ 0x4c402077, 0xd4024821, // add r0, r0, r1                ; mul24 r1, ra_width, v_x_mul
-+/* [0x00002520] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
-+/* [0x00002528] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
-+/* [0x00002530] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
-+/* [0x00002538] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8)
-+/* [0x00002540] */ 0x916481f6, 0xd4024823, // shl r0, r0, v_dma_h_shift     ; mov r3, ra_kmul_add
-+/* [0x00002548] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00002550] */ 0x9164f1f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift    ; mov r2, ra_fir_off_val
-+/* [0x00002558] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif
-+/* [0x00002560] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16    ; mov ra_wt_off_mul_l0, unif
-+/* [0x00002568] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3                ; mov rb5, ra_k255
-+/* [0x00002570] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
-+/* [0x00002578] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
-+/* [0x00002580] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
-+/* [0x00002588] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d
-+/* [0x00002590] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c
-+/* [0x00002598] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
-+/* [0x000025a0] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d            ; mov ra_wt_off_mul_l1, unif
-+/* [0x000025a8] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c        ; v8min rb4, r0, rb5
-+/* [0x000025b0] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8                 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1
-+/* [0x000025b8] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d            ; mov ra_dest, unif
-+/* [0x000025c0] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c        ; v8min rb11, r0, rb5
-+/* [0x000025c8] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d
-+/* [0x000025d0] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c
-+/* [0x000025d8] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d
-+/* [0x000025e0] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c
-+/* [0x000025e8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
-+/* [0x000025f0] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
-+/* [0x000025f8] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c        ; mov r5rep, -8
-+/* [0x00002600] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002608] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
-+/* [0x00002610] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d            ; mov ra_link, unif
-+/* [0x00002618] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c        ; v8min rb8, r0, rb5
-+// ::mc_filter_y10_pxx
-+/* [0x00002620] */ 0xfffffe48, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
-+/* [0x00002628] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
-+/* [0x00002630] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
-+/* [0x00002638] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00002640] */ 0x1158adc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5
-+/* [0x00002648] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x00002650] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
-+// :1
-+/* [0x00002658] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
-+/* [0x00002660] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
-+/* [0x00002668] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
-+/* [0x00002670] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
-+/* [0x00002678] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
-+/* [0x00002680] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
-+/* [0x00002688] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
-+/* [0x00002690] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
-+/* [0x00002698] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
-+/* [0x000026a0] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
-+/* [0x000026a8] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
-+/* [0x000026b0] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+/* [0x000026b8] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
-+/* [0x000026c0] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+/* [0x000026c8] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+/* [0x000026d0] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x000026d8] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+/* [0x000026e0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+/* [0x000026e8] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+/* [0x000026f0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x000026f8] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00002700] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
-+/* [0x00002708] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+/* [0x00002710] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
-+/* [0x00002718] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00002720] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
-+/* [0x00002728] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+/* [0x00002730] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00002738] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
-+/* [0x00002740] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
-+/* [0x00002748] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00002750] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+/* [0x00002758] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
-+/* [0x00002760] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
-+/* [0x00002768] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+/* [0x00002770] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
-+/* [0x00002778] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu      ; mov r3, ra_blk_height
-+/* [0x00002780] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4               ; mov.ifz rb_base2, rb_base2_next
-+/* [0x00002788] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0                ; mov.ifz ra_base, ra_base_next
-+/* [0x00002790] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6                 ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00002798] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x000027a0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+/* [0x000027a8] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
-+/* [0x000027b0] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000027b8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x000027c0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x000027c8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x000027d0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x000027d8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x000027e0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x000027e8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x000027f0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000027f8] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b
-+/* [0x00002800] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00002808] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00002810] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y10_p00
-+/* [0x00002818] */ 0x959a0ff6, 0x10024020, // mov ra0, unif                 ; mov r0, elem_num
-+/* [0x00002820] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5
-+/* [0x00002828] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0           ; mov ra_base_next, unif
-+/* [0x00002830] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
-+/* [0x00002838] */ 0x93027176, 0x12225813, // max r0, r0, r5                ; mov ra_y_next, ra0.16a
-+/* [0x00002840] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x          ; mov ra_width_height, unif
-+/* [0x00002848] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00002850] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00002858] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch          ; mov ra_wt_off_mul_l0, unif
-+/* [0x00002860] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00002868] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00002870] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
-+/* [0x00002878] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
-+/* [0x00002880] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
-+/* [0x00002888] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
-+/* [0x00002890] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD     ; v8min r0, r0, ra_blk_height
-+/* [0x00002898] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift     ; mov rb_lcount, r0
-+/* [0x000028a0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000028a8] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7
-+/* [0x000028b0] */ 0x9180f1f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift    ; mov ra_link, unif
-+/* [0x000028b8] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base
-+// :1
-+/* [0x000028c0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1
-+/* [0x000028c8] */ 0x804e7036, 0xa42099d1, // nop                           ; mov.ifz ra_y, ra_y_next       ; ldtmu0
-+/* [0x000028d0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
-+/* [0x000028d8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x000028e0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
-+/* [0x000028e8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
-+/* [0x000028f0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2          ; v8min r0, r0, rb_pmask
-+/* [0x000028f8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r0, ra_wt_mul_l0
-+/* [0x00002900] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
-+/* [0x00002908] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+/* [0x00002910] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00002918] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8
-+/* [0x00002920] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00002928] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x00002930] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
-+/* [0x00002938] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00002940] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
-+/* [0x00002948] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, ra_dest
-+/* [0x00002950] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00002958] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b
-+/* [0x00002960] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00002968] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00002970] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y10_bxx
-+/* [0x00002978] */ 0xfffffaf0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
-+/* [0x00002980] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
-+/* [0x00002988] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
-+/* [0x00002990] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00002998] */ 0x1158bdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6
-+/* [0x000029a0] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x000029a8] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0                ; mul24 r0, r2, ra_wt_mul_l1
-+/* [0x000029b0] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
-+// :1
-+/* [0x000029b8] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
-+/* [0x000029c0] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
-+/* [0x000029c8] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
-+/* [0x000029d0] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
-+/* [0x000029d8] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
-+/* [0x000029e0] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
-+/* [0x000029e8] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
-+/* [0x000029f0] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
-+/* [0x000029f8] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
-+/* [0x00002a00] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
-+/* [0x00002a08] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
-+/* [0x00002a10] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+/* [0x00002a18] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
-+/* [0x00002a20] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+/* [0x00002a28] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+/* [0x00002a30] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00002a38] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+/* [0x00002a40] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+/* [0x00002a48] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+/* [0x00002a50] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00002a58] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00002a60] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
-+/* [0x00002a68] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+/* [0x00002a70] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
-+/* [0x00002a78] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00002a80] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
-+/* [0x00002a88] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+/* [0x00002a90] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00002a98] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
-+/* [0x00002aa0] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
-+/* [0x00002aa8] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00002ab0] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+/* [0x00002ab8] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
-+/* [0x00002ac0] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
-+/* [0x00002ac8] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+/* [0x00002ad0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
-+/* [0x00002ad8] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4
-+/* [0x00002ae0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0                ; mov r2, rb_wt_off
-+/* [0x00002ae8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+/* [0x00002af0] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu      ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00002af8] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
-+/* [0x00002b00] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1                ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00002b08] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount     ; mov.ifz ra_base, ra_base_next
-+/* [0x00002b10] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2                ; mov r0, r1 << 8
-+/* [0x00002b18] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0                ; mov r3, ra_blk_height
-+/* [0x00002b20] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00002b28] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7      ; mul24 r2, r3, rb_pitch
-+/* [0x00002b30] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00002b38] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0                ; v8subs r0, ra_height, r3
-+/* [0x00002b40] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x00002b48] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00002b50] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x00002b58] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x00002b60] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00002b68] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b
-+/* [0x00002b70] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00002b78] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00002b80] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y10_b00
-+/* [0x00002b88] */ 0xfffff8e0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
-+/* [0x00002b90] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
-+/* [0x00002b98] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
-+/* [0x00002ba0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00002ba8] */ 0x00000001, 0xe00208a7, // mov r2, 1
-+/* [0x00002bb0] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2    ; mov r1, ra_wt_off_mul_l0
-+/* [0x00002bb8] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5
-+/* [0x00002bc0] */ 0x809f8009, 0xd000d9d6, // nop                           ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
-+// :1
-+/* [0x00002bc8] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu1
-+/* [0x00002bd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
-+/* [0x00002bd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
-+/* [0x00002be0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x00002be8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
-+/* [0x00002bf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
-+/* [0x00002bf8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2          ; mov.ifz rb_base2, rb_base2_next
-+/* [0x00002c00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x00002c08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x00002c10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
-+/* [0x00002c18] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2         ; v8min r0, r0, ra_pmax
-+/* [0x00002c20] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask          ; mul24 r0, r0, ra_wt_mul_l0
-+/* [0x00002c28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r1, ra_wt_mul_l1
-+/* [0x00002c30] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1                ; v8adds r5rep, r5, ra_k1
-+/* [0x00002c38] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
-+/* [0x00002c40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+/* [0x00002c48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00002c50] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32
-+/* [0x00002c58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00002c60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x00002c68] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x00002c70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00002c78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x00002c80] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x00002c88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00002c90] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b
-+/* [0x00002c98] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00002ca0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00002ca8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_end
-+};
-+#ifdef __HIGHC__
-+#pragma Align_to(8, ff_hevc_rpi_shader)
-+#endif
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader.h
-@@ -0,0 +1,63 @@
-+#ifndef rpi_hevc_shader_H
-+#define rpi_hevc_shader_H
-+
-+extern unsigned int ff_hevc_rpi_shader[];
-+
-+#define mc_setup_c_q0 (ff_hevc_rpi_shader + 0)
-+#define mc_start (ff_hevc_rpi_shader + 0)
-+#define mc_setup_c_qn (ff_hevc_rpi_shader + 2)
-+#define mc_filter_c_p (ff_hevc_rpi_shader + 134)
-+#define mc_filter_c_p_l1 (ff_hevc_rpi_shader + 260)
-+#define mc_filter_c_b (ff_hevc_rpi_shader + 386)
-+#define mc_sync_q0 (ff_hevc_rpi_shader + 580)
-+#define mc_sync_q1 (ff_hevc_rpi_shader + 598)
-+#define mc_sync_q2 (ff_hevc_rpi_shader + 610)
-+#define mc_sync_q3 (ff_hevc_rpi_shader + 622)
-+#define mc_sync_q4 (ff_hevc_rpi_shader + 634)
-+#define mc_sync_q5 (ff_hevc_rpi_shader + 652)
-+#define mc_sync_q6 (ff_hevc_rpi_shader + 664)
-+#define mc_sync_q7 (ff_hevc_rpi_shader + 676)
-+#define mc_sync_q8 (ff_hevc_rpi_shader + 688)
-+#define mc_sync_q9 (ff_hevc_rpi_shader + 706)
-+#define mc_sync_q10 (ff_hevc_rpi_shader + 718)
-+#define mc_sync_q11 (ff_hevc_rpi_shader + 730)
-+#define mc_exit_c_qn (ff_hevc_rpi_shader + 742)
-+#define mc_exit_y_qn (ff_hevc_rpi_shader + 742)
-+#define mc_exit_c_q0 (ff_hevc_rpi_shader + 760)
-+#define mc_exit_y_q0 (ff_hevc_rpi_shader + 760)
-+#define mc_setup_y_q0 (ff_hevc_rpi_shader + 780)
-+#define mc_setup_y_qn (ff_hevc_rpi_shader + 782)
-+#define mc_filter_y_pxx (ff_hevc_rpi_shader + 1014)
-+#define mc_filter_y_bxx (ff_hevc_rpi_shader + 1140)
-+#define mc_filter_y_p00 (ff_hevc_rpi_shader + 1272)
-+#define mc_filter_y_b00 (ff_hevc_rpi_shader + 1358)
-+#define mc_setup_c10_q0 (ff_hevc_rpi_shader + 1432)
-+#define mc_setup_c10_qn (ff_hevc_rpi_shader + 1434)
-+#define mc_filter_c10_p (ff_hevc_rpi_shader + 1562)
-+#define mc_filter_c10_p_l1 (ff_hevc_rpi_shader + 1684)
-+#define mc_filter_c10_b (ff_hevc_rpi_shader + 1806)
-+#define mc_sync10_q0 (ff_hevc_rpi_shader + 1996)
-+#define mc_sync10_q1 (ff_hevc_rpi_shader + 2014)
-+#define mc_sync10_q2 (ff_hevc_rpi_shader + 2026)
-+#define mc_sync10_q3 (ff_hevc_rpi_shader + 2038)
-+#define mc_sync10_q4 (ff_hevc_rpi_shader + 2050)
-+#define mc_sync10_q5 (ff_hevc_rpi_shader + 2068)
-+#define mc_sync10_q6 (ff_hevc_rpi_shader + 2080)
-+#define mc_sync10_q7 (ff_hevc_rpi_shader + 2092)
-+#define mc_sync10_q8 (ff_hevc_rpi_shader + 2104)
-+#define mc_sync10_q9 (ff_hevc_rpi_shader + 2122)
-+#define mc_sync10_q10 (ff_hevc_rpi_shader + 2134)
-+#define mc_sync10_q11 (ff_hevc_rpi_shader + 2146)
-+#define mc_exit_c10_q0 (ff_hevc_rpi_shader + 2158)
-+#define mc_exit_y10_q0 (ff_hevc_rpi_shader + 2158)
-+#define mc_exit_c10_qn (ff_hevc_rpi_shader + 2178)
-+#define mc_exit_y10_qn (ff_hevc_rpi_shader + 2178)
-+#define mc_setup_y10_q0 (ff_hevc_rpi_shader + 2196)
-+#define mc_setup_y10_qn (ff_hevc_rpi_shader + 2198)
-+#define mc_filter_y10_pxx (ff_hevc_rpi_shader + 2440)
-+#define mc_filter_y10_p00 (ff_hevc_rpi_shader + 2566)
-+#define mc_filter_y10_bxx (ff_hevc_rpi_shader + 2654)
-+#define mc_filter_y10_b00 (ff_hevc_rpi_shader + 2786)
-+#define mc_end (ff_hevc_rpi_shader + 2860)
-+
-+#endif
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader.qasm
-@@ -0,0 +1,1850 @@
-+# Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
-+# All rights reserved.
-+#
-+# Redistribution and use in source and binary forms, with or without
-+# modification, are permitted provided that the following conditions are met:
-+#     * Redistributions of source code must retain the above copyright
-+#       notice, this list of conditions and the following disclaimer.
-+#     * Redistributions in binary form must reproduce the above copyright
-+#       notice, this list of conditions and the following disclaimer in the
-+#       documentation and/or other materials provided with the distribution.
-+#     * Neither the name of the copyright holder nor the
-+#       names of its contributors may be used to endorse or promote products
-+#       derived from this software without specific prior written permission.
-+#
-+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+#
-+# Written by Peter de Rivaz, John Cox
-+
-+
-+
-+# Inter pred asm
-+#
-+# Logic here should be good to 14 bits without modification
-+# but only 8 & 10 are currently instantiated & tested
-+# 15 & 16 bits have different shift1, shift2 calc & I also suspect overflow
-+# in _p00 & _b00
-+
-+# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
-+# the warning that we are using rotation & ra/rb registers. r0..3 can be
-+# rotated through all 16 elems ra regs can only be rotated through their
-+# local 4.  As it happens this is what is wanted here as we do not want the
-+# constants from the other half of the calc.
-+
-+# Number limits in P/B calculation
-+#
-+# In order to avoid issues with mul24 being an unsigned 24->32 bit multiplier
-+# we offset our intermediates s.t. they always end up +ve before the next
-+# multiply (may be -ve whilst summing but that doesn't matter).
-+#
-+# Range calc for up to 14 bits (Y-B pred):
-+#
-+# denom: [0, 7]
-+# bmax = (1 << bits) - 1
-+# off: [-(1 << (bits-1)), (1 << (bits-1)) - 1]
-+#
-+# wt_mul: [-128, 255]
-+# wt_off = off * 2 + 1: [-bmax, bmax]
-+#
-+# pel: [0, bmax]
-+# H-filter: [(-22*pel + 88*pel) >> (bits-8) + 0x4000] = [0x2a00, 0x97ff]
-+# V-filter: [(-22*hf + 88*hf) >> 6] = [0x580, 0xc28e]
-+# mul_t = (V_L0 + V_l1) * (wt_mul + 128): [0, 0x24624e6]
-+# mul_t - (V_l0 + V_l1)* 128: [-0xc28e00, 0x18396e4]
-+# adj_wt_off = (wt_off << ((denom + 6) - (bits - 8))) - 0x4000 * (wt_mul * 2):
-+#  [wt_off << (21 - bits)] - [wt_mul << 15] = [-0x1fffff, 0x1fffff] - [-0x400000, 0x7f8000]
-+#
-+# This all looks good and is mostly bit depth independant - and as we manage
-+# to do unsigned multiplies everywhere (now) this should be good for any bit
-+# depth up to 14 (we could probably do 16 - but that requires a few tweaks
-+# to the shifts we don't currently have logic for)
-+
-+# PREREAD is the number of requests that we have sitting in the TMU request
-+# queue.
-+#
-+# There are 8 slots availible in the TMU request Q for tm0s requests, but
-+# only 4 output FIFO entries and overflow is bad (corruption or crash)
-+# (If threaded then only 2 out FIFO entries, but we aren't.)
-+# In s/w we are effectively limited to the min vertical read which is >= 4
-+# so output FIFO is the limit.
-+#
-+# As the test for read-next is is the main part of the Luma loop (rather than
-+# the preload FIFO part) we are limited to min_luma_height - 1
-+# Min_luma_height is 4 so we can only have a preload of 3
-+# Beware that min_chroma_height (and_width) is 2 so we can't do the same trick
-+# in chroma without abandoning preload pretty much entirely (which would be bad)
-+#
-+# Timing tests vs preload of 4 suggests this doesn't hurt us much
-+# Could have preread 4 for Chroma but when tested it didn't help
-+
-+.set PREREAD,                      3
-+
-+# Offset added (effectively) at the exit of the H FIR filter
-+# This is enough to force the result +ve
-+# Is good if it is a power of 2 as that allows for >> without loss
-+#
-+# Worst case for a single Y FIR is *-22 so we need an offset of 256*22
-+# But we need twice offset to survive both H & V = 256*22*2 = 0x2c00
-+# Round up to next power of 2
-+
-+.set FIR_OFFSET,                   0x4000
-+
-+# Block heights - 8 & 16 are the only numbers we currently support
-+
-+.set C_BLK_HEIGHT_8,               16
-+.set C_BLK_HEIGHT_16,              8
-+.set Y_BLK_HEIGHT_8,               16
-+.set Y_BLK_HEIGHT_16,              8
-+
-+# QPU counts - depend on block size
-+# If we have a 2-byte format & block_size > 8 then can only afford
-+# 8 QPUs
-+# These numbers must match the numbers in ff_hevc_rpi_shader_cmd.h
-+
-+.set N_QPU_8,                      12
-+.set N_QPU_16,                     12
-+
-+# Value to add to the weight multiplier to convert it into an unsigned value
-+# Should be power of two for convienience
-+
-+.set LOG2_MUL_ADD,                 14
-+.set MUL_ADD,                      (1 << LOG2_MUL_ADD)
-+
-+# Fixed denom (max that it can be set to)
-+.set DENOM,                        7
-+
-+# register allocation
-+#
-+
-+# ra0-3
-+# Used as temp and may be loop filter coeffs (split into .8s)
-+# or temp in loop. Check usage on an individual basis.
-+
-+# ra4-11
-+# V FIFO / temp / free
-+
-+# -- free --                       ra12
-+
-+# -- free --                       ra13
-+
-+# -- free --                       ra14
-+
-+# -- free --                       ra15
-+
-+# uniform: width:height
-+.set ra_width_height,              ra16
-+.set ra_width,                     ra16.16b
-+.set ra_height,                    ra16.16a
-+
-+# y:y2 same layout as y_y2_next so we can update both together
-+.set ra_y_y2,                      ra17
-+.set ra_y2,                        ra17.16a
-+.set ra_y,                         ra17.16b
-+
-+# uniform: L1 weight (U on left, V on right)
-+# Only used in Y B
-+.set ra_wt_off_mul_l1,             ra18
-+.set ra_wt_off_l1,                 ra18.16b
-+.set ra_wt_mul_l1,                 ra18.16a
-+
-+# y_next:y2_next same layout as y_y2 so we can update both together
-+.set ra_y_y2_next,                 ra19
-+.set ra_y_next,                    ra19.16b
-+.set ra_y2_next,                   ra19.16a
-+
-+# Setup: consts - subdivide a single register
-+.set ra_kff800100,                 ra20
-+.set ra_k256,                      ra20.16a
-+.set ra_k0,                        ra20.8a
-+.set ra_k1,                        ra20.8b
-+.set ra_k128,                      ra20.8c
-+.set ra_k255,                      ra20.8d
-+
-+# Loop: xshifts
-+.set ra_xshift,                    ra21.16a
-+.set ra_xshift_next,               ra21.16b
-+
-+# Loop var: L0 weight (U on left, V on right)
-+# _off_ is not used in loop as we want to modify it before use
-+.set ra_wt_off_mul_l0,             ra22
-+.set ra_wt_mul_l0,                 ra22.16a
-+.set ra_wt_off_l0,                 ra22.16b
-+
-+# Max pel value (for 8 bit we can get away with sat ops but not 9+)
-+# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the
-+#   2nd byte   but as the source should never be > 3 there 0x3ff should do
-+.set ra_blk_height_pmax,           ra23
-+.set ra_pmax,                      ra23.16a
-+.set ra_blk_height,                ra23.8c
-+# --free --                        ra23.8d
-+
-+# Loop:  src frame base (L0)
-+.set ra_base,                      ra24
-+
-+# Misc  offsets
-+.set ra_fir_off_val_wt_den_p7,     ra25
-+.set ra_wt_den_p7,                 ra25.8a
-+# -- free --                       ra25.8b
-+.set ra_fir_off_val,               ra25.16b
-+
-+# As it happens these constants are the same
-+.if FIR_OFFSET == MUL_ADD
-+# Weight multiplier unsigned add
-+.set ra_kmul_add,                  ra_fir_off_val
-+.else
-+.error "FIR_OFFSET != MUL_ADD: Need new register & init"
-+.endif
-+
-+# Loop: next src frame base (L0)
-+.set ra_base_next,                 ra26
-+
-+# Loop: height<<23 + width<<16 + vdw_setup_0
-+.set ra_dma0,                      ra27
-+
-+# Loop: destination address
-+.set ra_dest,                      ra28
-+
-+# Setup: Dup of rb_ef
-+# Lo bits are used as Y coeff 0 as that lefts us combine test & coeff mul
-+# (top bits are ignored by mul24)
-+.set ra_ef,                        ra29
-+
-+# Use an even numbered register as a link register to avoid corrupting flags
-+.set ra_link,                      ra30
-+
-+# -- free --                       ra31
-+
-+.set rb_xshift2,                   rb0
-+.set rb_xshift2_next,              rb1
-+
-+# C:  (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2
-+.set rb_elem_x,                    rb2
-+
-+# El Flags
-+# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n
-+# Duped into ra_ef as sometimes that is easier to use
-+.set rb_ef,                        rb3
-+
-+# rb4-11
-+# Loop: V filter FIFO or V filter coeff
-+
-+# Loop var: offset to add before shift (round + weighting offsets)
-+# Exact value varies by loop
-+.set rb_wt_off,                    rb12
-+
-+# -- free --                       rb13
-+
-+# -- free --                       rb14
-+
-+# Loop: src frame base (L1)
-+.set rb_base2,                     rb15
-+
-+# Line pitch (128 for sand128)
-+.set rb_pitch,                     rb16
-+
-+# Loop count - 2 (set up TMU for next xfer)
-+.set rb_i_tmu,                     rb17
-+
-+# Loop count for min(height, 16)
-+# Y will reset & loop again if height > 16
-+.set rb_lcount,                    rb18
-+
-+# frame_base2_next
-+.set rb_base2_next,                rb19
-+
-+# Setup: Height of Y+C in sand, (x&mask)*xpitch will give
-+# offset to the slice
-+.set rb_xpitch,                    rb20
-+
-+# These 3 consts each save 1 instruction in Y loop setup
-+# so whilst they are worthwhile they should be the 1st to die if we need
-+# another b reg
-+.set rb_y_coeffs_2,                rb21                         # 0x050b0a00
-+.set rb_y_coeffs_3,                rb22                         # 0x11283a40
-+.set rb_y_coeffs_5,                rb23                         # 0x0a0b0500
-+
-+# Setup: 0xff (8-bit) / 0xffff (9+ bit)
-+.set rb_pmask,                     rb24
-+
-+# vdw_setup_1(dst_pitch)
-+.set rb_dma1_base,                 rb25
-+
-+# Setup: pic width - 1
-+# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc.
-+.set rb_max_x,                     rb26
-+
-+# vdw_setup_0 (depends on QPU number)
-+.set rb_dma0_base,                 rb27
-+
-+# Setup: vw_setup value to reset VPM write pointer
-+.set rb_vpm_init,                  rb28
-+
-+# Loop: vdw_setup_1(dst_pitch-width) = stride
-+.set rb_dma1,                      rb29
-+
-+# Setup: pic_height - 1
-+.set rb_max_y,                     rb30
-+
-+# Setup: FIR H offset
-+.set rb_fir_off_h,                 rb31
-+
-+
-+# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
-+.set i_shift16,                    -16
-+.set i_shift21,                    -11
-+.set i_shift23,                     -9
-+.set i_shift30,                     -2
-+
-+# Much of the setup code is common between Y & C
-+# Macros that express this - obviously these can't be overlapped
-+# so are probably unsuitable for loop code
-+
-+.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma
-+  mov r2, qpu_num
-+.if v_bit_depth <= 8
-+  # 8 bit version
-+  asr r1, r2, 2
-+  shl r1, r1, 6
-+  and r0, r2, 3
-+  or  r0, r0, r1
-+
-+  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+  add r_vpm, r0, r1  # VPM 8bit storage
-+
-+  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+  shl r0, r0, 5
-+
-+.else
-+  # 16 bit version
-+  # Limited to 8 QPUs if blk height > 8
-+  asr r1, r2, 1
-+.if v_blk_height <= 8
-+  shl r1, r1, 4
-+.else
-+  shl r1, r1, 5
-+.endif
-+  and r0, r2, 1
-+  or  r0, r0, r1
-+
-+  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR
-+  add r_vpm, r0, r1
-+
-+  # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
-+  # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
-+  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))    # height,width added later
-+  shl r0, r0, 6
-+.endif
-+  add r_dma, r0, r1  # DMA out
-+.endm
-+
-+
-+.macro m_setup_q0
-+  srel -, 12
-+.endm
-+
-+# Code start label
-+::mc_start
-+
-+################################################################################
-+# mc_setup_c
-+#
-+# typedef struct qpu_mc_pred_c_s_s {
-+#     int16_t y;
-+#     int16_t x;
-+#     uint32_t base;
-+#     uint32_t pic_cw;            // C Width (== Y width / 2)
-+#     uint32_t pic_ch;            // C Height (== Y Height / 2)
-+#     uint32_t stride2;
-+#     uint32_t stride1;
-+#     uint32_t wdenom;
-+#     int16_t y2;
-+#     int16_t x2;
-+#     uint32_t base2;
-+#     uint32_t next_fn;
-+# } qpu_mc_pred_c_s_t;
-+
-+.macro m_setup_c, v_bit_depth
-+
-+# Cannot use mul24 on x as x might be -ve, so must use shift
-+.if v_bit_depth <= 8
-+.set v_x_shift,         1
-+.set v_pmask,           0xff
-+.set v_blk_height,      C_BLK_HEIGHT_8
-+.else
-+.set v_x_shift,         2
-+.set v_pmask,           0xffff
-+.set v_blk_height,      C_BLK_HEIGHT_16
-+.endif
-+
-+  mov tmurs, 1                  ; mov ra0, unif                 # No TMU swap ; x_y
-+
-+  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+  shl rb_ef, r0, i_shift30      ; mov ra_base, unif             # ; ref_c_base
-+
-+# Read image dimensions
-+  sub r0, unif, 1                                               # pic c width
-+  shl rb_max_x, r0, v_x_shift                                   # rb_max_x in bytes
-+  sub rb_max_y, unif, 1                                         # pic c height
-+
-+# load constants
-+  mov ra_kff800100, 0xff800100
-+  mov rb_pmask, v_pmask
-+  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+  mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+  mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+
-+# get source pitch
-+  mov ra_ef, rb_ef              ; mov rb_xpitch, unif           # ; stride2
-+  mov rb_pitch, unif                                            # stride1
-+  mov r1, vdw_setup_1(0)                                        # [rb_pitch delay] Merged with dst_stride shortly
-+  add rb_dma1_base, r1, rb_pitch                                # vdw_setup_1
-+
-+  and r0, 1, elem_num
-+  nop                           ; mul24 r0, r0, 5
-+.if v_bit_depth <= 8
-+  add rb_elem_x, r0, elem_num
-+.else
-+  add r0, r0, elem_num
-+  add rb_elem_x, r0, r0
-+.endif
-+
-+# Compute base address for first and second access
-+# ra_base ends up with t0s base
-+# ra_base2 ends up with t1s base
-+
-+  shl r0, ra0.16b, v_x_shift                                    # [rb_elem_x delay]
-+  add r0, r0, rb_elem_x                                         # Add elem no to x to get X for this slice
-+  max r0, r0, 0                 ; mov ra_y, ra0.16a             # ; stash Y
-+  min r0, r0, rb_max_x
-+
-+# Get shift
-+# Shift will always calculate as 0 for 9+ bit
-+# Ideally we can optimize the shift out of the code in these cases but for now
-+# it is tidier to leave it in
-+.if v_bit_depth <= 8
-+  shl ra_xshift_next, r0, 3
-+.else
-+  mov ra_xshift_next, 0         ; mov rb_xshift2_next, 0
-+.endif
-+
-+# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to
-+
-+.if v_bit_depth <= 8
-+  and r0, r0, -4
-+.endif
-+  sub r1, ra_k0, rb_pitch
-+  and r1, r0, r1
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mov ra0, unif                 # ; next_x2_y2
-+  add ra_base, ra_base, r0
-+
-+# Compute part of VPM to use for DMA output
-+# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop?
-+  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
-+
-+# And again for L1, but only worrying about frame2 stuff
-+
-+# Compute base address for first and second access
-+# ra_base ends up with t0s base
-+# rb_base2 ends up with t1s base
-+
-+  shl r0, ra0.16b, v_x_shift
-+  add r0, r0, rb_elem_x         ; mov ra_y2, ra0.16a            # Add QPU slice offset
-+  max r0, r0, 0                 ; mov rb_base2, unif            # ref_c_base2
-+  min r0, r0, rb_max_x
-+
-+# Get shift (already zero if 9+ bit so ignore)
-+.if v_bit_depth <= 8
-+  shl rb_xshift2_next, r0, 3
-+.endif
-+
-+# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
-+
-+.if v_bit_depth <= 8
-+  and r0, r0, -4
-+.endif
-+  sub r1, ra_k0, rb_pitch
-+  and r1, r0, r1                ; mov r3, PREREAD
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mov r2, ra_y2
-+  add rb_base2, rb_base2, r0    ; mov r0, ra_y
-+
-+# Do preloads
-+# r0 = ra_y, r2 = ra_y2, r3 = PREREAD
-+
-+:1
-+  sub.setf r3, r3, 1
-+  max r1, r0, 0
-+  min r1, r1, rb_max_y
-+  add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
-+  add t0s, ra_base, r1          ; mov ra_y, r0
-+
-+  max r1, r2, 0
-+  brr.anynz -, r:1b
-+  min r1, r1, rb_max_y
-+  add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
-+  add t1s, rb_base2, r1         ; mov ra_y2, r2
-+# >>> .anynz 1b
-+
-+  mov ra_link, unif                                             # link
-+# touch registers to keep simulator happy (and fills in delay slots)
-+  mov ra4, 0                    ; mov rb4, 0
-+  bra -, ra_link
-+  mov ra5, 0                    ; mov rb5, 0
-+  mov ra6, 0                    ; mov rb6, 0
-+  mov ra7, 0                    ; mov rb7, 0
-+# >>> ra_link
-+.endm
-+
-+::mc_setup_c_q0
-+  m_setup_q0
-+::mc_setup_c_qn
-+  m_setup_c 8
-+
-+################################################################################
-+#
-+# mc_filter_c_p
-+#
-+# typedef struct qpu_mc_pred_c_p_s {
-+#     int16_t y;
-+#     int16_t x;
-+#     uint32_t base;
-+#     uint16_t h;
-+#     uint16_t w;
-+#     uint32_t coeffs_x;
-+#     uint32_t coeffs_y;
-+#     uint32_t wo_u;
-+#     uint32_t wo_v;
-+#     uint32_t dst_addr_c;
-+#     uint32_t next_fn;
-+# } qpu_mc_pred_c_p_t;
-+
-+.macro m_filter_c_p, v_tmu, v_bit_depth
-+
-+.if v_bit_depth <= 8
-+.set v_x_shift,         1
-+.set v_x_mul,           2
-+.set v_v_shift,         8
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift,     7
-+.set v_dma_wh_shift,    i_shift16
-+.else
-+.set v_x_shift,         2
-+.set v_x_mul,           4
-+.set v_v_shift,         i_shift16
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift,     8
-+.set v_dma_wh_shift,    15
-+.endif
-+
-+.if v_tmu == 0
-+.set vrx_xshift,        rb_xshift2              # b side more convienient
-+.set vrx_xshift_next,   ra_xshift_next
-+.set vra_y_next,        ra_y_next
-+.set vrx_base_next,     ra_base_next
-+.set vra_y,             ra_y
-+.set vra_base,          ra_base
-+.set vr_txs,            t0s
-+.else
-+.set vrx_xshift,        ra_xshift               # a side more convienient
-+.set vrx_xshift_next,   rb_xshift2_next
-+.set vra_y_next,        ra_y2_next
-+.set vrx_base_next,     rb_base2_next
-+.set vra_y,             ra_y2
-+.set vra_base,          rb_base2
-+.set vr_txs,            t1s
-+.endif
-+
-+# denom shift values
-+.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
-+.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
-+
-+# per-channel shifts were calculated on the *previous* invocation
-+# get base addresses and per-channel shifts for *next* invocation
-+  mov vw_setup, rb_vpm_init     ; mov ra2, unif                 # ; x_y
-+
-+  add.setf -, rb_ef, rb_ef      ; mov r3, unif                  # [ra2 delay] ; base
-+
-+  shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0          # r5 = 0
-+  add r0, r0, rb_elem_x         ; mov ra_width_height, unif     # r1=pitch2 mask ; width_height
-+  sub r1, r5, rb_pitch          ; mov ra0, unif                 # ; H filter coeffs
-+  max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
-+  min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
-+
-+.if v_bit_depth <= 8
-+  shl vrx_xshift_next, r0, 3
-+  and r0, r0, -4
-+.endif
-+  and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul   # r2=w*2 (we are working in pel pairs)  ** x*2 already calced!
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mov ra3, unif                 # ; V filter coeffs
-+  add vrx_base_next, r3, r0     ; mov r1, ra_height
-+
-+# set up VPM write
-+  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif    # Compute vdw_setup1(dst_pitch-width) ; U offset/weight
-+  add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+  add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight
-+
-+# Misc final setup...
-+
-+  shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif             # ; dst_addr
-+  add r0, r0, r2                ; mov r2, ra_fir_off_val        # Combine width and height of destination area (r0=h<<8, r2=w*2)
-+  shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c              # Shift into bits 16 upwards of the vdw_setup0 register
-+  add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0          # ; r1=weight
-+  shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
-+  sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
-+  add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4            # ; loop counter (V FIFO fill = 4)
-+  mov rb11, ra3.8d              ; mov ra_link, unif             # ; Link
-+
-+# r5           = -4                     (loop counter)
-+# ra_wt_mul_l0 = weight L0 + 128        (now unsigned)
-+# rb_wt_off    = (offset * 2 + 1) << (wt_den + 5)
-+# rb31         = FIR value offset
-+
-+# FIFO: rb4, ra5, rb6, ra7
-+# Coeffs in ra3.8a, ra3.8b, rb10, rb11
-+
-+# We want (r0r1)
-+# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ...
-+# We fetch (after shift)
-+#  C0  :  C3  :  C1  :  C4  :  C2  :  C5  : ...
-+
-+:1
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+.if v_tmu == 0
-+  sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu0
-+  shr r2, r4, vrx_xshift        ; mov.ifz  r3, vra_y_next
-+  shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
-+  add.setf -, rb_ef, rb_ef      ; mov.ifz  vra_base, vrx_base_next
-+.else
-+  sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu1
-+  shr r2, r4, vrx_xshift        ; mov.ifz  vra_base, vrx_base_next
-+  shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
-+  add.setf -, rb_ef, rb_ef      ; mov.ifz  r3, vra_y_next       # [r1 << delay]
-+.endif
-+
-+  add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
-+  max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+  min r3, r3, rb_max_y          ; mov.ifnc r0, r2
-+
-+  and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
-+.if v_tmu == 0
-+  add vr_txs, vra_base, r3      ; v8min r0, r0, rb_pmask        # ; mask bytes
-+.else
-+  add vr_txs, vra_base, r3      ; v8min r0, r0, ra_pmax         # ; mask bytes
-+.endif
-+
-+# apply horizontal filter
-+# The filter coeffs for the two halves of this are the same (unlike in the
-+# Y case) so it doesn't matter which ra0 we get them from
-+# Also as the two halves are locked together we don't need to separate the 1st
-+# r0 mul or the last r1 mul as they are valid for all QPUs
-+
-+  add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
-+  sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
-+  sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+  nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+  add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+  add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+
-+# V filter = - r4 * a + r5 * b + r6 * c - r7 * d (post FIFO shift)
-+# We would like to save the r5->r4 shift but we need a delay slot
-+# for both r7 & r6 which we can't find anything to put in if we have
-+# already multiplied r4 & r5!
-+  brr.anyn -, r:1b
-+  add r2, r2, r3                ; mul24 r0, ra7, rb10           # r6 post
-+  mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b         # r5 post
-+  asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
-+# >>> .anyn 1b
-+
-+  add r1, r1, r0                ; mul24 r0, rb4, ra3.8a         # [ra7 delay]
-+  sub r1, r1, r0                ; mul24 r0, ra7, rb11
-+  sub r1, r1, r0
-+
-+  asr r1, r1, 6                 ; mov r3, ra_blk_height         # ; NxtLoop
-+  sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+  add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+  sub r1, r0, r1                ; v8subs r0, ra_height, r3      # ; NxtLoop
-+  brr.anyn -, r:1b
-+  asr r1, r1, i_wt_den_p6
-+  min r1, r1, ra_pmax           ; mov -, vw_wait
-+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch        # ; NxtLoop
-+# >>> .anyn 1b
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # VDW setup 0
-+
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3                ; mov vw_setup, rb_dma1         # Stride
-+  sub r1, r0, r3                ; mov vw_addr, ra_dest          # start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link
-+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  brr -, r:1b
-+  add rb_lcount, rb_lcount, r0
-+  add ra_dma0, ra_dma0, r1
-+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_c_p
-+  m_filter_c_p 0, 8
-+
-+::mc_filter_c_p_l1
-+  m_filter_c_p 1, 8
-+
-+################################################################################
-+#
-+# mc_filter_c_b
-+#
-+# typedef struct qpu_mc_pred_c_b_s {
-+#     int16_t y;
-+#     int16_t x;
-+#     uint32_t base;
-+#     uint16_t h;
-+#     uint16_t w;
-+#     uint32_t coeffs_x1;
-+#     uint32_t coeffs_y1;
-+#     int16_t weight_u1;
-+#     int16_t weight_v1;
-+#     int16_t y2;
-+#     int16_t x2;
-+#     uint32_t base2;
-+#     uint32_t coeffs_x2;
-+#     uint32_t coeffs_y2;
-+#     uint32_t wo_u2;
-+#     uint32_t wo_v2;
-+#     uint32_t dst_addr_c;
-+#     uint32_t next_fn;
-+# } qpu_mc_pred_c_b_t;
-+
-+.macro m_filter_c_b, v_bit_depth
-+
-+.if v_bit_depth <= 8
-+.set v_x_shift,         1
-+.set v_v_shift,         8
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift,     7
-+.set v_dma_wh_shift,    i_shift16
-+.else
-+.set v_x_shift,         2
-+.set v_v_shift,         i_shift16
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift,     8
-+.set v_dma_wh_shift,    15
-+.endif
-+.set v_x_mul,           (1 << v_x_shift)
-+
-+# denom shift values
-+.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
-+.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
-+
-+# per-channel shifts were calculated on the *previous* invocation
-+
-+# get base addresses and per-channel shifts for *next* invocation
-+  mov vw_setup, rb_vpm_init     ; mov ra2, unif                 # ; x_y
-+
-+  add.setf -, rb_ef, rb_ef      ; mov r3, unif                  # [ra2 delay] ; r3=base
-+
-+  shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r1, r1          # x ; r5=0
-+  add r0, r0, rb_elem_x         ; mov ra_y_next, ra2.16a
-+  sub r1, r5, rb_pitch          ; mov ra_width_height, unif     # r1=pitch2 mask ; width_height
-+  max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
-+  min r0, r0, rb_max_x          ; mov ra0, unif                 # ; L0 H filter coeffs
-+
-+.if v_bit_depth <= 8
-+  shl ra_xshift_next, r0, 3
-+.endif
-+
-+  and r0, r0, -4                ; mov ra2, unif                 # ; L0 V filter coeffs
-+  and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul   # r2=x*2 (we are working in pel pairs)
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mov r1, ra_height             # Add stripe offsets ; r1=height
-+  add ra_base_next, r3, r0      ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
-+
-+# set up VPM write
-+
-+  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif    # Compute vdw_setup1(dst_pitch-width) ; U weight
-+  add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+  add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 # ; V weight
-+
-+  shl r0, r1, v_dma_h_shift     ; mov ra3, unif                 # ; x2_y2
-+  add r0, r0, r2                ; mov r3, unif                  # [ra3 delay] ; base
-+  shl r0, r0, v_dma_wh_shift    ; mov ra_y2_next, ra3.16a       # Shift into bits 16 upwards of the vdw_setup0 register
-+  add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b               # r0=x
-+
-+# L1 - uniform layout could possibly be optimized
-+
-+  shl r0, r0, v_x_shift         ; mov ra1, unif                 # r0=x<<shift ; L1 H filter coeffs
-+  add r0, r0, rb_elem_x         ; mov ra3, unif                 # ; L1 V filter coeffs
-+  sub r1, r5, rb_pitch          ; mov ra_wt_off_mul_l1, unif    # [ra3 delay] r1=pitch2 mask ; U offset/weight
-+  max r0, r0, r5                ; mov ra9, rb_max_y
-+  min r0, r0, rb_max_x          ; mov r2, ra_kmul_add
-+
-+.if v_bit_depth <= 8
-+  shl rb_xshift2_next, r0, 3
-+.endif
-+
-+  and r0, r0, -4                ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight
-+  and r1, r0, r1                ; mov r5rep, -4
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mov ra_dest, unif             #  Add stripe offsets ; dst_addr
-+  add rb_base2_next, r3, r0     ; mov r0, ra_fir_off_val
-+
-+  add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
-+  add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
-+  add r0, r0, r1                ; mov r1, ra_wt_off_l1          # ; L0 off unset
-+  shl r1, r1, i_wt_den_p6       ; mov rb11, ra3.8d
-+  sub rb_wt_off, r1, r0         ; mov ra_link, unif             # ; link
-+
-+  mov ra10, rb_xshift2          ; mov rb7,  ra2.8d
-+
-+# r5        loop counter (-4)
-+# ra0       H coeffs L0
-+# ra1       H coeffs L1
-+# ra2       V coeffs L0
-+# ra3       V coeffs L1
-+# ra9       rb_max_y alias
-+# ra10      rb_xshift2 alias
-+
-+:1
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+  sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu0
-+  shr r2, r4, ra_xshift         ; mov.ifz rb_base2, rb_base2_next
-+  shr r1, r2, v_v_shift         ; mov.ifz ra_y_y2, ra_y_y2_next
-+  add.setf -, rb_ef, rb_ef      ; mov.ifz ra_base, ra_base_next # [ra_y delay]
-+  add ra_y, 1, ra_y             ; mov r3, ra_y
-+
-+  max r3, r3, ra_k0             ; mov      r0, r1 << 15
-+  min r3, r3, ra9               ; mov.ifnc r1, r2 << 1
-+
-+  mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
-+  add t0s, ra_base, r3          ; v8min r0, r0, rb_pmask        # ; masks bytes
-+
-+# L0 H-filter (-ra4*, +rb5, +rb6, -ra7)
-+
-+  and r1, r1, rb_pmask          ; mul24      r2, ra0.8a,       r0
-+  sub r2, rb_fir_off_h, r2      ; mul24      r3, ra0.8d,       r1
-+  sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+  nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+  add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+  nop                           ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+
-+  add r0, r2, r3                ; mul24 ra4, rb5, ra2.8a        ; ldtmu1
-+
-+  shr r2, r4, ra10              ; mov rb5, rb6
-+  shr r1, r2, v_v_shift         ; mov r3, ra_y2
-+  shr ra7, r0, v_bit_depth - 8  ; mov rb6, ra7                  # [r1 << delay]
-+
-+  add ra_y2, r3, ra_k1          ; mov      r0, r1 << 15
-+  max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+  min r3, r3, rb_max_y          ; v8min r1, r1, ra_pmax
-+
-+  mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
-+  add t1s, rb_base2, r3         ; v8min r0, r0, ra_pmax         # ; masks bytes
-+
-+# L1 H-filter (-r0*, +rb9, +rb10, -ra11)
-+
-+  add r5rep, r5, 1              ; mul24      r2, ra1.8a,       r0
-+  sub r2, rb_fir_off_h, r2      ; mul24      r3, ra1.8d,       r1
-+  sub r2, r2, r3                ; mul24      r3, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
-+  nop                           ; mul24.ifn  r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
-+  add r2, r3, r2                ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
-+  add.setf -, r5, r5            ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+
-+  brr.anyn -, r:1b
-+  add r2, r2, r3                ; mul24 r0, rb9,  ra3.8a
-+  mov rb9, rb10                 ; mul24 r1, rb10, ra3.8b
-+  shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+# >>> .anyn 1b
-+
-+  sub r2, r1, r0                ; mul24 r1, rb5,  ra2.8b        # L1 ; L0
-+  sub.setf -, r5, rb_lcount     ; mov r0, ra4
-+  sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+  add r1, r1, r0                ; mul24 r0, ra7,  rb7
-+
-+  sub r1, r1, r0                ; mul24 r0, rb10, ra3.8c        # L1
-+  add r2, r2, r0                ; mul24 r0, ra11, rb11          # L1
-+  sub r2, r2, r0
-+
-+  shr r1, r1, 6
-+  shr r2, r2, 6                 ; mul24 r0, r1, ra_wt_mul_l0
-+  add r2, r2, r1                ; mul24 r1, r2, ra_wt_mul_l1
-+  add r1, r1, r0                ; mul24 r2, r2, ra_kmul_add
-+  sub r1, r1, r2                ; mov r3, ra_blk_height         # ; NxtLoop
-+  add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3      # ; NxtLoop
-+
-+  brr.anyn -, r:1b
-+  asr r1, r1, ra_wt_den_p7
-+  min r1, r1, ra_pmax           ; mov -, vw_wait
-+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch        # ; NxtLoop
-+# >>> .anyn 1b
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # ; VDW setup 0
-+
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3                ; mov vw_setup, rb_dma1         # ; Stride
-+  sub r1, r0, r3                ; mov vw_addr, ra_dest          # ; start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link
-+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  brr -, r:1b
-+  add rb_lcount, rb_lcount, r0
-+  add ra_dma0, ra_dma0, r1
-+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_c_b
-+  m_filter_c_b 8
-+
-+################################################################################
-+# Exit code used by both Luma & Chroma so place between them to avoid I-cache
-+# conflicts
-+
-+.macro m_exit_drain
-+.if PREREAD == 2
-+# Special case 2 as loop is wasteful
-+  nop                   ; nop           ; ldtmu0
-+  nop                   ; nop           ; ldtmu1
-+  nop                   ; nop           ; ldtmu0
-+  mov -, vw_wait        ; nop           ; ldtmu1
-+.else
-+  mov.setf r3, PREREAD - 1
-+:1
-+  brr.anynz -, r:1b
-+  nop                   ; nop           ; ldtmu0
-+  nop                   ; nop           ; ldtmu1
-+  sub.setf r3, r3, 1
-+ # >>>
-+  mov  -, vw_wait
-+.endif
-+.endm
-+
-+# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair)
-+# All qpus start at the beginning and after that (group - 1) must have finished
-+# before (group) can start
-+#
-+# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain
-+# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important -
-+# lockup otherwise)
-+#
-+# There is some, currently ill defined, potential lockup if we have the VDM active
-+# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ??
-+#
-+# The code stalled when I had many waiters on a single sem so we have a
-+# "ripple" of srels to restart.  Unsure why, may have been bug, but this works
-+# and we currently have both the memory & sems to support it.
-+.macro m_sync_q, n_qpu, n_quads
-+# Do not generate code for qpu >= quads * 4 -  fns should never be called
-+.if n_qpu < n_quads * 4
-+  mov ra_link, unif     # Can only branch to an a reg (not r0)
-+  mov -, vw_wait        # [ra_link delay]
-+
-+.set n_sem_sync, n_qpu - (n_qpu % 4)
-+.set n_sem_in, n_qpu
-+.set n_sem_out, n_qpu + 1
-+
-+.if n_qpu % 4 == 0
-+
-+.set n_sem_quad_in,  12 + n_qpu / 4
-+.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads)
-+
-+  sacq -, n_sem_sync
-+  sacq -, n_sem_sync
-+  sacq -, n_sem_sync
-+  bra -, ra_link
-+  sacq -, n_sem_quad_in
-+  srel -, n_sem_out
-+  srel -, n_sem_quad_out
-+
-+.else
-+  bra -, ra_link
-+  srel -, n_sem_sync
-+  sacq -, n_sem_in
-+.if n_sem_out % 4 != 0
-+  srel -, n_sem_out
-+.else
-+  nop
-+.endif
-+.endif
-+.endif
-+.endm
-+
-+.set v_quads8, N_QPU_8 / 4
-+
-+::mc_sync_q0
-+  m_sync_q 0, v_quads8
-+::mc_sync_q1
-+  m_sync_q 1, v_quads8
-+::mc_sync_q2
-+  m_sync_q 2, v_quads8
-+::mc_sync_q3
-+  m_sync_q 3, v_quads8
-+::mc_sync_q4
-+  m_sync_q 4, v_quads8
-+::mc_sync_q5
-+  m_sync_q 5, v_quads8
-+::mc_sync_q6
-+  m_sync_q 6, v_quads8
-+::mc_sync_q7
-+  m_sync_q 7, v_quads8
-+::mc_sync_q8
-+  m_sync_q 8, v_quads8
-+::mc_sync_q9
-+  m_sync_q 9, v_quads8
-+::mc_sync_q10
-+  m_sync_q 10, v_quads8
-+::mc_sync_q11
-+  m_sync_q 11, v_quads8
-+
-+# mc_exit()
-+# Chroma & Luma the same now
-+
-+.macro m_exit_qn
-+  m_exit_drain
-+  nop                   ; nop           ; thrend
-+  nop
-+  nop
-+# >>> thrend <<<
-+.endm
-+
-+::mc_exit_c_qn
-+::mc_exit_y_qn
-+  m_exit_qn
-+
-+
-+
-+# mc_interrupt_exit12()
-+
-+.macro m_exit_q0
-+  m_exit_drain
-+  sacq -, 12
-+  nop                   ; nop           ; thrend
-+  mov interrupt, 1
-+  nop
-+# >>> thrend <<<
-+.endm
-+
-+::mc_exit_c_q0
-+::mc_exit_y_q0
-+  m_exit_q0
-+
-+# LUMA CODE
-+
-+# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
-+# For P frames we make the second x,y coordinates offset by +8
-+
-+
-+################################################################################
-+# mc_setup
-+#
-+# typedef struct qpu_mc_pred_y_s_s {
-+#    qpu_mc_src_t next_src1;
-+#    qpu_mc_src_t next_src2;
-+#    uint16_t pic_h;
-+#    uint16_t pic_w;
-+#    uint32_t stride2;
-+#    uint32_t stride1;
-+#    uint32_t wdenom;
-+#    uint32_t next_fn;
-+# } qpu_mc_pred_y_s_t;
-+
-+.macro m_setup_y, v_bit_depth
-+
-+# Cannot use mul24 on x as x might be -ve, so must use shift
-+.if v_bit_depth <= 8
-+.set v_x_shift,         0
-+.set v_pmask,           0xff
-+.set v_blk_height,      Y_BLK_HEIGHT_8
-+.else
-+.set v_x_shift,         1
-+.set v_pmask,           0xffff
-+.set v_blk_height,      Y_BLK_HEIGHT_16
-+.endif
-+
-+
-+  # Need to save these because we need to know the frame dimensions before computing texture coordinates
-+  mov tmurs, 1                  ; mov ra0, unif                 # No TMU swap ; x_y
-+  mov ra9, unif                                                 # ref_y_base
-+  mov ra1, unif                                                 # x2_y2
-+
-+
-+# load constants
-+  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+  shl rb_ef, r0, i_shift30      ; mov ra11, unif                # ; ref_y2_base
-+
-+  mov ra_kff800100, 0xff800100
-+  mov rb_pmask, v_pmask
-+  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+  mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+  mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+  mov rb_y_coeffs_2, 0x050b0a00
-+  mov rb_y_coeffs_3, 0x11283a40
-+  mov rb_y_coeffs_5, 0x0a0b0500
-+
-+# Compute part of VPM to use
-+
-+# Read image dimensions
-+  mov ra3, unif                                                 # width_height
-+  mov ra_ef, rb_ef              ; mov rb_xpitch, unif           # [ra3 delay] ; stride2
-+.if v_x_shift == 0
-+  sub rb_max_x, ra3.16b, 1
-+.else
-+  sub r0, ra3.16b, 1
-+  shl rb_max_x, r0, v_x_shift
-+.endif
-+  sub rb_max_y, ra3.16a, 1
-+  mov r3, elem_num              ; mov rb_pitch, unif            # stride1
-+
-+# get destination pitch
-+  mov r1, vdw_setup_1(0)                                        # [rb_pitch delay]
-+  or  rb_dma1_base, r1, rb_pitch
-+
-+# Compute base address for first and second access
-+  add r0, ra0.16b, r3                                           # Load x + elem_num
-+.if v_x_shift != 0
-+  shl r0, r0, v_x_shift
-+.endif
-+  max r0, r0, 0
-+  min r0, r0, rb_max_x
-+  shl ra_xshift_next, r0, 3                                     # Compute shifts
-+
-+# X is byte offset - we can only load words - mask
-+
-+  and r0, r0, -4                ; v8subs r2, r2, r2
-+  sub r2, r2, rb_pitch
-+  and r1, r0, r2
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                                                # Add stripe offsets
-+  add ra_base, ra9, r0
-+
-+  # r3 still contains elem_num
-+  add r0, ra1.16b, r3                                           # Load x
-+.if v_x_shift != 0
-+  shl r0, r0, v_x_shift
-+.endif
-+  max r0, r0, 0
-+  min r0, r0, rb_max_x
-+  shl rb_xshift2_next, r0, 3                                    # Compute shifts
-+
-+  # r2 still contains mask
-+  and r0, r0, -4
-+  and r1, r0, r2
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                                                # Add stripe offsets
-+  add rb_base2, ra11, r0
-+
-+# Do preloads
-+  nop                           ; mov r0, ra0.16a               # ; r0 = y
-+  mov r3, PREREAD               ; mov r2, ra1.16a               # ; r2 = y2
-+
-+:1
-+  sub.setf r3, r3, 1
-+  max r1, r0, 0
-+  min r1, r1, rb_max_y
-+  add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
-+  add t0s, ra_base, r1          ; mov ra_y, r0
-+
-+  max r1, r2, 0
-+  brr.anynz -, r:1b
-+  min r1, r1, rb_max_y
-+  add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
-+  add t1s, rb_base2, r1         ; mov ra_y2, r2
-+# >>> .anynz 1b
-+
-+  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
-+
-+  mov ra_link, unif                                             # Next fn
-+
-+# touch vertical context to keep simulator happy
-+  mov ra8,  0                   ; mov rb8,  0                   # [ra_link delay]
-+  bra -, ra_link
-+  mov ra9,  0                   ; mov rb9,  0
-+  mov ra10, 0                   ; mov rb10, 0
-+  mov ra11, 0                   ; mov rb11, 0
-+# >>> ra_link
-+.endm
-+
-+::mc_setup_y_q0
-+  m_setup_q0
-+::mc_setup_y_qn
-+  m_setup_y 8
-+
-+################################################################################
-+#
-+# Start of per-block setup code
-+# P and B blocks share the same setup code to save on Icache space
-+
-+# get base addresses and per-channel shifts for *next* invocation
-+# per-channel shifts were calculated on the *previous* invocation
-+
-+# 1st 3 instructions of per_block-setup in branch delay
-+#
-+# typedef struct qpu_mc_pred_y_p_s {
-+#    qpu_mc_src_t next_src1;
-+#    qpu_mc_src_t next_src2;
-+#    uint16_t h;
-+#    uint16_t w;
-+#    uint32_t mymx21;
-+#    uint32_t wo1;
-+#    uint32_t wo2;
-+#    uint32_t dst_addr;
-+#    uint32_t next_fn;
-+# } qpu_mc_pred_y_p_t;
-+#
-+
-+.macro m_luma_setup, v_bit_depth
-+# Hack - QASM may well have have label pasting but I have no idea how...
-+.if v_bit_depth == 8
-+  brr ra_link, r:per_block_setup_8
-+.elif v_bit_depth == 10
-+  brr ra_link, r:per_block_setup_10
-+.endif
-+  mov ra0, unif                 ; mov r3, elem_num              # y_x ; elem_num has implicit unpack??
-+  add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2          # [ra0 delay] ; r5 = 0
-+  add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
-+.endm
-+
-+.macro m_per_block_setup, v_bit_depth
-+
-+.if v_bit_depth <= 8
-+.set v_x_shift,         0
-+.set v_x_mul,           1
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift,     7
-+.set v_dma_wh_shift,    i_shift16
-+.else
-+.set v_x_shift,         1
-+.set v_x_mul,           2
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift,     8
-+.set v_dma_wh_shift,    15
-+.endif
-+
-+.if v_x_shift != 0
-+  shl r0, r0, v_x_shift
-+.endif
-+  max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
-+  min r0, r0, rb_max_x
-+
-+  shl ra_xshift_next, r0, 3                                     # Compute shifts
-+  and r0, r0, -4
-+  sub r2, r5, rb_pitch          ; mov ra_base_next, unif        # ; src1.base
-+  and r1, r0, r2                ; mov ra_y_next, ra0.16a
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mov ra1, unif                 # Add stripe offsets ; src2.x_y
-+  add ra_base_next, ra_base_next, r0                            # [ra1 delay]
-+
-+  add r0, ra1.16b, r3                                           # Load x2
-+.if v_x_shift != 0
-+  shl r0, r0, v_x_shift
-+.endif
-+  max r0, r0, r5                ; mov ra_y2_next, ra1.16a
-+  min r0, r0, rb_max_x          ; mov rb_base2_next, unif       # ; src2.base
-+  shl rb_xshift2_next, r0, 3                                    # Compute shifts
-+  and r0, r0, -4                ; mov ra_width_height, unif     # ; width_height
-+  and r1, r0, r2                ; mov vw_setup, rb_vpm_init     # ; set up VPM write
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mul24 r1, ra_width, v_x_mul   # Add stripe offsets ; r1 = x in bytes
-+  add rb_base2_next, rb_base2_next, r0
-+
-+# get width,height of block (unif load above), r1 = width * pel_size
-+  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height             # Compute vdw_setup1(dst_pitch-width)
-+  add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
-+  add rb_lcount, r0, (7-8)
-+  shl r0, r0, v_dma_h_shift     ; mov r3, ra_kmul_add           # ; r3 return val
-+  add r0, r0, r1                                                # Combine width and height of destination area
-+  shl r0, r0, v_dma_wh_shift    ; mov r2, ra_fir_off_val        # Shift into bits 16 upwards of the vdw_setup0 register ; r2 return val
-+  add ra_dma0, r0, rb_dma0_base ; mov r0, unif                  # ; Packed filter offsets
-+
-+# get filter coefficients and discard unused B frame values
-+  shl.ifnn r0, r0, i_shift16    ; mov ra_wt_off_mul_l0, unif    #  Pick half to use ; L0 offset/weight
-+  shl ra8, r0, 3                ; mov rb5, ra_k255
-+
-+# Coeffs are all abs values here as that means mul24 works (no sign extend from .8)
-+
-+# 2nd half coeffs same as first if we can swap 8<->24 in the rotate val
-+# but I can't see a way of doing that that is cheap enough to be worth it
-+
-+# Picked out in a slightly random order to space out uniform loads
-+
-+  # 1
-+  mov r1, 0x01040400            # [ra8 delay]
-+  ror ra2.8b, r1, ra8.8d
-+  ror ra0.8b, r1, ra8.8c
-+  # 2
-+  ror ra2.8c, rb_y_coeffs_2, ra8.8d
-+  ror ra0.8c, rb_y_coeffs_2, ra8.8c
-+  # 0
-+  mov r1,0x00010100             # -ve  [ra8 delay]
-+  ror r0, r1, ra8.8d            ; mov ra_wt_off_mul_l1, unif    # ; L1 Wt/Offset
-+  ror ra0.8a, r1, ra8.8c        ; v8min rb4, r0, rb5
-+  # 7
-+  shl r1, r1, 8                 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 # r1 = 0x01010000
-+  ror r0, r1, ra8.8d            ; mov ra_dest, unif             # ; Destination address
-+  ror ra1.8d, r1, ra8.8c        ; v8min rb11, r0, rb5
-+  # 3
-+  ror ra2.8d, rb_y_coeffs_3, ra8.8d
-+  ror ra0.8d, rb_y_coeffs_3, ra8.8c
-+  # 5
-+  ror ra3.8b, rb_y_coeffs_5, ra8.8d
-+  ror ra1.8b, rb_y_coeffs_5, ra8.8c
-+  # 6
-+  mov r1,0x04040100
-+  ror ra3.8c, r1, ra8.8d
-+  ror ra1.8c, r1, ra8.8c        ; mov r5rep, -8                 # ; r5 return val
-+
-+  bra -, ra_link
-+  # 4
-+  mov r1,0x3a281100
-+  ror r0, r1, ra8.8d            ; mov ra_link, unif             # ; link - load after we've used its previous val
-+  ror ra1.8a, r1, ra8.8c        ; v8min rb8, r0, rb5
-+# >>> branch ra_link
-+
-+# r5 = -8
-+# r2 = fir_off_val
-+# r3 = 128
-+.endm
-+
-+:per_block_setup_8
-+  m_per_block_setup 8
-+
-+
-+
-+################################################################################
-+#
-+# mc_filter_y_pxx
-+#
-+# Setup (& therefore uniform struct) shared with _bxx
-+# Struct in m_luma_setup
-+#
-+# We can have 2 separate P reqs here as long as they mate to generate a
-+# rectangular output block (i.e. h0 = h1, w0 = 8)
-+#
-+# At this point we have already issued PREREAD pairs of texture requests for the current block
-+
-+.macro m_filter_y_pxx, v_bit_depth
-+
-+# denom shift values
-+.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
-+.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
-+
-+  m_luma_setup v_bit_depth
-+
-+  shl r1, ra_wt_off_l0, i_wt_den_p5
-+  add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 # r2 = 0x4000 so mul24 safe even with -ve wt_mul
-+  sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
-+
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+# This loop is identical to the B loop from here --->
-+:1
-+  add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
-+
-+  max r2, ra_y, 0               ; mov r1, 0
-+  min r2, r2, rb_max_y          ; mov r3, ra_k1
-+  add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
-+  add t0s, ra_base, r2          ; mov rb5,  rb6
-+  shr r0, r4, ra_xshift         ; mov rb6,  rb7
-+
-+  max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1 # ; masks out all but wanted bytes
-+  shr r1, r4, rb_xshift2        ; mov rb7, ra8
-+  min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
-+  add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
-+  add t1s, rb_base2, r2         ; mov ra8,  ra9
-+
-+# apply horizontal filter
-+  add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+  mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
-+  sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
-+  add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+
-+  brr.anyn -, r:1b
-+  sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
-+  mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
-+  asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+  # >>> .anyn 1b (r5 + r5)
-+
-+  # apply vertical filter and write to VPM
-+  # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11
-+
-+  sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+  sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
-+  add r1, r1, r0                ; mul24 r0, ra8,  rb8
-+  add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+  add r1, r1, r0                ; mul24 r0, ra11, rb11
-+# <--- to here
-+  sub.setf -, r5, rb_i_tmu      ; mov r3, ra_blk_height                 # ; NxtLoop: r3 = block height
-+  sub r1, r1, ra4               ; mov.ifz rb_base2, rb_base2_next
-+  sub r1, r1, r0                ; mov.ifz ra_base, ra_base_next
-+
-+  asr r1, r1, 6                 ; mov.ifz ra_y_y2, ra_y_y2_next
-+  sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+  add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+  sub r1, r0, r1                ; v8subs r0, ra_height, r3              # ; NxtLoop: r0 = remaining height (0 saturate)
-+
-+  brr.anyn -, r:1b
-+  asr r1, r1, i_wt_den_p6
-+  min r1, r1, ra_pmax           ; mov -, vw_wait
-+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch                # ; NxtLoop
-+# >>> branch.anyn 1b (r5 - rb_lcount)
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0 # VDW setup 0
-+
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3                ; mov vw_setup, rb_dma1 # Stride
-+  sub r1, r0, r3                ; mov vw_addr, ra_dest  # start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link
-+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  brr -, r:1b
-+  add rb_lcount, rb_lcount, r0
-+  add ra_dma0, ra_dma0, r1
-+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_y_pxx
-+  m_filter_y_pxx 8
-+
-+
-+################################################################################
-+
-+# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
-+#
-+# Setup (& therefore uniform struct) shared with _pxx
-+# Struct in m_luma_setup
-+#
-+# l0 calc in els 0-7, L1 in 8-15
-+# Only els 0-7 write data that is stored back to ram (els 8-15 may write tosh)
-+#
-+# At this point we have already issued PREREAD pairs of texture requests for the current block
-+
-+.macro m_filter_y_bxx, v_bit_depth
-+
-+# denom shift values
-+.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
-+.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
-+
-+  m_luma_setup v_bit_depth
-+
-+  shl r1, ra_wt_off_l0, i_wt_den_p6
-+  add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
-+  sub r1, r1, r0                ; mul24 r0, r2, ra_wt_mul_l1
-+  sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
-+
-+# This loop is identical to the P loop from here --->
-+:1
-+  add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
-+
-+  max r2, ra_y, 0               ; mov r1, 0
-+  min r2, r2, rb_max_y          ; mov r3, ra_k1
-+  add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
-+  add t0s, ra_base, r2          ; mov rb5,  rb6
-+  shr r0, r4, ra_xshift         ; mov rb6,  rb7
-+
-+  max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1 # ; masks out all but wanted bytes
-+  shr r1, r4, rb_xshift2        ; mov rb7, ra8
-+  min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
-+  add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
-+  add t1s, rb_base2, r2         ; mov ra8,  ra9
-+
-+# apply horizontal filter
-+  add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+  mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
-+  sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
-+  add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+
-+  brr.anyn -, r:1b
-+  sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
-+  mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
-+  asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+  # >>> .anyn 1b (r5 + r5)
-+
-+  # apply vertical filter and write to VPM
-+  # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11
-+
-+  sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+  sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
-+  add r1, r1, r0                ; mul24 r0, ra8,  rb8
-+  add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+  add r1, r1, r0                ; mul24 r0, ra11, rb11
-+# <--- to here
-+  sub r1, r1, ra4
-+  sub r1, r1, r0                ; mov r2, rb_wt_off
-+
-+  asr r1, r1, 6
-+  sub.setf -, r5, rb_i_tmu      ; mul24 r0, r1, ra_wt_mul_l0
-+  mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
-+  sub r1, r0, r1                ; mov.ifz ra_y_y2, ra_y_y2_next
-+  sub.setf -, r5, rb_lcount     ; mov.ifz ra_base, ra_base_next
-+  add r1, r1, r2                ; mov r0, r1 << 8
-+  add r1, r1, r0                ; mov r3, ra_blk_height         # ; NxtLoop: r3 = block height
-+
-+  brr.anyn -, r:1b
-+  asr r1, r1, ra_wt_den_p7      ; mul24 r2, r3, rb_pitch        # ; NxtLoop
-+  min r1, r1, ra_pmax           ; mov -, vw_wait
-+  max vpm, r1, 0                ; v8subs r0, ra_height, r3      # ; NxtLoop: r0 = remaining height (0 saturate)
-+# >>> branch.anyn 1b (r5 - rb_lcount)
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed block_height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # VDW setup 0
-+
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3                ; mov vw_setup, rb_dma1         # Stride
-+  sub r1, r0, r3                ; mov vw_addr, ra_dest          # start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link (ra_height - remaining height)
-+
-+# Here r1 = cur_blk_height - blk_height so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  brr -, r:1b
-+  add rb_lcount, rb_lcount, r0
-+  add ra_dma0, ra_dma0, r1
-+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_y_bxx
-+  m_filter_y_bxx 8
-+
-+################################################################################
-+#
-+# typedef struct qpu_mc_pred_y_p00_s {
-+#    qpu_mc_src_t next_src1;
-+#    uint16_t h;
-+#    uint16_t w;
-+#    uint32_t wo1;
-+#    uint32_t dst_addr;
-+#    uint32_t next_fn;
-+# } qpu_mc_pred_y_p00_t;
-+
-+.macro m_filter_y_p00, v_bit_depth
-+
-+.if v_bit_depth <= 8
-+.set v_x_shift,         0
-+.set v_x_mul,           1
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift,     7
-+.set v_dma_wh_shift,    i_shift16
-+.else
-+.set v_x_shift,         1
-+.set v_x_mul,           2
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift,     8
-+.set v_dma_wh_shift,    15
-+.endif
-+
-+  mov ra0, unif                 ; mov r0, elem_num              # y_x
-+  mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5          # [ra0 delay] ; r5 = 0
-+  add r0, ra0.16b, r0           ; mov ra_base_next, unif        # ; src1.base
-+.if v_x_shift != 0
-+  shl r0, r0, v_x_shift
-+.endif
-+
-+  max r0, r0, r5                ; mov ra_y_next, ra0.16a        # ; width_height
-+  min r0, r0, rb_max_x          ; mov ra_width_height, unif
-+
-+  shl ra_xshift_next, r0, 3                                     # Compute shifts
-+  and r0, r0, -4
-+  sub r2, r5, rb_pitch          ; mov ra_wt_off_mul_l0, unif    # ; weight_offset
-+  and r1, r0, r2
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mov ra_dest, unif             # Add stripe offsets ; dest addr
-+  add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init  # [ra_width delay] ; set up VPM write
-+
-+# get width,height of block (unif load above)
-+# Compute vdw_setup1(dst_pitch-width)
-+  shl r1, ra_width, v_x_shift
-+  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
-+  sub rb_i_tmu, r0, PREREAD     ; v8min r0, r0, ra_blk_height
-+  shl r0, r0, v_dma_h_shift     ; mov rb_lcount, r0
-+  add r0, r0, r1                                                # Combine width and height of destination area
-+  shl rb_wt_off, ra_wt_off_l0, DENOM + 7
-+  shl r0, r0, v_dma_wh_shift    ; mov ra_link, unif             # Shift into bits 16 upwards of the vdw_setup0 register ; link
-+  add ra_dma0, r0, rb_dma0_base
-+
-+:1
-+  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1
-+  nop                           ; mov.ifz ra_y, ra_y_next       ; ldtmu0
-+  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
-+
-+  max r2, ra_y, 0  # y
-+  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
-+  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
-+  add t0s, ra_base, r2          ; v8min r0, r0, rb_pmask
-+
-+  sub.setf -, r5, rb_lcount     ; mul24 r1, r0, ra_wt_mul_l0
-+  shl r1, r1, 8                 ; mov r3, ra_blk_height
-+  add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+
-+  brr.anyn -, r:1b
-+  asr r1, r1, DENOM + 8
-+  min r1, r1, ra_pmax           ; mov -, vw_wait
-+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+# >>> branch.anyn 1b
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+  mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0
-+
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
-+  sub r1, r0, r3        ; mov vw_addr, ra_dest  # start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link
-+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  brr -, r:1b
-+  add rb_lcount, rb_lcount, r0
-+  add ra_dma0, ra_dma0, r1
-+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_y_p00
-+  m_filter_y_p00 8
-+
-+################################################################################
-+
-+.macro m_filter_y_b00, v_bit_depth
-+# luma setup does a fair bit more than we need calculating filter coeffs
-+# that we will never use but it saves I-cache to use it (also simple!)
-+  m_luma_setup v_bit_depth
-+
-+# Fix up vals that were expecting a filter (somewhat icky)
-+  mov r2, 1
-+  add rb_i_tmu, rb_i_tmu, r2    ; mov r1, ra_wt_off_mul_l0      # Need in rX rather than raX for <<8 to do what we want
-+  shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 # [r1 << delay] ; r5quad OK for zero
-+  nop                           ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
-+
-+:1
-+  sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu1
-+  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
-+  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
-+
-+  max r2, ra_y, 0  # y
-+  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
-+  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
-+  add t0s, ra_base, r2          ; mov.ifz rb_base2, rb_base2_next
-+
-+  max r2, ra_y2, 0
-+  min r2, r2, rb_max_y
-+  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
-+  add t1s, rb_base2, r2         ; v8min r0, r0, ra_pmax         # v8subs masks out all but bottom byte
-+  and r1, r1, rb_pmask          ; mul24 r0, r0, ra_wt_mul_l0
-+
-+  sub.setf -, r5, rb_lcount     ; mul24 r1, r1, ra_wt_mul_l1
-+  add r1, r0, r1                ; v8adds r5rep, r5, ra_k1
-+
-+  shl r1, r1, 8                 ; mov r3, ra_blk_height
-+  add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+
-+  brr.anyn -, r:1b
-+  asr r1, r1, (DENOM + 9) - 32                                  # -32 to get valid shift immediate
-+  min r1, r1, ra_pmax           ; mov -, vw_wait
-+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+# >>> branch.anyn 1b
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # ; VDW setup 0
-+
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3                ; mov vw_setup, rb_dma1         # ; Stride
-+  sub r1, r0, r3                ; mov vw_addr, ra_dest          # ; start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link
-+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  brr -, r:1b
-+  add rb_lcount, rb_lcount, r0
-+  add ra_dma0, ra_dma0, r1
-+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_y_b00
-+  m_filter_y_b00 8
-+
-+################################################################################
-+################################################################################
-+# 10 BIT
-+
-+::mc_setup_c10_q0
-+  m_setup_q0
-+::mc_setup_c10_qn
-+  m_setup_c 10
-+
-+::mc_filter_c10_p
-+  m_filter_c_p 0, 10
-+
-+::mc_filter_c10_p_l1
-+  m_filter_c_p 1, 10
-+
-+
-+::mc_filter_c10_b
-+  m_filter_c_b 10
-+
-+# Even if these fns are the same as for other bit depths we want our own copy
-+# to keep the code we are using in a single lump to avoid (direct map) cache
-+# thrashing
-+.set v_quads10, N_QPU_16 / 4
-+
-+::mc_sync10_q0
-+  m_sync_q 0, v_quads10
-+::mc_sync10_q1
-+  m_sync_q 1, v_quads10
-+::mc_sync10_q2
-+  m_sync_q 2, v_quads10
-+::mc_sync10_q3
-+  m_sync_q 3, v_quads10
-+::mc_sync10_q4
-+  m_sync_q 4, v_quads10
-+::mc_sync10_q5
-+  m_sync_q 5, v_quads10
-+::mc_sync10_q6
-+  m_sync_q 6, v_quads10
-+::mc_sync10_q7
-+  m_sync_q 7, v_quads10
-+::mc_sync10_q8
-+  m_sync_q 8, v_quads10
-+::mc_sync10_q9
-+  m_sync_q 9, v_quads10
-+::mc_sync10_q10
-+  m_sync_q 10, v_quads10
-+::mc_sync10_q11
-+  m_sync_q 11, v_quads10
-+
-+::mc_exit_y10_q0
-+::mc_exit_c10_q0
-+  m_exit_q0
-+
-+::mc_exit_y10_qn
-+::mc_exit_c10_qn
-+  m_exit_qn
-+
-+::mc_setup_y10_q0
-+  m_setup_q0
-+::mc_setup_y10_qn
-+  m_setup_y 10
-+
-+:per_block_setup_10
-+  m_per_block_setup 10
-+
-+::mc_filter_y10_pxx
-+  m_filter_y_pxx 10
-+
-+::mc_filter_y10_p00
-+  m_filter_y_p00 10
-+
-+::mc_filter_y10_bxx
-+  m_filter_y_bxx 10
-+
-+::mc_filter_y10_b00
-+  m_filter_y_b00 10
-+
-+
-+
-+::mc_end
-+# Do not add code here because mc_end must appear after all other code.
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader_cmd.h
-@@ -0,0 +1,165 @@
-+/*
-+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+*/
-+
-+#ifndef RPI_SHADER_CMD_H
-+#define RPI_SHADER_CMD_H
-+
-+#pragma pack(push, 4)
-+
-+#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y
-+// If mixed then we are just confused and get a lot of warnings....
-+typedef const uint8_t * qpu_mc_src_addr_t;
-+typedef uint8_t * qpu_mc_dst_addr_t;
-+#else
-+typedef uint32_t qpu_mc_src_addr_t;
-+typedef uint32_t qpu_mc_dst_addr_t;
-+#endif
-+
-+typedef struct qpu_mc_src_s
-+{
-+    int16_t y;
-+    int16_t x;
-+    qpu_mc_src_addr_t base;
-+} qpu_mc_src_t;
-+
-+
-+typedef struct qpu_mc_pred_c_p_s {
-+    qpu_mc_src_t next_src;
-+    uint16_t h;
-+    uint16_t w;
-+    uint32_t coeffs_x;
-+    uint32_t coeffs_y;
-+    uint32_t wo_u;
-+    uint32_t wo_v;
-+    qpu_mc_dst_addr_t dst_addr_c;
-+    uint32_t next_fn;
-+} qpu_mc_pred_c_p_t;
-+
-+typedef struct qpu_mc_pred_c_b_s {
-+    qpu_mc_src_t next_src1;
-+    uint16_t h;
-+    uint16_t w;
-+    uint32_t coeffs_x1;
-+    uint32_t coeffs_y1;
-+    int16_t weight_u1;
-+    int16_t weight_v1;
-+    qpu_mc_src_t next_src2;
-+    uint32_t coeffs_x2;
-+    uint32_t coeffs_y2;
-+    uint32_t wo_u2;
-+    uint32_t wo_v2;
-+    qpu_mc_dst_addr_t dst_addr_c;
-+    uint32_t next_fn;
-+} qpu_mc_pred_c_b_t;
-+
-+typedef struct qpu_mc_pred_c_s_s {
-+    qpu_mc_src_t next_src1;
-+    uint32_t pic_cw;            // C Width (== Y width / 2)
-+    uint32_t pic_ch;            // C Height (== Y Height / 2)
-+    uint32_t stride2;
-+    uint32_t stride1;
-+    qpu_mc_src_t next_src2;
-+    uint32_t next_fn;
-+} qpu_mc_pred_c_s_t;
-+
-+typedef struct qpu_mc_pred_c_s {
-+    union {
-+        qpu_mc_pred_c_p_t p;
-+        qpu_mc_pred_c_b_t b;
-+        qpu_mc_pred_c_s_t s;
-+    };
-+} qpu_mc_pred_c_t;
-+
-+
-+typedef struct qpu_mc_pred_y_p_s {
-+    qpu_mc_src_t next_src1;
-+    qpu_mc_src_t next_src2;
-+    uint16_t h;
-+    uint16_t w;
-+    uint32_t mymx21;
-+    uint32_t wo1;
-+    uint32_t wo2;
-+    qpu_mc_dst_addr_t dst_addr;
-+    uint32_t next_fn;
-+} qpu_mc_pred_y_p_t;
-+
-+typedef struct qpu_mc_pred_y_p00_s {
-+    qpu_mc_src_t next_src1;
-+    uint16_t h;
-+    uint16_t w;
-+    uint32_t wo1;
-+    qpu_mc_dst_addr_t dst_addr;
-+    uint32_t next_fn;
-+} qpu_mc_pred_y_p00_t;
-+
-+typedef struct qpu_mc_pred_y_s_s {
-+    qpu_mc_src_t next_src1;
-+    qpu_mc_src_t next_src2;
-+    uint16_t pic_h;
-+    uint16_t pic_w;
-+    uint32_t stride2;
-+    uint32_t stride1;
-+    uint32_t next_fn;
-+} qpu_mc_pred_y_s_t;
-+
-+typedef struct qpu_mc_pred_sync_s {
-+    uint32_t next_fn;
-+} qpu_mc_pred_sync_t;
-+
-+// Only a useful structure in that it allows us to return something other than a void *
-+typedef struct qpu_mc_pred_y_s {
-+    union {
-+        qpu_mc_pred_y_p_t p;
-+        qpu_mc_pred_y_p00_t p00;
-+        qpu_mc_pred_y_s_t s;
-+    };
-+} qpu_mc_pred_y_t;
-+
-+typedef union qpu_mc_pred_cmd_u {
-+    qpu_mc_pred_y_t y;
-+    qpu_mc_pred_c_t c;
-+    qpu_mc_pred_sync_t sync;
-+} qpu_mc_pred_cmd_t;
-+
-+static void inline qpu_mc_link_set(qpu_mc_pred_cmd_t * const cmd, const uint32_t fn)
-+{
-+    // Link is last el of previous cmd
-+    ((uint32_t *)cmd)[-1] = fn;
-+}
-+
-+#define QPU_MC_PRED_N_Y8        12
-+#define QPU_MC_PRED_N_C8        12
-+
-+#define QPU_MC_PRED_N_Y10       12
-+#define QPU_MC_PRED_N_C10       12
-+
-+#define QPU_MC_DENOM            7
-+
-+#pragma pack(pop)
-+
-+#endif
-+
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader_template.c
-@@ -0,0 +1,88 @@
-+/*
-+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+*/
-+
-+#include "hevc.h"
-+#include "rpi_hevcdec.h"
-+#include "libavutil/rpi_sand_fns.h"
-+#include "rpi_hevc_shader_cmd.h"
-+#include "rpi_hevc_shader_template.h"
-+
-+typedef struct shader_track_s
-+{
-+    const union qpu_mc_pred_cmd_u *qpu_mc_curr;
-+    const struct qpu_mc_src_s *last_l0;
-+    const struct qpu_mc_src_s *last_l1;
-+    uint32_t width;  // pic_width * PW
-+    uint32_t height;
-+    uint32_t stride2;
-+    uint32_t stride1;
-+} shader_track_t;
-+
-+static int wtoidx(const unsigned int w)
-+{
-+    static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
-+    return pel_weight[w];
-+}
-+
-+static const int fctom(uint32_t x)
-+{
-+    int rv;
-+    // As it happens we can take the 2nd filter term & divide it by 8
-+    // (dropping fractions) to get the fractional move
-+    rv = 8 - ((x >> 11) & 0xf);
-+    av_assert2(rv >= 0 && rv <= 7);
-+    return rv;
-+}
-+
-+static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr)
-+{
-+    return (x << shl) >> shr;
-+}
-+
-+static inline int woff_p(HEVCRpiContext *const s, int32_t x)
-+{
-+    return ext(x, 0, 17 + s->ps.sps->bit_depth - 8);
-+}
-+
-+static inline int woff_b(HEVCRpiContext *const s, int32_t x)
-+{
-+    return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8);
-+}
-+
-+static inline int wweight(int32_t x)
-+{
-+    return ext(x, 16, 16);
-+}
-+
-+
-+#define PW 1
-+#include "rpi_hevc_shader_template_fn.h"
-+
-+#undef PW
-+#define PW 2
-+#include "rpi_hevc_shader_template_fn.h"
-+
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader_template.h
-@@ -0,0 +1,49 @@
-+/*
-+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+*/
-+
-+#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H
-+#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H
-+
-+struct HEVCRpiContext;
-+struct HEVCRpiInterPredEnv;
-+
-+void ff_hevc_rpi_shader_c8(struct HEVCRpiContext *const s,
-+                  const struct HEVCRpiInterPredEnv *const ipe_y,
-+                  const struct HEVCRpiInterPredEnv *const ipe_c);
-+
-+void ff_hevc_rpi_shader_c16(struct HEVCRpiContext *const s,
-+                  const struct HEVCRpiInterPredEnv *const ipe_y,
-+                  const struct HEVCRpiInterPredEnv *const ipe_c);
-+
-+void rpi_sand_dump8(const char * const name,
-+                    const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
-+
-+void rpi_sand_dump16(const char * const name,
-+                     const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
-+
-+#endif
-+
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader_template_fn.h
-@@ -0,0 +1,502 @@
-+/*
-+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+*/
-+
-+#define STRCAT(x,y) x##y
-+
-+#if PW == 1
-+#define pixel uint8_t
-+#define FUNC(f) STRCAT(f, 8)
-+#elif PW == 2
-+#define pixel uint16_t
-+#define FUNC(f) STRCAT(f, 16)
-+#else
-+#error Unexpected PW
-+#endif
-+
-+#define PATCH_STRIDE (16 * PW)
-+
-+static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
-+{
-+    for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) {
-+        const pixel s = *(const pixel *)src;
-+        pixel * d = (pixel *)dst;
-+        for (unsigned int j = 0; j < w; j += PW) {
-+            *d++ = s;
-+        }
-+    }
-+}
-+
-+static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
-+{
-+    for (unsigned int i = 0; i != h; ++i, dst += stride) {
-+        memcpy(dst, src, w);
-+    }
-+}
-+
-+static void FUNC(get_patch_y)(const shader_track_t * const st,
-+                         uint8_t * dst, const unsigned int dst_stride,
-+                         const qpu_mc_src_t *src,
-+                         unsigned int _w, unsigned int _h)
-+{
-+    int x = src->x * PW;
-+    int y = src->y;
-+    int w = _w * PW;
-+    int h = _h;
-+    int dl = 0;
-+    int dr = 0;
-+    int dt = 0;
-+    int db = 0;
-+
-+    if (x < 0) {
-+        if (-x >= w)
-+            x = PW - w;
-+        dl = -x;
-+        w += x;
-+        x = 0;
-+    }
-+    if (x + w > st->width) {
-+        if (x >= st->width)
-+            x = st->width - PW;
-+        dr = (x + w) - st->width;
-+        w = st->width - x;
-+    }
-+
-+    // Y
-+    if (y < 0) {
-+        if (-y >= h)
-+            y = 1 - h;
-+        dt = -y;
-+        h += y;
-+        y = 0;
-+    }
-+    if (y + h > st->height) {
-+        if (y >= st->height)
-+            y = st->height - 1;
-+        db = (y + h) - st->height;
-+        h = st->height - y;
-+    }
-+
-+    dst += dl + dt * dst_stride;
-+    FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
-+
-+    // Edge dup
-+    if (dl != 0)
-+        FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride);
-+    if (dr != 0)
-+        FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride);
-+    w += dl + dr;
-+    dst -= dl;
-+
-+    if (dt != 0)
-+        FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride);
-+    if (db != 0)
-+        FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride);
-+}
-+
-+
-+
-+static void FUNC(get_patch_c)(const shader_track_t * const st,
-+                         uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride,
-+                         const qpu_mc_src_t *src,
-+                         unsigned int _w, unsigned int _h)
-+{
-+    int x = src->x * PW;
-+    int y = src->y;
-+    int w = _w * PW;
-+    int h = _h;
-+    int dl = 0;
-+    int dr = 0;
-+    int dt = 0;
-+    int db = 0;
-+    const int width = st->width;
-+    const int height = st->height;
-+
-+    if (x < 0) {
-+        if (-x >= w)
-+            x = PW - w;
-+        dl = -x;
-+        w += x;
-+        x = 0;
-+    }
-+    if (x + w > width) {
-+        if (x >= width)
-+            x = width - PW;
-+        dr = (x + w) - width;
-+        w = width - x;
-+    }
-+
-+    // Y
-+    if (y < 0) {
-+        if (-y >= h)
-+            y = 1 - h;
-+        dt = -y;
-+        h += y;
-+        y = 0;
-+    }
-+    if (y + h > height) {
-+        if (y >= height)
-+            y = height - 1;
-+        db = (y + h) - height;
-+        h = height - y;
-+    }
-+
-+    dst_u += dl + dt * dst_stride;
-+    dst_v += dl + dt * dst_stride;
-+    FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
-+
-+    // Edge dup
-+    if (dl != 0)
-+    {
-+        FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride);
-+        FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride);
-+    }
-+    if (dr != 0)
-+    {
-+        FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride);
-+        FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride);
-+    }
-+    w += dl + dr;
-+    dst_u -= dl;
-+    dst_v -= dl;
-+
-+    if (dt != 0)
-+    {
-+        FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride);
-+        FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride);
-+    }
-+    if (db != 0)
-+    {
-+        FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride);
-+        FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride);
-+    }
-+}
-+
-+// w, y, w, h in pixels
-+// stride1, stride2 in bytes
-+void FUNC(rpi_sand_dump)(const char * const name,
-+                         const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c)
-+{
-+    const int mask = stride2 == 0 ? ~0 : stride1 - 1;
-+
-+    printf("%s (%d,%d) %dx%d\n", name, x, y, w, h);
-+
-+    if (is_c) {
-+        x *= 2;
-+        w *= 2;
-+    }
-+
-+    for (int i = y; i != y + h; ++i) {
-+        for (int j = x; j != x + w; ++j) {
-+            const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2;
-+            char sep = is_c && (j & 1) == 0 ? ':' : ' ';
-+#if PW == 1
-+            if (j < 0 || i < 0)
-+                printf("..%c", sep);
-+            else
-+                printf("%02x%c", *(const pixel*)p, sep);
-+#else
-+            if (j < 0 || i < 0)
-+                printf("...%c", sep);
-+            else
-+                printf("%03x%c", *(const pixel*)p, sep);
-+#endif
-+        }
-+        printf("\n");
-+    }
-+}
-+
-+
-+void FUNC(ff_hevc_rpi_shader_c)(HEVCRpiContext *const s,
-+                  const HEVCRpiInterPredEnv *const ipe_y,
-+                  const HEVCRpiInterPredEnv *const ipe_c)
-+{
-+    for (int c_idx = 0; c_idx < 2; ++c_idx)
-+    {
-+        const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c;
-+        shader_track_t tracka[QPU_N_MAX] = {{NULL}};
-+        unsigned int exit_n = 0;
-+
-+        if (ipe == NULL || !ipe->used) {
-+            continue;
-+        }
-+
-+        do {
-+            for (unsigned int i = 0; i != ipe->n; ++i) {
-+                const HEVCRpiInterPredQ * const q = ipe->q + i;
-+                shader_track_t * const st = tracka + i;
-+                const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr;
-+
-+                for (;;) {
-+                    const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1];
-+
-+                    if (link == q->code_setup) {
-+                        if (c_idx == 0) {
-+                            // Luma
-+                            const qpu_mc_pred_y_s_t *const c = &cmd->y.s;
-+
-+                            st->height = c->pic_h;
-+                            st->width = c->pic_w * PW;
-+                            st->stride1 = c->stride1;
-+                            st->stride2 = c->stride2;
-+                            st->last_l0 = &c->next_src1;
-+                            st->last_l1 = &c->next_src2;
-+                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                        }
-+                        else {
-+                            // Chroma
-+                            const qpu_mc_pred_c_s_t *const c = &cmd->c.s;
-+
-+                            st->height = c->pic_ch;
-+                            st->width = c->pic_cw * PW;
-+                            st->stride1 = c->stride1;
-+                            st->stride2 = c->stride2;
-+                            st->last_l0 = &c->next_src1;
-+                            st->last_l1 = &c->next_src2;
-+                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                        }
-+                    }
-+                    else if (link == s->qpu.y_pxx) {
-+                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
-+                        const int w1 = FFMIN(c->w, 8);
-+                        const int w2 = c->w - w1;
-+
-+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y1, PATCH_STRIDE,
-+                                    st->last_l0,
-+                                    16, c->h + 7);
-+                        if (w2 > 0) {
-+                            FUNC(get_patch_y)(st,
-+                                        patch_y2, PATCH_STRIDE,
-+                                        st->last_l1,
-+                                        16, c->h + 7);
-+                        }
-+
-+                        // wo[offset] = offset*2+1
-+                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
-+                            (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1);
-+                        if (w2 > 0) {
-+                            s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
-+                                (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
-+                                c->h, QPU_MC_DENOM, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2);
-+                        }
-+                        st->last_l0 = &c->next_src1;
-+                        st->last_l1 = &c->next_src2;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.y_bxx) {
-+                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
-+
-+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
-+
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y1, PATCH_STRIDE,
-+                                    st->last_l0,
-+                                    16, c->h + 7);
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y2, PATCH_STRIDE,
-+                                    st->last_l1,
-+                                    16, c->h + 7);
-+
-+                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
-+                           patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
-+                           c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w);
-+
-+                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
-+                            (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2),
-+                            0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w);
-+                        st->last_l0 = &c->next_src1;
-+                        st->last_l1 = &c->next_src2;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.y_p00) {
-+                        const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00;
-+
-+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y1, PATCH_STRIDE,
-+                                    st->last_l0,
-+                                    16, c->h + 7);
-+
-+                        // wo[offset] = offset*2+1
-+                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0](
-+                            (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w);
-+
-+                        st->last_l0 = &c->next_src1;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.y_b00) {
-+                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
-+
-+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
-+
-+                        av_assert0(c->w <= 16 && c->h <= 64);
-+
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y1, PATCH_STRIDE,
-+                                    st->last_l0,
-+                                    16, c->h);
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y2, PATCH_STRIDE,
-+                                    st->last_l1,
-+                                    16, c->h);
-+
-+                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0](
-+                           patch_y3, patch_y1, PATCH_STRIDE,
-+                           c->h, 0, 0, c->w);
-+
-+                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0](
-+                            (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2),
-+                            0, woff_b(s, c->wo2), 0, 0, c->w);
-+                        st->last_l0 = &c->next_src1;
-+                        st->last_l1 = &c->next_src2;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.c_pxx) {
-+                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
-+                        const int mx = fctom(c->coeffs_x);
-+                        const int my = fctom(c->coeffs_y);
-+
-+                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_u3[8 * 16 * PW];
-+                        uint8_t patch_v3[8 * 16 * PW];
-+
-+                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
-+
-+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
-+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
-+
-+                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
-+
-+                        st->last_l0 = &c->next_src;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.c_pxx_l1) {
-+                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
-+                        const int mx = fctom(c->coeffs_x);
-+                        const int my = fctom(c->coeffs_y);
-+
-+                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_u3[8 * 16 * PW];
-+                        uint8_t patch_v3[8 * 16 * PW];
-+
-+                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
-+
-+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
-+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
-+
-+                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
-+
-+                        st->last_l1 = &c->next_src;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.c_bxx) {
-+                        const qpu_mc_pred_c_b_t *const c = &cmd->c.b;
-+                        const int mx1 = fctom(c->coeffs_x1);
-+                        const int my1 = fctom(c->coeffs_y1);
-+                        const int mx2 = fctom(c->coeffs_x2);
-+                        const int my2 = fctom(c->coeffs_y2);
-+
-+                        uint8_t patch_u1[PATCH_STRIDE * 72];
-+                        uint8_t patch_v1[PATCH_STRIDE * 72];
-+                        uint8_t patch_u2[PATCH_STRIDE * 72];
-+                        uint8_t patch_v2[PATCH_STRIDE * 72];
-+                        uint8_t patch_u3[8 * 16 * PW];
-+                        uint8_t patch_v3[8 * 16 * PW];
-+                        uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE];
-+                        uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE];
-+
-+                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
-+                        FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
-+
-+                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
-+                           patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                           c->h, mx1, my1, c->w);
-+                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
-+                           patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                           c->h, mx1, my1, c->w);
-+
-+                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
-+                            patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4,
-+                            c->h, QPU_MC_DENOM, c->weight_u1, wweight(c->wo_u2),
-+                            0, woff_b(s, c->wo_u2), mx2, my2, c->w);
-+                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
-+                            patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4,
-+                            c->h, QPU_MC_DENOM, c->weight_v1, wweight(c->wo_v2),
-+                            0, woff_b(s, c->wo_v2), mx2, my2, c->w);
-+
-+                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
-+
-+                        st->last_l0 = &c->next_src1;
-+                        st->last_l1 = &c->next_src2;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == q->code_sync) {
-+                        cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1);
-+                        break;
-+                    }
-+                    else if (link == q->code_exit) {
-+                        // We expect exit to occur without other sync
-+                        av_assert0(i == exit_n);
-+                        ++exit_n;
-+                        break;
-+                    }
-+                    else {
-+                        av_assert0(0);
-+                    }
-+                }
-+
-+                st->qpu_mc_curr = cmd;
-+            }
-+        } while (exit_n == 0);
-+    }
-+}
-+
-+#undef FUNC
-+#undef pixel
-+
---- /dev/null
-+++ b/libavcodec/rpi_hevc_transform.s
-@@ -0,0 +1,444 @@
-+# ******************************************************************************
-+# Argon Design Ltd.
-+# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
-+#
-+# Module : HEVC
-+# Author : Peter de Rivaz
-+# ******************************************************************************
-+
-+# USE_STACK = 1 means temporary data stored on the stack (requires build with larger stack)
-+# USE_STACK = 0 means temporary data stored in fixed per-VPU data buffers (requires modifications to vasm to handle instruction encoding for PC relative instructions)
-+.set USE_STACK, 0
-+
-+# Lines that fail to assemble start with #:
-+# The script insert_magic_opcodes.sh inserts the machine code directly for these.
-+# HEVC VPU Transform
-+#
-+# Transform matrix can be thought of as
-+#   output row vector = input row vector * transMatrix2
-+#
-+# The even rows of the matrix are symmetric
-+# The odd rows of the matrix are antisymmetric
-+#
-+# So only need to compute the first half of the results, then can compute the remainder with a butterfly
-+#
-+# EXAMPLE
-+#   (a b c d) (1 2  2  1)
-+#             (3 4 -4 -3)
-+#             (5 6  6  5)
-+#             (7 8 -8 -7)
-+#
-+#  x=(a c)(1 2) = 1a+5c 2a+6c
-+#         (5 6)
-+#
-+#  y=(b d)(3 4) = 3b+7d 4b+8d
-+#         (7 8)
-+#
-+#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
-+#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
-+#
-+#  Final results are (u , v[::-1])
-+#
-+#
-+#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
-+#  Apply the even matrix first and stop before rounding
-+#  Then apply the odd matrix in a full manner:
-+#
-+#   First step is to compute partial products with the first input (16 cycles)
-+#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
-+#   2a 4b 6c 8d
-+#   2a -4b 6c -8d
-+#   1a -3b 5c -7d
-+#
-+#   Second step is to sum partial products into final position (8 cycles)
-+#   1a+3b+5c+7d
-+#   2a+4b+6c+8d
-+#   2a-4b+6c-8d
-+#   1a-3b+5c-7d
-+#
-+#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
-+#
-+#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
-+#
-+#   For 8x8 we could compute two in parallel.
-+#
-+#
-+
-+# Columns are transformed first
-+#
-+# Store top left half of transMatrix2 in
-+# Store bottom left half of transMatrix2 in HX(32,32)
-+#
-+# For 16x16
-+# HX(0:15,0) contains input data before transform
-+# HY(0:15,0) contains 32bit output data after transform
-+# HX(32,0) contains even rows of left half of transMatrix2
-+# HX(32,32) contains odd rows of left half of transMatrix2
-+# HY(48,0) contains partial products ready for summing
-+#
-+
-+
-+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
-+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
-+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-+# num: number of 16x16 transforms to be done
-+# coeffs32
-+# num32: number of 32x32 transforms
-+# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
-+#
-+
-+.equ TRANS_SHIFT, 20 - BIT_DEPTH
-+.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1)
-+.equ TRANS_ASL2, 16 - TRANS_SHIFT
-+
-+
-+hevc_trans_16x16:
-+  push r6-r15, lr # TODO cut down number of used registers
-+  mov r14,r3 # coeffs32
-+  mov r15,r4 # num32
-+  mov r3, 16*2 # Stride of transMatrix2 in bytes
-+  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
-+
-+  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
-+  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-+
-+  # Now use r0 to describe which matrix we are working on.
-+  # Allows us to prefetch the next block of coefficients for efficiency.
-+  mov r0,0 # This describes the location where we read our coefficients from
-+  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
-+  mov r7,16*16*2 # Total block size
-+  mov r8,64*16 # Value used to swap from current to next VRF location
-+  mov r4,64 # Constant used for rounding first pass
-+  mov r5,TRANS_RND2 # Constant used for rounding second pass
-+
-+  sub sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack
-+
-+  add r11,sp,64 # Space for 32 bytes before, and rounding
-+  lsr r11,5
-+  lsl r11,5 # Make sure r11 is rounded to multiple of 2**5==32
-+
-+  lsr r10, r2, 16 # Number of compressed blocks stored in top short
-+  extu r2,16
-+  # At start of block r0,r1 point to the current block (that has already been loaded)
-+  # r0 VRF location of current block
-+  # r1 address of current block
-+  # r2 number of 16*16 transforms to do
-+  # r3 Stride of coefficients (==32)
-+  # r4 TRANS_RND1 (64)
-+  # r5 TRANS_RND2
-+  # r6 temporary used inside col_trans16
-+  # r7 16*16*2 total bytes in block
-+  # r8 64*16 VRF switch locations
-+  # r9 temporary in unpack_coeff for index
-+  # r10 number of 16x16 transforms using compression
-+  # r11 unpacked data buffer (16*16 shorts) (preceded by 16 shorts of packed data buffer)
-+  # r12 temporary counter in unpack_coeff
-+  # r13
-+  # r14 Save information for 32 bit transform (coeffs location)
-+  # r15 Save information for 32 bit transform (number of transforms)
-+  cmp r2,0
-+  beq done16x16s
-+block_loop:
-+  # With compressed coefficients, we don't use prefetch as we don't want to issue unnecessary memory requests
-+  cmp r10,0
-+  mov r6, r1
-+  beq not_compressed
-+  sub r10, 1
-+  bl unpack16x16
-+not_compressed:
-+  #mov r6,r1 # DEBUG without compress
-+  vldh HX(0++,0)+r0,(r6 += r3) REP 16
-+  #eor r0,r8
-+  #add r1,r7
-+  # Prefetch the next block
-+  #bl unpack16x16
-+  #vldh HX(0++,0)+r0,(r6 += r3) REP 16
-+  #vmov HX(0++,0)+r0,0 REP 16  # DEBUG
-+  #eor r0,r8
-+  #sub r1,r7
-+
-+  # Transform the current block
-+  bl col_trans_16
-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
-+  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
-+
-+  bl col_trans_16
-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
-+
-+  # Save results - note there has been a transposition during the processing so we save columns
-+  vsth VX(0,32++)+r0, (r1 += r3) REP 16
-+
-+  # Move onto next block
-+  eor r0,r8
-+  add r1,r7
-+
-+  addcmpbgt r2,-1,0,block_loop
-+done16x16s:
-+
-+  add sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack
-+  # Now go and do any 32x32 transforms
-+  b hevc_trans_32x32
-+
-+  pop r6-r15, pc
-+# This returns a value in r6 that says where to load the data from.
-+# We load data 16 shorts at a time from memory (uncached), and store to stack space to allow us to process it.
-+unpack16x16:
-+# Clear out destination
-+  vmov HX(0,0)+r0,0
-+  mov r6, r11
-+  vsth HX(0,0)+r0,(r6 += r3) REP 16
-+  mov r5, r1 # Moving pointer to input coefficients
-+unpack_outer_loop:
-+  # Loop until we find the end
-+  vldh HX(0,0)+r0,(r5)  # TODO would prefetch help here while unpacking previous?
-+  sub r6,r11,32
-+  #add r6,pc,packed_data-$ # Packed data
-+  vsth HX(0,0)+r0,(r6)  # Store into packed data
-+  mov r12,0
-+unpack_loop:
-+  ld r4,(r6)
-+  add r6,r6,4
-+  lsr r9,r4,16 # r9 is destination value
-+  cmp r4,0 # {value,index}
-+  extu r4,8
-+  beq done_unpack
-+  sth r9,(r11, r4)
-+  addcmpblt r12,1,8,unpack_loop
-+#  # Read next 16
-+  add r5,32
-+  b unpack_outer_loop
-+done_unpack:
-+#  # Set new load location
-+  mov r6, r11
-+  #add r6,pc,unpacked_data-$
-+#  # Restore constants
-+  mov r4,64
-+  mov r5,TRANS_RND2
-+#  pop r6-r15, pc
-+  b lr
-+
-+# r1,r2,r3 r7,r8 should be preserved
-+# HX(0++,0)+r0 is the block to be transformed
-+# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
-+# Use HY(48,0) for intermediate results
-+# r0 can be used, but should be returned to its original value at the end
-+col_trans_16:
-+  add r6,r0,16 # Final value for this loop
-+col_trans_16_loop:
-+  # First compute partial products for a single column
-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
-+  # Then sum up the results and place back
-+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+  addcmpblt r0,1,r6,col_trans_16_loop
-+  sub r0,16  # put r0 back to its original value
-+  b lr
-+
-+col_trans_odd_16:
-+  add r6,r0,16 # Final value for this loop
-+col_trans_odd_16_loop:
-+  # First compute partial products for a single column
-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
-+  # Then sum up the results and place back
-+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+  addcmpblt r0,1,r6,col_trans_odd_16_loop
-+  sub r0,16  # put r0 back to its original value
-+  b lr
-+
-+# r1/r10 input pointer
-+# r0,r4,r5,r6 free
-+# r8/r9 output storage
-+#
-+# Store packed coefficients at r9-32
-+# Store unpacked at r9+32*32 (because transform works on even/odd rows on input, but writes all rows)
-+unpack32x32:
-+# Clear out destination
-+  vmov HX(0,0),0
-+  add r0, r9, 32*32*2 # Unpacked buffer
-+  mov r4, 32
-+  vsth HX(0,0),(r0 += r4) REP 64
-+unpack_outer_loop32:
-+  # Loop until we find the end
-+  vldh HX(0,0),(r1)  # TODO would prefetch help here while unpacking previous?
-+  sub r6,r9,32
-+  #add r6,pc,packed_data-$ # Packed data
-+  vsth HX(0,0),(r6)  # Store into packed data
-+  mov r8,0
-+unpack_loop32:
-+  ld r4,(r6)
-+  add r6,r6,4
-+  lsr r5,r4,16 # r5 is destination value
-+  cmp r4,0 # {value,index}
-+  extu r4,10
-+  beq done_unpack
-+  sth r5,(r0, r4)
-+  addcmpblt r8,1,8,unpack_loop32
-+#  # Read next 16
-+  add r1,32
-+  b unpack_outer_loop32
-+done_unpack32:
-+  b lr
-+# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
-+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
-+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-+# num: number of 16x16 transforms to be done in low 16, number of packed in high 16
-+#
-+# Note that the 32x32 transforms are stored in reverse order, this means that the unpacked ones appear first!
-+hevc_trans_32x32:
-+  mov r1,r14 # coeffs
-+  mov r2,r15 # num
-+  lsr r15,r15,16 # Number that are packed
-+  extu r2,16 # Total number
-+
-+  # Fetch odd transform matrix
-+  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
-+  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
-+  #add r0, 16*16*2
-+  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-+
-+  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
-+  mov r7, 16*16*2 # Total block size
-+
-+.if USE_STACK
-+  # Stack base allocation
-+  sub sp,sp,32*32*4+64 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned) and another 32*32 for unpacking
-+  # set r8 to 32byte aligned stack pointer with 32 bytes of space before it
-+  add r8,sp,63
-+  lsr r8,5
-+  lsl r8,5
-+.else
-+#:version r8
-+  .half 0x00e8 #AUTOINSERTED
-+  btst r8,16
-+#:add r8,pc,intermediate_results-$
-+  .half 0xbfe8
-+  .half intermediate_results-($-2)
-+  beq on_vpu1
-+  add r8,r8,32*32*2*2+16*2 # Move to secondary storage
-+on_vpu1:
-+.endif
-+  mov r9,r8  # Backup of the temporary storage
-+  mov r10,r1 # Backup of the coefficient buffer
-+
-+  cmp r2,0
-+  beq done32x32s
-+block_loop32:
-+
-+  # Transform the first 16 columns
-+  mov r1,r10  # Input Coefficient buffer
-+  mov r8,r9   # Output temporary storage
-+  # Unpacked are first, so need to only do unpacking when r2(=num left) <= r15 (=num packed)
-+  cmp r2,r15
-+  bgt not_compressed_32
-+  bl unpack32x32
-+  add r1,r9,32*32*2   # Uncompressed into temporary storage
-+  mov r8,r9           # Transform into here
-+not_compressed_32:
-+  # COLUMN TRANSFORM
-+  mov r4, 64 # Constant used for rounding first pass
-+  mov r5, 9 # left shift used for rounding first pass
-+
-+  bl trans32
-+  # Transform the second 16 columns
-+  add r8,32*16*2
-+  add r1,32
-+  bl trans32
-+
-+  # ROW TRANSFORM
-+  mov r4, TRANS_RND2 # Constant used for rounding second pass
-+  mov r5, TRANS_ASL2 # left shift used for rounding second pass
-+
-+  mov r1,r9  # Input temporary storage
-+  mov r8,r10   # Output Coefficient buffer
-+  bl trans32
-+  # Transform the second 16 columns
-+  add r8,32*16*2
-+  add r1,32
-+  bl trans32
-+
-+  add r10, 32*32*2 # move onto next block of coefficients
-+  addcmpbgt r2,-1,0,block_loop32
-+done32x32s:
-+
-+.if USE_STACK
-+  add sp,sp,32*32*4+64# Restore stack
-+.endif
-+
-+  pop r6-r15, pc
-+
-+trans32:
-+  push lr
-+  # We can no longer afford the VRF space to do prefetching when doing 32x32
-+  # Fetch the even rows
-+  vldh HX(0++,0),(r1 += r3) REP 16
-+  # Fetch the odd rows
-+  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
-+
-+  # Transform the even rows using even matrix
-+  mov r0, 0 # Even rows
-+  bl col_trans_16
-+
-+  # Now transform the odd rows using odd matrix
-+  mov r0, 64*16 # Odd rows
-+  bl col_trans_odd_16
-+
-+  # Now apply butterfly to compute the first 16 results
-+  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
-+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
-+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
-+  # 16bit results now in HX(48,32)
-+  mov r0,r8
-+  mov r6,32*2
-+  vsth VX(48,32++),(r0+=r6) REP 16
-+
-+  # Now apply butterfly to compute the second 16 results (in reverse order)
-+  vsub HY(63,0),HY(0 ,0),HY(16,0)
-+  vsub HY(62,0),HY(1 ,0),HY(17,0)
-+  vsub HY(61,0),HY(2 ,0),HY(18,0)
-+  vsub HY(60,0),HY(3 ,0),HY(19,0)
-+  vsub HY(59,0),HY(4 ,0),HY(20,0)
-+  vsub HY(58,0),HY(5 ,0),HY(21,0)
-+  vsub HY(57,0),HY(6 ,0),HY(22,0)
-+  vsub HY(56,0),HY(7 ,0),HY(23,0)
-+  vsub HY(55,0),HY(8 ,0),HY(24,0)
-+  vsub HY(54,0),HY(9 ,0),HY(25,0)
-+  vsub HY(53,0),HY(10,0),HY(26,0)
-+  vsub HY(52,0),HY(11,0),HY(27,0)
-+  vsub HY(51,0),HY(12,0),HY(28,0)
-+  vsub HY(50,0),HY(13,0),HY(29,0)
-+  vsub HY(49,0),HY(14,0),HY(30,0)
-+  vsub HY(48,0),HY(15,0),HY(31,0)
-+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
-+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
-+  add r0,r8,32
-+  vsth VX(48,32++),(r0+=r6) REP 16
-+  pop pc
-+
-+.if USE_STACK == 0
-+  .balign 32
-+
-+# .space directives generate 0's in the bin so avoid unnecessary padding by
-+# just setting to appropriate value
-+.equ intermediate_results, $+16*2
-+
-+# Layout goes:
-+#
-+#packed_buffer:
-+#  .space 16*2
-+#intermediate_results:
-+#  .space 32*32*2
-+#unpacked_buffer:
-+#  .space 32*32*2
-+#
-+#packed_buffer2:
-+#  .space 16*2
-+#intermediate_results2:
-+#  .space 32*32*2
-+#unpacked_buffer2:
-+#  .space 32*32*2
-+.endif
-+
-+
---- /dev/null
-+++ b/libavcodec/rpi_hevc_transform10.h
-@@ -0,0 +1,94 @@
-+static const unsigned char rpi_hevc_transform10 [] = {
-+0xa9,  0x03,  0x3e,  0x40,  0x4f,  0x40,  0x03,  0xb0,   // 0000
-+0x20,  0x00,  0x0c,  0xf8,  0x38,  0x88,  0x80,  0x03,   // 0008
-+0xc0,  0xf8,  0x00,  0x00,  0x40,  0xb0,  0x00,  0x02,   // 0010
-+0x0c,  0xf8,  0x38,  0xa8,  0x80,  0x03,  0xc0,  0xf8,   // 0018
-+0x00,  0x00,  0x00,  0x60,  0x03,  0xb0,  0x20,  0x00,   // 0020
-+0x07,  0xb0,  0x00,  0x02,  0x08,  0xb0,  0x00,  0x04,   // 0028
-+0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,  0x00,  0x02,   // 0030
-+0x59,  0xb0,  0xc0,  0xfd,  0x0b,  0x12,  0x5b,  0x7a,   // 0038
-+0x5b,  0x7c,  0x4a,  0xc3,  0x50,  0x17,  0x02,  0x6f,   // 0040
-+0x02,  0x6a,  0x32,  0x18,  0x0a,  0x6a,  0x16,  0x40,   // 0048
-+0x04,  0x18,  0x1a,  0x66,  0x80,  0x90,  0x32,  0x00,   // 0050
-+0x0c,  0xf8,  0x38,  0x80,  0x80,  0x03,  0xc0,  0x08,   // 0058
-+0x18,  0x00,  0x80,  0x90,  0x51,  0x00,  0x04,  0xff,   // 0060
-+0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,  0x10,  0x00,   // 0068
-+0x4c,  0xfe,  0x30,  0xc0,  0x09,  0x04,  0x20,  0x08,   // 0070
-+0x00,  0x00,  0x04,  0xfc,  0x38,  0x90,  0x80,  0x02,   // 0078
-+0xc0,  0x0b,  0x02,  0x00,  0x80,  0x90,  0x40,  0x00,   // 0080
-+0x04,  0xff,  0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,   // 0088
-+0x14,  0x00,  0x4c,  0xfe,  0x30,  0xc0,  0x06,  0x04,   // 0090
-+0x20,  0x08,  0x00,  0x00,  0x8c,  0xf8,  0x2c,  0xe0,   // 0098
-+0x80,  0x03,  0x20,  0x30,  0x04,  0x00,  0x80,  0x45,   // 00a0
-+0x71,  0x42,  0xf2,  0x8c,  0xd1,  0xc0,  0x59,  0xb0,   // 00a8
-+0x40,  0x02,  0x00,  0x9e,  0x6d,  0x00,  0x29,  0x03,   // 00b0
-+0x00,  0xf4,  0x38,  0x80,  0x00,  0x0c,  0xb6,  0x40,   // 00b8
-+0x8c,  0xf8,  0x20,  0xe0,  0x80,  0x03,  0x00,  0x30,   // 00c0
-+0x18,  0x00,  0x15,  0x40,  0x08,  0xf0,  0x38,  0x80,   // 00c8
-+0x85,  0x0b,  0x66,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 00d0
-+0x24,  0xe0,  0x86,  0x03,  0x0c,  0x60,  0x64,  0x08,   // 00d8
-+0x46,  0x62,  0x49,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 00e0
-+0x84,  0x6e,  0x07,  0x18,  0x69,  0xa0,  0x04,  0x5f,   // 00e8
-+0x1c,  0x8b,  0xf7,  0xc8,  0x45,  0x76,  0x6b,  0x1f,   // 00f0
-+0xb6,  0x40,  0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,   // 00f8
-+0x00,  0x02,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0100
-+0xa4,  0xff,  0x24,  0xcc,  0x60,  0x02,  0x00,  0xf8,   // 0108
-+0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0110
-+0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0118
-+0x00,  0x67,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0120
-+0xa4,  0xff,  0x24,  0xcc,  0xe0,  0x02,  0x00,  0xf8,   // 0128
-+0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0130
-+0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0138
-+0x00,  0x67,  0x5a,  0x00,  0x00,  0xf4,  0x38,  0x80,   // 0140
-+0x00,  0x04,  0x20,  0xb5,  0x00,  0x08,  0x04,  0xb0,   // 0148
-+0x20,  0x00,  0x8e,  0xf8,  0x20,  0xe0,  0x80,  0x03,   // 0150
-+0xc0,  0x43,  0x00,  0x00,  0x08,  0xf0,  0x38,  0x80,   // 0158
-+0x81,  0x03,  0x26,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 0160
-+0x20,  0xe0,  0x86,  0x03,  0x08,  0x60,  0x64,  0x08,   // 0168
-+0x46,  0x62,  0x45,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 0170
-+0xa4,  0x6e,  0x7f,  0x90,  0xbf,  0xff,  0x65,  0xa0,   // 0178
-+0x04,  0x07,  0x18,  0x8b,  0xf6,  0xc8,  0x41,  0x76,   // 0180
-+0x6a,  0x1f,  0x5a,  0x00,  0xe1,  0x40,  0xf2,  0x40,   // 0188
-+0x0f,  0x7b,  0x02,  0x6f,  0x03,  0xb0,  0x80,  0x00,   // 0190
-+0x07,  0xb0,  0x00,  0x02,  0xe8,  0x00,  0x08,  0x6d,   // 0198
-+0xe8,  0xbf,  0x60,  0x01,  0x03,  0x18,  0x48,  0xb0,   // 01a0
-+0x20,  0x10,  0x89,  0x40,  0x1a,  0x40,  0x02,  0x6a,   // 01a8
-+0x24,  0x18,  0xa1,  0x40,  0x98,  0x40,  0xf2,  0x4a,   // 01b0
-+0x06,  0x1e,  0xff,  0x9f,  0xc5,  0xff,  0x21,  0xb5,   // 01b8
-+0x00,  0x08,  0x98,  0x40,  0x04,  0xb0,  0x40,  0x00,   // 01c0
-+0x95,  0x60,  0x80,  0x90,  0x18,  0x00,  0x48,  0xb0,   // 01c8
-+0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x13,  0x00,   // 01d0
-+0x04,  0xb0,  0x00,  0x02,  0x65,  0x60,  0x91,  0x40,   // 01d8
-+0xa8,  0x40,  0x80,  0x90,  0x0c,  0x00,  0x48,  0xb0,   // 01e0
-+0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x07,  0x00,   // 01e8
-+0x4a,  0xb0,  0x00,  0x08,  0xf2,  0x8c,  0xdf,  0xc0,   // 01f0
-+0x29,  0x03,  0xef,  0x03,  0x0c,  0xf8,  0x38,  0x80,   // 01f8
-+0x80,  0x03,  0xc0,  0xf8,  0x04,  0x00,  0x0c,  0xf8,   // 0200
-+0x38,  0x84,  0xc0,  0x03,  0xc0,  0xf8,  0x04,  0x00,   // 0208
-+0x00,  0x60,  0xff,  0x9f,  0x79,  0xff,  0x00,  0xb0,   // 0210
-+0x00,  0x04,  0xff,  0x9f,  0x85,  0xff,  0x04,  0xff,   // 0218
-+0x30,  0xcc,  0x10,  0x03,  0xe0,  0xfb,  0x3e,  0x00,   // 0220
-+0x04,  0xff,  0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,   // 0228
-+0x10,  0x00,  0x4c,  0xfe,  0x33,  0xcc,  0x80,  0x03,   // 0230
-+0xe0,  0xfb,  0x14,  0x00,  0x80,  0x40,  0x06,  0xb0,   // 0238
-+0x40,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,  0x80,  0x03,   // 0240
-+0xe0,  0x63,  0x00,  0x00,  0x20,  0xf7,  0xf0,  0xcf,   // 0248
-+0x10,  0x03,  0x20,  0xf7,  0xb0,  0xcf,  0x11,  0x13,   // 0250
-+0x20,  0xf7,  0x70,  0xcf,  0x12,  0x23,  0x20,  0xf7,   // 0258
-+0x30,  0xcf,  0x13,  0x33,  0x20,  0xf7,  0xf0,  0xce,   // 0260
-+0x14,  0x43,  0x20,  0xf7,  0xb0,  0xce,  0x15,  0x53,   // 0268
-+0x20,  0xf7,  0x70,  0xce,  0x16,  0x63,  0x20,  0xf7,   // 0270
-+0x30,  0xce,  0x17,  0x73,  0x20,  0xf7,  0xf0,  0xcd,   // 0278
-+0x18,  0x83,  0x20,  0xf7,  0xb0,  0xcd,  0x19,  0x93,   // 0280
-+0x20,  0xf7,  0x70,  0xcd,  0x1a,  0xa3,  0x20,  0xf7,   // 0288
-+0x30,  0xcd,  0x1b,  0xb3,  0x20,  0xf7,  0xf0,  0xcc,   // 0290
-+0x1c,  0xc3,  0x20,  0xf7,  0xb0,  0xcc,  0x1d,  0xd3,   // 0298
-+0x20,  0xf7,  0x70,  0xcc,  0x1e,  0xe3,  0x20,  0xf7,   // 02a0
-+0x30,  0xcc,  0x1f,  0xf3,  0x04,  0xff,  0x33,  0xcc,   // 02a8
-+0x80,  0x03,  0xe0,  0xfb,  0x10,  0x00,  0x4c,  0xfe,   // 02b0
-+0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,  0x14,  0x00,   // 02b8
-+0x00,  0xb5,  0x20,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,   // 02c0
-+0x80,  0x03,  0xe0,  0x63,  0x00,  0x00,  0x6f,  0x03,   // 02c8
-+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d0
-+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d8
-+};
---- /dev/null
-+++ b/libavcodec/rpi_hevc_transform8.h
-@@ -0,0 +1,94 @@
-+static const unsigned char rpi_hevc_transform8 [] = {
-+0xa9,  0x03,  0x3e,  0x40,  0x4f,  0x40,  0x03,  0xb0,   // 0000
-+0x20,  0x00,  0x0c,  0xf8,  0x38,  0x88,  0x80,  0x03,   // 0008
-+0xc0,  0xf8,  0x00,  0x00,  0x40,  0xb0,  0x00,  0x02,   // 0010
-+0x0c,  0xf8,  0x38,  0xa8,  0x80,  0x03,  0xc0,  0xf8,   // 0018
-+0x00,  0x00,  0x00,  0x60,  0x03,  0xb0,  0x20,  0x00,   // 0020
-+0x07,  0xb0,  0x00,  0x02,  0x08,  0xb0,  0x00,  0x04,   // 0028
-+0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,  0x00,  0x08,   // 0030
-+0x59,  0xb0,  0xc0,  0xfd,  0x0b,  0x12,  0x5b,  0x7a,   // 0038
-+0x5b,  0x7c,  0x4a,  0xc3,  0x50,  0x17,  0x02,  0x6f,   // 0040
-+0x02,  0x6a,  0x32,  0x18,  0x0a,  0x6a,  0x16,  0x40,   // 0048
-+0x04,  0x18,  0x1a,  0x66,  0x80,  0x90,  0x32,  0x00,   // 0050
-+0x0c,  0xf8,  0x38,  0x80,  0x80,  0x03,  0xc0,  0x08,   // 0058
-+0x18,  0x00,  0x80,  0x90,  0x51,  0x00,  0x04,  0xff,   // 0060
-+0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,  0x10,  0x00,   // 0068
-+0x4c,  0xfe,  0x30,  0xc0,  0x09,  0x04,  0x20,  0x08,   // 0070
-+0x00,  0x00,  0x04,  0xfc,  0x38,  0x90,  0x80,  0x02,   // 0078
-+0xc0,  0x0b,  0x02,  0x00,  0x80,  0x90,  0x40,  0x00,   // 0080
-+0x04,  0xff,  0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,   // 0088
-+0x14,  0x00,  0x4c,  0xfe,  0x30,  0xc0,  0x04,  0x04,   // 0090
-+0x20,  0x08,  0x00,  0x00,  0x8c,  0xf8,  0x2c,  0xe0,   // 0098
-+0x80,  0x03,  0x20,  0x30,  0x04,  0x00,  0x80,  0x45,   // 00a0
-+0x71,  0x42,  0xf2,  0x8c,  0xd1,  0xc0,  0x59,  0xb0,   // 00a8
-+0x40,  0x02,  0x00,  0x9e,  0x6d,  0x00,  0x29,  0x03,   // 00b0
-+0x00,  0xf4,  0x38,  0x80,  0x00,  0x0c,  0xb6,  0x40,   // 00b8
-+0x8c,  0xf8,  0x20,  0xe0,  0x80,  0x03,  0x00,  0x30,   // 00c0
-+0x18,  0x00,  0x15,  0x40,  0x08,  0xf0,  0x38,  0x80,   // 00c8
-+0x85,  0x0b,  0x66,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 00d0
-+0x24,  0xe0,  0x86,  0x03,  0x0c,  0x60,  0x64,  0x08,   // 00d8
-+0x46,  0x62,  0x49,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 00e0
-+0x84,  0x6e,  0x07,  0x18,  0x69,  0xa0,  0x04,  0x5f,   // 00e8
-+0x1c,  0x8b,  0xf7,  0xc8,  0x45,  0x76,  0x6b,  0x1f,   // 00f0
-+0xb6,  0x40,  0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,   // 00f8
-+0x00,  0x08,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0100
-+0xa4,  0xff,  0x24,  0xcc,  0x60,  0x02,  0x00,  0xf8,   // 0108
-+0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0110
-+0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0118
-+0x00,  0x67,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0120
-+0xa4,  0xff,  0x24,  0xcc,  0xe0,  0x02,  0x00,  0xf8,   // 0128
-+0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0130
-+0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0138
-+0x00,  0x67,  0x5a,  0x00,  0x00,  0xf4,  0x38,  0x80,   // 0140
-+0x00,  0x04,  0x20,  0xb5,  0x00,  0x08,  0x04,  0xb0,   // 0148
-+0x20,  0x00,  0x8e,  0xf8,  0x20,  0xe0,  0x80,  0x03,   // 0150
-+0xc0,  0x43,  0x00,  0x00,  0x08,  0xf0,  0x38,  0x80,   // 0158
-+0x81,  0x03,  0x26,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 0160
-+0x20,  0xe0,  0x86,  0x03,  0x08,  0x60,  0x64,  0x08,   // 0168
-+0x46,  0x62,  0x45,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 0170
-+0xa4,  0x6e,  0x7f,  0x90,  0xbf,  0xff,  0x65,  0xa0,   // 0178
-+0x04,  0x07,  0x18,  0x8b,  0xf6,  0xc8,  0x41,  0x76,   // 0180
-+0x6a,  0x1f,  0x5a,  0x00,  0xe1,  0x40,  0xf2,  0x40,   // 0188
-+0x0f,  0x7b,  0x02,  0x6f,  0x03,  0xb0,  0x80,  0x00,   // 0190
-+0x07,  0xb0,  0x00,  0x02,  0xe8,  0x00,  0x08,  0x6d,   // 0198
-+0xe8,  0xbf,  0x60,  0x01,  0x03,  0x18,  0x48,  0xb0,   // 01a0
-+0x20,  0x10,  0x89,  0x40,  0x1a,  0x40,  0x02,  0x6a,   // 01a8
-+0x24,  0x18,  0xa1,  0x40,  0x98,  0x40,  0xf2,  0x4a,   // 01b0
-+0x06,  0x1e,  0xff,  0x9f,  0xc5,  0xff,  0x21,  0xb5,   // 01b8
-+0x00,  0x08,  0x98,  0x40,  0x04,  0xb0,  0x40,  0x00,   // 01c0
-+0x95,  0x60,  0x80,  0x90,  0x18,  0x00,  0x48,  0xb0,   // 01c8
-+0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x13,  0x00,   // 01d0
-+0x04,  0xb0,  0x00,  0x08,  0x45,  0x60,  0x91,  0x40,   // 01d8
-+0xa8,  0x40,  0x80,  0x90,  0x0c,  0x00,  0x48,  0xb0,   // 01e0
-+0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x07,  0x00,   // 01e8
-+0x4a,  0xb0,  0x00,  0x08,  0xf2,  0x8c,  0xdf,  0xc0,   // 01f0
-+0x29,  0x03,  0xef,  0x03,  0x0c,  0xf8,  0x38,  0x80,   // 01f8
-+0x80,  0x03,  0xc0,  0xf8,  0x04,  0x00,  0x0c,  0xf8,   // 0200
-+0x38,  0x84,  0xc0,  0x03,  0xc0,  0xf8,  0x04,  0x00,   // 0208
-+0x00,  0x60,  0xff,  0x9f,  0x79,  0xff,  0x00,  0xb0,   // 0210
-+0x00,  0x04,  0xff,  0x9f,  0x85,  0xff,  0x04,  0xff,   // 0218
-+0x30,  0xcc,  0x10,  0x03,  0xe0,  0xfb,  0x3e,  0x00,   // 0220
-+0x04,  0xff,  0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,   // 0228
-+0x10,  0x00,  0x4c,  0xfe,  0x33,  0xcc,  0x80,  0x03,   // 0230
-+0xe0,  0xfb,  0x14,  0x00,  0x80,  0x40,  0x06,  0xb0,   // 0238
-+0x40,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,  0x80,  0x03,   // 0240
-+0xe0,  0x63,  0x00,  0x00,  0x20,  0xf7,  0xf0,  0xcf,   // 0248
-+0x10,  0x03,  0x20,  0xf7,  0xb0,  0xcf,  0x11,  0x13,   // 0250
-+0x20,  0xf7,  0x70,  0xcf,  0x12,  0x23,  0x20,  0xf7,   // 0258
-+0x30,  0xcf,  0x13,  0x33,  0x20,  0xf7,  0xf0,  0xce,   // 0260
-+0x14,  0x43,  0x20,  0xf7,  0xb0,  0xce,  0x15,  0x53,   // 0268
-+0x20,  0xf7,  0x70,  0xce,  0x16,  0x63,  0x20,  0xf7,   // 0270
-+0x30,  0xce,  0x17,  0x73,  0x20,  0xf7,  0xf0,  0xcd,   // 0278
-+0x18,  0x83,  0x20,  0xf7,  0xb0,  0xcd,  0x19,  0x93,   // 0280
-+0x20,  0xf7,  0x70,  0xcd,  0x1a,  0xa3,  0x20,  0xf7,   // 0288
-+0x30,  0xcd,  0x1b,  0xb3,  0x20,  0xf7,  0xf0,  0xcc,   // 0290
-+0x1c,  0xc3,  0x20,  0xf7,  0xb0,  0xcc,  0x1d,  0xd3,   // 0298
-+0x20,  0xf7,  0x70,  0xcc,  0x1e,  0xe3,  0x20,  0xf7,   // 02a0
-+0x30,  0xcc,  0x1f,  0xf3,  0x04,  0xff,  0x33,  0xcc,   // 02a8
-+0x80,  0x03,  0xe0,  0xfb,  0x10,  0x00,  0x4c,  0xfe,   // 02b0
-+0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,  0x14,  0x00,   // 02b8
-+0x00,  0xb5,  0x20,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,   // 02c0
-+0x80,  0x03,  0xe0,  0x63,  0x00,  0x00,  0x6f,  0x03,   // 02c8
-+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d0
-+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d8
-+};
---- /dev/null
-+++ b/libavcodec/rpi_hevcdec.c
-@@ -0,0 +1,6134 @@
-+/*
-+ * HEVC video Decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2012 - 2013 Mickael Raulet
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ * Copyright (C) 2012 - 2013 Wassim Hamidouche
-+ * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/attributes.h"
-+#include "libavutil/common.h"
-+#include "libavutil/display.h"
-+#include "libavutil/internal.h"
-+#include "libavutil/mastering_display_metadata.h"
-+#include "libavutil/md5.h"
-+#include "libavutil/opt.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/stereo3d.h"
-+
-+#include "decode.h"
-+#include "bswapdsp.h"
-+#include "bytestream.h"
-+#include "golomb.h"
-+#include "hevc.h"
-+#include "rpi_hevc_data.h"
-+#include "rpi_hevc_parse.h"
-+#include "rpi_hevcdec.h"
-+#include "rpi_hevc_cabac_fns.h"
-+#include "profiles.h"
-+#include "hwconfig.h"
-+
-+#include "rpi_zc_frames.h"
-+#include "rpi_qpu.h"
-+#include "rpi_hevc_shader.h"
-+#include "rpi_hevc_shader_cmd.h"
-+#include "rpi_hevc_shader_template.h"
-+#include "rpi_zc.h"
-+#include "libavutil/rpi_sand_fns.h"
-+
-+#include "pthread.h"
-+#include <stdatomic.h>
-+
-+#define DEBUG_DECODE_N 0   // 0 = do all, n = frames idr onwards
-+
-+#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
-+
-+#ifndef av_mod_uintp2
-+static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
-+{
-+    return a & ((1 << p) - 1);
-+}
-+#   define av_mod_uintp2   av_mod_uintp2_c
-+#endif
-+
-+const uint8_t ff_hevc_rpi_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
-+static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first);
-+
-+#define MC_DUMMY_X (-32)
-+#define MC_DUMMY_Y (-32)
-+
-+// UV & Y both have min 4x4 pred (no 2x2 chroma)
-+// Allow for even spread +1 for setup, +1 for rounding
-+// As we have load sharing this can (in theory) be exceeded so we have to
-+// check after each CTU, but it is a good base size
-+
-+// Worst case (all 4x4) commands per CTU
-+#define QPU_Y_CMD_PER_CTU_MAX (16 * 16)
-+#define QPU_C_CMD_PER_CTU_MAX (8 * 8)
-+
-+#define QPU_MAX_CTU_PER_LINE ((HEVC_RPI_MAX_WIDTH + 63) / 64)
-+
-+#define QPU_GRPS (QPU_N_MAX / QPU_N_GRP)
-+#define QPU_CTU_PER_GRP ((QPU_MAX_CTU_PER_LINE + QPU_GRPS - 1) / QPU_GRPS)
-+
-+#define QPU_Y_CMD_SLACK_PER_Q (QPU_Y_CMD_PER_CTU_MAX / 2)
-+#define QPU_C_CMD_SLACK_PER_Q (QPU_C_CMD_PER_CTU_MAX / 2)
-+
-+// Total cmds to allocate - allow for slack & setup
-+#define QPU_Y_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_Y_CMD_PER_CTU_MAX + (1 + QPU_Y_CMD_SLACK_PER_Q) * QPU_N_MAX)
-+#define QPU_C_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_C_CMD_PER_CTU_MAX + (1 + QPU_C_CMD_SLACK_PER_Q) * QPU_N_MAX)
-+
-+#define QPU_Y_SYNCS (QPU_N_MAX * (16 + 2))
-+#define QPU_C_SYNCS (QPU_N_MAX * (8 + 2))
-+
-+// The QPU code for UV blocks only works up to a block width of 8
-+#define RPI_CHROMA_BLOCK_WIDTH 8
-+
-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
-+
-+
-+// Actual filter goes -ve, +ve, +ve, -ve using these values
-+static const uint32_t rpi_filter_coefs[8] = {
-+        ENCODE_COEFFS(  0,  64,   0,  0),
-+        ENCODE_COEFFS(  2,  58,  10,  2),
-+        ENCODE_COEFFS(  4,  54,  16,  2),
-+        ENCODE_COEFFS(  6,  46,  28,  4),
-+        ENCODE_COEFFS(  4,  36,  36,  4),
-+        ENCODE_COEFFS(  4,  28,  46,  6),
-+        ENCODE_COEFFS(  2,  16,  54,  4),
-+        ENCODE_COEFFS(  2,  10,  58,  2)
-+};
-+
-+// Function arrays by QPU
-+
-+static const int * const inter_pred_setup_c_qpu[12] = {
-+    mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
-+    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
-+    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn
-+};
-+
-+static const int * const inter_pred_setup_c10_qpu[12] = {
-+    mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
-+    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
-+    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn
-+};
-+
-+static const int * const inter_pred_setup_y_qpu[12] = {
-+    mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
-+    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
-+    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn
-+};
-+
-+static const int * const inter_pred_setup_y10_qpu[12] = {
-+    mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
-+    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
-+    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn
-+};
-+
-+static const int * const inter_pred_sync_qpu[12] = {
-+    mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3,
-+    mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7,
-+    mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11
-+};
-+
-+static const int * const inter_pred_sync10_qpu[12] = {
-+    mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3,
-+    mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7,
-+    mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11
-+};
-+
-+static const int * const inter_pred_exit_c_qpu[12] = {
-+    mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
-+    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
-+    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn
-+};
-+
-+static const int * const inter_pred_exit_c10_qpu[12] = {
-+    mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
-+    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
-+    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn
-+};
-+
-+static const int * const inter_pred_exit_y_qpu[12] = {
-+    mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
-+    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
-+    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn
-+};
-+
-+static const int * const inter_pred_exit_y10_qpu[12] = {
-+    mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
-+    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
-+    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn
-+};
-+
-+typedef struct ipe_chan_info_s
-+{
-+    const uint8_t bit_depth;
-+    const uint8_t n;
-+    const int * const * setup_fns;
-+    const int * const * sync_fns;
-+    const int * const * exit_fns;
-+} ipe_chan_info_t;
-+
-+typedef struct ipe_init_info_s
-+{
-+    ipe_chan_info_t luma;
-+    ipe_chan_info_t chroma;
-+} ipe_init_info_t;
-+
-+static void set_bytes(uint8_t * b, const unsigned int stride, const int ln, unsigned int a)
-+{
-+    switch (ln)
-+    {
-+        default:  // normally 0
-+            *b = a;
-+            break;
-+        case 1:
-+            a |= a << 8;
-+            *(uint16_t *)b = a;
-+            b += stride;
-+            *(uint16_t *)b = a;
-+            break;
-+        case 2:
-+            a |= a << 8;
-+            a |= a << 16;
-+            *(uint32_t *)b = a;
-+            b += stride;
-+            *(uint32_t *)b = a;
-+            b += stride;
-+            *(uint32_t *)b = a;
-+            b += stride;
-+            *(uint32_t *)b = a;
-+            break;
-+        case 3:
-+        {
-+            unsigned int i;
-+            uint64_t d;
-+            a |= a << 8;
-+            a |= a << 16;
-+            d = ((uint64_t)a << 32) | a;
-+            for (i = 0; i != 8; ++i, b += stride)
-+                *(uint64_t *)b = d;
-+            break;
-+        }
-+        case 4:
-+        {
-+            unsigned int i;
-+            uint64_t d;
-+            a |= a << 8;
-+            a |= a << 16;
-+            d = ((uint64_t)a << 32) | a;
-+            for (i = 0; i != 16; ++i, b += stride)
-+            {
-+                *(uint64_t *)b = d;
-+                *(uint64_t *)(b + 8) = d;
-+            }
-+            break;
-+        }
-+    }
-+}
-+
-+// We expect this to be called with ln = (log2_cb_size - 3) so range =  -1..3
-+// (4 not required)
-+static void set_stash2(uint8_t * b_u, uint8_t * b_l, const int ln, unsigned int a)
-+{
-+    switch (ln)
-+    {
-+        default:  // 0 or -1
-+            *b_u = a;
-+            *b_l = a;
-+            break;
-+        case 1:
-+            a |= a << 8;
-+            *(uint16_t *)b_u = a;
-+            *(uint16_t *)b_l = a;
-+            break;
-+        case 2:
-+            a |= a << 8;
-+            a |= a << 16;
-+            *(uint32_t *)b_u = a;
-+            *(uint32_t *)b_l = a;
-+            break;
-+        case 3:
-+            a |= a << 8;
-+            a |= a << 16;
-+            *(uint32_t *)b_u = a;
-+            *(uint32_t *)(b_u + 4) = a;
-+            *(uint32_t *)b_l = a;
-+            *(uint32_t *)(b_l + 4) = a;
-+            break;
-+        case 4:
-+            a |= a << 8;
-+            a |= a << 16;
-+            *(uint32_t *)b_u = a;
-+            *(uint32_t *)(b_u + 4) = a;
-+            *(uint32_t *)(b_u + 8) = a;
-+            *(uint32_t *)(b_u + 12) = a;
-+            *(uint32_t *)b_l = a;
-+            *(uint32_t *)(b_l + 4) = a;
-+            *(uint32_t *)(b_l + 8) = a;
-+            *(uint32_t *)(b_l + 12) = a;
-+            break;
-+    }
-+}
-+
-+static void zap_cabac_stash(uint8_t * b, const int ln)
-+{
-+    switch (ln)
-+    {
-+        default:  // 0
-+            *b = 0;
-+            break;
-+        case 1:
-+            *(uint16_t *)b = 0;
-+            break;
-+        case 2:
-+            *(uint32_t *)b = 0;
-+            break;
-+        case 3:
-+            *(uint32_t *)b = 0;
-+            *(uint32_t *)(b + 4) = 0;
-+            break;
-+    }
-+}
-+
-+
-+
-+// Set a small square block of bits in a bitmap
-+// Bits must be aligned on their size boundry (which will be true of all split CBs)
-+static void set_bits(uint8_t * f, const unsigned int x, const unsigned int stride, const unsigned int ln)
-+{
-+    unsigned int n;
-+    const unsigned int sh = (x & 7);
-+
-+    f += (x >> 3);
-+
-+    av_assert2(ln <= 3);
-+    av_assert2((x & ((1 << ln) - 1)) == 0);
-+
-+    switch (ln)
-+    {
-+        default:  // 1
-+            f[0] |= 1 << sh;
-+            break;
-+        case 1:  // 3 * 2
-+            n = 3 << sh;
-+            f[0] |= n;
-+            f[stride] |= n;
-+            break;
-+        case 2:  // 0xf * 4
-+            n = 0xf << sh;
-+            f[0] |= n;
-+            f[stride] |= n;
-+            f[stride * 2] |= n;
-+            f[stride * 3] |= n;
-+            break;
-+        case 3:  // 0xff * 8
-+            for (n = 0; n != 8; ++n, f += stride)
-+                *f = 0xff;
-+            break;
-+    }
-+}
-+
-+static const ipe_init_info_t ipe_init_infos[9] = {  // Alloc for bit depths of 8-16
-+   {  // 8
-+      .luma =   {8, QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu},
-+      .chroma = {8, QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu}
-+   },
-+   {  // 9
-+      .luma =   {0},
-+      .chroma = {0}
-+   },
-+   {  // 10
-+      .luma =   {10, QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu},
-+      .chroma = {10, QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu}
-+   }
-+
-+};
-+
-+static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici)
-+{
-+    const unsigned int n = ici->n;
-+    const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3;  // Round down to word
-+
-+    ipe->n = n;
-+    ipe->max_fill = q1_size - ipe->min_gap;
-+    for(unsigned int i = 0; i < n; i++) {
-+        HEVCRpiInterPredQ * const q = ipe->q + i;
-+        q->qpu_mc_curr = q->qpu_mc_base =
-+            (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size);
-+        q->code_setup = qpu_fn(ici->setup_fns[i]);
-+        q->code_sync = qpu_fn(ici->sync_fns[i]);
-+        q->code_exit = qpu_fn(ici->exit_fns[i]);
-+    }
-+}
-+
-+static void rpi_hevc_qpu_set_fns(HEVCRpiContext * const s, const unsigned int bit_depth)
-+{
-+    av_assert0(bit_depth >= 8 && bit_depth <= 16);
-+
-+    rpi_hevc_qpu_init_fn(&s->qpu, bit_depth);
-+}
-+
-+// Unsigned Trivial MOD
-+static inline unsigned int utmod(const unsigned int x, const unsigned int n)
-+{
-+    return x >= n ? x - n : x;
-+}
-+
-+// returns pq->job_n++
-+static inline unsigned int pass_queue_inc_job_n(HEVCRpiPassQueue * const pq)
-+{
-+    unsigned int const x2 = pq->job_n;
-+    pq->job_n = utmod(x2 + 1, RPI_MAX_JOBS);
-+    return x2;
-+}
-+
-+static void pass_queue_init(HEVCRpiPassQueue * const pq, HEVCRpiContext * const s, HEVCRpiWorkerFn * const worker, sem_t * const psem_out, const int n)
-+{
-+    pq->terminate = 0;
-+    pq->job_n = 0;
-+    pq->context = s;
-+    pq->worker = worker;
-+    pq->psem_out = psem_out;
-+    pq->pass_n = n;
-+    pq->started = 0;
-+    sem_init(&pq->sem_in, 0, 0);
-+}
-+
-+static void pass_queue_kill(HEVCRpiPassQueue * const pq)
-+{
-+    sem_destroy(&pq->sem_in);
-+}
-+
-+static inline void rpi_sem_wait(sem_t * const sem)
-+{
-+    while (sem_wait(sem) != 0) {
-+        av_assert0(errno == EINTR);
-+    }
-+}
-+
-+static void pass_queue_submit_job(HEVCRpiPassQueue * const pq)
-+{
-+    sem_post(&pq->sem_in);
-+}
-+
-+static inline void pass_queue_do_all(HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+    // Do the various passes - common with the worker code
-+    for (unsigned int i = 0; i != RPI_PASSES; ++i) {
-+        s->passq[i].worker(s, jb);
-+    }
-+}
-+
-+
-+#if 0
-+static void dump_jbc(const HEVCRpiJobCtl *const jbc, const char * const func)
-+{
-+    int x;
-+    sem_getvalue((sem_t *)&jbc->sem_out, &x);
-+    printf("%s: jbc: in=%d, out=%d, sum=%d\n", func, jbc->offload_in, jbc->offload_out, x);
-+}
-+#endif
-+
-+
-+static HEVCRpiJob * job_alloc(HEVCRpiJobCtl * const jbc, HEVCRpiLocalContext * const lc)
-+{
-+    HEVCRpiJob * jb;
-+    HEVCRpiJobGlobal * const jbg = jbc->jbg;
-+
-+    pthread_mutex_lock(&jbg->lock);
-+    // Check local 1st
-+    if ((jb = jbc->jb1) != NULL)
-+    {
-+        // Only 1 - very easy :-)
-+        jbc->jb1 = NULL;
-+    }
-+    else
-+    {
-+        // Now look for global free chain
-+        if ((jb = jbg->free1) != NULL)
-+        {
-+            // Found one - unlink it
-+            jbg->free1 = jb->next;
-+            jb->next = NULL;
-+        }
-+        else
-+        {
-+            // Out of places to look - wait for one to become free - add to Qs
-+
-+            // Global
-+            // If "good" lc then add after the last "good" el in the chain
-+            // otherwise add to the tail
-+            if (jbg->wait_tail == NULL || jbg->wait_tail->last_progress_good || !lc->last_progress_good)
-+            {
-+                // Add to end as we had to wait last time or wait Q empty
-+                if ((lc->jw_prev = jbg->wait_tail) == NULL)
-+                    jbg->wait_head = lc;
-+                else
-+                    lc->jw_prev->jw_next = lc;
-+                lc->jw_next = NULL;
-+                jbg->wait_tail = lc;
-+            }
-+            else
-+            {
-+                // This is a "good" lc that we need to poke into the middle
-+                // of the Q
-+                // We know that the Q isn't empty and there is at least one
-+                // !last_progess_good el in it from the previous test
-+
-+                HEVCRpiLocalContext * const p = jbg->wait_good; // Insert after
-+
-+                if (p == NULL)
-+                {
-+                    // No current good els - add to head
-+                    lc->jw_next = jbg->wait_head;
-+                    jbg->wait_head = lc;
-+                }
-+                else
-+                {
-+                    lc->jw_next = p->jw_next;
-+                    p->jw_next = lc;
-+                }
-+
-+                lc->jw_next->jw_prev = lc;
-+                lc->jw_prev = p;
-+            }
-+
-+            // If "good" then we are now the last good waiting el
-+            if (lc->last_progress_good)
-+                jbg->wait_good = lc;
-+
-+            // Local
-+            if ((lc->ljw_prev = jbc->lcw_tail) == NULL)
-+                jbc->lcw_head = lc;
-+            else
-+                lc->ljw_prev->ljw_next = lc;
-+            lc->ljw_next = NULL;
-+            jbc->lcw_tail = lc;
-+        }
-+    }
-+
-+    pthread_mutex_unlock(&jbg->lock);
-+
-+    if (jb == NULL)  // Need to wait
-+    {
-+        rpi_sem_wait(&lc->jw_sem);
-+        jb = lc->jw_job;  // Set by free code
-+    }
-+
-+    return jb;
-+}
-+
-+
-+static void job_free(HEVCRpiJobCtl * const jbc0, HEVCRpiJob * const jb)
-+{
-+    HEVCRpiJobGlobal * const jbg = jbc0->jbg;  // This jbc only used to find jbg so we can get the lock
-+    HEVCRpiJobCtl * jbc = jb->jbc_local;
-+    HEVCRpiLocalContext * lc = NULL;
-+
-+    pthread_mutex_lock(&jbg->lock);
-+
-+    if (jbc != NULL)
-+    {
-+        av_assert1(jbc->jb1 == NULL);
-+
-+        // Release to Local if nothing waiting there
-+        if ((lc = jbc->lcw_head) == NULL)
-+            jbc->jb1 = jb;
-+    }
-+    else
-+    {
-+        // Release to global if nothing waiting there
-+        if ((lc = jbg->wait_head) == NULL)
-+        {
-+            jb->next = jbg->free1;
-+            jbg->free1 = jb;
-+        }
-+        else
-+        {
-+            // ? seems somehow mildy ugly...
-+            jbc = lc->context->jbc;
-+        }
-+    }
-+
-+    if (lc != NULL)
-+    {
-+        // Something was waiting
-+
-+        // Unlink
-+        // Global
-+        if (lc->jw_next == NULL)
-+            jbg->wait_tail = lc->jw_prev;
-+        else
-+            lc->jw_next->jw_prev = lc->jw_prev;
-+
-+        if (lc->jw_prev == NULL)
-+            jbg->wait_head = lc->jw_next;
-+        else
-+            lc->jw_prev->jw_next = lc->jw_next;
-+
-+        // Local
-+        if (lc->ljw_next == NULL)
-+            jbc->lcw_tail = lc->ljw_prev;
-+        else
-+            lc->ljw_next->ljw_prev = lc->ljw_prev;
-+
-+        if (lc->ljw_prev == NULL)
-+            jbc->lcw_head = lc->ljw_next;
-+        else
-+            lc->ljw_prev->ljw_next = lc->ljw_next;
-+
-+        // Update good if required
-+        if (jbg->wait_good == lc)
-+            jbg->wait_good = lc->jw_prev;
-+
-+        // Prod
-+        lc->jw_job = jb;
-+        sem_post(&lc->jw_sem);
-+    }
-+
-+    pthread_mutex_unlock(&jbg->lock);
-+}
-+
-+static void job_lc_kill(HEVCRpiLocalContext * const lc)
-+{
-+    sem_destroy(&lc->jw_sem);
-+}
-+
-+static void job_lc_init(HEVCRpiLocalContext * const lc)
-+{
-+    lc->jw_next = NULL;
-+    lc->jw_prev = NULL;
-+    lc->ljw_next = NULL;
-+    lc->ljw_prev = NULL;
-+    lc->jw_job = NULL;
-+    sem_init(&lc->jw_sem,  0, 0);
-+}
-+
-+// Returns:
-+//  0 if we have waited for MV or expect to wait for recon
-+//  1 if we haven't waited for MV & do not need to wait for recon
-+static int progress_good(const HEVCRpiContext *const s, const HEVCRpiJob * const jb)
-+{
-+    if (jb->waited) // reset by rpi_begin
-+        return 0;
-+    for (unsigned int i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i)
-+    {
-+        if (jb->progress_req[i] >= 0 && s->DPB[i].tf.progress != NULL &&
-+                ((volatile int *)(s->DPB[i].tf.progress->data))[0] < jb->progress_req[i])
-+            return 0;
-+    }
-+    return 1;
-+}
-+
-+// Submit job if it is full (indicated by having ctu_ts_last set >= 0)
-+static inline void worker_submit_job(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc)
-+{
-+    HEVCRpiJobCtl *const jbc = s->jbc;
-+    HEVCRpiJob * const jb = lc->jb0;
-+
-+    av_assert1(jb != NULL);
-+
-+    if (jb->ctu_ts_last < 0) {
-+        return;
-+    }
-+
-+    lc->last_progress_good = progress_good(s, jb);
-+    jb->waited = !lc->last_progress_good;
-+    lc->jb0 = NULL;
-+
-+    if (s->offload_recon)
-+    {
-+        pthread_mutex_lock(&jbc->in_lock);
-+        jbc->offloadq[jbc->offload_in] = jb;
-+        jbc->offload_in = utmod(jbc->offload_in + 1, RPI_MAX_JOBS);
-+        pthread_mutex_unlock(&jbc->in_lock);
-+
-+        pass_queue_submit_job(s->passq + 0);  // Consumes job eventually
-+    }
-+    else
-+    {
-+        pass_queue_do_all(s, jb);  // Consumes job before return
-+    }
-+}
-+
-+
-+// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
-+// available to receive the next job.
-+//
-+// Now safe against multiple callers - needed for tiles
-+// "normal" and WPP will only call here one at a time
-+static inline void worker_pass0_ready(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    HEVCRpiJobCtl * const jbc = s->jbc;
-+
-+    // It is legit for us to already have a job allocated - do nothing in this case
-+    if (lc->jb0 != NULL)
-+        return;
-+
-+    if (s->offload_recon)
-+        rpi_sem_wait(&jbc->sem_out);  // This sem will stop this frame grabbing too much
-+
-+    lc->jb0 = job_alloc(jbc, lc);
-+
-+    rpi_begin(s, lc->jb0, lc->ts);
-+}
-+
-+// Free up a job without submission
-+static void worker_free(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    HEVCRpiJobCtl * const jbc = s->jbc;
-+    HEVCRpiJob * const jb = lc->jb0;
-+
-+    if (jb == NULL) {
-+        return;
-+    }
-+
-+    lc->jb0 = NULL;
-+
-+    job_free(jbc, jb);
-+
-+    // If offload then poke sem_out too
-+    if (s->offload_recon) {
-+        sem_post(&jbc->sem_out);
-+    }
-+}
-+
-+
-+// Call this to wait for all jobs to have completed at the end of a frame
-+// Slightly icky as there is no clean way to wait for a sem to count up
-+// Not reentrant - call on main thread only
-+static void worker_wait(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    HEVCRpiJobCtl * const jbc = s->jbc;
-+    int i = 0;
-+
-+    // We shouldn't reach here with an unsubmitted job
-+    av_assert1(lc->jb0 == NULL);
-+
-+    // If no offload then there can't be anything to wait for
-+    if (!s->offload_recon) {
-+        return;
-+    }
-+
-+    if (sem_getvalue(&jbc->sem_out, &i) == 0 && i < RPI_MAX_JOBS)
-+    {
-+        for (i = 0; i != RPI_MAX_JOBS; ++i) {
-+            rpi_sem_wait(&jbc->sem_out);
-+        }
-+        for (i = 0; i != RPI_MAX_JOBS; ++i) {
-+            sem_post(&jbc->sem_out);
-+        }
-+    }
-+}
-+
-+static void * pass_worker(void *arg)
-+{
-+    HEVCRpiPassQueue *const pq = (HEVCRpiPassQueue *)arg;
-+    HEVCRpiContext *const s = pq->context;
-+
-+    for (;;)
-+    {
-+        rpi_sem_wait(&pq->sem_in);
-+
-+        if (pq->terminate)
-+            break;
-+
-+        pq->worker(s, s->jbc->offloadq[pass_queue_inc_job_n(pq)]);
-+        // * should really set jb->passes_done here
-+
-+        sem_post(pq->psem_out);
-+    }
-+    return NULL;
-+}
-+
-+static void pass_queues_start_all(HEVCRpiContext *const s)
-+{
-+    unsigned int i;
-+    HEVCRpiPassQueue * const pqs = s->passq;
-+
-+    for (i = 0; i != RPI_PASSES; ++i)
-+    {
-+        av_assert0(pthread_create(&pqs[i].thread, NULL, pass_worker, pqs + i) == 0);
-+        pqs[i].started = 1;
-+    }
-+}
-+
-+static void pass_queues_term_all(HEVCRpiContext *const s)
-+{
-+    unsigned int i;
-+    HEVCRpiPassQueue * const pqs = s->passq;
-+
-+    for (i = 0; i != RPI_PASSES; ++i)
-+        pqs[i].terminate = 1;
-+    for (i = 0; i != RPI_PASSES; ++i)
-+    {
-+        if (pqs[i].started)
-+            sem_post(&pqs[i].sem_in);
-+    }
-+    for (i = 0; i != RPI_PASSES; ++i)
-+    {
-+        if (pqs[i].started) {
-+            pthread_join(pqs[i].thread, NULL);
-+            pqs[i].started = 0;
-+        }
-+    }
-+}
-+
-+static void pass_queues_kill_all(HEVCRpiContext *const s)
-+{
-+    unsigned int i;
-+    HEVCRpiPassQueue * const pqs = s->passq;
-+
-+    for (i = 0; i != RPI_PASSES; ++i)
-+        pass_queue_kill(pqs + i);
-+}
-+
-+
-+static void worker_pic_free_one(HEVCRpiJob * const jb)
-+{
-+    // Free coeff stuff - allocation not the same for all buffers
-+    HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
-+
-+    if (cf->s[0].buf != NULL)
-+        av_freep(&cf->mptr);
-+    if (cf->s[2].buf != NULL)
-+        gpu_free(&cf->gptr);
-+    memset(cf, 0, sizeof(*cf));
-+}
-+
-+static int worker_pic_alloc_one(HEVCRpiJob * const jb, const unsigned int coeff_count)
-+{
-+    HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
-+
-+    if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0)
-+        goto fail;
-+    cf->s[2].buf = (int16_t *)cf->gptr.arm;
-+    cf->s[3].buf = cf->s[2].buf + coeff_count;
-+
-+    // Must be 64 byte aligned for our zero zapping code so over-allocate &
-+    // round
-+    if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL)
-+        goto fail;
-+    cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63);
-+    return 0;
-+
-+fail:
-+    av_log(NULL, AV_LOG_ERROR, "%s: Allocation failed\n", __func__);
-+    worker_pic_free_one(jb);
-+    return -1;
-+}
-+
-+static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf)
-+{
-+    unsigned int i;
-+    for (i = 0; i != 4; ++i) {
-+        cf->s[i].n = 0;
-+#if RPI_COMPRESS_COEFFS        
-+        cf->s[i].packed = 1;
-+        cf->s[i].packed_n = 0;
-+#endif
-+    }
-+}
-+
-+int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n)
-+{
-+    HEVCRpiCoeffEnv *const cfe = jb->coeffs.s + buf_no;
-+    int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n);
-+    cfe->n += n;
-+    return coeffs;
-+}
-+
-+void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+                                     const HEVCRpiFrame * const ref, const int val, const int field)
-+{
-+    if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) {
-+        HEVCRpiContext *const fs = ref->tf.owner[field]->priv_data;
-+        HEVCRpiFrameProgressState * const pstate = fs->progress_states + field;
-+        sem_t * sem = NULL;
-+
-+        av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
-+        if (((volatile int *)ref->tf.progress->data)[field] < val) {
-+            HEVCRpiFrameProgressWait * const pwait = &jb->progress_wait;
-+
-+            av_assert1(pwait->req == -1 && pwait->next == NULL);
-+            jb->waited = 1;  // Remember that we had to wait for later scheduling
-+
-+            pwait->req = val;
-+            pwait->next = NULL;
-+            if (pstate->first == NULL)
-+                pstate->first = pwait;
-+            else
-+                pstate->last->next = pwait;
-+            pstate->last = pwait;
-+            sem = &pwait->sem;
-+        }
-+        pthread_mutex_unlock(&pstate->lock);
-+
-+        if (sem != NULL) {
-+            rpi_sem_wait(sem);
-+        }
-+    }
-+}
-+
-+void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field)
-+{
-+    HEVCRpiFrameProgressState *const pstate = s->progress_states + field;
-+
-+    ((int *)s->ref->tf.progress->data)[field] = val;
-+
-+    av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
-+    {
-+        HEVCRpiFrameProgressWait ** ppwait = &pstate->first;
-+        HEVCRpiFrameProgressWait * pwait;
-+
-+        while ((pwait = *ppwait) != NULL) {
-+            if (pwait->req > val)
-+            {
-+                ppwait = &pwait->next;
-+                pstate->last = pwait;
-+            }
-+            else
-+            {
-+                *ppwait = pwait->next;
-+                pwait->req = -1;
-+                pwait->next = NULL;
-+                sem_post(&pwait->sem);
-+            }
-+        }
-+    }
-+    pthread_mutex_unlock(&pstate->lock);
-+}
-+
-+static void ff_hevc_rpi_progress_init_state(HEVCRpiFrameProgressState * const pstate)
-+{
-+    pstate->first = NULL;
-+    pstate->last = NULL;
-+    pthread_mutex_init(&pstate->lock, NULL);
-+}
-+
-+static void ff_hevc_rpi_progress_init_wait(HEVCRpiFrameProgressWait * const pwait)
-+{
-+    pwait->req = -1;
-+    pwait->next = NULL;
-+    sem_init(&pwait->sem, 0, 0);
-+}
-+
-+static void ff_hevc_rpi_progress_kill_state(HEVCRpiFrameProgressState * const pstate)
-+{
-+    av_assert1(pstate->first == NULL);
-+    pthread_mutex_destroy(&pstate->lock);
-+}
-+
-+static void ff_hevc_rpi_progress_kill_wait(HEVCRpiFrameProgressWait * const pwait)
-+{
-+    sem_destroy(&pwait->sem);
-+}
-+
-+
-+/**
-+ * NOTE: Each function hls_foo correspond to the function foo in the
-+ * specification (HLS stands for High Level Syntax).
-+ */
-+
-+/**
-+ * Section 5.7
-+ */
-+
-+// Realloc the entry point arrays
-+static int alloc_entry_points(RpiSliceHeader * const sh, const int n)
-+{
-+    if (sh->entry_point_offset == NULL || n > sh->offsets_allocated || n == 0)
-+    {
-+        // Round up alloc to multiple of 32
-+        int a = (n + 31) & ~31;
-+
-+        // We don't care about the previous contents so probably fastest to simply discard
-+        av_freep(&sh->entry_point_offset);
-+        av_freep(&sh->offset);
-+        av_freep(&sh->size);
-+
-+        if (a != 0)
-+        {
-+            sh->entry_point_offset = av_malloc_array(a, sizeof(unsigned));
-+            sh->offset = av_malloc_array(a, sizeof(int));
-+            sh->size = av_malloc_array(a, sizeof(int));
-+
-+            if (!sh->entry_point_offset || !sh->offset || !sh->size) {
-+                sh->num_entry_point_offsets = 0;
-+                sh->offsets_allocated = 0;
-+                return AVERROR(ENOMEM);
-+            }
-+        }
-+
-+        sh->offsets_allocated = a;
-+    }
-+
-+    return 0;
-+}
-+
-+/* free everything allocated  by pic_arrays_init() */
-+static void pic_arrays_free(HEVCRpiContext *s)
-+{
-+    av_freep(&s->sao);
-+    av_freep(&s->deblock);
-+
-+    av_freep(&s->cabac_stash_up);
-+    s->cabac_stash_left = NULL;  // freed with _up
-+
-+    av_freep(&s->mvf_up);
-+    av_freep(&s->mvf_left);
-+
-+    av_freep(&s->is_pcm);
-+    av_freep(&s->is_intra_store);
-+    s->is_intra = NULL;
-+    av_freep(&s->rpl_tab);
-+    s->rpl_tab_size = 0;
-+
-+    av_freep(&s->qp_y_tab);
-+    av_freep(&s->tab_slice_address);
-+    av_freep(&s->filter_slice_edges);
-+
-+    av_freep(&s->bs_horizontal);
-+    s->bs_vertical = NULL;  // freed with H
-+    av_freep(&s->bsf_stash_left);
-+    av_freep(&s->bsf_stash_up);
-+
-+    av_freep(&s->rpl_up);
-+    av_freep(&s->rpl_left);
-+
-+    alloc_entry_points(&s->sh, 0);
-+
-+    av_buffer_pool_uninit(&s->col_mvf_pool);
-+}
-+
-+/* allocate arrays that depend on frame dimensions */
-+static int pic_arrays_init(HEVCRpiContext * const s, const HEVCRpiSPS * const sps)
-+{
-+    const unsigned int log2_min_cb_size = sps->log2_min_cb_size;
-+    const unsigned int width            = sps->width;
-+    const unsigned int height           = sps->height;
-+    const unsigned int pic_size_in_cb   = ((width  >> log2_min_cb_size) + 1) *
-+                           ((height >> log2_min_cb_size) + 1);
-+    const unsigned int ctb_count        = sps->ctb_size;
-+
-+    {
-+        unsigned int w = ((width + HEVC_RPI_BS_STRIDE1_PEL_MASK) & ~HEVC_RPI_BS_STRIDE1_PEL_MASK);
-+        unsigned int h = ((height + 15) & ~15);
-+
-+        s->bs_stride2 = h >> HEVC_RPI_BS_COL_BYTES_SHR; // Column size
-+        s->bs_size = s->bs_stride2 * (w >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT); // col size * cols
-+    }
-+
-+    s->sao           = av_mallocz(ctb_count * sizeof(*s->sao) + 8); // Our sao code overreads this array slightly
-+    s->deblock       = av_mallocz_array(ctb_count, sizeof(*s->deblock));
-+    if (!s->sao || !s->deblock)
-+        goto fail;
-+
-+    s->cabac_stash_up  = av_malloc((((width + 63) & ~63) >> 3) + (((height + 63) & ~63) >> 3));
-+    s->cabac_stash_left = s->cabac_stash_up + (((width + 63) & ~63) >> 3);
-+    if (s->cabac_stash_up == NULL)
-+        goto fail;
-+
-+    // Round width up to max ctb size
-+    s->mvf_up = av_malloc((((width + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up));
-+    // * Only needed if we have H tiles
-+    s->mvf_left = av_malloc((((height + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up));
-+
-+    // We can overread by 1 line & one byte in deblock so alloc & zero
-+    // We don't need to zero the extra @ start of frame as it will never be
-+    // written
-+    s->is_pcm   = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
-+    s->is_intra_store = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
-+    if (s->is_pcm == NULL || s->is_intra_store == NULL)
-+        goto fail;
-+
-+    s->filter_slice_edges = av_mallocz(ctb_count);
-+    s->tab_slice_address  = av_malloc_array(ctb_count,
-+                                      sizeof(*s->tab_slice_address));
-+    s->qp_y_tab           = av_malloc_array(pic_size_in_cb,
-+                                      sizeof(*s->qp_y_tab));
-+    if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address)
-+        goto fail;
-+
-+    s->bs_horizontal = av_mallocz(s->bs_size * 2);
-+    s->bs_vertical   = s->bs_horizontal + s->bs_size;
-+    if (s->bs_horizontal == NULL)
-+        goto fail;
-+
-+    s->rpl_up = av_mallocz(sps->ctb_width * sizeof(*s->rpl_up));
-+    s->rpl_left = av_mallocz(sps->ctb_height * sizeof(*s->rpl_left));
-+    if (s->rpl_left == NULL || s->rpl_up == NULL)
-+        goto fail;
-+
-+    if ((s->bsf_stash_left = av_mallocz(((height + 63) & ~63) >> 4)) == NULL ||
-+        (s->bsf_stash_up   = av_mallocz(((width + 63) & ~63) >> 4)) == NULL)
-+        goto fail;
-+
-+    s->col_mvf_stride = (width + 15) >> 4;
-+    s->col_mvf_pool = av_buffer_pool_init(((height + 15) >> 4) * s->col_mvf_stride * sizeof(ColMvField),
-+                                          av_buffer_allocz);
-+    if (s->col_mvf_pool == NULL)
-+        goto fail;
-+
-+    return 0;
-+
-+fail:
-+    pic_arrays_free(s);
-+    return AVERROR(ENOMEM);
-+}
-+
-+static void default_pred_weight_table(HEVCRpiContext * const s)
-+{
-+  unsigned int i;
-+  const unsigned int wt = 1 << QPU_MC_DENOM;
-+  s->sh.luma_log2_weight_denom = 0;
-+  s->sh.chroma_log2_weight_denom = 0;
-+  for (i = 0; i < s->sh.nb_refs[L0]; i++) {
-+      s->sh.luma_weight_l0[i] = wt;
-+      s->sh.luma_offset_l0[i] = 0;
-+      s->sh.chroma_weight_l0[i][0] = wt;
-+      s->sh.chroma_weight_l0[i][1] = wt;
-+      s->sh.chroma_offset_l0[i][0] = 0;
-+      s->sh.chroma_offset_l0[i][1] = 0;
-+  }
-+  for (i = 0; i < s->sh.nb_refs[L1]; i++) {
-+      s->sh.luma_weight_l1[i] = wt;
-+      s->sh.luma_offset_l1[i] = 0;
-+      s->sh.chroma_weight_l1[i][0] = wt;
-+      s->sh.chroma_weight_l1[i][1] = wt;
-+      s->sh.chroma_offset_l1[i][0] = 0;
-+      s->sh.chroma_offset_l1[i][1] = 0;
-+  }
-+}
-+
-+static int get_weights(HEVCRpiContext * const s, GetBitContext * const gb,
-+                       const unsigned int refs,
-+                       int16_t * luma_weight,   int16_t * luma_offset,
-+                       int16_t * chroma_weight, int16_t * chroma_offset)
-+{
-+    unsigned int luma_flags;
-+    unsigned int chroma_flags;
-+    unsigned int i;
-+    const unsigned int wp_offset_bd_shift = s->ps.sps->high_precision_offsets_enabled_flag ? 0 : (s->ps.sps->bit_depth - 8);
-+    const int wp_offset_half_range = s->ps.sps->wp_offset_half_range;
-+    const unsigned int luma_weight_base    = 1 << QPU_MC_DENOM;
-+    const unsigned int chroma_weight_base  = 1 << QPU_MC_DENOM;
-+    const unsigned int luma_weight_shift   = (QPU_MC_DENOM - s->sh.luma_log2_weight_denom);
-+    const unsigned int chroma_weight_shift = (QPU_MC_DENOM - s->sh.chroma_log2_weight_denom);
-+
-+    if (refs == 0)
-+        return 0;
-+
-+    luma_flags = get_bits(gb, refs);
-+    chroma_flags = ctx_cfmt(s) == 0 ? 0 : get_bits(gb, refs);
-+    i = 1 << (refs - 1);
-+
-+    do
-+    {
-+        if ((luma_flags & i) != 0)
-+        {
-+            const int delta_weight = get_se_golomb(gb);
-+            const int offset = get_se_golomb(gb);
-+            if (delta_weight < -128 || delta_weight > 127 ||
-+                offset < -wp_offset_half_range || offset >= wp_offset_half_range)
-+            {
-+                return AVERROR_INVALIDDATA;
-+            }
-+            *luma_weight++ = luma_weight_base + (delta_weight << luma_weight_shift);
-+            *luma_offset++ = offset << wp_offset_bd_shift;
-+        }
-+        else
-+        {
-+            *luma_weight++ = luma_weight_base;
-+            *luma_offset++ = 0;
-+        }
-+
-+        if ((chroma_flags & i) != 0)
-+        {
-+            unsigned int j;
-+            for (j = 0; j != 2; ++j)
-+            {
-+                const int delta_weight = get_se_golomb(gb);
-+                const int delta_offset = get_se_golomb(gb);
-+
-+                if (delta_weight < -128 || delta_weight > 127 ||
-+                    delta_offset < -4 * wp_offset_half_range || delta_offset >= 4 * wp_offset_half_range)
-+                {
-+                    return AVERROR_INVALIDDATA;
-+                }
-+
-+                *chroma_weight++ = chroma_weight_base + (delta_weight << chroma_weight_shift);
-+                *chroma_offset++ = av_clip(
-+                    wp_offset_half_range + delta_offset -
-+                        ((wp_offset_half_range * ((1 << s->sh.chroma_log2_weight_denom) + delta_weight)) >> s->sh.chroma_log2_weight_denom),
-+                    -wp_offset_half_range, wp_offset_half_range - 1) << wp_offset_bd_shift;
-+            }
-+        }
-+        else
-+        {
-+            *chroma_weight++ = chroma_weight_base;
-+            *chroma_weight++ = chroma_weight_base;
-+            *chroma_offset++ = 0;
-+            *chroma_offset++ = 0;
-+        }
-+    } while ((i >>= 1) != 0);
-+
-+    return 0;
-+}
-+
-+static int pred_weight_table(HEVCRpiContext *s, GetBitContext *gb)
-+{
-+    int err;
-+    const unsigned int luma_log2_weight_denom = get_ue_golomb_long(gb);
-+    const unsigned int chroma_log2_weight_denom = (ctx_cfmt(s) == 0) ? 0 : luma_log2_weight_denom + get_se_golomb(gb);
-+
-+    if (luma_log2_weight_denom > 7 ||
-+        chroma_log2_weight_denom > 7)
-+    {
-+        av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight denom: luma=%d, chroma=%d\n",
-+               luma_log2_weight_denom, chroma_log2_weight_denom);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    s->sh.luma_log2_weight_denom = luma_log2_weight_denom;
-+    s->sh.chroma_log2_weight_denom = chroma_log2_weight_denom;
-+
-+    if ((err = get_weights(s, gb, s->sh.nb_refs[L0],
-+                s->sh.luma_weight_l0,      s->sh.luma_offset_l0,
-+                s->sh.chroma_weight_l0[0], s->sh.chroma_offset_l0[0])) != 0 ||
-+        (err = get_weights(s, gb, s->sh.nb_refs[L1],
-+                s->sh.luma_weight_l1,      s->sh.luma_offset_l1,
-+                s->sh.chroma_weight_l1[0], s->sh.chroma_offset_l1[0])) != 0)
-+    {
-+        av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight or offset\n");
-+        return err;
-+    }
-+
-+    return 0;
-+}
-+
-+static int decode_lt_rps(HEVCRpiContext *s, LongTermRPS *rps, GetBitContext *gb)
-+{
-+    const HEVCRpiSPS *sps = s->ps.sps;
-+    int max_poc_lsb    = 1 << sps->log2_max_poc_lsb;
-+    int prev_delta_msb = 0;
-+    unsigned int nb_sps = 0, nb_sh;
-+    int i;
-+
-+    rps->nb_refs = 0;
-+    if (!sps->long_term_ref_pics_present_flag)
-+        return 0;
-+
-+    if (sps->num_long_term_ref_pics_sps > 0)
-+        nb_sps = get_ue_golomb_long(gb);
-+    nb_sh = get_ue_golomb_long(gb);
-+
-+    if (nb_sps > sps->num_long_term_ref_pics_sps)
-+        return AVERROR_INVALIDDATA;
-+    if (nb_sh + (uint64_t)nb_sps > FF_ARRAY_ELEMS(rps->poc))
-+        return AVERROR_INVALIDDATA;
-+
-+    rps->nb_refs = nb_sh + nb_sps;
-+
-+    for (i = 0; i < rps->nb_refs; i++) {
-+        uint8_t delta_poc_msb_present;
-+
-+        if (i < nb_sps) {
-+            uint8_t lt_idx_sps = 0;
-+
-+            if (sps->num_long_term_ref_pics_sps > 1)
-+                lt_idx_sps = get_bits(gb, av_ceil_log2(sps->num_long_term_ref_pics_sps));
-+
-+            rps->poc[i]  = sps->lt_ref_pic_poc_lsb_sps[lt_idx_sps];
-+            rps->used[i] = sps->used_by_curr_pic_lt_sps_flag[lt_idx_sps];
-+        } else {
-+            rps->poc[i]  = get_bits(gb, sps->log2_max_poc_lsb);
-+            rps->used[i] = get_bits1(gb);
-+        }
-+
-+        delta_poc_msb_present = get_bits1(gb);
-+        if (delta_poc_msb_present) {
-+            int64_t delta = get_ue_golomb_long(gb);
-+            int64_t poc;
-+
-+            if (i && i != nb_sps)
-+                delta += prev_delta_msb;
-+
-+            poc = rps->poc[i] + s->poc - delta * max_poc_lsb - s->sh.pic_order_cnt_lsb;
-+            if (poc != (int32_t)poc)
-+                return AVERROR_INVALIDDATA;
-+            rps->poc[i] = poc;
-+            prev_delta_msb = delta;
-+        }
-+    }
-+
-+    return 0;
-+}
-+
-+static void export_stream_params(AVCodecContext *avctx, const HEVCRpiParamSets *ps,
-+                                 const HEVCRpiSPS *sps)
-+{
-+    const HEVCRpiVPS *vps = (const HEVCRpiVPS*)ps->vps_list[sps->vps_id]->data;
-+    const HEVCRpiWindow *ow = &sps->output_window;
-+    unsigned int num = 0, den = 0;
-+
-+    avctx->pix_fmt             = sps->pix_fmt;
-+    avctx->coded_width         = sps->width;
-+    avctx->coded_height        = sps->height;
-+    avctx->width               = sps->width  - ow->left_offset - ow->right_offset;
-+    avctx->height              = sps->height - ow->top_offset  - ow->bottom_offset;
-+    avctx->has_b_frames        = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics;
-+    avctx->profile             = sps->ptl.general_ptl.profile_idc;
-+    avctx->level               = sps->ptl.general_ptl.level_idc;
-+
-+    ff_set_sar(avctx, sps->vui.sar);
-+
-+    if (sps->vui.video_signal_type_present_flag)
-+        avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG
-+                                                            : AVCOL_RANGE_MPEG;
-+    else
-+        avctx->color_range = AVCOL_RANGE_MPEG;
-+
-+    if (sps->vui.colour_description_present_flag) {
-+        avctx->color_primaries = sps->vui.colour_primaries;
-+        avctx->color_trc       = sps->vui.transfer_characteristic;
-+        avctx->colorspace      = sps->vui.matrix_coeffs;
-+    } else {
-+        avctx->color_primaries = AVCOL_PRI_UNSPECIFIED;
-+        avctx->color_trc       = AVCOL_TRC_UNSPECIFIED;
-+        avctx->colorspace      = AVCOL_SPC_UNSPECIFIED;
-+    }
-+
-+    if (vps->vps_timing_info_present_flag) {
-+        num = vps->vps_num_units_in_tick;
-+        den = vps->vps_time_scale;
-+    } else if (sps->vui.vui_timing_info_present_flag) {
-+        num = sps->vui.vui_num_units_in_tick;
-+        den = sps->vui.vui_time_scale;
-+    }
-+
-+    if (num != 0 && den != 0)
-+        av_reduce(&avctx->framerate.den, &avctx->framerate.num,
-+                  num, den, 1 << 30);
-+}
-+
-+static enum AVPixelFormat get_format(HEVCRpiContext *s, const HEVCRpiSPS *sps)
-+{
-+    enum AVPixelFormat pix_fmts[4], *fmt = pix_fmts;
-+
-+    // Admit to no h/w formats
-+
-+    *fmt++ = sps->pix_fmt;
-+    *fmt = AV_PIX_FMT_NONE;
-+
-+    return pix_fmts[0] == AV_PIX_FMT_NONE ? AV_PIX_FMT_NONE: ff_thread_get_format(s->avctx, pix_fmts);
-+}
-+
-+static int is_sps_supported(const HEVCRpiSPS * const sps)
-+{
-+    return av_rpi_is_sand_format(sps->pix_fmt) &&
-+           sps->width <= HEVC_RPI_MAX_WIDTH &&
-+           sps->height <= HEVC_RPI_MAX_HEIGHT;
-+}
-+
-+static int set_sps(HEVCRpiContext * const s, const HEVCRpiSPS * const sps,
-+                   const enum AVPixelFormat pix_fmt)
-+{
-+    int ret;
-+
-+    pic_arrays_free(s);
-+    s->ps.sps = NULL;
-+    s->ps.vps = NULL;
-+
-+    if (sps == NULL)
-+        return 0;
-+
-+    if (!is_sps_supported(sps))
-+        return AVERROR_DECODER_NOT_FOUND;
-+
-+    ret = pic_arrays_init(s, sps);
-+    if (ret < 0)
-+        goto fail;
-+
-+    export_stream_params(s->avctx, &s->ps, sps);
-+
-+    s->avctx->pix_fmt = pix_fmt;
-+
-+    ff_hevc_rpi_pred_init(&s->hpc,     sps->bit_depth);
-+    ff_hevc_rpi_dsp_init (&s->hevcdsp, sps->bit_depth);
-+
-+    // * We don't support cross_component_prediction_enabled_flag but as that
-+    //   must be 0 unless we have 4:4:4 there is no point testing for it as we
-+    //   only deal with sand which is never 4:4:4
-+    //   [support wouldn't be hard]
-+
-+    rpi_hevc_qpu_set_fns(s, sps->bit_depth);
-+
-+    av_freep(&s->sao_pixel_buffer_h[0]);
-+    av_freep(&s->sao_pixel_buffer_v[0]);
-+
-+    if (sps->sao_enabled)
-+    {
-+        const unsigned int c_count = (ctx_cfmt(s) != 0) ? 3 : 1;
-+        unsigned int c_idx;
-+        size_t vsize[3] = {0};
-+        size_t hsize[3] = {0};
-+
-+        for(c_idx = 0; c_idx < c_count; c_idx++) {
-+            int w = sps->width >> ctx_hshift(s, c_idx);
-+            int h = sps->height >> ctx_vshift(s, c_idx);
-+            // ctb height & width are a min of 8 so this must a multiple of 16
-+            // so no point rounding up!
-+            hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift;
-+            vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift;
-+        }
-+
-+        // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2]
-+        // when we have plaited chroma
-+        s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]);
-+        s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]);
-+        s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0];
-+        s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1];
-+        s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0];
-+        s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1];
-+    }
-+
-+    s->ps.sps = sps;
-+    s->ps.vps = (HEVCRpiVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
-+
-+    return 0;
-+
-+fail:
-+    pic_arrays_free(s);
-+    s->ps.sps = NULL;
-+    return ret;
-+}
-+
-+static inline int qp_offset_valid(const int qp_offset)
-+{
-+    return qp_offset >= -12 && qp_offset <= 12;
-+}
-+
-+static int hls_slice_header(HEVCRpiContext * const s)
-+{
-+    GetBitContext * const gb = &s->HEVClc->gb;
-+    RpiSliceHeader * const sh   = &s->sh;
-+    int i, ret;
-+
-+    // Coded parameters
-+    sh->first_slice_in_pic_flag = get_bits1(gb);
-+    if ((IS_IDR(s) || IS_BLA(s)) && sh->first_slice_in_pic_flag) {
-+        s->seq_decode = (s->seq_decode + 1) & 0xff;
-+        s->max_ra     = INT_MAX;
-+        if (IS_IDR(s))
-+            ff_hevc_rpi_clear_refs(s);
-+    }
-+    sh->no_output_of_prior_pics_flag = 0;
-+    if (IS_IRAP(s))
-+        sh->no_output_of_prior_pics_flag = get_bits1(gb);
-+
-+    sh->pps_id = get_ue_golomb_long(gb);
-+    if (sh->pps_id >= HEVC_MAX_PPS_COUNT || !s->ps.pps_list[sh->pps_id]) {
-+        av_log(s->avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", sh->pps_id);
-+        return AVERROR_INVALIDDATA;
-+    }
-+    if (!sh->first_slice_in_pic_flag &&
-+        s->ps.pps != (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data) {
-+        av_log(s->avctx, AV_LOG_ERROR, "PPS changed between slices.\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+    s->ps.pps = (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data;
-+    if (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos == 1)
-+        sh->no_output_of_prior_pics_flag = 1;
-+
-+    if (s->ps.sps != (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) {
-+        const HEVCRpiSPS *sps = (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data;
-+        const HEVCRpiSPS *last_sps = s->ps.sps;
-+        enum AVPixelFormat pix_fmt;
-+
-+        if (last_sps && IS_IRAP(s) && s->nal_unit_type != HEVC_NAL_CRA_NUT) {
-+            if (sps->width != last_sps->width || sps->height != last_sps->height ||
-+                sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering !=
-+                last_sps->temporal_layer[last_sps->max_sub_layers - 1].max_dec_pic_buffering)
-+                sh->no_output_of_prior_pics_flag = 0;
-+        }
-+        ff_hevc_rpi_clear_refs(s);
-+
-+        ret = set_sps(s, sps, sps->pix_fmt);
-+        if (ret < 0)
-+            return ret;
-+
-+        pix_fmt = get_format(s, sps);
-+        if (pix_fmt < 0)
-+            return pix_fmt;
-+
-+//        ret = set_sps(s, sps, pix_fmt);
-+//        if (ret < 0)
-+//            return ret;
-+
-+        s->avctx->pix_fmt = pix_fmt;
-+
-+        s->seq_decode = (s->seq_decode + 1) & 0xff;
-+        s->max_ra     = INT_MAX;
-+    }
-+
-+    sh->dependent_slice_segment_flag = 0;
-+    if (!sh->first_slice_in_pic_flag) {
-+        int slice_address_length;
-+
-+        if (s->ps.pps->dependent_slice_segments_enabled_flag)
-+            sh->dependent_slice_segment_flag = get_bits1(gb);
-+
-+        slice_address_length = av_ceil_log2(s->ps.sps->ctb_size);
-+        sh->slice_segment_addr = get_bitsz(gb, slice_address_length);
-+        if (sh->slice_segment_addr >= s->ps.sps->ctb_size) {
-+            av_log(s->avctx, AV_LOG_ERROR,
-+                   "Invalid slice segment address: %u.\n",
-+                   sh->slice_segment_addr);
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        if (!sh->dependent_slice_segment_flag) {
-+            sh->slice_addr = sh->slice_segment_addr;
-+            s->slice_idx++;
-+        }
-+    } else {
-+        sh->slice_segment_addr = sh->slice_addr = 0;
-+        s->slice_idx           = 0;
-+        s->slice_initialized   = 0;
-+    }
-+
-+    if (!sh->dependent_slice_segment_flag) {
-+        s->slice_initialized = 0;
-+
-+        for (i = 0; i < s->ps.pps->num_extra_slice_header_bits; i++)
-+            skip_bits(gb, 1);  // slice_reserved_undetermined_flag[]
-+
-+        sh->slice_type = get_ue_golomb_long(gb);
-+        if (!(sh->slice_type == HEVC_SLICE_I ||
-+              sh->slice_type == HEVC_SLICE_P ||
-+              sh->slice_type == HEVC_SLICE_B)) {
-+            av_log(s->avctx, AV_LOG_ERROR, "Unknown slice type: %d.\n",
-+                   sh->slice_type);
-+            return AVERROR_INVALIDDATA;
-+        }
-+        if (IS_IRAP(s) && sh->slice_type != HEVC_SLICE_I) {
-+            av_log(s->avctx, AV_LOG_ERROR, "Inter slices in an IRAP frame.\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        // when flag is not present, picture is inferred to be output
-+        sh->pic_output_flag = 1;
-+        if (s->ps.pps->output_flag_present_flag)
-+            sh->pic_output_flag = get_bits1(gb);
-+
-+        if (s->ps.sps->separate_colour_plane_flag)
-+            sh->colour_plane_id = get_bits(gb, 2);
-+
-+        if (!IS_IDR(s)) {
-+            int poc, pos;
-+
-+            sh->pic_order_cnt_lsb = get_bits(gb, s->ps.sps->log2_max_poc_lsb);
-+            poc = ff_hevc_rpi_compute_poc(s->ps.sps, s->pocTid0, sh->pic_order_cnt_lsb, s->nal_unit_type);
-+            if (!sh->first_slice_in_pic_flag && poc != s->poc) {
-+                av_log(s->avctx, AV_LOG_WARNING,
-+                       "Ignoring POC change between slices: %d -> %d\n", s->poc, poc);
-+                if (s->avctx->err_recognition & AV_EF_EXPLODE)
-+                    return AVERROR_INVALIDDATA;
-+                poc = s->poc;
-+            }
-+            s->poc = poc;
-+
-+            sh->short_term_ref_pic_set_sps_flag = get_bits1(gb);
-+            pos = get_bits_left(gb);
-+            if (!sh->short_term_ref_pic_set_sps_flag) {
-+                ret = ff_hevc_rpi_decode_short_term_rps(gb, s->avctx, &sh->slice_rps, s->ps.sps, 1);
-+                if (ret < 0)
-+                    return ret;
-+
-+                sh->short_term_rps = &sh->slice_rps;
-+            } else {
-+                int numbits, rps_idx;
-+
-+                if (!s->ps.sps->nb_st_rps) {
-+                    av_log(s->avctx, AV_LOG_ERROR, "No ref lists in the SPS.\n");
-+                    return AVERROR_INVALIDDATA;
-+                }
-+
-+                numbits = av_ceil_log2(s->ps.sps->nb_st_rps);
-+                rps_idx = numbits > 0 ? get_bits(gb, numbits) : 0;
-+                sh->short_term_rps = &s->ps.sps->st_rps[rps_idx];
-+            }
-+            sh->short_term_ref_pic_set_size = pos - get_bits_left(gb);
-+
-+            pos = get_bits_left(gb);
-+            ret = decode_lt_rps(s, &sh->long_term_rps, gb);
-+            if (ret < 0) {
-+                av_log(s->avctx, AV_LOG_WARNING, "Invalid long term RPS.\n");
-+                if (s->avctx->err_recognition & AV_EF_EXPLODE)
-+                    return AVERROR_INVALIDDATA;
-+            }
-+            sh->long_term_ref_pic_set_size = pos - get_bits_left(gb);
-+
-+            if (s->ps.sps->sps_temporal_mvp_enabled_flag)
-+                sh->slice_temporal_mvp_enabled_flag = get_bits1(gb);
-+            else
-+                sh->slice_temporal_mvp_enabled_flag = 0;
-+        } else {
-+            s->sh.short_term_rps = NULL;
-+            s->poc               = 0;
-+        }
-+
-+        /* 8.3.1 */
-+        if (sh->first_slice_in_pic_flag && s->temporal_id == 0 &&
-+            s->nal_unit_type != HEVC_NAL_TRAIL_N &&
-+            s->nal_unit_type != HEVC_NAL_TSA_N   &&
-+            s->nal_unit_type != HEVC_NAL_STSA_N  &&
-+            s->nal_unit_type != HEVC_NAL_RADL_N  &&
-+            s->nal_unit_type != HEVC_NAL_RADL_R  &&
-+            s->nal_unit_type != HEVC_NAL_RASL_N  &&
-+            s->nal_unit_type != HEVC_NAL_RASL_R)
-+            s->pocTid0 = s->poc;
-+
-+        if (s->ps.sps->sao_enabled) {
-+            sh->slice_sample_adaptive_offset_flag[0] = get_bits1(gb);
-+            if (ctx_cfmt(s) != 0) {
-+                sh->slice_sample_adaptive_offset_flag[1] =
-+                sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb);
-+            }
-+        } else {
-+            sh->slice_sample_adaptive_offset_flag[0] = 0;
-+            sh->slice_sample_adaptive_offset_flag[1] = 0;
-+            sh->slice_sample_adaptive_offset_flag[2] = 0;
-+        }
-+
-+        sh->nb_refs[L0] = sh->nb_refs[L1] = 0;
-+        if (sh->slice_type == HEVC_SLICE_P || sh->slice_type == HEVC_SLICE_B) {
-+            int nb_refs;
-+
-+            sh->nb_refs[L0] = s->ps.pps->num_ref_idx_l0_default_active;
-+            if (sh->slice_type == HEVC_SLICE_B)
-+                sh->nb_refs[L1] = s->ps.pps->num_ref_idx_l1_default_active;
-+
-+            if (get_bits1(gb)) { // num_ref_idx_active_override_flag
-+                sh->nb_refs[L0] = get_ue_golomb_long(gb) + 1;
-+                if (sh->slice_type == HEVC_SLICE_B)
-+                    sh->nb_refs[L1] = get_ue_golomb_long(gb) + 1;
-+            }
-+            if (sh->nb_refs[L0] > HEVC_MAX_REFS || sh->nb_refs[L1] > HEVC_MAX_REFS) {
-+                av_log(s->avctx, AV_LOG_ERROR, "Too many refs: %d/%d.\n",
-+                       sh->nb_refs[L0], sh->nb_refs[L1]);
-+                return AVERROR_INVALIDDATA;
-+            }
-+
-+            sh->rpl_modification_flag[0] = 0;
-+            sh->rpl_modification_flag[1] = 0;
-+            nb_refs = ff_hevc_rpi_frame_nb_refs(s);
-+            if (!nb_refs) {
-+                av_log(s->avctx, AV_LOG_ERROR, "Zero refs for a frame with P or B slices.\n");
-+                return AVERROR_INVALIDDATA;
-+            }
-+
-+            if (s->ps.pps->lists_modification_present_flag && nb_refs > 1) {
-+                sh->rpl_modification_flag[0] = get_bits1(gb);
-+                if (sh->rpl_modification_flag[0]) {
-+                    for (i = 0; i < sh->nb_refs[L0]; i++)
-+                        sh->list_entry_lx[0][i] = get_bits(gb, av_ceil_log2(nb_refs));
-+                }
-+
-+                if (sh->slice_type == HEVC_SLICE_B) {
-+                    sh->rpl_modification_flag[1] = get_bits1(gb);
-+                    if (sh->rpl_modification_flag[1] == 1)
-+                        for (i = 0; i < sh->nb_refs[L1]; i++)
-+                            sh->list_entry_lx[1][i] = get_bits(gb, av_ceil_log2(nb_refs));
-+                }
-+            }
-+
-+            if (sh->slice_type == HEVC_SLICE_B)
-+                sh->mvd_l1_zero_flag = get_bits1(gb);
-+
-+            if (s->ps.pps->cabac_init_present_flag)
-+                sh->cabac_init_flag = get_bits1(gb);
-+            else
-+                sh->cabac_init_flag = 0;
-+
-+            sh->collocated_ref_idx = 0;
-+            if (sh->slice_temporal_mvp_enabled_flag) {
-+                sh->collocated_list = L0;
-+                if (sh->slice_type == HEVC_SLICE_B)
-+                    sh->collocated_list = !get_bits1(gb);
-+
-+                if (sh->nb_refs[sh->collocated_list] > 1) {
-+                    sh->collocated_ref_idx = get_ue_golomb_long(gb);
-+                    if (sh->collocated_ref_idx >= sh->nb_refs[sh->collocated_list]) {
-+                        av_log(s->avctx, AV_LOG_ERROR,
-+                               "Invalid collocated_ref_idx: %d.\n",
-+                               sh->collocated_ref_idx);
-+                        return AVERROR_INVALIDDATA;
-+                    }
-+                }
-+            }
-+
-+            if ((s->ps.pps->weighted_pred_flag   && sh->slice_type == HEVC_SLICE_P) ||
-+                (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B))
-+            {
-+                if ((ret = pred_weight_table(s, gb)) != 0)
-+                    return ret;
-+            }
-+            else
-+            {
-+                // Give us unit weights
-+                default_pred_weight_table(s);
-+            }
-+
-+            sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
-+            if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
-+                av_log(s->avctx, AV_LOG_ERROR,
-+                       "Invalid number of merging MVP candidates: %d.\n",
-+                       sh->max_num_merge_cand);
-+                return AVERROR_INVALIDDATA;
-+            }
-+        }
-+
-+        sh->slice_qp_delta = get_se_golomb(gb);
-+
-+        if (s->ps.pps->pic_slice_level_chroma_qp_offsets_present_flag) {
-+            sh->slice_cb_qp_offset = get_se_golomb(gb);
-+            sh->slice_cr_qp_offset = get_se_golomb(gb);
-+            if (!qp_offset_valid(sh->slice_cb_qp_offset) ||
-+                !qp_offset_valid(s->ps.pps->cb_qp_offset + sh->slice_cb_qp_offset) ||
-+                !qp_offset_valid(sh->slice_cr_qp_offset) ||
-+                !qp_offset_valid(s->ps.pps->cr_qp_offset + sh->slice_cr_qp_offset))
-+            {
-+                av_log(s->avctx, AV_LOG_ERROR, "Bad chroma offset (pps:%d/%d; slice=%d/%d\n",
-+                       sh->slice_cr_qp_offset, sh->slice_cr_qp_offset,
-+                       s->ps.pps->cb_qp_offset, s->ps.pps->cr_qp_offset);
-+                return AVERROR_INVALIDDATA;
-+            }
-+        } else
-+        {
-+            sh->slice_cb_qp_offset = 0;
-+            sh->slice_cr_qp_offset = 0;
-+        }
-+
-+        if (s->ps.pps->chroma_qp_offset_list_enabled_flag)
-+            sh->cu_chroma_qp_offset_enabled_flag = get_bits1(gb);
-+        else
-+            sh->cu_chroma_qp_offset_enabled_flag = 0;
-+
-+        if (s->ps.pps->deblocking_filter_control_present_flag) {
-+            int deblocking_filter_override_flag = 0;
-+
-+            if (s->ps.pps->deblocking_filter_override_enabled_flag)
-+                deblocking_filter_override_flag = get_bits1(gb);
-+
-+            if (deblocking_filter_override_flag) {
-+                sh->disable_deblocking_filter_flag = get_bits1(gb);
-+                if (!sh->disable_deblocking_filter_flag) {
-+                    int beta_offset_div2 = get_se_golomb(gb);
-+                    int tc_offset_div2   = get_se_golomb(gb) ;
-+                    if (beta_offset_div2 < -6 || beta_offset_div2 > 6 ||
-+                        tc_offset_div2   < -6 || tc_offset_div2   > 6) {
-+                        av_log(s->avctx, AV_LOG_ERROR,
-+                            "Invalid deblock filter offsets: %d, %d\n",
-+                            beta_offset_div2, tc_offset_div2);
-+                        return AVERROR_INVALIDDATA;
-+                    }
-+                    sh->beta_offset = beta_offset_div2 * 2;
-+                    sh->tc_offset   =   tc_offset_div2 * 2;
-+                }
-+            } else {
-+                sh->disable_deblocking_filter_flag = s->ps.pps->disable_dbf;
-+                sh->beta_offset                    = s->ps.pps->beta_offset;
-+                sh->tc_offset                      = s->ps.pps->tc_offset;
-+            }
-+        } else {
-+            sh->disable_deblocking_filter_flag = 0;
-+            sh->beta_offset                    = 0;
-+            sh->tc_offset                      = 0;
-+        }
-+
-+        if (s->ps.pps->seq_loop_filter_across_slices_enabled_flag &&
-+            (sh->slice_sample_adaptive_offset_flag[0] ||
-+             sh->slice_sample_adaptive_offset_flag[1] ||
-+             !sh->disable_deblocking_filter_flag)) {
-+            sh->slice_loop_filter_across_slices_enabled_flag = get_bits1(gb);
-+        } else {
-+            sh->slice_loop_filter_across_slices_enabled_flag = s->ps.pps->seq_loop_filter_across_slices_enabled_flag;
-+        }
-+        sh->no_dblk_boundary_flags =
-+            (sh->slice_loop_filter_across_slices_enabled_flag ? 0 :
-+                BOUNDARY_UPPER_SLICE | BOUNDARY_LEFT_SLICE) |
-+            (s->ps.pps->loop_filter_across_tiles_enabled_flag ? 0 :
-+                BOUNDARY_UPPER_TILE | BOUNDARY_LEFT_TILE);
-+
-+
-+    } else if (!s->slice_initialized) {
-+        av_log(s->avctx, AV_LOG_ERROR, "Independent slice segment missing.\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    sh->num_entry_point_offsets = 0;
-+    sh->offload_wpp = 0;
-+    sh->offload_tiles = 0;
-+
-+    if (s->ps.pps->tiles_enabled_flag || s->ps.pps->entropy_coding_sync_enabled_flag) {
-+        unsigned num_entry_point_offsets = get_ue_golomb_long(gb);
-+        // It would be possible to bound this tighter but this here is simpler
-+        if (num_entry_point_offsets > get_bits_left(gb)) {
-+            av_log(s->avctx, AV_LOG_ERROR, "num_entry_point_offsets %d is invalid\n", num_entry_point_offsets);
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        sh->num_entry_point_offsets = num_entry_point_offsets;
-+        if (sh->num_entry_point_offsets > 0) {
-+            int offset_len = get_ue_golomb_long(gb) + 1;
-+
-+            if (offset_len < 1 || offset_len > 32) {
-+                sh->num_entry_point_offsets = 0;
-+                av_log(s->avctx, AV_LOG_ERROR, "offset_len %d is invalid\n", offset_len);
-+                return AVERROR_INVALIDDATA;
-+            }
-+
-+            if ((ret = alloc_entry_points(sh, sh->num_entry_point_offsets)) < 0)
-+            {
-+                av_log(s->avctx, AV_LOG_ERROR, "Failed to allocate memory\n");
-+                return ret;
-+            }
-+
-+            for (i = 0; i < sh->num_entry_point_offsets; i++) {
-+                uint32_t val_minus1 = get_bits_long(gb, offset_len);
-+                if (val_minus1 > (1 << 28))
-+                {
-+                    // We can declare offsets of > 2^28 bad without loss of generality
-+                    // Will check actual bounds wrt NAL later, but this keeps
-+                    // the values within bounds we can deal with easily
-+                    av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset_minus1 %d invalid\n", val_minus1);
-+                    return AVERROR_INVALIDDATA;
-+                }
-+                sh->entry_point_offset[i] = val_minus1 + 1; // +1 to get the size
-+            }
-+
-+            // Do we want to offload this
-+            if (s->threads_type != 0)
-+            {
-+                sh->offload_tiles = (!s->ps.pps->tile_wpp_inter_disable || sh->slice_type == HEVC_SLICE_I) &&
-+                    s->ps.pps->num_tile_columns > 1;
-+                // * We only cope with WPP in a single column
-+                //   Probably want to deal with that case as tiles rather than WPP anyway
-+                // ?? Not actually sure that the main code deals with WPP + multi-col correctly
-+                sh->offload_wpp = s->ps.pps->entropy_coding_sync_enabled_flag &&
-+                    s->ps.pps->num_tile_columns == 1;
-+            }
-+        }
-+    }
-+
-+    if (s->ps.pps->slice_header_extension_present_flag) {
-+        unsigned int length = get_ue_golomb_long(gb);
-+        if (length*8LL > get_bits_left(gb)) {
-+            av_log(s->avctx, AV_LOG_ERROR, "too many slice_header_extension_data_bytes\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+        for (i = 0; i < length; i++)
-+            skip_bits(gb, 8);  // slice_header_extension_data_byte
-+    }
-+
-+    // Inferred parameters
-+    sh->slice_qp = 26U + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta;
-+    if (sh->slice_qp > 51 ||
-+        sh->slice_qp < -s->ps.sps->qp_bd_offset) {
-+        av_log(s->avctx, AV_LOG_ERROR,
-+               "The slice_qp %d is outside the valid range "
-+               "[%d, 51].\n",
-+               sh->slice_qp,
-+               -s->ps.sps->qp_bd_offset);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (get_bits_left(gb) < 0) {
-+        av_log(s->avctx, AV_LOG_ERROR,
-+               "Overread slice header by %d bits\n", -get_bits_left(gb));
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    s->slice_initialized = 1;
-+    return 0;
-+}
-+
-+static void hls_sao_param(const HEVCRpiContext *s, HEVCRpiLocalContext * const lc, const int rx, const int ry)
-+{
-+    RpiSAOParams * const sao = s->sao + rx + ry * s->ps.sps->ctb_width;
-+    int c_idx, i;
-+
-+    if (s->sh.slice_sample_adaptive_offset_flag[0] ||
-+        s->sh.slice_sample_adaptive_offset_flag[1]) {
-+        if ((lc->ctb_avail & AVAIL_L) != 0)
-+        {
-+            const int sao_merge_left_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
-+            if (sao_merge_left_flag) {
-+                *sao = sao[-1];
-+                return;
-+            }
-+        }
-+        if ((lc->ctb_avail & AVAIL_U) != 0)
-+        {
-+            const int sao_merge_up_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
-+            if (sao_merge_up_flag) {
-+                *sao = sao[-(int)s->ps.sps->ctb_width];
-+                return;
-+            }
-+        }
-+    }
-+
-+    for (c_idx = 0; c_idx < (ctx_cfmt(s) != 0 ? 3 : 1); c_idx++) {
-+        const unsigned int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma :
-+                                                 s->ps.pps->log2_sao_offset_scale_chroma;
-+        int offset_abs[4];
-+        char offset_sign[4] = {0};
-+
-+        if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) {
-+            sao->type_idx[c_idx] = SAO_NOT_APPLIED;
-+            continue;
-+        }
-+
-+        if (c_idx == 2) {
-+            sao->type_idx[2] = sao->type_idx[1];
-+            sao->eo_class[2] = sao->eo_class[1];
-+        } else {
-+            sao->type_idx[c_idx] = ff_hevc_rpi_sao_type_idx_decode(lc);
-+        }
-+
-+        // ** Could use BY22 here quite plausibly - this is all bypass stuff
-+        //    though only per CTB so not very timing critical
-+
-+        if (sao->type_idx[c_idx] == SAO_NOT_APPLIED)
-+            continue;
-+
-+        for (i = 0; i < 4; i++)
-+            offset_abs[i] = ff_hevc_rpi_sao_offset_abs_decode(s, lc);
-+
-+        if (sao->type_idx[c_idx] == SAO_BAND) {
-+            for (i = 0; i < 4; i++) {
-+                if (offset_abs[i] != 0)
-+                    offset_sign[i] = ff_hevc_rpi_sao_offset_sign_decode(lc);
-+            }
-+            sao->band_position[c_idx] = ff_hevc_rpi_sao_band_position_decode(lc);
-+        } else if (c_idx != 2) {
-+            sao->eo_class[c_idx] = ff_hevc_rpi_sao_eo_class_decode(lc);
-+        }
-+
-+        // Inferred parameters
-+        sao->offset_val[c_idx][0] = 0;
-+        for (i = 0; i < 4; i++) {
-+            sao->offset_val[c_idx][i + 1] = offset_abs[i] << log2_sao_offset_scale;
-+            if (sao->type_idx[c_idx] == SAO_EDGE) {
-+                if (i > 1)
-+                    sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
-+            } else if (offset_sign[i]) {
-+                sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
-+            }
-+        }
-+    }
-+}
-+
-+#if 0
-+static int hls_cross_component_pred(HEVCRpiLocalContext * const lc, const int idx) {
-+    int log2_res_scale_abs_plus1 = ff_hevc_rpi_log2_res_scale_abs(lc, idx);  // 0..4
-+
-+    if (log2_res_scale_abs_plus1 !=  0) {
-+        int res_scale_sign_flag = ff_hevc_rpi_res_scale_sign_flag(lc, idx);
-+        lc->tu.res_scale_val = (1 << (log2_res_scale_abs_plus1 - 1)) *
-+                               (1 - 2 * res_scale_sign_flag);
-+    } else {
-+        lc->tu.res_scale_val = 0;
-+    }
-+
-+
-+    return 0;
-+}
-+#endif
-+
-+static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCRpiJob * const jb)
-+{
-+    return jb->intra.cmds + jb->intra.n++;
-+}
-+
-+#define A0(x, y, U, L, UL, UR, DL) \
-+    [(x)+(y)*16] = (((U) ? AVAIL_U : 0) | ((L) ? AVAIL_L : 0) | ((UL) ? AVAIL_UL : 0) | ((UR) ? AVAIL_UR : 0) | ((DL) ? AVAIL_DL : 0))
-+
-+#define A1(x, y, U, L, UL, UR, DL) \
-+    A0((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A0((x) + 1, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
-+    A0((x) + 0, (y) + 1,  1,   (L),  (L),   1,   (DL)),  A0((x) + 1, (y) + 1,  1,    1,    1,    0,    0  )
-+
-+#define A2(x, y, U, L, UL, UR, DL) \
-+    A1((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A1((x) + 2, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
-+    A1((x) + 0, (y) + 2,  1,   (L),  (L),   1,   (DL)),  A1((x) + 2, (y) + 2,  1,    1,    1,    0,    0  )
-+
-+#define A3(x, y, U, L, UL, UR, DL) \
-+    A2((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A2((x) + 4, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
-+    A2((x) + 0, (y) + 4,  1,   (L),  (L),   1,   (DL)),  A2((x) + 4, (y) + 4,  1,    1,    1,    0,    0  )
-+
-+#define A4(x, y, U, L, UL, UR, DL) \
-+    A3((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A3((x) + 8, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
-+    A3((x) + 0, (y) + 8,  1,   (L),  (L),   1,   (DL)),  A3((x) + 8, (y) + 8,  1,    1,    1,    0,    0  )
-+
-+static const uint8_t tb_flags[16 * 16] = {A4(0, 0, 0, 0, 0, 0, 0)};
-+
-+unsigned int ff_hevc_rpi_tb_avail_flags(
-+    const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
-+    const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h)
-+{
-+    const unsigned int ctb_mask = ~0U << s->ps.sps->log2_ctb_size;
-+    const unsigned int tb_x = x & ~ctb_mask;
-+    const unsigned int tb_y = y & ~ctb_mask;
-+    const unsigned int ctb_avail = lc->ctb_avail;
-+
-+    const uint8_t * const tb_f = tb_flags + (tb_x >> 2) + (tb_y >> 2) * 16;
-+
-+    unsigned int f = (ctb_avail | tb_f[0]) & (AVAIL_L | AVAIL_U | AVAIL_UL);
-+
-+    // This deals with both the U & L edges
-+    if ((tb_x | tb_y) != 0 && (~f & (AVAIL_L | AVAIL_U)) == 0)
-+        f |= AVAIL_UL;
-+
-+    if (x + w < lc->end_of_ctb_x)
-+        f |= (tb_y == 0 ? ctb_avail >> (AVAIL_S_U - AVAIL_S_UR) : tb_f[(w - 1) >> 2]) & AVAIL_UR;
-+    else if (tb_y == 0)
-+        f |= (ctb_avail & AVAIL_UR);
-+#if AVAIL_S_U - AVAIL_S_UR < 0
-+#error Shift problem
-+#endif
-+
-+    // Never any D if Y beyond eoctb
-+    if (y + h < lc->end_of_ctb_y)
-+        f |= (tb_x == 0 ? ctb_avail << (AVAIL_S_DL - AVAIL_S_L) : tb_f[((h - 1) >> 2) * 16]) & AVAIL_DL;
-+#if AVAIL_S_DL - AVAIL_S_L < 0
-+#error Shift problem
-+#endif
-+
-+//    printf("(%#x, %#x): %dx%d ca=%02x, ful=%02x, ftr=%02x, fdl=%02x, eox=%#x, eoy=%#x\n", x, y, w, h,
-+//           lc->ctb_avail, tb_f[0], tb_f[(w - 1) >> 2], tb_f[((h - 1) >> 2) * 16],
-+//           lc->end_of_ctb_x, lc->end_of_ctb_y);
-+
-+    return f;
-+}
-+
-+#undef A0
-+#undef A1
-+#undef A2
-+#undef A3
-+#undef A4
-+
-+static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx,
-+                          unsigned int avail)
-+{
-+    // If rpi_enabled then sand - U & V done on U call
-+    if (c_idx <= 1)
-+    {
-+        HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
-+        cmd->type = RPI_PRED_INTRA + c_idx;
-+        cmd->size = log2_trafo_size;
-+        cmd->avail = avail;
-+        cmd->i_pred.x = x0;
-+        cmd->i_pred.y = y0;
-+        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
-+
-+//        printf("(%#x, %#x) c_idx=%d, s=%d, a=%#x\n", x0, y0, c_idx, 1 << log2_trafo_size, avail);
-+    }
-+}
-+
-+#define CBF_CB0_S 0
-+#define CBF_CB1_S 1 // CB1 must be CB0 + 1
-+#define CBF_CR0_S 2
-+#define CBF_CR1_S 3
-+
-+#define CBF_CB0 (1 << CBF_CB0_S)
-+#define CBF_CR0 (1 << CBF_CR0_S)
-+#define CBF_CB1 (1 << CBF_CB1_S)
-+#define CBF_CR1 (1 << CBF_CR1_S)
-+
-+// * Only good for chroma_idx == 1
-+static int hls_transform_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                              const unsigned int x0, const unsigned int y0,
-+                              const unsigned int log2_cb_size, const unsigned int log2_trafo_size,
-+                              const unsigned int blk_idx, const int cbf_luma,
-+                              const unsigned int cbf_chroma)
-+{
-+    const unsigned int log2_trafo_size_c = FFMAX(2, log2_trafo_size - 1);
-+    const unsigned int x0_c = x0 & ~7;
-+    const unsigned int y0_c = y0 & ~7;
-+
-+    enum ScanType scan_idx   = SCAN_DIAG;
-+    enum ScanType scan_idx_c = SCAN_DIAG;
-+
-+    if (lc->cu.pred_mode == MODE_INTRA)
-+    {
-+        const unsigned int trafo_size = 1 << log2_trafo_size;
-+        const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size, trafo_size);
-+
-+        do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0, avail);
-+
-+        if (log2_trafo_size > 2)
-+            do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1, avail);
-+        else if (blk_idx == 3)
-+            do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1,
-+                          ff_hevc_rpi_tb_avail_flags(s, lc, x0_c, y0_c, 8, 8));
-+
-+        if (log2_trafo_size < 4) {
-+            if (lc->tu.intra_pred_mode >= 6 &&
-+                lc->tu.intra_pred_mode <= 14) {
-+                scan_idx = SCAN_VERT;
-+            } else if (lc->tu.intra_pred_mode >= 22 &&
-+                       lc->tu.intra_pred_mode <= 30) {
-+                scan_idx = SCAN_HORIZ;
-+            }
-+
-+            if (lc->tu.intra_pred_mode_c >=  6 &&
-+                lc->tu.intra_pred_mode_c <= 14) {
-+                scan_idx_c = SCAN_VERT;
-+            } else if (lc->tu.intra_pred_mode_c >= 22 &&
-+                       lc->tu.intra_pred_mode_c <= 30) {
-+                scan_idx_c = SCAN_HORIZ;
-+            }
-+        }
-+    }
-+
-+    if (!cbf_luma && cbf_chroma == 0)
-+        return 0;
-+
-+    if (lc->tu.is_cu_qp_delta_wanted)
-+    {
-+        const int qp_delta = ff_hevc_rpi_cu_qp_delta(lc);
-+        const unsigned int cb_mask = ~0U << log2_cb_size;
-+
-+        if (qp_delta < -(26 + (s->ps.sps->qp_bd_offset >> 1)) ||
-+            qp_delta >  (25 + (s->ps.sps->qp_bd_offset >> 1)))
-+        {
-+            av_log(s->avctx, AV_LOG_ERROR,
-+                   "The cu_qp_delta %d is outside the valid range "
-+                   "[%d, %d].\n",
-+                   qp_delta,
-+                   -(26 + (s->ps.sps->qp_bd_offset >> 1)),
-+                    (25 + (s->ps.sps->qp_bd_offset >> 1)));
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        lc->tu.is_cu_qp_delta_wanted = 0;
-+        lc->tu.cu_qp_delta = qp_delta;
-+        ff_hevc_rpi_set_qPy(s, lc, x0 & cb_mask, y0 & cb_mask);
-+    }
-+
-+    // * Not main profile & untested due to no conform streams
-+    if (lc->tu.cu_chroma_qp_offset_wanted && cbf_chroma &&
-+        !lc->cu.cu_transquant_bypass_flag) {
-+        int cu_chroma_qp_offset_flag = ff_hevc_rpi_cu_chroma_qp_offset_flag(lc);
-+        if (cu_chroma_qp_offset_flag) {
-+            int cu_chroma_qp_offset_idx  = 0;
-+            if (s->ps.pps->chroma_qp_offset_list_len_minus1 > 0) {
-+                cu_chroma_qp_offset_idx = ff_hevc_rpi_cu_chroma_qp_offset_idx(s, lc);
-+            }
-+            lc->tu.qp_divmod6[1] += s->ps.pps->cb_qp_offset_list[cu_chroma_qp_offset_idx];
-+            lc->tu.qp_divmod6[2] += s->ps.pps->cr_qp_offset_list[cu_chroma_qp_offset_idx];
-+        }
-+        lc->tu.cu_chroma_qp_offset_wanted = 0;
-+    }
-+
-+    if (cbf_luma)
-+        ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0, log2_trafo_size, scan_idx, 0);
-+
-+    if (log2_trafo_size > 2 || blk_idx == 3)
-+    {
-+        if ((cbf_chroma & CBF_CB0) != 0)
-+            ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c,
-+                                        log2_trafo_size_c, scan_idx_c, 1);
-+        if ((cbf_chroma & CBF_CR0) != 0)
-+            ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c,
-+                                        log2_trafo_size_c, scan_idx_c, 2);
-+    }
-+
-+    return 0;
-+}
-+
-+static inline void set_deblocking_bypass(const HEVCRpiContext * const s, const int x0, const int y0, const int log2_cb_size)
-+{
-+    set_bits(s->is_pcm + (y0 >> 3) * s->ps.sps->pcm_width, x0 >> 3, s->ps.sps->pcm_width, log2_cb_size - 3);
-+}
-+
-+
-+static int hls_transform_tree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                              const unsigned int x0, const unsigned int y0,
-+                              const unsigned int log2_trafo_size,
-+                              const unsigned int trafo_depth, const unsigned int blk_idx,
-+                              const unsigned int cbf_c0)
-+{
-+    // When trafo_size == 2 hls_transform_unit uses c0 so put in c1
-+    unsigned int cbf_c1 = cbf_c0;
-+    int split_transform_flag;
-+    int ret;
-+
-+    if (lc->cu.intra_split_flag) {
-+        if (trafo_depth == 1) {
-+            lc->tu.intra_pred_mode   = lc->pu.intra_pred_mode[blk_idx];
-+            if (ctx_cfmt(s) == 3) {
-+                lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[blk_idx];
-+                lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[blk_idx];
-+            } else {
-+                lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
-+                lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[0];
-+            }
-+        }
-+    } else {
-+        lc->tu.intra_pred_mode   = lc->pu.intra_pred_mode[0];
-+        lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
-+        lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[0];
-+    }
-+
-+    if (log2_trafo_size <= s->ps.sps->log2_max_trafo_size &&
-+        log2_trafo_size >  s->ps.sps->log2_min_tb_size    &&
-+        trafo_depth     < lc->cu.max_trafo_depth       &&
-+        !(lc->cu.intra_split_flag && trafo_depth == 0))
-+    {
-+        split_transform_flag = ff_hevc_rpi_split_transform_flag_decode(lc, log2_trafo_size);
-+    } else {
-+        int inter_split = s->ps.sps->max_transform_hierarchy_depth_inter == 0 &&
-+                          lc->cu.pred_mode == MODE_INTER &&
-+                          lc->cu.part_mode != PART_2Nx2N &&
-+                          trafo_depth == 0;
-+
-+        split_transform_flag = log2_trafo_size > s->ps.sps->log2_max_trafo_size ||
-+                               (lc->cu.intra_split_flag && trafo_depth == 0) ||
-+                               inter_split;
-+    }
-+
-+    if (log2_trafo_size > 2 || ctx_cfmt(s) == 3)
-+    {
-+        const int wants_c1 = ctx_cfmt(s) == 2 && (!split_transform_flag || log2_trafo_size == 3);
-+        cbf_c1 = 0;
-+
-+        if ((cbf_c0 & CBF_CB0) != 0)
-+        {
-+            cbf_c1 = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB0_S;
-+            if (wants_c1)
-+                cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB1_S;
-+        }
-+
-+        if ((cbf_c0 & CBF_CR0) != 0)
-+        {
-+            cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR0_S;
-+            if (wants_c1)
-+                cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR1_S;
-+        }
-+    }
-+
-+    if (split_transform_flag) {
-+        const int trafo_size_split = 1 << (log2_trafo_size - 1);
-+        const int x1 = x0 + trafo_size_split;
-+        const int y1 = y0 + trafo_size_split;
-+
-+#define SUBDIVIDE(x, y, idx)                                                    \
-+do {                                                                            \
-+    ret = hls_transform_tree(s, lc, x, y,                                       \
-+                             log2_trafo_size - 1, trafo_depth + 1, idx,         \
-+                             cbf_c1);                                           \
-+    if (ret < 0)                                                                \
-+        return ret;                                                             \
-+} while (0)
-+
-+        SUBDIVIDE(x0, y0, 0);
-+        SUBDIVIDE(x1, y0, 1);
-+        SUBDIVIDE(x0, y1, 2);
-+        SUBDIVIDE(x1, y1, 3);
-+
-+#undef SUBDIVIDE
-+    } else {
-+        // If trafo_size == 2 then we should have cbf_c == 0 here but as we can't have
-+        // trafo_size == 2 with depth == 0 the issue is moot
-+        const int cbf_luma = ((lc->cu.pred_mode != MODE_INTRA && trafo_depth == 0 && cbf_c1 == 0) ||
-+            ff_hevc_rpi_cbf_luma_decode(lc, trafo_depth));
-+
-+        ret = hls_transform_unit(s, lc, x0, y0,
-+                                 log2_trafo_size + trafo_depth, log2_trafo_size,
-+                                 blk_idx, cbf_luma, cbf_c1);
-+        if (ret < 0)
-+            return ret;
-+
-+        if (!s->sh.disable_deblocking_filter_flag) {
-+            ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_trafo_size, cbf_luma);
-+        }
-+    }
-+    return 0;
-+}
-+
-+
-+static int pcm_extract(const HEVCRpiContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
-+{
-+    GetBitContext gb;
-+    int ret;
-+
-+    ret = init_get_bits(&gb, pcm, length);
-+    if (ret < 0)
-+        return ret;
-+
-+    s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0),
-+                       frame_stride1(s->frame, 0),
-+                       cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
-+
-+    s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> ctx_hshift(s, 1), y0 >> ctx_vshift(s, 1)),
-+                       s->frame->linesize[1],
-+                       cb_size >> ctx_hshift(s, 1),
-+                       cb_size >> ctx_vshift(s, 1),
-+                       &gb, s->ps.sps->pcm.bit_depth_chroma);
-+
-+    return 0;
-+}
-+
-+
-+// x * 2^(y*2)
-+static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
-+{
-+    return x << (y * 2);
-+}
-+
-+static int hls_pcm_sample(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, unsigned int log2_cb_size)
-+{
-+    // Length in bits
-+    const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
-+        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 1)) +
-+        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 2));
-+
-+    const uint8_t * const pcm = ff_hevc_rpi_cabac_skip_bytes(&lc->cc, (length + 7) >> 3);
-+
-+    if (!s->sh.disable_deblocking_filter_flag)
-+        ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
-+
-+    // Copy coeffs
-+    {
-+        const int blen = (length + 7) >> 3;
-+        // Round allocated bytes up to nearest 32 to avoid alignment confusion
-+        // Allocation is in int16_t s
-+        // As we are only using 1 byte per sample and the coeff buffer allows 2 per
-+        // sample this rounding doesn't affect the total size we need to allocate for
-+        // the coeff buffer
-+        int16_t * const coeffs = rpi_alloc_coeff_buf(lc->jb0, 0, ((blen + 31) & ~31) >> 1);
-+        memcpy(coeffs, pcm, blen);
-+
-+        // Our coeff stash assumes that any partially allocated 64byte lump
-+        // is zeroed so make that true.
-+        {
-+            uint8_t * const eopcm = (uint8_t *)coeffs + blen;
-+            if ((-(intptr_t)eopcm & 63) != 0)
-+                memset(eopcm, 0, -(intptr_t)eopcm & 63);
-+        }
-+
-+        // Add command
-+        {
-+            HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
-+            cmd->type = RPI_PRED_I_PCM;
-+            cmd->size = log2_cb_size;
-+            cmd->i_pcm.src = coeffs;
-+            cmd->i_pcm.x = x0;
-+            cmd->i_pcm.y = y0;
-+            cmd->i_pcm.src_len = length;
-+        }
-+        return 0;
-+    }
-+}
-+
-+
-+static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCRpiFrame * const ref,
-+                                const MvXY xy, const int y0, const int height)
-+{
-+    if (s->threads_type != 0) {
-+        const int y = FFMAX(0, (MV_Y(xy) >> 2) + y0 + height + 9);
-+
-+        // Progress has to be attached to current job as the actual wait
-+        // is in worker_core which can't use lc
-+        int16_t *const pr = lc->jb0->progress_req + ref->dpb_no;
-+        if (*pr < y) {
-+            *pr = y;
-+        }
-+    }
-+}
-+
-+static void hevc_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                  const int x0, const int y0, const int nPbW,
-+                                  const int nPbH,
-+                                  HEVCRpiMvField * const mv)
-+{
-+    enum InterPredIdc inter_pred_idc = PRED_L0;
-+    int mvp_flag;
-+    const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH);
-+
-+    mv->pred_flag = 0;
-+    if (s->sh.slice_type == HEVC_SLICE_B)
-+        inter_pred_idc = ff_hevc_rpi_inter_pred_idc_decode(lc, nPbW, nPbH);
-+
-+    if (inter_pred_idc != PRED_L1) {
-+        MvXY mvd;
-+
-+        if (s->sh.nb_refs[L0])
-+            mv->ref_idx[0]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L0]);
-+
-+        mv->pred_flag = PF_L0;
-+        mvd = ff_hevc_rpi_hls_mvd_coding(lc);
-+        mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
-+        ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail,
-+                                 mv, mvp_flag, 0);
-+        mv->xy[0] = mvxy_add(mv->xy[0], mvd);
-+    }
-+
-+    if (inter_pred_idc != PRED_L0) {
-+        MvXY mvd = 0;
-+
-+        if (s->sh.nb_refs[L1])
-+            mv->ref_idx[1] = ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]);
-+
-+        if (s->sh.mvd_l1_zero_flag != 1 || inter_pred_idc != PRED_BI)
-+            mvd = ff_hevc_rpi_hls_mvd_coding(lc);
-+
-+        mv->pred_flag += PF_L1;
-+        mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
-+        ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail,
-+                                 mv, mvp_flag, 1);
-+        mv->xy[1] = mvxy_add(mv->xy[1], mvd);
-+    }
-+}
-+
-+
-+static HEVCRpiInterPredQ *
-+rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn)
-+{
-+    HEVCRpiInterPredQ * yp = NULL;
-+    HEVCRpiInterPredQ * ypt = ipe->q + ipe->curr;
-+    const unsigned int max_fill = ipe->max_fill;
-+    unsigned int load = UINT_MAX;
-+
-+    for (unsigned int i = 0; i != ipe->n_grp; ++i, ++ypt) {
-+        // We will always have enough room between the Qs but if we are
-+        // running critically low due to poor scheduling then use fill size
-+        // rather than load to determine QPU.  This has obvious dire
-+        // performance implications but (a) it is better than crashing
-+        // and (b) it should (almost) never happen
-+        const unsigned int tfill = (char *)ypt->qpu_mc_curr - (char *)ypt->qpu_mc_base;
-+        const unsigned int tload = tfill > max_fill ? tfill + 0x1000000 : ypt->load;
-+
-+        if (tload < load)
-+        {
-+            yp = ypt;
-+            load = tload;
-+        }
-+    }
-+
-+    yp->load += load_val;
-+    ipe->used_grp = 1;
-+    qpu_mc_link_set(yp->qpu_mc_curr, fn);
-+
-+    return yp;
-+}
-+
-+
-+static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe)
-+{
-+    for (unsigned int i = 0; i != ipe->n; ++i) {
-+        HEVCRpiInterPredQ * const q = ipe->q + i;
-+        const unsigned int qfill = (char *)q->qpu_mc_curr - (char *)q->qpu_mc_base;
-+
-+        qpu_mc_link_set(q->qpu_mc_curr, q->code_sync);
-+        q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(&q->qpu_mc_curr->sync + 1);
-+        q->load = (qfill >> 7); // Have a mild preference for emptier Qs to balance memory usage
-+    }
-+}
-+
-+// Returns 0 on success
-+// We no longer check for Q fullness as wew have emergncy code in ctu alloc
-+// * However it might be an idea to have some means of spotting that we've used it
-+static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe)
-+{
-+    if (!ipe->used_grp)
-+        return 0;
-+
-+    if ((ipe->curr += ipe->n_grp) >= ipe->n)
-+    {
-+        ipe->curr = 0;
-+        rpi_inter_pred_sync(ipe);
-+    }
-+    ipe->used = 1;
-+    ipe->used_grp = 0;
-+
-+    return 0;
-+}
-+
-+static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe)
-+{
-+    unsigned int i;
-+
-+    ipe->curr = 0;
-+    ipe->used = 0;
-+    ipe->used_grp = 0;
-+    for (i = 0; i != ipe->n; ++i) {
-+        HEVCRpiInterPredQ * const q = ipe->q + i;
-+        q->qpu_mc_curr = q->qpu_mc_base;
-+        q->load = 0;
-+        q->last_l0 = NULL;
-+        q->last_l1 = NULL;
-+    }
-+}
-+
-+static int rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe,
-+                                 const unsigned int n_max, const unsigned int n_grp,
-+                                 const unsigned int total_size, const unsigned int min_gap)
-+{
-+    int rv;
-+
-+    memset(ipe, 0, sizeof(*ipe));
-+    if ((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) == NULL)
-+        return AVERROR(ENOMEM);
-+
-+    ipe->n_grp = n_grp;
-+    ipe->min_gap = min_gap;
-+
-+    if ((rv = gpu_malloc_cached(total_size, &ipe->gptr)) != 0)
-+        av_freep(&ipe->q);
-+    return rv;
-+}
-+
-+
-+#if RPI_QPU_EMU_Y
-+#define get_mc_address_y(f) ((f)->data[0])
-+#else
-+#define get_mc_address_y(f) get_vc_address_y(f)
-+#endif
-+#if RPI_QPU_EMU_C
-+#define get_mc_address_u(f) ((f)->data[1])
-+#else
-+#define get_mc_address_u(f) get_vc_address_u(f)
-+#endif
-+
-+static inline uint32_t pack_wo_p(const int off, const int mul)
-+{
-+    return PACK2(off * 2 + 1, mul);
-+}
-+
-+static inline uint32_t pack_wo_b(const int off0, const int off1, const int mul)
-+{
-+    return PACK2(off0 + off1 + 1, mul);
-+}
-+
-+
-+static void
-+rpi_pred_y(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
-+           const int x0, const int y0,
-+           const int nPbW, const int nPbH,
-+           const MvXY mv_xy,
-+           const int weight_mul,
-+           const int weight_offset,
-+           AVFrame *const src_frame)
-+{
-+    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
-+    const unsigned int mx          = MV_X(mv_xy) & 3;
-+    const unsigned int my          = MV_Y(mv_xy) & 3;
-+    const unsigned int my_mx       = (my << 8) | mx;
-+    const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
-+    const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame);
-+    qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off;
-+    const uint32_t wo = pack_wo_p(weight_offset, weight_mul);
-+    HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
-+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
-+
-+    if (my_mx == 0)
-+    {
-+        const int x1 = x0 + (MV_X(mv_xy) >> 2);
-+        const int y1 = y0 + (MV_Y(mv_xy) >> 2);
-+        const int bh = nPbH;
-+
-+        for (int start_x = 0; start_x < nPbW; start_x += 16)
-+        {
-+            const int bw = FFMIN(nPbW - start_x, 16);
-+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00);
-+            qpu_mc_src_t *const src1 = yp->last_l0;
-+            qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00;
-+
-+#if RPI_TSTATS
-+            {
-+                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
-+                ++ts->y_pred1_x0y0;
-+
-+                if (nPbW > 8)
-+                    ++ts->y_pred1_wgt8;
-+                else
-+                    ++ts->y_pred1_wle8;
-+
-+                if (nPbH > 16)
-+                    ++ts->y_pred1_hgt16;
-+                else
-+                    ++ts->y_pred1_hle16;
-+            }
-+#endif
-+
-+            src1->x = x1 + start_x;
-+            src1->y = y1;
-+            src1->base = src_vc_address_y;
-+            cmd_y->w = bw;
-+            cmd_y->h = bh;
-+            cmd_y->wo1 = wo;
-+            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
-+            yp->last_l0 = &cmd_y->next_src1;
-+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
-+        }
-+    }
-+    else
-+    {
-+        const int x1_m3 = x0 + (MV_X(mv_xy) >> 2) - 3;
-+        const int y1_m3 = y0 + (MV_Y(mv_xy) >> 2) - 3;
-+        const unsigned int bh = nPbH;
-+        int start_x = 0;
-+
-+#if 1
-+        // As Y-pred operates on two independant 8-wide src blocks we can merge
-+        // this pred with the previous one if it the previous one is 8 pel wide,
-+        // the same height as the current block, immediately to the left of our
-+        // current dest block and mono-pred.
-+
-+        qpu_mc_pred_y_p_t *const last_y8_p = jb->last_y8_p;
-+        if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr)
-+        {
-+            const int bw = FFMIN(nPbW, 8);
-+            qpu_mc_src_t *const last_y8_src2 = jb->last_y8_l1;
-+
-+            last_y8_src2->x = x1_m3;
-+            last_y8_src2->y = y1_m3;
-+            last_y8_src2->base = src_vc_address_y;
-+            last_y8_p->w += bw;
-+            last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21);
-+            last_y8_p->wo2 = wo;
-+
-+            jb->last_y8_p = NULL;
-+            jb->last_y8_l1 = NULL;
-+            start_x = bw;
-+#if RPI_TSTATS
-+            ++((HEVCRpiStats *)&s->tstats)->y_pred1_y8_merge;
-+#endif
-+        }
-+#endif
-+
-+        for (; start_x < nPbW; start_x += 16)
-+        {
-+            const int bw = FFMIN(nPbW - start_x, 16);
-+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx);
-+            qpu_mc_src_t *const src1 = yp->last_l0;
-+            qpu_mc_src_t *const src2 = yp->last_l1;
-+            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
-+#if RPI_TSTATS
-+            {
-+                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
-+                if (mx == 0 && my == 0)
-+                    ++ts->y_pred1_x0y0;
-+                else if (mx == 0)
-+                    ++ts->y_pred1_x0;
-+                else if (my == 0)
-+                    ++ts->y_pred1_y0;
-+                else
-+                    ++ts->y_pred1_xy;
-+
-+                if (nPbW > 8)
-+                    ++ts->y_pred1_wgt8;
-+                else
-+                    ++ts->y_pred1_wle8;
-+
-+                if (nPbH > 16)
-+                    ++ts->y_pred1_hgt16;
-+                else
-+                    ++ts->y_pred1_hle16;
-+            }
-+#endif
-+            src1->x = x1_m3 + start_x;
-+            src1->y = y1_m3;
-+            src1->base = src_vc_address_y;
-+            if (bw <= 8)
-+            {
-+                src2->x = MC_DUMMY_X;
-+                src2->y = MC_DUMMY_Y;
-+#if RPI_QPU_EMU_Y
-+                src2->base = s->qpu_dummy_frame_emu;
-+#else
-+                src2->base = s->qpu_dummy_frame_qpu;
-+#endif
-+            }
-+            else
-+            {
-+                src2->x = x1_m3 + start_x + 8;
-+                src2->y = y1_m3;
-+                src2->base = src_vc_address_y;
-+            }
-+            cmd_y->w = bw;
-+            cmd_y->h = bh;
-+            cmd_y->mymx21 = my2_mx2_my_mx;
-+            cmd_y->wo1 = wo;
-+            cmd_y->wo2 = wo;
-+            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
-+            yp->last_l0 = &cmd_y->next_src1;
-+            yp->last_l1 = &cmd_y->next_src2;
-+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
-+
-+            if (bw == 8) {
-+                jb->last_y8_l1 = src2;
-+                jb->last_y8_p = cmd_y;
-+            }
-+        }
-+    }
-+}
-+
-+static void
-+rpi_pred_y_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+           const int x0, const int y0,
-+           const int nPbW, const int nPbH,
-+           const struct HEVCRpiMvField *const mv_field,
-+           const AVFrame *const src_frame,
-+           const AVFrame *const src_frame2)
-+{
-+    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
-+    const MvXY mv  = mv_field->xy[0];
-+    const MvXY mv2 = mv_field->xy[1];
-+
-+    const unsigned int mx          = MV_X(mv) & 3;
-+    const unsigned int my          = MV_Y(mv) & 3;
-+    const unsigned int my_mx = (my<<8) | mx;
-+    const unsigned int mx2          = MV_X(mv2) & 3;
-+    const unsigned int my2          = MV_Y(mv2) & 3;
-+    const unsigned int my2_mx2 = (my2<<8) | mx2;
-+    const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
-+    const unsigned int ref_idx0 = mv_field->ref_idx[0];
-+    const unsigned int ref_idx1 = mv_field->ref_idx[1];
-+    const uint32_t wo1 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l0[ref_idx0]);
-+    const uint32_t wo2 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l1[ref_idx1]);
-+
-+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
-+    qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off;
-+    const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame);
-+    const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2);
-+    HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
-+
-+    if (my2_mx2_my_mx == 0)
-+    {
-+        const int x1 = x0 + (MV_X(mv) >> 2);
-+        const int y1 = y0 + (MV_Y(mv) >> 2);
-+        const int x2 = x0 + (MV_X(mv2) >> 2);
-+        const int y2 = y0 + (MV_Y(mv2) >> 2);
-+        const int bh = nPbH;
-+
-+        // Can do chunks a full 16 wide if we don't want the H filter
-+        for (int start_x=0; start_x < nPbW; start_x += 16)
-+        {
-+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00);
-+            qpu_mc_src_t *const src1 = yp->last_l0;
-+            qpu_mc_src_t *const src2 = yp->last_l1;
-+            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
-+#if RPI_TSTATS
-+            {
-+                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
-+                ++ts->y_pred2_x0y0;
-+
-+                if (nPbH > 16)
-+                    ++ts->y_pred2_hgt16;
-+                else
-+                    ++ts->y_pred2_hle16;
-+            }
-+#endif
-+            src1->x = x1 + start_x;
-+            src1->y = y1;
-+            src1->base = src1_base;
-+            src2->x = x2 + start_x;
-+            src2->y = y2;
-+            src2->base = src2_base;
-+            cmd_y->w = FFMIN(nPbW - start_x, 16);
-+            cmd_y->h = bh;
-+            cmd_y->mymx21 = 0;
-+            cmd_y->wo1 = wo1;
-+            cmd_y->wo2 = wo2;
-+            cmd_y->dst_addr =  dst + (start_x << xshl);
-+            yp->last_l0 = &cmd_y->next_src1;
-+            yp->last_l1 = &cmd_y->next_src2;
-+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
-+        }
-+    }
-+    else
-+    {
-+        // Filter requires a run-up of 3
-+        const int x1 = x0 + (MV_X(mv) >> 2) - 3;
-+        const int y1 = y0 + (MV_Y(mv) >> 2) - 3;
-+        const int x2 = x0 + (MV_X(mv2) >> 2) - 3;
-+        const int y2 = y0 + (MV_Y(mv2) >> 2) - 3;
-+        const int bh = nPbH;
-+
-+        for (int start_x=0; start_x < nPbW; start_x += 8)
-+        { // B blocks work 8 at a time
-+            // B weights aren't doubled as the QPU code does the same
-+            // amount of work as it does for P
-+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx);
-+            qpu_mc_src_t *const src1 = yp->last_l0;
-+            qpu_mc_src_t *const src2 = yp->last_l1;
-+            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
-+#if RPI_TSTATS
-+            {
-+                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
-+                const unsigned int mmx = mx | mx2;
-+                const unsigned int mmy = my | my2;
-+                if (mmx == 0 && mmy == 0)
-+                    ++ts->y_pred2_x0y0;
-+                else if (mmx == 0)
-+                    ++ts->y_pred2_x0;
-+                else if (mmy == 0)
-+                    ++ts->y_pred2_y0;
-+                else
-+                    ++ts->y_pred2_xy;
-+
-+                if (nPbH > 16)
-+                    ++ts->y_pred2_hgt16;
-+                else
-+                    ++ts->y_pred2_hle16;
-+            }
-+#endif
-+            src1->x = x1 + start_x;
-+            src1->y = y1;
-+            src1->base = src1_base;
-+            src2->x = x2 + start_x;
-+            src2->y = y2;
-+            src2->base = src2_base;
-+            cmd_y->w = FFMIN(nPbW - start_x, 8);
-+            cmd_y->h = bh;
-+            cmd_y->mymx21 = my2_mx2_my_mx;
-+            cmd_y->wo1 = wo1;
-+            cmd_y->wo2 = wo2;
-+            cmd_y->dst_addr =  dst + (start_x << xshl);
-+            yp->last_l0 = &cmd_y->next_src1;
-+            yp->last_l1 = &cmd_y->next_src2;
-+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
-+        }
-+    }
-+}
-+
-+// h/v shifts fixed at one as that is all the qasm copes with
-+static void
-+rpi_pred_c(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+  const unsigned int lx, const int x0_c, const int y0_c,
-+  const int nPbW_c, const int nPbH_c,
-+  const MvXY mv,
-+  const int16_t * const c_weights,
-+  const int16_t * const c_offsets,
-+  AVFrame * const src_frame)
-+{
-+    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
-+    const int hshift = 1; // = s->ps.sps->hshift[1];
-+    const int vshift = 1; // = s->ps.sps->vshift[1];
-+
-+    const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1;
-+    const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1;
-+    const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame);
-+    const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_X(mv), 2 + hshift) << (1 - hshift)];
-+    const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_Y(mv), 2 + vshift) << (1 - vshift)];
-+    const uint32_t wo_u = pack_wo_p(c_offsets[0], c_weights[0]);
-+    const uint32_t wo_v = pack_wo_p(c_offsets[1], c_weights[1]);
-+    qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
-+    HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
-+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
-+    const unsigned int bh = nPbH_c;
-+    const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1;
-+
-+    for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
-+    {
-+        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn);
-+        qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p;
-+        qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1;
-+        qpu_mc_src_t * const last_lx = *plast_lx;
-+        const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
-+
-+        last_lx->x = x1_c + start_x;
-+        last_lx->y = y1_c;
-+        last_lx->base = src_base_u;
-+        cmd_c->h = bh;
-+        cmd_c->w = bw;
-+        cmd_c->coeffs_x = x_coeffs;
-+        cmd_c->coeffs_y = y_coeffs;
-+        cmd_c->wo_u = wo_u;
-+        cmd_c->wo_v = wo_v;
-+        cmd_c->dst_addr_c = dst_base_u + (start_x << xshl);
-+        *plast_lx = &cmd_c->next_src;
-+        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1);
-+    }
-+    return;
-+}
-+
-+// h/v shifts fixed at one as that is all the qasm copes with
-+static void
-+rpi_pred_c_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+  const int x0_c, const int y0_c,
-+  const int nPbW_c, const int nPbH_c,
-+  const struct HEVCRpiMvField * const mv_field,
-+  const int16_t * const c_weights,
-+  const int16_t * const c_offsets,
-+  const int16_t * const c_weights2,
-+  const int16_t * const c_offsets2,
-+  AVFrame * const src_frame,
-+  AVFrame * const src_frame2)
-+{
-+    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
-+    const int hshift = 1; // s->ps.sps->hshift[1];
-+    const int vshift = 1; // s->ps.sps->vshift[1];
-+    const MvXY mv = mv_field->xy[0];
-+    const MvXY mv2 = mv_field->xy[1];
-+
-+    const unsigned int mx = av_mod_uintp2(MV_X(mv), 2 + hshift);
-+    const unsigned int my = av_mod_uintp2(MV_Y(mv), 2 + vshift);
-+    const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
-+    const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
-+    const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1;
-+    const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1;
-+
-+    const unsigned int mx2 = av_mod_uintp2(MV_X(mv2), 2 + hshift);
-+    const unsigned int my2 = av_mod_uintp2(MV_Y(mv2), 2 + vshift);
-+    const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
-+    const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
-+
-+    const int x2_c = x0_c + (MV_X(mv2) >> (2 + hshift)) - 1;
-+    const int y2_c = y0_c + (MV_Y(mv2) >> (2 + hshift)) - 1;
-+
-+    const uint32_t wo_u2 = pack_wo_b(c_offsets[0], c_offsets2[0], c_weights2[0]);
-+    const uint32_t wo_v2 = pack_wo_b(c_offsets[1], c_offsets2[1], c_weights2[1]);
-+
-+    const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
-+    const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame);
-+    const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2);
-+    HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
-+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
-+    const unsigned int bh = nPbH_c;
-+
-+    for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH)
-+    {
-+        const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
-+
-+        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx);
-+        qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b;
-+        qpu_mc_src_t * const src_l0 = cp->last_l0;
-+        qpu_mc_src_t * const src_l1 = cp->last_l1;
-+
-+        src_l0->x = x1_c + start_x;
-+        src_l0->y = y1_c;
-+        src_l0->base = src1_base;
-+        src_l1->x = x2_c + start_x;
-+        src_l1->y = y2_c;
-+        src_l1->base = src2_base;
-+
-+        u[0].h = bh;
-+        u[0].w = bw;
-+        u[0].coeffs_x1 = coefs0_x;
-+        u[0].coeffs_y1 = coefs0_y;
-+        u[0].weight_u1 = c_weights[0]; // Weight L0 U
-+        u[0].weight_v1 = c_weights[1]; // Weight L0 V
-+        u[0].coeffs_x2 = coefs1_x;
-+        u[0].coeffs_y2 = coefs1_y;
-+        u[0].wo_u2 = wo_u2;
-+        u[0].wo_v2 = wo_v2;
-+        u[0].dst_addr_c = dst_base_u + (start_x << xshl);
-+
-+        cp->last_l0 = &u[0].next_src1;
-+        cp->last_l1 = &u[0].next_src2;
-+        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
-+    }
-+}
-+
-+
-+static inline void
-+col_stash(const HEVCRpiContext * const s,
-+          const unsigned int x0, const unsigned int y0, const unsigned int w0, const unsigned int h0,
-+          const HEVCRpiMvField * const mvf)
-+{
-+    ColMvField * const col_mvf = s->ref->col_mvf;
-+    const unsigned int x = (x0 + 15) >> 4;
-+    const unsigned int y = (y0 + 15) >> 4;
-+    const unsigned int w = ((x0 + 15 + w0) >> 4) - x;
-+    const unsigned int h = ((y0 + 15 + h0) >> 4) - y;
-+
-+    if (col_mvf != NULL && w != 0 && h != 0)
-+    {
-+        // Only record MV from the top left of the 16x16 block
-+
-+        const RefPicList * const rpl = s->refPicList;
-+        const ColMvField cmv = {
-+            .L = {
-+                {
-+                    .poc = (mvf->pred_flag & PF_L0) == 0 ?
-+                            COL_POC_INTRA :
-+                            COL_POC_MAKE_INTER(rpl[0].isLongTerm[mvf->ref_idx[0]], rpl[0].list[mvf->ref_idx[0]]),
-+                    .xy = mvf->xy[0]
-+                },
-+                {
-+                    .poc = (mvf->pred_flag & PF_L1) == 0 ?
-+                            COL_POC_INTRA :
-+                            COL_POC_MAKE_INTER(rpl[1].isLongTerm[mvf->ref_idx[1]], rpl[1].list[mvf->ref_idx[1]]),
-+                    .xy = mvf->xy[1]
-+                }
-+            }
-+        };
-+
-+        ColMvField * p = col_mvf + y * s->col_mvf_stride + x;
-+        const unsigned int stride = s->col_mvf_stride - w;
-+        unsigned int j = h;
-+
-+        do
-+        {
-+            unsigned int k = w;
-+            do
-+            {
-+                *p++ = cmv;
-+            } while (--k != 0);
-+            p += stride;
-+        } while (--j != 0);
-+    }
-+}
-+
-+static void hls_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                const unsigned int x0, const unsigned int y0,
-+                                const unsigned int nPbW, const unsigned int nPbH,
-+                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
-+{
-+    HEVCRpiJob * const jb = lc->jb0;
-+
-+    struct HEVCRpiMvField current_mv = {{0}};
-+    const RefPicList  *const refPicList = s->refPicList;
-+    const HEVCRpiFrame *ref0 = NULL, *ref1 = NULL;
-+
-+    if (lc->cu.pred_mode != MODE_SKIP)
-+        lc->pu.merge_flag = ff_hevc_rpi_merge_flag_decode(lc);
-+
-+    if (lc->cu.pred_mode == MODE_SKIP || lc->pu.merge_flag) {
-+        const unsigned int merge_idx = s->sh.max_num_merge_cand <= 1 ? 0 :
-+            ff_hevc_rpi_merge_idx_decode(s, lc);
-+
-+        ff_hevc_rpi_luma_mv_merge_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
-+                                   partIdx, merge_idx, &current_mv);
-+    } else {
-+        hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, &current_mv);
-+    }
-+
-+    {
-+        HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0);
-+        unsigned int i, j;
-+
-+        for (j = 0; j < nPbH >> LOG2_MIN_PU_SIZE; j++)
-+        {
-+            for (i = 0; i < nPbW >> LOG2_MIN_PU_SIZE; i++)
-+                p[i] = current_mv;
-+            p += MVF_STASH_WIDTH_PU;
-+        }
-+    }
-+
-+    col_stash(s, x0, y0, nPbW, nPbH, &current_mv);
-+
-+    if (current_mv.pred_flag & PF_L0) {
-+        ref0 = refPicList[0].ref[current_mv.ref_idx[0]];
-+        if (!ref0)
-+            return;
-+        hevc_await_progress(s, lc, ref0, current_mv.xy[0], y0, nPbH);
-+    }
-+    if (current_mv.pred_flag & PF_L1) {
-+        ref1 = refPicList[1].ref[current_mv.ref_idx[1]];
-+        if (!ref1)
-+            return;
-+        hevc_await_progress(s, lc, ref1, current_mv.xy[1], y0, nPbH);
-+    }
-+
-+    if (current_mv.pred_flag == PF_L0) {
-+        const int x0_c = x0 >> ctx_hshift(s, 1);
-+        const int y0_c = y0 >> ctx_vshift(s, 1);
-+        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
-+        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
-+
-+        rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[0],
-+          s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
-+          ref0->frame);
-+
-+        if (ctx_cfmt(s) != 0) {
-+            rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[0],
-+              s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
-+              ref0->frame);
-+            return;
-+        }
-+    } else if (current_mv.pred_flag == PF_L1) {
-+        const int x0_c = x0 >> ctx_hshift(s, 1);
-+        const int y0_c = y0 >> ctx_vshift(s, 1);
-+        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
-+        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
-+
-+        rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[1],
-+          s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
-+          ref1->frame);
-+
-+        if (ctx_cfmt(s) != 0) {
-+            rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[1],
-+              s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
-+              ref1->frame);
-+            return;
-+        }
-+    } else if (current_mv.pred_flag == PF_BI) {
-+        const int x0_c = x0 >> ctx_hshift(s, 1);
-+        const int y0_c = y0 >> ctx_vshift(s, 1);
-+        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
-+        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
-+
-+        rpi_pred_y_b(s, jb, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
-+
-+        if (ctx_cfmt(s) != 0) {
-+          rpi_pred_c_b(s, jb, x0_c, y0_c, nPbW_c, nPbH_c,
-+                       &current_mv,
-+                       s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
-+                       s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
-+                       s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
-+                       s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
-+                       ref0->frame,
-+                       ref1->frame);
-+            return;
-+        }
-+    }
-+}
-+
-+static void set_ipm(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                    const unsigned int x0, const unsigned int y0,
-+                    const unsigned int log2_cb_size,
-+                    const unsigned int ipm)
-+{
-+    const unsigned int x_pu = x0 >> LOG2_MIN_PU_SIZE;
-+    const unsigned int y_pu = y0 >> LOG2_MIN_PU_SIZE;
-+
-+    {
-+        const unsigned int ctb_mask = ~(~0U << (s->ps.sps->log2_ctb_size - LOG2_MIN_PU_SIZE));
-+        set_stash2(lc->ipm_left + (y_pu & ctb_mask), lc->ipm_up + (x_pu & ctb_mask), log2_cb_size - LOG2_MIN_PU_SIZE, ipm);
-+    }
-+
-+    // If IRAP then everything is Intra & we avoid ever looking at these
-+    // stashes so don't bother setting them
-+    if (!s->is_irap && lc->cu.pred_mode == MODE_INTRA)
-+    {
-+        if (s->is_intra != NULL)
-+        {
-+            set_bits(s->is_intra + (y0 >> LOG2_MIN_CU_SIZE) * s->ps.sps->pcm_width, x0 >> LOG2_MIN_CU_SIZE, s->ps.sps->pcm_width, log2_cb_size - LOG2_MIN_CU_SIZE);
-+        }
-+
-+        {
-+            HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0);
-+            const unsigned int size_in_pus = (1 << log2_cb_size) >> LOG2_MIN_PU_SIZE; // min_pu <= log2_cb so >= 1
-+            unsigned int n = size_in_pus;
-+
-+            do
-+            {
-+                memset(p, 0, size_in_pus * sizeof(*p));
-+                p += MVF_STASH_WIDTH_PU;
-+            } while (--n != 0);
-+        }
-+
-+
-+        if (s->ref->col_mvf != NULL && ((x0 | y0) & 0xf) == 0)
-+        {
-+            // Only record top left stuff
-+            // Blocks should always be alinged on size boundries
-+            // so cannot have overflow from a small block
-+
-+            ColMvField * p = s->ref->col_mvf + (y0 >> 4) * s->col_mvf_stride + (x0 >> 4);
-+            const unsigned int size_in_col = log2_cb_size < 4 ? 1 : (1 << (log2_cb_size - 4));
-+            const unsigned int stride = s->col_mvf_stride - size_in_col;
-+            unsigned int j = size_in_col;
-+
-+            do
-+            {
-+                unsigned int k = size_in_col;
-+                do
-+                {
-+                    p->L[0].poc = COL_POC_INTRA;
-+                    p->L[0].xy = 0;
-+                    p->L[1].poc = COL_POC_INTRA;
-+                    p->L[1].xy = 0;
-+                    ++p;
-+                } while (--k != 0);
-+                p += stride;
-+            } while (--j != 0);
-+        }
-+    }
-+}
-+
-+static inline void intra_prediction_unit_default_value(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                                const unsigned int x0, const unsigned int y0,
-+                                                const unsigned int log2_cb_size)
-+{
-+    set_ipm(s, lc, x0, y0, log2_cb_size, INTRA_DC);
-+}
-+
-+
-+/**
-+ * 8.4.1
-+ */
-+static int luma_intra_pred_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                int x0, int y0, int log2_pu_size,
-+                                int prev_intra_luma_pred_flag,
-+                                const unsigned int idx)
-+{
-+    const unsigned int ctb_mask = ~(~0U << s->ps.sps->log2_ctb_size);
-+    const unsigned int xb_pu = (x0 & ctb_mask) >> LOG2_MIN_PU_SIZE;
-+    const unsigned int yb_pu = (y0 & ctb_mask) >> LOG2_MIN_PU_SIZE;
-+
-+    // Up does not cross boundries so as we always scan 1 slice-tile-line in an
-+    // lc we can just keep 1 CTB lR stashes
-+    // Left is reset to DC @ Start of Line/Tile/Slice in fill_job
-+    const unsigned int cand_up   = yb_pu == 0 ? INTRA_DC : lc->ipm_up[xb_pu];
-+    const unsigned int cand_left = lc->ipm_left[yb_pu];
-+
-+    unsigned int intra_pred_mode;
-+    unsigned int a, b, c;
-+
-+    if (cand_left == cand_up) {
-+        if (cand_left < 2) {
-+            a = INTRA_PLANAR;
-+            b = INTRA_DC;
-+            c = INTRA_ANGULAR_26;
-+        } else {
-+            a = cand_left;
-+            b = 2 + ((cand_left - 2 - 1 + 32) & 31);
-+            c = 2 + ((cand_left - 2 + 1) & 31);
-+        }
-+    } else {
-+        a = cand_left;
-+        b = cand_up;
-+        c = (cand_left != INTRA_PLANAR && cand_up != INTRA_PLANAR) ?
-+                INTRA_PLANAR :
-+            (cand_left != INTRA_DC && cand_up != INTRA_DC) ?
-+                INTRA_DC :
-+                INTRA_ANGULAR_26;
-+    }
-+
-+    if (prev_intra_luma_pred_flag) {
-+        intra_pred_mode = idx == 0 ? a : idx == 1 ? b : c;
-+    } else {
-+        // Sort lowest 1st
-+        if (a > b)
-+            FFSWAP(int, a, b);
-+        if (a > c)
-+            FFSWAP(int, a, c);
-+        if (b > c)
-+            FFSWAP(int, b, c);
-+
-+        intra_pred_mode = idx;
-+        if (intra_pred_mode >= a)
-+            intra_pred_mode++;
-+        if (intra_pred_mode >= b)
-+            intra_pred_mode++;
-+        if (intra_pred_mode >= c)
-+            intra_pred_mode++;
-+    }
-+
-+    /* write the intra prediction units into the mv array */
-+    set_ipm(s, lc, x0, y0, log2_pu_size, intra_pred_mode);
-+    return intra_pred_mode;
-+}
-+
-+static const uint8_t tab_mode_idx[] = {
-+     0,  1,  2,  2,  2,  2,  3,  5,  7,  8, 10, 12, 13, 15, 17, 18, 19, 20,
-+    21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31};
-+
-+static void intra_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                  const unsigned int x0, const unsigned int y0,
-+                                  const unsigned int log2_cb_size)
-+{
-+    static const uint8_t intra_chroma_table[4] = { 0, 26, 10, 1 };
-+    uint8_t prev_intra_luma_pred_flag[4];
-+    int split   = lc->cu.part_mode == PART_NxN;
-+    const unsigned int split_size = (1 << (log2_cb_size - 1));
-+    int chroma_mode;
-+    const unsigned int n = split ? 4 : 1;
-+    unsigned int i;
-+
-+    for (i = 0; i != n; i++)
-+        prev_intra_luma_pred_flag[i] = ff_hevc_rpi_prev_intra_luma_pred_flag_decode(lc);
-+
-+    for (i = 0; i < n; i++) {
-+        // depending on mode idx is mpm or luma_pred_mode
-+        const unsigned int idx = prev_intra_luma_pred_flag[i] ?
-+            ff_hevc_rpi_mpm_idx_decode(lc) :
-+            ff_hevc_rpi_rem_intra_luma_pred_mode_decode(lc);
-+
-+        lc->pu.intra_pred_mode[i] =
-+            luma_intra_pred_mode(s, lc,
-+                                 x0 + ((i & 1) == 0 ? 0 : split_size),
-+                                 y0 + ((i & 2) == 0 ? 0 : split_size),
-+                                 log2_cb_size - split,
-+                                 prev_intra_luma_pred_flag[i], idx);
-+    }
-+
-+    if (ctx_cfmt(s) == 3) {
-+        for (i = 0; i < n; i++) {
-+            lc->pu.chroma_mode_c[i] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
-+            if (chroma_mode != 4) {
-+                if (lc->pu.intra_pred_mode[i] == intra_chroma_table[chroma_mode])
-+                    lc->pu.intra_pred_mode_c[i] = 34;
-+                else
-+                    lc->pu.intra_pred_mode_c[i] = intra_chroma_table[chroma_mode];
-+            } else {
-+                lc->pu.intra_pred_mode_c[i] = lc->pu.intra_pred_mode[i];
-+            }
-+        }
-+    } else if (ctx_cfmt(s) == 2) {
-+        int mode_idx;
-+        lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
-+        if (chroma_mode != 4) {
-+            if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
-+                mode_idx = 34;
-+            else
-+                mode_idx = intra_chroma_table[chroma_mode];
-+        } else {
-+            mode_idx = lc->pu.intra_pred_mode[0];
-+        }
-+        lc->pu.intra_pred_mode_c[0] = tab_mode_idx[mode_idx];
-+    } else if (ctx_cfmt(s) != 0) {
-+        chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
-+        if (chroma_mode != 4) {
-+            if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
-+                lc->pu.intra_pred_mode_c[0] = 34;
-+            else
-+                lc->pu.intra_pred_mode_c[0] = intra_chroma_table[chroma_mode];
-+        } else {
-+            lc->pu.intra_pred_mode_c[0] = lc->pu.intra_pred_mode[0];
-+        }
-+    }
-+}
-+
-+static int hls_coding_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                           const unsigned int x0, const unsigned int y0, const unsigned int log2_cb_size)
-+{
-+    const unsigned int cb_size          = 1 << log2_cb_size;
-+    const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
-+    const unsigned int min_cb_width     = s->ps.sps->min_cb_width;
-+    const unsigned int x_cb             = x0 >> log2_min_cb_size;
-+    const unsigned int y_cb             = y0 >> log2_min_cb_size;
-+    const unsigned int idx              = log2_cb_size - 2;
-+    const unsigned int qp_block_mask    = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1;
-+    int skip_flag = 0;
-+
-+    lc->cu.x                = x0;
-+    lc->cu.y                = y0;
-+    lc->cu.x_split          = x0;
-+    lc->cu.y_split          = y0;
-+
-+    lc->cu.pred_mode        = MODE_INTRA;
-+    lc->cu.part_mode        = PART_2Nx2N;
-+    lc->cu.intra_split_flag = 0;
-+    lc->cu.cu_transquant_bypass_flag = 0;
-+    lc->pu.intra_pred_mode[0] = 1;
-+    lc->pu.intra_pred_mode[1] = 1;
-+    lc->pu.intra_pred_mode[2] = 1;
-+    lc->pu.intra_pred_mode[3] = 1;
-+
-+    if (s->ps.pps->transquant_bypass_enable_flag) {
-+        lc->cu.cu_transquant_bypass_flag = ff_hevc_rpi_cu_transquant_bypass_flag_decode(lc);
-+        if (lc->cu.cu_transquant_bypass_flag)
-+            set_deblocking_bypass(s, x0, y0, log2_cb_size);
-+    }
-+
-+    if (s->sh.slice_type != HEVC_SLICE_I) {
-+        lc->cu.pred_mode = MODE_INTER;
-+        skip_flag = ff_hevc_rpi_skip_flag_decode(s, lc, x0, y0, x_cb, y_cb);
-+    }
-+
-+    if (skip_flag) {
-+        lc->cu.pred_mode = MODE_SKIP;
-+
-+        hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
-+        intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
-+
-+        if (!s->sh.disable_deblocking_filter_flag)
-+            ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
-+    } else {
-+        int pcm_flag = 0;
-+
-+        if (s->sh.slice_type != HEVC_SLICE_I)
-+            lc->cu.pred_mode = ff_hevc_rpi_pred_mode_decode(lc);
-+        if (lc->cu.pred_mode != MODE_INTRA ||
-+            log2_cb_size == s->ps.sps->log2_min_cb_size) {
-+            lc->cu.part_mode        = ff_hevc_rpi_part_mode_decode(s, lc, log2_cb_size);
-+            lc->cu.intra_split_flag = lc->cu.part_mode == PART_NxN &&
-+                                      lc->cu.pred_mode == MODE_INTRA;
-+        }
-+
-+        if (lc->cu.pred_mode == MODE_INTRA) {
-+            if (lc->cu.part_mode == PART_2Nx2N &&
-+                log2_cb_size <= s->ps.sps->pcm.log2_max_pcm_cb_size &&  // 0 if not enabled
-+                log2_cb_size >= s->ps.sps->pcm.log2_min_pcm_cb_size &&
-+                ff_hevc_rpi_pcm_flag_decode(lc) != 0)
-+            {
-+                int ret;
-+                pcm_flag = 1;
-+                intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
-+                if ((ret = hls_pcm_sample(s, lc, x0, y0, log2_cb_size)) < 0)
-+                    return ret;
-+
-+                if (s->ps.sps->pcm.loop_filter_disable_flag)
-+                    set_deblocking_bypass(s, x0, y0, log2_cb_size);
-+            } else {
-+                intra_prediction_unit(s, lc, x0, y0, log2_cb_size);
-+            }
-+        } else {
-+            intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
-+            switch (lc->cu.part_mode) {
-+            case PART_2Nx2N:
-+                hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
-+                break;
-+            case PART_2NxN:
-+                hls_prediction_unit(s, lc, x0, y0,               cb_size, cb_size / 2, log2_cb_size, 0, idx);
-+                lc->cu.y_split = y0 + cb_size / 2;
-+                hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx);
-+                break;
-+            case PART_Nx2N:
-+                hls_prediction_unit(s, lc, x0,               y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1);
-+                lc->cu.x_split = x0 + cb_size / 2;
-+                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1);
-+                break;
-+            case PART_2NxnU:
-+                hls_prediction_unit(s, lc, x0, y0,               cb_size, cb_size     / 4, log2_cb_size, 0, idx);
-+                lc->cu.y_split = y0 + cb_size / 4;
-+                hls_prediction_unit(s, lc, x0, y0 + cb_size / 4, cb_size, cb_size / 4 * 3, log2_cb_size, 1, idx);
-+                break;
-+            case PART_2NxnD:
-+                hls_prediction_unit(s, lc, x0, y0,                   cb_size, cb_size / 4 * 3, log2_cb_size, 0, idx);
-+                lc->cu.y_split = y0 + cb_size / 4 * 3;
-+                hls_prediction_unit(s, lc, x0, y0 + cb_size / 4 * 3, cb_size, cb_size     / 4, log2_cb_size, 1, idx);
-+                break;
-+            case PART_nLx2N:
-+                hls_prediction_unit(s, lc, x0,               y0, cb_size     / 4, cb_size, log2_cb_size, 0, idx - 2);
-+                lc->cu.x_split = x0 + cb_size / 4;
-+                hls_prediction_unit(s, lc, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2);
-+                break;
-+            case PART_nRx2N:
-+                hls_prediction_unit(s, lc, x0,                   y0, cb_size / 4 * 3, cb_size, log2_cb_size, 0, idx - 2);
-+                lc->cu.x_split = x0 + cb_size / 4 * 3;
-+                hls_prediction_unit(s, lc, x0 + cb_size / 4 * 3, y0, cb_size     / 4, cb_size, log2_cb_size, 1, idx - 2);
-+                break;
-+            case PART_NxN:
-+                hls_prediction_unit(s, lc, x0,               y0,               cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1);
-+                lc->cu.x_split = x0 + cb_size / 2;
-+                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0,               cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1);
-+                lc->cu.y_split = y0 + cb_size / 2;
-+                hls_prediction_unit(s, lc, x0,               y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1);
-+                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1);
-+                break;
-+            }
-+        }
-+
-+        if (!pcm_flag) {
-+            int rqt_root_cbf = 1;
-+
-+            if (lc->cu.pred_mode != MODE_INTRA &&
-+                !(lc->cu.part_mode == PART_2Nx2N && lc->pu.merge_flag)) {
-+                rqt_root_cbf = ff_hevc_rpi_no_residual_syntax_flag_decode(lc);
-+            }
-+            if (rqt_root_cbf) {
-+                const unsigned int cbf_c = ctx_cfmt(s) == 0 ? 0 : (CBF_CR0 | CBF_CB0);
-+                int ret;
-+
-+                lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ?
-+                                         s->ps.sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag :
-+                                         s->ps.sps->max_transform_hierarchy_depth_inter;
-+                // transform_tree does deblock_boundary_strengths
-+                ret = hls_transform_tree(s, lc, x0, y0,
-+                                         log2_cb_size, 0, 0, cbf_c);
-+                if (ret < 0)
-+                    return ret;
-+            } else {
-+                if (!s->sh.disable_deblocking_filter_flag)
-+                    ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
-+            }
-+        }
-+    }
-+
-+    // If the delta is still wanted then we haven't read the delta & therefore need to set qp here
-+    if (lc->tu.is_cu_qp_delta_wanted)
-+        ff_hevc_rpi_set_qPy(s, lc, x0, y0);
-+
-+    if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
-+       ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0) {
-+        lc->qPy_pred = lc->qp_y;
-+    }
-+
-+    set_bytes(s->qp_y_tab + y_cb * min_cb_width + x_cb, min_cb_width, log2_cb_size - log2_min_cb_size, lc->qp_y & 0xff);
-+
-+    set_stash2(s->cabac_stash_up + (x0 >> 3), s->cabac_stash_left + (y0 >> 3), log2_cb_size - 3, (lc->ct_depth << 1) | skip_flag);
-+
-+    return 0;
-+}
-+
-+// Returns:
-+//  < 0  Error
-+//  0    More data wanted
-+//  1    EoSlice / EoPicture
-+static int hls_coding_quadtree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0,
-+                               const int log2_cb_size, const unsigned int cb_depth)
-+{
-+    const int cb_size    = 1 << log2_cb_size;
-+    int ret;
-+    int split_cu;
-+
-+    lc->ct_depth = cb_depth;
-+    split_cu = (log2_cb_size > s->ps.sps->log2_min_cb_size);
-+    if (x0 + cb_size <= s->ps.sps->width  &&
-+        y0 + cb_size <= s->ps.sps->height &&
-+        split_cu)
-+    {
-+        split_cu = ff_hevc_rpi_split_coding_unit_flag_decode(s, lc, cb_depth, x0, y0);
-+    }
-+
-+    // Qp delta (and offset) need to remain wanted if cb_size < min until
-+    // a coded block is found so we still initial state at depth 0 (outside
-+    // this fn) and only reset here
-+    if (s->ps.pps->cu_qp_delta_enabled_flag &&
-+        log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size)
-+    {
-+        lc->tu.is_cu_qp_delta_wanted = 1;
-+        lc->tu.cu_qp_delta          = 0;
-+    }
-+    if (s->sh.cu_chroma_qp_offset_enabled_flag &&
-+        log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size)
-+    {
-+        lc->tu.cu_chroma_qp_offset_wanted = 1;
-+    }
-+
-+    lc->tu.qp_divmod6[0] = s->ps.pps->qp_bd_x[0];
-+    lc->tu.qp_divmod6[1] = s->ps.pps->qp_bd_x[1] + s->sh.slice_cb_qp_offset;
-+    lc->tu.qp_divmod6[2] = s->ps.pps->qp_bd_x[2] + s->sh.slice_cr_qp_offset;
-+
-+    if (split_cu) {
-+        int qp_block_mask = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1;
-+        const int cb_size_split = cb_size >> 1;
-+        const int x1 = x0 + cb_size_split;
-+        const int y1 = y0 + cb_size_split;
-+
-+        int more_data = 0;
-+
-+        more_data = hls_coding_quadtree(s, lc, x0, y0, log2_cb_size - 1, cb_depth + 1);
-+        if (more_data < 0)
-+            return more_data;
-+
-+        if (more_data && x1 < s->ps.sps->width) {
-+            more_data = hls_coding_quadtree(s, lc, x1, y0, log2_cb_size - 1, cb_depth + 1);
-+            if (more_data < 0)
-+                return more_data;
-+        }
-+        if (more_data && y1 < s->ps.sps->height) {
-+            more_data = hls_coding_quadtree(s, lc, x0, y1, log2_cb_size - 1, cb_depth + 1);
-+            if (more_data < 0)
-+                return more_data;
-+        }
-+        if (more_data && x1 < s->ps.sps->width &&
-+            y1 < s->ps.sps->height) {
-+            more_data = hls_coding_quadtree(s, lc, x1, y1, log2_cb_size - 1, cb_depth + 1);
-+            if (more_data < 0)
-+                return more_data;
-+        }
-+
-+        if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
-+            ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0)
-+            lc->qPy_pred = lc->qp_y;
-+
-+        if (more_data)
-+            return ((x1 + cb_size_split) < s->ps.sps->width ||
-+                    (y1 + cb_size_split) < s->ps.sps->height);
-+        else
-+            return 0;
-+    } else {
-+        ret = hls_coding_unit(s, lc, x0, y0, log2_cb_size);
-+        if (ret < 0)
-+            return ret;
-+        if ((!((x0 + cb_size) %
-+               (1 << (s->ps.sps->log2_ctb_size))) ||
-+             (x0 + cb_size >= s->ps.sps->width)) &&
-+            (!((y0 + cb_size) %
-+               (1 << (s->ps.sps->log2_ctb_size))) ||
-+             (y0 + cb_size >= s->ps.sps->height))) {
-+            int end_of_slice_flag = ff_hevc_rpi_get_cabac_terminate(&lc->cc);
-+            return !end_of_slice_flag;
-+        } else {
-+            return 1;
-+        }
-+    }
-+
-+    return 0;  // NEVER
-+}
-+
-+static void hls_decode_neighbour(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                 const int x_ctb, const int y_ctb, const int ctb_addr_ts)
-+{
-+    const unsigned int ctb_size          = 1 << s->ps.sps->log2_ctb_size;
-+    const unsigned int ctb_addr_rs       = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
-+    const unsigned int ctb_addr_rs_in_slice = ctb_addr_rs - s->sh.slice_addr;  // slice_addr = RS addr of start of slice
-+    const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts];
-+    const unsigned int line_w = s->ps.sps->ctb_width;
-+
-+    s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr;
-+
-+    lc->end_of_ctb_x = FFMIN(x_ctb + ctb_size, s->ps.sps->width);
-+    lc->end_of_ctb_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height);
-+
-+    lc->boundary_flags = 0;
-+
-+    if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0)
-+        lc->boundary_flags |= BOUNDARY_LEFT_TILE;
-+    if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1])
-+        lc->boundary_flags |= BOUNDARY_LEFT_SLICE;
-+    if ((ctb_flags & CTB_TS_FLAGS_TOT) != 0)
-+        lc->boundary_flags |= BOUNDARY_UPPER_TILE;
-+    if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - line_w])
-+        lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
-+
-+    // Use line width rather than tile width for addr_in_slice test as
-+    // addr_in_slice is in raster units
-+
-+    lc->ctb_avail =
-+        ((lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0 ? AVAIL_L : 0) |
-+        ((lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0 ? AVAIL_U : 0) |
-+        ((lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 &&
-+            (ctb_addr_rs_in_slice > line_w) ? AVAIL_UL : 0) |
-+        ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_TOT)) == 0 &&
-+            (ctb_addr_rs_in_slice + 1 >= line_w) ? AVAIL_UR : 0);
-+    // Down-left never avail at CTB level
-+}
-+
-+
-+static void rpi_execute_dblk_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+    int y = ff_hevc_rpi_hls_filter_blk(s, jb->bounds,
-+        (s->ps.pps->ctb_ts_flags[jb->ctu_ts_last] & CTB_TS_FLAGS_EOT) != 0);
-+
-+    // Signal
-+    if (y > 0) {
-+        // Cast away const as progress is held in s, but this really shouldn't confuse anything
-+        ff_hevc_rpi_progress_signal_recon((HEVCRpiContext *)s, y - 1);
-+    }
-+
-+    // Job done now
-+    // ? Move outside this fn
-+    job_free(s->jbc, jb);
-+}
-+
-+// I-pred, transform_and_add for all blocks types done here
-+// All ARM
-+static void rpi_execute_pred_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+    unsigned int i;
-+    HEVCRpiIntraPredEnv * const iap = &jb->intra;
-+    const HEVCPredCmd *cmd = iap->cmds;
-+
-+#if !RPI_WORKER_WAIT_PASS_0
-+    rpi_sem_wait(&jb->sem);
-+    rpi_cache_flush_execute(jb->rfe);  // Invalidate data set up in pass1
-+#endif
-+
-+    for (i = iap->n; i > 0; i--, cmd++)
-+    {
-+        switch (cmd->type)
-+        {
-+            case RPI_PRED_INTRA:
-+                s->hpc.intra_pred(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size);
-+                break;
-+            case RPI_PRED_INTRA_C:
-+                s->hpc.intra_pred_c(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size);
-+                break;
-+            case RPI_PRED_ADD_RESIDUAL:
-+                s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
-+                break;
-+            case RPI_PRED_ADD_DC:
-+                s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
-+                break;
-+            case RPI_PRED_ADD_RESIDUAL_U:
-+                s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
-+                break;
-+            case RPI_PRED_ADD_RESIDUAL_V:
-+                s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
-+                break;
-+            case RPI_PRED_ADD_RESIDUAL_C:
-+                s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
-+                break;
-+            case RPI_PRED_ADD_DC_U:
-+            case RPI_PRED_ADD_DC_V:
-+                s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
-+                break;
-+
-+            case RPI_PRED_I_PCM:
-+                pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
-+                break;
-+
-+            default:
-+                av_log(s->avctx, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
-+                abort();
-+        }
-+    }
-+
-+    // Mark done
-+    iap->n = 0;
-+}
-+
-+
-+// Set initial uniform job values & zero ctu_count
-+static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first)
-+{
-+    unsigned int i;
-+    HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip;
-+    HEVCRpiInterPredEnv *const yipe = &jb->luma_ip;
-+    const HEVCRpiSPS * const sps = s->ps.sps;
-+
-+    const uint16_t pic_width_y   = sps->width;
-+    const uint16_t pic_height_y  = sps->height;
-+
-+    const uint16_t pic_width_c   = sps->width >> ctx_hshift(s, 1);
-+    const uint16_t pic_height_c  = sps->height >> ctx_vshift(s, 1);
-+
-+    // We expect the pointer to change if we use another sps
-+    if (sps != jb->sps)
-+    {
-+        worker_pic_free_one(jb);
-+
-+        set_ipe_from_ici(cipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].chroma);
-+        set_ipe_from_ici(yipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].luma);
-+
-+        {
-+            const int coefs_per_luma = HEVC_MAX_CTB_SIZE * HEVC_RPI_MAX_WIDTH;
-+            const int coefs_per_chroma = (coefs_per_luma * 2) >> (ctx_vshift(s, 1) + ctx_hshift(s, 1));
-+            worker_pic_alloc_one(jb, coefs_per_luma + coefs_per_chroma);
-+        }
-+
-+        jb->sps = sps;
-+    }
-+
-+    jb->waited = 0;
-+    jb->ctu_ts_first = ctu_ts_first;
-+    jb->ctu_ts_last = -1;
-+
-+    rpi_inter_pred_reset(cipe);
-+    for (i = 0; i < cipe->n; i++) {
-+        HEVCRpiInterPredQ * const cp = cipe->q + i;
-+        qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s;
-+
-+        u->next_src1.x = 0;
-+        u->next_src1.y = 0;
-+        u->next_src1.base = 0;
-+        u->pic_cw = pic_width_c;
-+        u->pic_ch = pic_height_c;
-+        u->stride2 = av_rpi_sand_frame_stride2(s->frame);
-+        u->stride1 = av_rpi_sand_frame_stride1(s->frame);
-+        cp->last_l0 = &u->next_src1;
-+
-+        u->next_fn = 0;
-+        u->next_src2.x = 0;
-+        u->next_src2.y = 0;
-+        u->next_src2.base = 0;
-+        cp->last_l1 = &u->next_src2;
-+
-+        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
-+    }
-+
-+    rpi_inter_pred_reset(yipe);
-+    for (i = 0; i < yipe->n; i++) {
-+        HEVCRpiInterPredQ * const yp = yipe->q + i;
-+        qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s;
-+
-+        y->next_src1.x = 0;
-+        y->next_src1.y = 0;
-+        y->next_src1.base = 0;
-+        y->next_src2.x = 0;
-+        y->next_src2.y = 0;
-+        y->next_src2.base = 0;
-+        y->pic_h = pic_height_y;
-+        y->pic_w = pic_width_y;
-+        y->stride2 = av_rpi_sand_frame_stride2(s->frame);
-+        y->stride1 = av_rpi_sand_frame_stride1(s->frame);
-+        y->next_fn = 0;
-+        yp->last_l0 = &y->next_src1;
-+        yp->last_l1 = &y->next_src2;
-+
-+        yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1);
-+    }
-+
-+    jb->last_y8_p = NULL;
-+    jb->last_y8_l1 = NULL;
-+
-+    for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
-+        jb->progress_req[i] = -1;
-+    }
-+
-+    worker_pic_reset(&jb->coeffs);
-+}
-+
-+
-+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
-+static unsigned int mc_terminate_add_qpu(const HEVCRpiContext * const s,
-+                                     const vpu_qpu_job_h vqj,
-+                                     rpi_cache_flush_env_t * const rfe,
-+                                     HEVCRpiInterPredEnv * const ipe)
-+{
-+    unsigned int i;
-+    uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS];
-+    unsigned int max_block = 0;
-+
-+    if (!ipe->used) {
-+        return 0;
-+    }
-+
-+    if (ipe->curr != 0) {
-+        rpi_inter_pred_sync(ipe);
-+    }
-+
-+    // Add final commands to Q
-+    for(i = 0; i != ipe->n; ++i) {
-+        HEVCRpiInterPredQ * const yp = ipe->q + i;
-+        qpu_mc_src_t *const p0 = yp->last_l0;
-+        qpu_mc_src_t *const p1 = yp->last_l1;
-+        const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base;
-+
-+        if (block_size > max_block)
-+            max_block = block_size;
-+
-+        qpu_mc_link_set(yp->qpu_mc_curr, yp->code_exit);
-+
-+        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
-+        p0->x = MC_DUMMY_X;
-+        p0->y = MC_DUMMY_Y;
-+        p0->base = s->qpu_dummy_frame_qpu;
-+        p1->x = MC_DUMMY_X;
-+        p1->y = MC_DUMMY_Y;
-+        p1->base = s->qpu_dummy_frame_qpu;
-+
-+        yp->last_l0 = NULL;
-+        yp->last_l1 = NULL;
-+
-+        // Add to mailbox list
-+        mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm);
-+        mail[i][1] = yp->code_setup;
-+    }
-+
-+    // We don't need invalidate here as the uniforms aren't changed by the QPU
-+    // and leaving them in ARM cache avoids (pointless) pre-reads when writing
-+    // new values which seems to give us a small performance advantage
-+    //
-+    // In most cases we will not have a completely packed set of uniforms and as
-+    // we have a 2d invalidate we writeback all uniform Qs to the depth of the
-+    // fullest
-+    rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK,
-+                                  (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block,
-+                                  ipe->n, ipe->max_fill + ipe->min_gap);
-+    vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail);
-+
-+    return 1;
-+}
-+#endif
-+
-+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
-+static unsigned int mc_terminate_add_emu(const HEVCRpiContext * const s,
-+                                     const vpu_qpu_job_h vqj,
-+                                     rpi_cache_flush_env_t * const rfe,
-+                                     HEVCRpiInterPredEnv * const ipe)
-+{
-+    unsigned int i;
-+    if (!ipe->used) {
-+        return 0;
-+    }
-+
-+    if (ipe->curr != 0) {
-+        rpi_inter_pred_sync(ipe);
-+    }
-+
-+    // Add final commands to Q
-+    for(i = 0; i != ipe->n; ++i) {
-+        HEVCRpiInterPredQ * const yp = ipe->q + i;
-+        qpu_mc_src_t *const p0 = yp->last_l0;
-+        qpu_mc_src_t *const p1 = yp->last_l1;
-+
-+        yp->qpu_mc_curr->data[-1] = yp->code_exit;
-+
-+        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
-+        p0->x = MC_DUMMY_X;
-+        p0->y = MC_DUMMY_Y;
-+        p0->base = s->qpu_dummy_frame_emu;
-+        p1->x = MC_DUMMY_X;
-+        p1->y = MC_DUMMY_Y;
-+        p1->base = s->qpu_dummy_frame_emu;
-+
-+        yp->last_l0 = NULL;
-+        yp->last_l1 = NULL;
-+    }
-+
-+    return 1;
-+}
-+#endif
-+
-+
-+#if RPI_QPU_EMU_Y
-+#define mc_terminate_add_y mc_terminate_add_emu
-+#else
-+#define mc_terminate_add_y mc_terminate_add_qpu
-+#endif
-+#if RPI_QPU_EMU_C
-+#define mc_terminate_add_c mc_terminate_add_emu
-+#else
-+#define mc_terminate_add_c mc_terminate_add_qpu
-+#endif
-+
-+
-+static void flush_frame(HEVCRpiContext *s,AVFrame *frame)
-+{
-+    rpi_cache_buf_t cbuf;
-+    rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf);
-+    rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
-+    rpi_cache_flush_finish(rfe);
-+}
-+
-+static void job_gen_bounds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+    const unsigned int rs0 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_first];
-+    const unsigned int rs1 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_last];
-+    const unsigned int ctb_width = s->ps.sps->ctb_width;
-+    RpiBlk *const bounds = &jb->bounds;
-+    av_assert1(jb->ctu_ts_first <= jb->ctu_ts_last);
-+    bounds->x = (rs0 % ctb_width) << s->ps.sps->log2_ctb_size;
-+    bounds->y = (rs0 / ctb_width) << s->ps.sps->log2_ctb_size;
-+    bounds->w = ((rs1 - rs0) % ctb_width + 1) << s->ps.sps->log2_ctb_size;
-+    bounds->h = ((rs1 - rs0) / ctb_width + 1) << s->ps.sps->log2_ctb_size;
-+
-+    bounds->w = FFMIN(bounds->w, s->ps.sps->width - bounds->x);
-+    bounds->h = FFMIN(bounds->h, s->ps.sps->height - bounds->y);
-+}
-+
-+#if RPI_PASSES == 2
-+static void worker_core2(HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+    // Perform intra prediction and residual reconstruction
-+    rpi_execute_pred_cmds(s, jb);
-+
-+    // Perform deblocking for CTBs in this row
-+    rpi_execute_dblk_cmds(s, jb);
-+}
-+#endif
-+
-+// Core execution tasks
-+static void worker_core(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+    int pred_y, pred_c;
-+    vpu_qpu_job_env_t qvbuf;
-+    const vpu_qpu_job_h vqj = vpu_qpu_job_init(&qvbuf);
-+#if RPI_WORKER_WAIT_PASS_0
-+    int do_wait;
-+#endif
-+
-+    {
-+        const HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
-+        if (cf->s[3].n + cf->s[2].n != 0)
-+        {
-+            const unsigned int csize = sizeof(cf->s[3].buf[0]);
-+            const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize;
-+            unsigned int n16 = (cf->s[2].n >> 8);
-+            unsigned int n32 = (cf->s[3].n >> 10);
-+#if RPI_COMPRESS_COEFFS
-+            if (cf->s[2].packed) {
-+                n16 = n16 | (n16<<16);
-+            } else {
-+                const unsigned int npack16 = (cf->s[2].packed_n>>8);
-+                n16 = n16 | (npack16<<16);
-+            }
-+            if (cf->s[3].packed) {
-+                n32 = n32 | (n32<<16);
-+            } else {
-+                const unsigned int npack32 = (cf->s[3].packed_n>>10);
-+                n32 = n32 | (npack32<<16);
-+            }
-+#endif
-+            vpu_qpu_job_add_vpu(vqj,
-+                vpu_get_fn(s->ps.sps->bit_depth),
-+                vpu_get_constants(),
-+                cf->gptr.vc,
-+                n16,
-+                cf->gptr.vc + offset32,
-+                n32,
-+                0);
-+
-+            rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize);
-+            rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize);
-+        }
-+    }
-+
-+    pred_c = mc_terminate_add_c(s, vqj, jb->rfe, &jb->chroma_ip);
-+
-+// We could take a sync here and try to locally overlap QPU processing with ARM
-+// but testing showed a slightly negative benefit with noticable extra complexity
-+
-+    pred_y = mc_terminate_add_y(s, vqj, jb->rfe, &jb->luma_ip);
-+
-+    // Returns 0 if nothing to do, 1 if sync added
-+#if RPI_WORKER_WAIT_PASS_0
-+    do_wait = vpu_qpu_job_add_sync_sem(vqj, &jb->sem);
-+#else
-+    if (vpu_qpu_job_add_sync_sem(vqj, &jb->sem) == 0)
-+        sem_post(&jb->sem);
-+#endif
-+
-+    rpi_cache_flush_execute(jb->rfe);
-+
-+    // Await progress as required
-+    // jb->waited will only be clear if we have already tested the progress values
-+    // (in worker_submit_job) and found we don't have to wait
-+    if (jb->waited)
-+    {
-+        unsigned int i;
-+        for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
-+            if (jb->progress_req[i] >= 0) {
-+                ff_hevc_rpi_progress_wait_recon(s, jb, s->DPB + i, jb->progress_req[i]);
-+            }
-+        }
-+    }
-+
-+    vpu_qpu_job_finish(vqj);
-+
-+    // We always work on a rectangular block
-+    if (pred_y || pred_c)
-+    {
-+        rpi_cache_flush_add_frame_block(jb->rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE,
-+                                        jb->bounds.x, jb->bounds.y, jb->bounds.w, jb->bounds.h,
-+                                        ctx_vshift(s, 1), pred_y, pred_c);
-+    }
-+
-+    // If we have emulated VPU ops - do it here
-+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
-+    if (av_rpi_is_sand8_frame(s->frame))
-+    {
-+#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
-+        ff_hevc_rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip);
-+#elif RPI_QPU_EMU_Y
-+        ff_hevc_rpi_shader_c8(s, &jb->luma_ip, NULL);
-+#else
-+        ff_hevc_rpi_shader_c8(s, NULL, &jb->chroma_ip);
-+#endif
-+    }
-+    else
-+    {
-+#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
-+        ff_hevc_rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip);
-+#elif RPI_QPU_EMU_Y
-+        ff_hevc_rpi_shader_c16(s, &jb->luma_ip, NULL);
-+#else
-+        ff_hevc_rpi_shader_c16(s, NULL, &jb->chroma_ip);
-+#endif
-+    }
-+#endif
-+
-+#if RPI_WORKER_WAIT_PASS_0
-+    if (do_wait)
-+        rpi_sem_wait(&jb->sem);
-+    rpi_cache_flush_execute(jb->rfe);
-+#endif
-+}
-+
-+
-+static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe)
-+{
-+    av_freep(&ipe->q);
-+    gpu_free(&ipe->gptr);
-+}
-+
-+static HEVCRpiJob * job_new(void)
-+{
-+    HEVCRpiJob * const jb = av_mallocz(sizeof(HEVCRpiJob));
-+
-+    if (jb == NULL)
-+        return NULL;
-+
-+    sem_init(&jb->sem, 0, 0);
-+    jb->rfe = rpi_cache_flush_init(&jb->flush_buf);
-+    ff_hevc_rpi_progress_init_wait(&jb->progress_wait);
-+
-+    jb->intra.n = 0;
-+    if ((jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS)) == NULL)
-+        goto fail1;
-+
-+    // * Sizeof the union structure might be overkill but at the moment it
-+    //   is correct (it certainly isn't going to be too small)
-+    // Set max fill to slack/2 from the end of the Q
-+    // If we exceed this in any Q then we will schedule by size (which should
-+    // mean that we never use that Q again part from syncs)
-+    // * Given how agressive the overflow resonse is we could maybe put the
-+    //   threshold even nearer the end, but I don't expect us to ever hit
-+    //   it on any real stream anyway.
-+
-+    if (rpi_inter_pred_alloc(&jb->chroma_ip,
-+                         QPU_N_MAX, QPU_N_GRP,
-+                         QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t) + QPU_C_SYNCS * sizeof(uint32_t),
-+                         QPU_C_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_c_t) / 2) != 0)
-+        goto fail2;
-+    if (rpi_inter_pred_alloc(&jb->luma_ip,
-+                         QPU_N_MAX,  QPU_N_GRP,
-+                         QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t) + QPU_Y_SYNCS * sizeof(uint32_t),
-+                         QPU_Y_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_y_t) / 2) != 0)
-+        goto fail3;
-+
-+    return jb;
-+
-+fail3:
-+    rpi_free_inter_pred(&jb->luma_ip);
-+fail2:
-+    av_freep(&jb->intra.cmds);
-+fail1:
-+    ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
-+    rpi_cache_flush_finish(jb->rfe);
-+    sem_destroy(&jb->sem);
-+    return NULL;
-+}
-+
-+static void job_delete(HEVCRpiJob * const jb)
-+{
-+    worker_pic_free_one(jb);
-+    ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
-+    rpi_free_inter_pred(&jb->chroma_ip);
-+    rpi_free_inter_pred(&jb->luma_ip);
-+    av_freep(&jb->intra.cmds);
-+    rpi_cache_flush_finish(jb->rfe);  // Not really needed - should do nothing
-+    sem_destroy(&jb->sem);
-+    av_free(jb);
-+}
-+
-+static void jbg_delete(HEVCRpiJobGlobal * const jbg)
-+{
-+    HEVCRpiJob * jb;
-+
-+    if (jbg == NULL)
-+        return;
-+
-+    jb = jbg->free1;
-+    while (jb != NULL)
-+    {
-+        HEVCRpiJob * const jb2 = jb;
-+        jb = jb2->next;
-+        job_delete(jb2);
-+    }
-+
-+    pthread_mutex_destroy(&jbg->lock);
-+    av_free(jbg);
-+}
-+
-+static HEVCRpiJobGlobal * jbg_new(unsigned int job_count)
-+{
-+    HEVCRpiJobGlobal * const jbg = av_mallocz(sizeof(HEVCRpiJobGlobal));
-+    if (jbg == NULL)
-+        return NULL;
-+
-+    pthread_mutex_init(&jbg->lock, NULL);
-+
-+    while (job_count-- != 0)
-+    {
-+        HEVCRpiJob * const jb = job_new();
-+        if (jb == NULL)
-+            goto fail;
-+
-+        jb->next = jbg->free1;
-+        jbg->free1 = jb;
-+    }
-+
-+    return jbg;
-+
-+fail:
-+    jbg_delete(jbg);
-+    return NULL;
-+}
-+
-+static void rpi_job_ctl_delete(HEVCRpiJobCtl * const jbc)
-+{
-+    HEVCRpiJobGlobal * jbg;
-+
-+    if (jbc == NULL)
-+        return;
-+
-+    jbg = jbc->jbg;
-+
-+    if (jbc->jb1 != NULL)
-+        job_delete(jbc->jb1);
-+
-+    pthread_mutex_destroy(&jbc->in_lock);
-+    sem_destroy(&jbc->sem_out);
-+    av_free(jbc);
-+
-+    // Deref the global job context
-+    if (jbg != NULL && atomic_fetch_add(&jbg->ref_count, -1) == 1)
-+        jbg_delete(jbg);
-+}
-+
-+static HEVCRpiJobCtl * rpi_job_ctl_new(HEVCRpiJobGlobal *const jbg)
-+{
-+    HEVCRpiJobCtl * const jbc = av_mallocz(sizeof(HEVCRpiJobCtl));
-+
-+    if (jbc == NULL)
-+        return NULL;
-+
-+    jbc->jbg = jbg;
-+    atomic_fetch_add(&jbg->ref_count, 1);
-+
-+    sem_init(&jbc->sem_out, 0, RPI_MAX_JOBS);
-+    pthread_mutex_init(&jbc->in_lock, NULL);
-+
-+    if ((jbc->jb1 = job_new()) == NULL)
-+        goto fail;
-+    jbc->jb1->jbc_local = jbc;
-+
-+    return jbc;
-+
-+fail:
-+    rpi_job_ctl_delete(jbc);
-+    return NULL;
-+}
-+
-+
-+
-+static av_cold void hevc_init_worker(HEVCRpiContext * const s)
-+{
-+#if RPI_PASSES == 2
-+    pass_queue_init(s->passq + 1, s, worker_core2, &s->jbc->sem_out, 1);
-+#elif RPI_PASSES == 3
-+    pass_queue_init(s->passq + 2, s, rpi_execute_dblk_cmds, &s->jbc->sem_out, 2);
-+    pass_queue_init(s->passq + 1, s, rpi_execute_pred_cmds, &s->passq[2].sem_in, 1);
-+#else
-+#error Passes confused
-+#endif
-+    pass_queue_init(s->passq + 0, s, worker_core, &s->passq[1].sem_in, 0);
-+
-+    pass_queues_start_all(s);
-+}
-+
-+static av_cold void hevc_exit_worker(HEVCRpiContext *s)
-+{
-+    pass_queues_term_all(s);
-+
-+    pass_queues_kill_all(s);
-+
-+    rpi_job_ctl_delete(s->jbc);
-+    s->jbc = NULL;
-+}
-+
-+
-+static int slice_start(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc)
-+{
-+    const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
-+    const int tiles = s->ps.pps->num_tile_rows * s->ps.pps->num_tile_columns;
-+    const unsigned int tile_id = s->ps.pps->tile_id[ctb_addr_ts];
-+
-+    // Check for obvious disasters
-+    if (ctb_addr_ts == 0 && s->sh.dependent_slice_segment_flag) {
-+        av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    // If dependant then ctb_addr_ts != 0 from previous check
-+    if (s->sh.dependent_slice_segment_flag) {
-+        int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1];
-+        if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) {
-+            av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+    }
-+
-+    if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
-+        tile_id + s->sh.num_entry_point_offsets >= tiles)
-+    {
-+        av_log(s->avctx, AV_LOG_ERROR, "Entry points exceed tiles\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    // Tiled stuff must start at start of tile if it has multiple entry points
-+    if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
-+        s->sh.num_entry_point_offsets != 0 &&
-+        ctb_addr_ts != s->ps.pps->tile_pos_ts[tile_id])
-+    {
-+        av_log(s->avctx, AV_LOG_ERROR, "Multiple tiles in slice; slice start != tile start\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    ff_hevc_rpi_cabac_init_decoder(lc);
-+
-+    // Setup any required decode vars
-+    lc->cabac_init_req = !s->sh.dependent_slice_segment_flag;
-+
-+//    printf("SS: req=%d, sol=%d, sot=%d\n", lc->cabac_init_req, sol, sot);
-+    lc->qp_y = s->sh.slice_qp;
-+
-+    // General setup
-+    lc->bt_line_no = 0;
-+    lc->ts = ctb_addr_ts;
-+    return 0;
-+}
-+
-+static int gen_entry_points(HEVCRpiContext * const s, const H2645NAL * const nal)
-+{
-+    const GetBitContext * const gb = &s->HEVClc->gb;
-+    RpiSliceHeader * const sh = &s->sh;
-+    int i, j;
-+
-+    const unsigned int length = nal->size;
-+    unsigned int offset = ((gb->index) >> 3) + 1;  // We have a bit & align still to come = +1 byte
-+    unsigned int cmpt;
-+    unsigned int startheader;
-+
-+    if (sh->num_entry_point_offsets == 0) {
-+        s->data = NULL;
-+        return 0;
-+    }
-+
-+    // offset in slice header includes emulation prevention bytes.
-+    // Unfortunately those have been removed by the time we get here so we
-+    // have to compensate.  The nal layer keeps a track of where they were.
-+    for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[0]; j < nal->skipped_bytes; j++) {
-+        if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
-+            startheader--;
-+            cmpt++;
-+        }
-+    }
-+
-+    for (i = 1; i < sh->num_entry_point_offsets; i++) {
-+        offset += (sh->entry_point_offset[i - 1] - cmpt);
-+        for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[i]; j < nal->skipped_bytes; j++) {
-+            if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
-+                startheader--;
-+                cmpt++;
-+            }
-+        }
-+        if (sh->entry_point_offset[i] <= cmpt) {
-+            av_log(s->avctx, AV_LOG_ERROR, "entry point offset <= skipped bytes\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+        sh->size[i - 1] = sh->entry_point_offset[i] - cmpt;
-+        sh->offset[i - 1] = offset;
-+    }
-+
-+    offset += sh->entry_point_offset[sh->num_entry_point_offsets - 1] - cmpt;
-+    if (length < offset) {
-+        av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+    sh->size[sh->num_entry_point_offsets - 1] = length - offset;
-+    sh->offset[sh->num_entry_point_offsets - 1] = offset;
-+
-+    // Remember data start pointer as we won't have nal later
-+    s->data = nal->data;
-+    return 0;
-+}
-+
-+
-+// Return
-+// < 0   Error
-+// 0     OK
-+//
-+// jb->ctu_ts_last < 0       Job still filling
-+// jb->ctu_ts_last >= 0      Job ready
-+
-+static int fill_job(HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, unsigned int max_blocks)
-+{
-+    const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
-+    const unsigned int ctb_size = (1 << log2_ctb_size);
-+    HEVCRpiJob * const jb = lc->jb0;
-+    int more_data = 1;
-+    unsigned int ctb_addr_ts = lc->ts;
-+    unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
-+    unsigned int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << log2_ctb_size;
-+    const unsigned int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << log2_ctb_size;
-+
-+    lc->unit_done = 0;
-+
-+    while (more_data && ctb_addr_ts < s->ps.sps->ctb_size)
-+    {
-+        int q_full;
-+        const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts];
-+
-+        hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts);
-+
-+        ff_hevc_rpi_cabac_init(s, lc, ctb_flags);
-+
-+        hls_sao_param(s, lc, x_ctb >> log2_ctb_size, y_ctb >> log2_ctb_size);
-+
-+        s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset;
-+        s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
-+        s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
-+
-+        // Zap stashes if navail
-+        if ((lc->ctb_avail & AVAIL_U) == 0)
-+            zap_cabac_stash(s->cabac_stash_up + (x_ctb >> 3), log2_ctb_size - 3);
-+        if ((lc->ctb_avail & AVAIL_L) == 0)
-+        {
-+            memset(lc->ipm_left, INTRA_DC, IPM_TAB_SIZE);
-+            zap_cabac_stash(s->cabac_stash_left + (y_ctb >> 3), log2_ctb_size - 3);
-+        }
-+#if MVF_STASH_WIDTH > 64
-+        // Restore left mvf stash at start of tile if not at start of line
-+        if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0 && x_ctb != 0 && !s->is_irap)
-+        {
-+            unsigned int i;
-+            HEVCRpiMvField * dst = mvf_stash_ptr(s, lc, x_ctb - 1, 0);
-+            const HEVCRpiMvField * src = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE);
-+            for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i)
-+            {
-+                *dst = *src++;
-+                dst += MVF_STASH_WIDTH_PU;
-+            }
-+        }
-+#endif
-+
-+        // Set initial tu states
-+        lc->tu.cu_qp_delta = 0;
-+        lc->tu.is_cu_qp_delta_wanted = 0;
-+        lc->tu.cu_chroma_qp_offset_wanted = 0;
-+
-+        // Decode
-+        more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, log2_ctb_size, 0);
-+
-+        if (ff_hevc_rpi_cabac_overflow(lc))
-+        {
-+            av_log(s->avctx, AV_LOG_ERROR, "Quadtree bitstream overread\n ");
-+            more_data = AVERROR_INVALIDDATA;
-+        }
-+
-+        if (more_data < 0) {
-+            s->tab_slice_address[ctb_addr_rs] = TAB_SLICE_ADDR_BROKEN;  // Mark slice as broken
-+            return more_data;
-+        }
-+
-+        if (more_data && ((ctb_flags & CTB_TS_FLAGS_EOT) != 0 ||
-+             (s->ps.pps->entropy_coding_sync_enabled_flag && (ctb_flags & CTB_TS_FLAGS_EOTL) != 0)))
-+        {
-+            if (ff_hevc_rpi_get_cabac_terminate(&lc->cc) < 0 ||
-+                ff_hevc_rpi_cabac_skip_bytes(&lc->cc, 0) == NULL)
-+            {
-+                av_log(s->avctx, AV_LOG_ERROR, "Error reading terminate el\n ");
-+                return -1;
-+            }
-+        }
-+
-+        // --- Post CTB processing
-+
-+        // Stash rpl top/left for deblock that needs to remember such things cross-slice
-+        s->rpl_up[x_ctb >> log2_ctb_size] = s->refPicList;
-+        s->rpl_left[y_ctb >> log2_ctb_size] = s->refPicList;
-+
-+        if (!s->is_irap)
-+        {
-+            // Copy MVF up to up-left & stash to up
-+            {
-+                const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb, ctb_size - 1);
-+                HEVCRpiMvField * dst = s->mvf_up + (x_ctb >> LOG2_MIN_PU_SIZE);
-+
-+    //            printf("Stash: %d,%d, ctb_size=%d, %p->%p\n", x_ctb, y_ctb, ctb_size, src, dst);
-+
-+                lc->mvf_ul[0] = dst[(ctb_size - 1) >> LOG2_MIN_PU_SIZE];
-+                memcpy(dst, src, (sizeof(*src)*ctb_size) >> LOG2_MIN_PU_SIZE);
-+            }
-+            // Stash sideways if end of tile line but not end of line (no point)
-+            // ** Could/should do this @ end of fn
-+#if MVF_STASH_WIDTH > 64
-+            if ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOTL)
-+#endif
-+            {
-+                unsigned int i;
-+                const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb + ctb_size - 1, 0);
-+                HEVCRpiMvField * dst = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE);
-+                for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i)
-+                {
-+                    *dst++ = *src;
-+                    src += MVF_STASH_WIDTH_PU;
-+                }
-+            }
-+        }
-+
-+        if ((ctb_flags & CTB_TS_FLAGS_CSAVE) != 0)
-+            ff_hevc_rpi_save_states(s, lc);
-+
-+        // Report progress so we can use our MVs in other frames
-+        if ((ctb_flags & CTB_TS_FLAGS_EOL) != 0)
-+            ff_hevc_rpi_progress_signal_mv(s, y_ctb + ctb_size - 1);
-+
-+        // End of line || End of tile line || End of tile
-+        // (EoL covers end of frame for our purposes here)
-+        q_full = ((ctb_flags & CTB_TS_FLAGS_EOTL) != 0);
-+
-+        // Allocate QPU chunks on fixed size 64 pel boundries rather than
-+        // whatever ctb_size is today.
-+        // * We might quite like to continue to 64 pel vertical too but that
-+        //   currently confuses WPP
-+        if (((x_ctb + ctb_size) & 63) == 0 || q_full)
-+        {
-+            int overflow = 0;
-+            if (rpi_inter_pred_next_ctu(&jb->luma_ip) != 0)
-+                overflow = 1;
-+            if (rpi_inter_pred_next_ctu(&jb->chroma_ip) != 0)
-+                overflow = 1;
-+            if (overflow)
-+            {
-+                // * This is very annoying (and slow) to cope with in WPP so
-+                //   we treat it as an error there (no known stream triggers this
-+                //   with the current buffer sizes).  Non-wpp should cope fine.
-+                av_log(s->avctx, AV_LOG_WARNING,  "%s: Q full before EoL\n", __func__);
-+                q_full = 1;
-+            }
-+        }
-+
-+        // Inc TS to next.
-+        ctb_addr_ts++;
-+        ctb_addr_rs++;
-+        x_ctb += ctb_size;
-+
-+        if (q_full)
-+        {
-+            // Do job
-+            // Prep for submission
-+            jb->ctu_ts_last = ctb_addr_ts - 1;  // Was pre-inced
-+            job_gen_bounds(s, jb);
-+            break;
-+        }
-+
-+        // If max_blocks started as 0 then this will never be true
-+        if (--max_blocks == 0)
-+            break;
-+    }
-+
-+    lc->unit_done = (more_data <= 0);
-+    lc->ts = ctb_addr_ts;
-+    return 0;
-+}
-+
-+static void bt_lc_init(HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const unsigned int n)
-+{
-+    lc->context = s;
-+    lc->jb0 = NULL;
-+    lc->lc_n = n;
-+    lc->bt_terminate = 0;
-+    lc->bt_psem_out = NULL;
-+    sem_init(&lc->bt_sem_in, 0, 0);
-+}
-+
-+#define TRACE_WPP 0
-+#if RPI_EXTRA_BIT_THREADS > 0
-+static inline unsigned int line_ts_width(const HEVCRpiContext * const s, unsigned int ts)
-+{
-+    unsigned int rs = s->ps.pps->ctb_addr_ts_to_rs[ts];
-+    return s->ps.pps->column_width[s->ps.pps->col_idxX[rs % s->ps.sps->ctb_width]];
-+}
-+
-+// Move local context parameters from an aux bit thread back to the main
-+// thread at the end of a slice as processing is going to continue there.
-+static void movlc(HEVCRpiLocalContext *const dst_lc, HEVCRpiLocalContext *const src_lc, const int is_dep)
-+{
-+    if (src_lc == dst_lc) {
-+        return;
-+    }
-+
-+    // Move the job
-+    // We will still have an active job if the final line terminates early
-+    // Dest should always be null by now
-+    av_assert1(dst_lc->jb0 == NULL);
-+    dst_lc->jb0 = src_lc->jb0;
-+    src_lc->jb0 = NULL;
-+
-+    // Always need to store where we are in the bitstream
-+    dst_lc->ts = src_lc->ts;
-+    dst_lc->gb = src_lc->gb;
-+    // Cabac init request will be built at start of next slice
-+
-+    // Need to store context if we might have a dependent seg
-+    if (is_dep)
-+    {
-+        dst_lc->qPy_pred = src_lc->qPy_pred;
-+        memcpy(dst_lc->ipm_left, src_lc->ipm_left, sizeof(src_lc->ipm_left));
-+        memcpy(dst_lc->cabac_state, src_lc->cabac_state, sizeof(src_lc->cabac_state));
-+        memcpy(dst_lc->stat_coeff, src_lc->stat_coeff, sizeof(src_lc->stat_coeff));
-+    }
-+}
-+
-+static inline int wait_bt_sem_in(HEVCRpiLocalContext * const lc)
-+{
-+    rpi_sem_wait(&lc->bt_sem_in);
-+    return lc->bt_terminate;
-+}
-+
-+// Do one WPP line
-+// Will not work correctly over horizontal tile boundries - vertical should be OK
-+static int rpi_run_one_line(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc, const int is_first)
-+{
-+    const int is_tile = lc->bt_is_tile;
-+    const unsigned int tile_id = s->ps.pps->tile_id[lc->ts];
-+    const unsigned int line = lc->bt_line_no;
-+    const unsigned int line_inc = lc->bt_line_inc;
-+    const int is_last = (line >= lc->bt_last_line);
-+
-+    const unsigned int ts_eol = lc->ts + (is_tile ? s->ps.pps->tile_size[tile_id] : lc->bt_line_width);
-+    const unsigned int ts_next =
-+        line + line_inc > (unsigned int)s->sh.num_entry_point_offsets ?
-+            INT_MAX :
-+        is_tile ?
-+            s->ps.pps->tile_pos_ts[tile_id + line_inc] :
-+            lc->ts + lc->bt_line_width * line_inc;
-+    // Tile wants line, WPP a few CTUs (must be >= 2 for cabac context to work)
-+    const unsigned int partial_size = is_tile ? line_ts_width(s, lc->ts) : 2;
-+    unsigned int ts_prev;
-+    int loop_n = 0;
-+    int err = 0;
-+
-+    av_assert1(line <= s->sh.num_entry_point_offsets);
-+
-+#if TRACE_WPP
-+    printf("%s[%d]: Start %s: tile=%d, line=%d/%d/%d, ts=%d/%d/%d, width=%d, jb=%p\n", __func__,
-+           lc->lc_n,  is_tile ? "Tile" : "WPP", tile_id,
-+           line, lc->bt_last_line, s->sh.num_entry_point_offsets,
-+           lc->ts, ts_eol, ts_next, partial_size, lc->jb0);
-+#endif
-+    if (line != 0)
-+    {
-+        const uint8_t * const data = s->data + s->sh.offset[line - 1];
-+        const unsigned int len = s->sh.size[line - 1];
-+        if ((err = init_get_bits8(&lc->gb, data, len)) < 0)
-+            return err;
-+
-+        ff_init_cabac_decoder(&lc->cc, data, len);
-+    }
-+
-+    // We should never be processing a dependent slice here so reset is good
-+    // ?? These probably shouldn't be needed (as they should be set by later
-+    //    logic) but do seem to be required
-+    lc->qp_y = s->sh.slice_qp;
-+
-+    do
-+    {
-+        if (!is_last && loop_n > 1) {
-+#if TRACE_WPP
-+            printf("%s[%d]: %sPoke %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", lc->bt_psem_out);
-+#endif
-+            sem_post(lc->bt_psem_out);
-+        }
-+        // The wait for loop_n == 0 has been done in bit_thread
-+        if (!is_first && loop_n != 0)
-+        {
-+#if TRACE_WPP
-+            printf("%s[%d]: %sWait %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", &lc->bt_sem_in);
-+#endif
-+            if (wait_bt_sem_in(lc) != 0)
-+                return AVERROR_EXIT;
-+        }
-+
-+#if TRACE_WPP
-+        {
-+            int n;
-+            sem_getvalue(&lc->bt_sem_in, &n);
-+            printf("%s[%d]: ts=%d, sem=%d %p\n", __func__, lc->lc_n, lc->ts, n, &lc->bt_sem_in);
-+        }
-+#endif
-+
-+        ts_prev = lc->ts;
-+
-+        // If we have had an error - do no further decode but do continue
-+        // moving signals around so the other threads continue to operate
-+        // correctly (or at least as correctly as they can with this line missing)
-+        //
-+        // Errors in WPP/Tile are less fatal than normal as we have a good idea
-+        // of how to restart on the next line so there is no need to give up totally
-+        if (err != 0)
-+        {
-+            lc->unit_done = 0;
-+            lc->ts += partial_size;
-+        }
-+        else
-+        {
-+            worker_pass0_ready(s, lc);
-+
-+            if ((err = fill_job(s, lc, partial_size)) < 0 ||
-+                (lc->ts < ts_eol && !is_last && (lc->ts != ts_prev + partial_size || lc->unit_done)))
-+            {
-+                if (err == 0) {
-+                    av_log(s->avctx, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n");
-+                    err = AVERROR_INVALIDDATA;
-+                }
-+                worker_free(s, lc);
-+                lc->ts = ts_prev + partial_size;  // Pretend we did all that
-+                lc->unit_done = 0;
-+            }
-+            else if (is_tile)
-+            {
-+                worker_submit_job(s, lc);
-+            }
-+        }
-+
-+        ++loop_n;
-+    } while (lc->ts < ts_eol && !lc->unit_done);
-+
-+    // If we are on the last line & we didn't get a whole line we must wait for
-+    // and sink the sem_posts from the line above / tile to the left.
-+    while ((ts_prev += partial_size) < ts_eol)
-+    {
-+#if TRACE_WPP
-+        printf("%s[%d]: EOL Wait: ts=%d %p\n", __func__, lc->lc_n, ts_prev, &lc->bt_sem_in);
-+#endif
-+        if (wait_bt_sem_in(lc) != 0)
-+            return AVERROR_EXIT;
-+    }
-+
-+    lc->bt_line_no += line_inc;
-+
-+    if (!is_tile && err == 0)
-+        worker_submit_job(s, lc);
-+
-+    if (!is_last) {
-+        lc->ts = ts_next;
-+
-+#if TRACE_WPP
-+        printf("%s[%d]: Poke post submit %p\n", __func__, lc->lc_n, lc->bt_psem_out);
-+#endif
-+        sem_post(lc->bt_psem_out);
-+        if (loop_n > 1) {
-+#if TRACE_WPP
-+            printf("%s[%d]: Poke post submit2 %p\n", __func__, lc->lc_n, lc->bt_psem_out);
-+#endif
-+            sem_post(lc->bt_psem_out);
-+        }
-+    }
-+    else
-+    {
-+        movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag);  // * & not EoT
-+#if MVF_STASH_WIDTH > 64
-+        // Horrid calculations to work out what we want but luckily this should almost never execute
-+        // **** Move to movlc
-+        if (!s->is_irap)
-+        {
-+            const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[lc->ts];
-+            if ((ctb_flags & CTB_TS_FLAGS_EOTL) == 0) // If EOTL then we have already stashed mvf
-+            {
-+                const unsigned int x_ctb = ((s->ps.pps->ctb_addr_ts_to_rs[lc->ts] % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size) - 1;
-+                unsigned int i;
-+                const HEVCRpiMvField *s_mvf = lc->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1));
-+                HEVCRpiMvField *d_mvf = s->HEVClcList[0]->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1));
-+
-+                for (i = 0; i != MVF_STASH_HEIGHT_PU; ++i)
-+                {
-+                    *d_mvf = *s_mvf;
-+                    d_mvf += MVF_STASH_WIDTH_PU;
-+                    s_mvf += MVF_STASH_WIDTH_PU;
-+                }
-+
-+            }
-+        }
-+#endif
-+        // When all done poke the thread 0 sem_in one final time
-+#if TRACE_WPP
-+        printf("%s[%d]: Poke final %p\n", __func__, lc->lc_n, &s->HEVClcList[0]->bt_sem_in);
-+#endif
-+        sem_post(&s->HEVClcList[0]->bt_sem_in);
-+    }
-+
-+#if TRACE_WPP
-+    printf("%s[%d]: End. dep=%d\n", __func__, lc->lc_n, s->ps.pps->dependent_slice_segments_enabled_flag);
-+#endif
-+    return err;
-+}
-+
-+static void wpp_setup_lcs(HEVCRpiContext * const s)
-+{
-+    unsigned int ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
-+    const unsigned int line_width = line_ts_width(s, ts);
-+
-+    for (int i = 0; i <= s->sh.num_entry_point_offsets && i < RPI_BIT_THREADS; ++i)
-+    {
-+        HEVCRpiLocalContext * const lc = s->HEVClcList[i];
-+        lc->ts = ts;
-+        lc->bt_is_tile = 0;
-+        lc->bt_line_no = i;
-+        lc->bt_line_width = line_width;
-+        lc->bt_last_line = s->sh.num_entry_point_offsets;
-+        lc->bt_line_inc = RPI_BIT_THREADS;
-+        ts += line_width;
-+    }
-+}
-+
-+
-+// Can only process tile single row at once
-+static void tile_one_row_setup_lcs(HEVCRpiContext * const s, unsigned int slice_row)
-+{
-+    const HEVCRpiPPS * const pps = s->ps.pps;
-+    const unsigned int ts0 = pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
-+    const unsigned int tile0 = pps->tile_id[ts0];
-+    const unsigned int col0 = tile0 % pps->num_tile_columns;
-+
-+    const unsigned int col = (slice_row == 0) ? col0 : 0;
-+    unsigned int line = slice_row * pps->num_tile_columns - col0 + col;
-+    const unsigned int last_line = FFMIN(
-+        line + pps->num_tile_columns - 1 - col, s->sh.num_entry_point_offsets);
-+
-+    const unsigned int par =
-+        FFMIN(RPI_BIT_THREADS, last_line + 1 - line);
-+#if TRACE_WPP
-+    printf("ts0=%d, ents=%d, row=%d, tiles=%dx%d, col=%d, par=%d, line=%d/%d\n", ts0, s->sh.num_entry_point_offsets, slice_row,
-+           pps->num_tile_columns, pps->num_tile_rows, col, par, line, last_line);
-+#endif
-+    for (unsigned int i = 0; i != par; ++i, ++line)
-+    {
-+        HEVCRpiLocalContext * const lc = s->HEVClcList[i];
-+        const unsigned int tile = tile0 + line;
-+
-+        lc->ts = pps->tile_pos_ts[tile];
-+        lc->bt_line_no = line;
-+        lc->bt_is_tile = 1;
-+        lc->bt_line_width = line_ts_width(s, lc->ts);
-+        lc->bt_last_line = last_line;
-+        lc->bt_line_inc = par;
-+    }
-+}
-+
-+
-+static void * bit_thread(void * v)
-+{
-+    HEVCRpiLocalContext * const lc = v;
-+    HEVCRpiContext *const s = lc->context;
-+
-+    while (wait_bt_sem_in(lc) == 0)
-+    {
-+        int err;
-+
-+        if ((err = rpi_run_one_line(s, lc, 0)) < 0) {  // Never first tile/wpp
-+            if (lc->bt_terminate) {
-+                av_log(s->avctx, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__);
-+                break;
-+            }
-+            av_log(s->avctx, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err);
-+        }
-+    }
-+
-+    return NULL;
-+}
-+
-+static int bit_threads_start(HEVCRpiContext * const s)
-+{
-+    if (s->bt_started)
-+        return 0;
-+
-+    for (int i = 1; i < RPI_BIT_THREADS; ++i)
-+    {
-+        // lc[0] belongs to the main thread - this sets up lc[1..RPI_BIT_THREADS]
-+        if (s->HEVClcList[i] == NULL) {
-+            if ((s->HEVClcList[i] = av_mallocz(sizeof(*s->HEVClcList[0]))) == NULL)
-+                return -1;
-+        }
-+
-+        bt_lc_init(s, s->HEVClcList[i], i);
-+        job_lc_init(s->HEVClcList[i]);
-+    }
-+
-+    // Link the sems in a circle
-+    for (int i = 0; i < RPI_BIT_THREADS - 1; ++i)
-+        s->HEVClcList[i]->bt_psem_out = &s->HEVClcList[i + 1]->bt_sem_in;
-+    s->HEVClcList[RPI_BIT_THREADS - 1]->bt_psem_out = &s->HEVClcList[0]->bt_sem_in;
-+
-+    // Init all lc before starting any threads
-+    for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
-+    {
-+        if (pthread_create(s->bit_threads + i, NULL, bit_thread, s->HEVClcList[i + 1]) < 0)
-+            return -1;
-+    }
-+
-+    s->bt_started = 1;
-+    return 0;
-+}
-+
-+static int bit_threads_kill(HEVCRpiContext * const s)
-+{
-+    if (!s->bt_started)
-+        return 0;
-+    s->bt_started = 0;
-+
-+    for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
-+    {
-+        HEVCRpiLocalContext *const lc = s->HEVClcList[i + 1];
-+        if (lc == NULL)
-+            break;
-+
-+        lc->bt_terminate = 1;
-+        sem_post(&lc->bt_sem_in);
-+        pthread_join(s->bit_threads[i], NULL);
-+
-+        sem_destroy(&lc->bt_sem_in);
-+        job_lc_kill(lc);
-+    }
-+    return 0;
-+}
-+#endif
-+
-+
-+// If we are at EoT and the row is shorter than the number of jobs
-+// we can Q we have to wait for it finish otherwise we risk cache/QPU
-+// disasters
-+static inline int tile_needs_wait(const HEVCRpiContext * const s, const int n)
-+{
-+    return
-+        s->ps.pps->tile_wpp_inter_disable >= 2 &&
-+        s->sh.slice_type != HEVC_SLICE_I &&
-+        n >= 0 &&
-+        (s->ps.pps->ctb_ts_flags[n] & (CTB_TS_FLAGS_EOT | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOT;
-+}
-+
-+static int rpi_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+{
-+    HEVCRpiContext * const s  = avctxt->priv_data;
-+    HEVCRpiLocalContext * const lc = s->HEVClc;
-+    int err;
-+
-+    // Start of slice
-+    if ((err = slice_start(s, lc)) != 0)
-+        return err;
-+
-+#if RPI_EXTRA_BIT_THREADS > 0
-+
-+    if (s->sh.offload_tiles)
-+    {
-+        unsigned int slice_row = 0;
-+
-+#if TRACE_WPP
-+        printf("%s: Do Tiles\n", __func__);
-+#endif
-+        // Generate & start extra bit threads if they aren't already running
-+        bit_threads_start(s);
-+
-+        do
-+        {
-+            // Reset lc lines etc.
-+            tile_one_row_setup_lcs(s, slice_row);
-+
-+#if TRACE_WPP
-+            printf("%s: Row %d: Do 1st: line=%d/%d/%d\n",
-+                   __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
-+#endif
-+
-+            rpi_run_one_line(s, lc, 1);  // Kicks off the other threads
-+#if TRACE_WPP
-+            printf("%s: Row %d: Done 1st: line=%d/%d/%d\n",
-+                   __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
-+#endif
-+
-+            while (lc->bt_line_no <= lc->bt_last_line) {
-+                rpi_sem_wait(&lc->bt_sem_in);
-+                rpi_run_one_line(s, lc, 0);
-+            }
-+#if TRACE_WPP
-+            printf("%s: Done body\n", __func__);
-+#endif
-+
-+            // Wait for everything else to finish
-+            rpi_sem_wait(&lc->bt_sem_in);
-+
-+            ++slice_row;
-+        } while (lc->bt_last_line < s->sh.num_entry_point_offsets);
-+
-+
-+#if TRACE_WPP
-+        printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
-+#endif
-+    }
-+    else if (s->sh.offload_wpp)
-+    {
-+#if TRACE_WPP
-+        printf("%s: Do WPP\n", __func__);
-+#endif
-+        // Generate & start extra bit threads if they aren't already running
-+        bit_threads_start(s);
-+
-+        // Reset lc lines etc.
-+        wpp_setup_lcs(s);
-+
-+        rpi_run_one_line(s, lc, 1);  // Kicks off the other threads
-+#if TRACE_WPP
-+        printf("%s: Done 1st\n", __func__);
-+#endif
-+
-+        while (lc->bt_line_no <= s->sh.num_entry_point_offsets) {
-+            rpi_sem_wait(&lc->bt_sem_in);
-+            rpi_run_one_line(s, lc, 0);
-+        }
-+#if TRACE_WPP
-+        printf("%s: Done body\n", __func__);
-+#endif
-+
-+        // Wait for everything else to finish
-+        rpi_sem_wait(&lc->bt_sem_in);
-+
-+#if TRACE_WPP
-+        printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
-+#endif
-+    }
-+    else
-+#endif
-+    {
-+#if TRACE_WPP
-+        printf("%s: Single start: ts=%d\n", __func__, lc->ts);
-+#endif
-+        // Single bit thread
-+        do {
-+            // Make sure we have space to prepare the next job
-+            worker_pass0_ready(s, lc);
-+
-+            if ((err = fill_job(s, lc, 0)) < 0)
-+                goto fail;
-+
-+            worker_submit_job(s, lc);
-+
-+            if (tile_needs_wait(s, lc->ts - 1))
-+                worker_wait(s, lc);
-+
-+        } while (!lc->unit_done);
-+
-+#if TRACE_WPP
-+        printf("%s: Single end: ts=%d\n", __func__, lc->ts);
-+#endif
-+    }
-+
-+    // If we have reached the end of the frame or
-+    // then wait for the worker to finish all its jobs
-+    if (lc->ts >= s->ps.sps->ctb_size)
-+        worker_wait(s, lc);
-+
-+#if RPI_TSTATS
-+    {
-+        HEVCRpiStats *const ts = &s->tstats;
-+
-+        printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n    B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
-+               ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
-+               ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
-+               ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
-+               ts->y_pred2_hgt16, ts->y_pred2_hle16);
-+        memset(ts, 0, sizeof(*ts));
-+    }
-+#endif
-+
-+    return lc->ts;
-+
-+fail:
-+    // Cleanup
-+    av_log(s->avctx, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err);
-+    // Free our job & wait for temination
-+    worker_free(s, lc);
-+    worker_wait(s, lc);
-+    return err;
-+}
-+
-+
-+static void set_no_backward_pred(HEVCRpiContext * const s)
-+{
-+    int i, j;
-+    const RefPicList *const refPicList = s->refPicList;
-+
-+    s->no_backward_pred_flag = 0;
-+    if (s->sh.slice_type != HEVC_SLICE_B || !s->sh.slice_temporal_mvp_enabled_flag)
-+        return;
-+
-+    for (j = 0; j < 2; j++) {
-+        for (i = 0; i < refPicList[j].nb_refs; i++) {
-+            if (refPicList[j].list[i] > s->poc) {
-+                s->no_backward_pred_flag = 1;
-+                return;
-+            }
-+        }
-+    }
-+}
-+
-+static int hls_slice_data(HEVCRpiContext * const s, const H2645NAL * const nal)
-+{
-+    int err;
-+    if ((err = gen_entry_points(s, nal)) < 0)
-+        return err;
-+
-+    set_no_backward_pred(s);
-+
-+    return rpi_decode_entry(s->avctx, NULL);
-+}
-+
-+static int set_side_data(HEVCRpiContext *s)
-+{
-+    AVFrame *out = s->ref->frame;
-+
-+    if (s->sei.frame_packing.present &&
-+        s->sei.frame_packing.arrangement_type >= 3 &&
-+        s->sei.frame_packing.arrangement_type <= 5 &&
-+        s->sei.frame_packing.content_interpretation_type > 0 &&
-+        s->sei.frame_packing.content_interpretation_type < 3) {
-+        AVStereo3D *stereo = av_stereo3d_create_side_data(out);
-+        if (!stereo)
-+            return AVERROR(ENOMEM);
-+
-+        switch (s->sei.frame_packing.arrangement_type) {
-+        case 3:
-+            if (s->sei.frame_packing.quincunx_subsampling)
-+                stereo->type = AV_STEREO3D_SIDEBYSIDE_QUINCUNX;
-+            else
-+                stereo->type = AV_STEREO3D_SIDEBYSIDE;
-+            break;
-+        case 4:
-+            stereo->type = AV_STEREO3D_TOPBOTTOM;
-+            break;
-+        case 5:
-+            stereo->type = AV_STEREO3D_FRAMESEQUENCE;
-+            break;
-+        }
-+
-+        if (s->sei.frame_packing.content_interpretation_type == 2)
-+            stereo->flags = AV_STEREO3D_FLAG_INVERT;
-+
-+        if (s->sei.frame_packing.arrangement_type == 5) {
-+            if (s->sei.frame_packing.current_frame_is_frame0_flag)
-+                stereo->view = AV_STEREO3D_VIEW_LEFT;
-+            else
-+                stereo->view = AV_STEREO3D_VIEW_RIGHT;
-+        }
-+    }
-+
-+    if (s->sei.display_orientation.present &&
-+        (s->sei.display_orientation.anticlockwise_rotation ||
-+         s->sei.display_orientation.hflip || s->sei.display_orientation.vflip)) {
-+        double angle = s->sei.display_orientation.anticlockwise_rotation * 360 / (double) (1 << 16);
-+        AVFrameSideData *rotation = av_frame_new_side_data(out,
-+                                                           AV_FRAME_DATA_DISPLAYMATRIX,
-+                                                           sizeof(int32_t) * 9);
-+        if (!rotation)
-+            return AVERROR(ENOMEM);
-+
-+        av_display_rotation_set((int32_t *)rotation->data, angle);
-+        av_display_matrix_flip((int32_t *)rotation->data,
-+                               s->sei.display_orientation.hflip,
-+                               s->sei.display_orientation.vflip);
-+    }
-+
-+    // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
-+    // so the side data persists for the entire coded video sequence.
-+    if (s->sei.mastering_display.present > 0 &&
-+        IS_IRAP(s) && s->no_rasl_output_flag) {
-+        s->sei.mastering_display.present--;
-+    }
-+    if (s->sei.mastering_display.present) {
-+        // HEVC uses a g,b,r ordering, which we convert to a more natural r,g,b
-+        const int mapping[3] = {2, 0, 1};
-+        const int chroma_den = 50000;
-+        const int luma_den = 10000;
-+        int i;
-+        AVMasteringDisplayMetadata *metadata =
-+            av_mastering_display_metadata_create_side_data(out);
-+        if (!metadata)
-+            return AVERROR(ENOMEM);
-+
-+        for (i = 0; i < 3; i++) {
-+            const int j = mapping[i];
-+            metadata->display_primaries[i][0].num = s->sei.mastering_display.display_primaries[j][0];
-+            metadata->display_primaries[i][0].den = chroma_den;
-+            metadata->display_primaries[i][1].num = s->sei.mastering_display.display_primaries[j][1];
-+            metadata->display_primaries[i][1].den = chroma_den;
-+        }
-+        metadata->white_point[0].num = s->sei.mastering_display.white_point[0];
-+        metadata->white_point[0].den = chroma_den;
-+        metadata->white_point[1].num = s->sei.mastering_display.white_point[1];
-+        metadata->white_point[1].den = chroma_den;
-+
-+        metadata->max_luminance.num = s->sei.mastering_display.max_luminance;
-+        metadata->max_luminance.den = luma_den;
-+        metadata->min_luminance.num = s->sei.mastering_display.min_luminance;
-+        metadata->min_luminance.den = luma_den;
-+        metadata->has_luminance = 1;
-+        metadata->has_primaries = 1;
-+
-+        av_log(s->avctx, AV_LOG_DEBUG, "Mastering Display Metadata:\n");
-+        av_log(s->avctx, AV_LOG_DEBUG,
-+               "r(%5.4f,%5.4f) g(%5.4f,%5.4f) b(%5.4f %5.4f) wp(%5.4f, %5.4f)\n",
-+               av_q2d(metadata->display_primaries[0][0]),
-+               av_q2d(metadata->display_primaries[0][1]),
-+               av_q2d(metadata->display_primaries[1][0]),
-+               av_q2d(metadata->display_primaries[1][1]),
-+               av_q2d(metadata->display_primaries[2][0]),
-+               av_q2d(metadata->display_primaries[2][1]),
-+               av_q2d(metadata->white_point[0]), av_q2d(metadata->white_point[1]));
-+        av_log(s->avctx, AV_LOG_DEBUG,
-+               "min_luminance=%f, max_luminance=%f\n",
-+               av_q2d(metadata->min_luminance), av_q2d(metadata->max_luminance));
-+    }
-+    // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
-+    // so the side data persists for the entire coded video sequence.
-+    if (s->sei.content_light.present > 0 &&
-+        IS_IRAP(s) && s->no_rasl_output_flag) {
-+        s->sei.content_light.present--;
-+    }
-+    if (s->sei.content_light.present) {
-+        AVContentLightMetadata *metadata =
-+            av_content_light_metadata_create_side_data(out);
-+        if (!metadata)
-+            return AVERROR(ENOMEM);
-+        metadata->MaxCLL  = s->sei.content_light.max_content_light_level;
-+        metadata->MaxFALL = s->sei.content_light.max_pic_average_light_level;
-+
-+        av_log(s->avctx, AV_LOG_DEBUG, "Content Light Level Metadata:\n");
-+        av_log(s->avctx, AV_LOG_DEBUG, "MaxCLL=%d, MaxFALL=%d\n",
-+               metadata->MaxCLL, metadata->MaxFALL);
-+    }
-+
-+    if (s->sei.a53_caption.a53_caption) {
-+        AVFrameSideData* sd = av_frame_new_side_data(out,
-+                                                     AV_FRAME_DATA_A53_CC,
-+                                                     s->sei.a53_caption.a53_caption_size);
-+        if (sd)
-+            memcpy(sd->data, s->sei.a53_caption.a53_caption, s->sei.a53_caption.a53_caption_size);
-+        av_freep(&s->sei.a53_caption.a53_caption);
-+        s->sei.a53_caption.a53_caption_size = 0;
-+        s->avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
-+    }
-+
-+    if (s->sei.alternative_transfer.present &&
-+        av_color_transfer_name(s->sei.alternative_transfer.preferred_transfer_characteristics) &&
-+        s->sei.alternative_transfer.preferred_transfer_characteristics != AVCOL_TRC_UNSPECIFIED) {
-+        s->avctx->color_trc = out->color_trc = s->sei.alternative_transfer.preferred_transfer_characteristics;
-+    }
-+
-+    return 0;
-+}
-+
-+static int hevc_frame_start(HEVCRpiContext * const s)
-+{
-+    int ret;
-+
-+    memset(s->bs_horizontal, 0, s->bs_size * 2);  // Does V too
-+    memset(s->is_pcm,        0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
-+    memset(s->tab_slice_address, -1, s->ps.sps->ctb_size * sizeof(*s->tab_slice_address));
-+
-+    // Only need to remember intra for CIP
-+    if (!s->ps.pps->constrained_intra_pred_flag || s->is_irap)
-+        s->is_intra = NULL;
-+    else
-+    {
-+        s->is_intra = s->is_intra_store;
-+        memset(s->is_intra, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
-+    }
-+
-+    s->is_decoded        = 0;
-+    s->first_nal_type    = s->nal_unit_type;
-+
-+    s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos);
-+
-+    if (s->pkt.nb_nals > s->rpl_tab_size)
-+    {
-+        // In most cases it will be faster to free & realloc as that doesn't
-+        // require (an unwanted) copy
-+        av_freep(&s->rpl_tab);
-+        s->rpl_tab_size = 0;
-+        if ((s->rpl_tab = av_malloc(s->pkt.nb_nals * sizeof(*s->rpl_tab))) == NULL)
-+            goto fail;
-+        s->rpl_tab_size = s->pkt.nb_nals;
-+    }
-+    memset(s->rpl_tab, 0, s->pkt.nb_nals * sizeof(*s->rpl_tab));
-+
-+    ret = ff_hevc_rpi_set_new_ref(s, &s->frame, s->poc);
-+    if (ret < 0)
-+        goto fail;
-+
-+    // Resize rpl_tab to max that we might want
-+    ret = ff_hevc_rpi_frame_rps(s);
-+    if (ret < 0) {
-+        av_log(s->avctx, AV_LOG_ERROR, "Error constructing the frame RPS.\n");
-+        goto fail;
-+    }
-+
-+    s->ref->frame->key_frame = IS_IRAP(s);
-+
-+    ret = set_side_data(s);
-+    if (ret < 0)
-+        goto fail;
-+
-+    s->frame->pict_type = 3 - s->sh.slice_type;
-+
-+    if (!IS_IRAP(s))
-+        ff_hevc_rpi_bump_frame(s);
-+
-+    av_frame_unref(s->output_frame);
-+    ret = ff_hevc_rpi_output_frame(s, s->output_frame, 0);
-+    if (ret < 0)
-+        goto fail;
-+
-+    ff_thread_finish_setup(s->avctx);
-+
-+    return 0;
-+
-+fail:
-+    if (s->ref)
-+        ff_hevc_rpi_unref_frame(s, s->ref, ~0);
-+    s->ref = NULL;
-+    return ret;
-+}
-+
-+static inline int is_non_ref_unit_type(const unsigned int nal_unit_type)
-+{
-+    // From Table 7-1
-+    return (nal_unit_type & ~0xe) == 0;  // True for 0, 2, 4, 6, 8, 10, 12, 14
-+}
-+
-+static int decode_nal_unit(HEVCRpiContext *s, const H2645NAL *nal)
-+{
-+    GetBitContext * const gb    = &s->HEVClc->gb;
-+    int ctb_addr_ts, ret;
-+
-+    *gb              = nal->gb;
-+    s->nal_unit_type = nal->type;
-+    s->temporal_id   = nal->temporal_id;
-+
-+    switch (s->nal_unit_type) {
-+    case HEVC_NAL_VPS:
-+        ret = ff_hevc_rpi_decode_nal_vps(gb, s->avctx, &s->ps);
-+        if (ret < 0)
-+            goto fail;
-+        break;
-+    case HEVC_NAL_SPS:
-+        ret = ff_hevc_rpi_decode_nal_sps(gb, s->avctx, &s->ps,
-+                                     s->apply_defdispwin);
-+        if (ret < 0)
-+            goto fail;
-+        break;
-+    case HEVC_NAL_PPS:
-+        ret = ff_hevc_rpi_decode_nal_pps(gb, s->avctx, &s->ps);
-+        if (ret < 0)
-+            goto fail;
-+        break;
-+    case HEVC_NAL_SEI_PREFIX:
-+    case HEVC_NAL_SEI_SUFFIX:
-+        ret = ff_hevc_rpi_decode_nal_sei(gb, s->avctx, &s->sei, &s->ps, s->nal_unit_type);
-+        if (ret < 0)
-+            goto fail;
-+        break;
-+    case HEVC_NAL_TRAIL_R:
-+    case HEVC_NAL_TRAIL_N:
-+    case HEVC_NAL_TSA_N:
-+    case HEVC_NAL_TSA_R:
-+    case HEVC_NAL_STSA_N:
-+    case HEVC_NAL_STSA_R:
-+    case HEVC_NAL_BLA_W_LP:
-+    case HEVC_NAL_BLA_W_RADL:
-+    case HEVC_NAL_BLA_N_LP:
-+    case HEVC_NAL_IDR_W_RADL:
-+    case HEVC_NAL_IDR_N_LP:
-+    case HEVC_NAL_CRA_NUT:
-+    case HEVC_NAL_RADL_N:
-+    case HEVC_NAL_RADL_R:
-+    case HEVC_NAL_RASL_N:
-+    case HEVC_NAL_RASL_R:
-+        ret = hls_slice_header(s);
-+        if (ret < 0)
-+            return ret;
-+
-+        // The definition of _N unit types is "non-reference for other frames
-+        // with the same temporal_id" so they may/will be ref frames for pics
-+        // with a higher temporal_id.
-+        s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
-+            !is_non_ref_unit_type(s->nal_unit_type);
-+        s->offload_recon = s->threads_type != 0 && s->used_for_ref;
-+        s->is_irap = IS_IRAP(s);
-+
-+#if DEBUG_DECODE_N
-+        {
-+            static int z = 0;
-+            if (IS_IDR(s)) {
-+                z = 1;
-+            }
-+            if (z != 0 && z++ > DEBUG_DECODE_N) {
-+                s->is_decoded = 0;
-+                break;
-+            }
-+        }
-+#endif
-+        if (
-+            (s->avctx->skip_frame >= AVDISCARD_NONREF && !s->used_for_ref) ||
-+            (s->avctx->skip_frame >= AVDISCARD_BIDIR && s->sh.slice_type == HEVC_SLICE_B) ||
-+            (s->avctx->skip_frame >= AVDISCARD_NONINTRA && s->sh.slice_type != HEVC_SLICE_I) ||
-+            (s->avctx->skip_frame >= AVDISCARD_NONKEY && !IS_IRAP(s)))
-+        {
-+            s->is_decoded = 0;
-+            break;
-+        }
-+
-+        if (s->sh.first_slice_in_pic_flag) {
-+            if (s->max_ra == INT_MAX) {
-+                if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) {
-+                    s->max_ra = s->poc;
-+                } else {
-+                    if (IS_IDR(s))
-+                        s->max_ra = INT_MIN;
-+                }
-+            }
-+
-+            if ((s->nal_unit_type == HEVC_NAL_RASL_R || s->nal_unit_type == HEVC_NAL_RASL_N) &&
-+                s->poc <= s->max_ra) {
-+                s->is_decoded = 0;
-+                break;
-+            } else {
-+                if (s->nal_unit_type == HEVC_NAL_RASL_R && s->poc > s->max_ra)
-+                    s->max_ra = INT_MIN;
-+            }
-+
-+            ret = hevc_frame_start(s);
-+            if (ret < 0)
-+                return ret;
-+        } else if (!s->ref) {
-+            av_log(s->avctx, AV_LOG_ERROR, "First slice in a frame missing.\n");
-+            goto fail;
-+        }
-+
-+        if (s->nal_unit_type != s->first_nal_type) {
-+            av_log(s->avctx, AV_LOG_ERROR,
-+                   "Non-matching NAL types of the VCL NALUs: %d %d\n",
-+                   s->first_nal_type, s->nal_unit_type);
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        if (!s->sh.dependent_slice_segment_flag &&
-+            s->sh.slice_type != HEVC_SLICE_I) {
-+            ret = ff_hevc_rpi_slice_rpl(s);
-+            if (ret < 0) {
-+                av_log(s->avctx, AV_LOG_WARNING,
-+                       "Error constructing the reference lists for the current slice.\n");
-+                goto fail;
-+            }
-+        }
-+
-+        ctb_addr_ts = hls_slice_data(s, nal);
-+        if (ctb_addr_ts >= s->ps.sps->ctb_size) {
-+            s->is_decoded = 1;
-+        }
-+
-+        if (ctb_addr_ts < 0) {
-+            ret = ctb_addr_ts;
-+            goto fail;
-+        }
-+        break;
-+    case HEVC_NAL_EOS_NUT:
-+    case HEVC_NAL_EOB_NUT:
-+        s->seq_decode = (s->seq_decode + 1) & 0xff;
-+        s->max_ra     = INT_MAX;
-+        break;
-+    case HEVC_NAL_AUD:
-+    case HEVC_NAL_FD_NUT:
-+        break;
-+    default:
-+        av_log(s->avctx, AV_LOG_INFO,
-+               "Skipping NAL unit %d\n", s->nal_unit_type);
-+    }
-+
-+    return 0;
-+fail:
-+    if (s->avctx->err_recognition & AV_EF_EXPLODE)
-+        return ret;
-+    return 0;
-+}
-+
-+static int decode_nal_units(HEVCRpiContext *s, const uint8_t *buf, int length)
-+{
-+    int i, ret = 0;
-+    int eos_at_start = 1;
-+
-+    s->ref = NULL;
-+    s->last_eos = s->eos;
-+    s->eos = 0;
-+
-+    /* split the input packet into NAL units, so we know the upper bound on the
-+     * number of slices in the frame */
-+    ret = ff_h2645_packet_split(&s->pkt, buf, length, s->avctx, s->is_nalff,
-+                                s->nal_length_size, s->avctx->codec_id, 0, 0);
-+    if (ret < 0) {
-+        av_log(s->avctx, AV_LOG_ERROR,
-+               "Error splitting the input into NAL units.\n");
-+        return ret;
-+    }
-+
-+    for (i = 0; i < s->pkt.nb_nals; i++) {
-+        if (s->pkt.nals[i].type == HEVC_NAL_EOB_NUT ||
-+            s->pkt.nals[i].type == HEVC_NAL_EOS_NUT) {
-+            if (eos_at_start) {
-+                s->last_eos = 1;
-+            } else {
-+                s->eos = 1;
-+            }
-+        } else {
-+            eos_at_start = 0;
-+        }
-+    }
-+
-+    /* decode the NAL units */
-+    for (i = 0; i < s->pkt.nb_nals; i++) {
-+        ret = decode_nal_unit(s, &s->pkt.nals[i]);
-+        if (ret < 0) {
-+            av_log(s->avctx, AV_LOG_WARNING,
-+                   "Error parsing NAL unit #%d.\n", i);
-+            goto fail;
-+        }
-+    }
-+
-+fail:  // Also success path
-+    if (s->ref != NULL) {
-+        if (s->used_for_ref && s->threads_type != 0) {
-+            ff_hevc_rpi_progress_signal_all_done(s);
-+        }
-+        else {
-+            // Flush frame to real memory as we expect to be able to pass
-+            // it straight on to mmal
-+            flush_frame(s, s->frame);
-+        }
-+    }
-+    return ret;
-+}
-+
-+static void print_md5(void *log_ctx, int level, uint8_t md5[16])
-+{
-+    int i;
-+    for (i = 0; i < 16; i++)
-+        av_log(log_ctx, level, "%02"PRIx8, md5[i]);
-+}
-+
-+static int verify_md5(HEVCRpiContext *s, AVFrame *frame)
-+{
-+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
-+    int pixel_shift;
-+    int i, j;
-+
-+    if (!desc)
-+        return AVERROR(EINVAL);
-+
-+    pixel_shift = desc->comp[0].depth > 8;
-+
-+    av_log(s->avctx, AV_LOG_DEBUG, "Verifying checksum for frame with POC %d: ",
-+           s->poc);
-+
-+    /* the checksums are LE, so we have to byteswap for >8bpp formats
-+     * on BE arches */
-+#if HAVE_BIGENDIAN
-+    if (pixel_shift && !s->checksum_buf) {
-+        av_fast_malloc(&s->checksum_buf, &s->checksum_buf_size,
-+                       FFMAX3(frame->linesize[0], frame->linesize[1],
-+                              frame->linesize[2]));
-+        if (!s->checksum_buf)
-+            return AVERROR(ENOMEM);
-+    }
-+#endif
-+
-+    for (i = 0; frame->data[i]; i++) {
-+        int width  = s->avctx->coded_width;
-+        int height = s->avctx->coded_height;
-+        int w = (i == 1 || i == 2) ? (width  >> desc->log2_chroma_w) : width;
-+        int h = (i == 1 || i == 2) ? (height >> desc->log2_chroma_h) : height;
-+        uint8_t md5[16];
-+
-+        av_md5_init(s->md5_ctx);
-+        for (j = 0; j < h; j++) {
-+            const uint8_t *src = frame->data[i] + j * frame_stride1(frame, 1);
-+#if HAVE_BIGENDIAN
-+            if (pixel_shift) {
-+                s->bdsp.bswap16_buf((uint16_t *) s->checksum_buf,
-+                                    (const uint16_t *) src, w);
-+                src = s->checksum_buf;
-+            }
-+#endif
-+            av_md5_update(s->md5_ctx, src, w << pixel_shift);
-+        }
-+        av_md5_final(s->md5_ctx, md5);
-+
-+        if (!memcmp(md5, s->sei.picture_hash.md5[i], 16)) {
-+            av_log   (s->avctx, AV_LOG_DEBUG, "plane %d - correct ", i);
-+            print_md5(s->avctx, AV_LOG_DEBUG, md5);
-+            av_log   (s->avctx, AV_LOG_DEBUG, "; ");
-+        } else {
-+            av_log   (s->avctx, AV_LOG_ERROR, "mismatching checksum of plane %d - ", i);
-+            print_md5(s->avctx, AV_LOG_ERROR, md5);
-+            av_log   (s->avctx, AV_LOG_ERROR, " != ");
-+            print_md5(s->avctx, AV_LOG_ERROR, s->sei.picture_hash.md5[i]);
-+            av_log   (s->avctx, AV_LOG_ERROR, "\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+    }
-+
-+    av_log(s->avctx, AV_LOG_DEBUG, "\n");
-+
-+    return 0;
-+}
-+
-+static int all_sps_supported(const HEVCRpiContext * const s)
-+{
-+    for (unsigned int i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
-+        if (s->ps.sps_list[i] != NULL)
-+        {
-+            const HEVCRpiSPS * const sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
-+            if (!is_sps_supported(sps))
-+                return 0;
-+        }
-+    }
-+    return 1;
-+}
-+
-+static int hevc_rpi_decode_extradata(HEVCRpiContext *s, uint8_t *buf, int length, int first)
-+{
-+    int ret, i;
-+
-+    ret = ff_hevc_rpi_decode_extradata(buf, length, &s->ps, &s->sei, &s->is_nalff,
-+                                   &s->nal_length_size, s->avctx->err_recognition,
-+                                   s->apply_defdispwin, s->avctx);
-+    if (ret < 0)
-+        return ret;
-+
-+    /* export stream parameters from the first SPS */
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
-+        if (first && s->ps.sps_list[i]) {
-+            const HEVCRpiSPS *sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
-+            export_stream_params(s->avctx, &s->ps, sps);
-+            break;
-+        }
-+    }
-+
-+    return 0;
-+}
-+
-+static int hevc_rpi_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
-+                             AVPacket *avpkt)
-+{
-+    int ret;
-+    int new_extradata_size;
-+    uint8_t *new_extradata;
-+    HEVCRpiContext *s = avctx->priv_data;
-+
-+    if (!avpkt->size) {
-+        ret = ff_hevc_rpi_output_frame(s, data, 1);
-+        if (ret < 0)
-+            return ret;
-+
-+        *got_output = ret;
-+        return 0;
-+    }
-+
-+    new_extradata = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA,
-+                                            &new_extradata_size);
-+    if (new_extradata && new_extradata_size > 0) {
-+        ret = hevc_rpi_decode_extradata(s, new_extradata, new_extradata_size, 0);
-+        if (ret < 0)
-+            return ret;
-+    }
-+
-+    s->ref = NULL;
-+    ret    = decode_nal_units(s, avpkt->data, avpkt->size);
-+    if (ret < 0)
-+        return ret;
-+
-+    /* verify the SEI checksum */
-+    if (avctx->err_recognition & AV_EF_CRCCHECK && s->is_decoded &&
-+        s->sei.picture_hash.is_md5) {
-+        ret = verify_md5(s, s->ref->frame);
-+        if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE) {
-+            ff_hevc_rpi_unref_frame(s, s->ref, ~0);
-+            return ret;
-+        }
-+    }
-+    s->sei.picture_hash.is_md5 = 0;
-+
-+    if (s->is_decoded) {
-+        av_log(avctx, AV_LOG_DEBUG, "Decoded frame with POC %d.\n", s->poc);
-+        s->is_decoded = 0;
-+    }
-+
-+    if (s->output_frame->buf[0]) {
-+        av_frame_move_ref(data, s->output_frame);
-+        *got_output = 1;
-+    }
-+
-+    return avpkt->size;
-+}
-+
-+static int hevc_ref_frame(HEVCRpiContext *s, HEVCRpiFrame *dst, HEVCRpiFrame *src)
-+{
-+    int ret;
-+
-+    ret = ff_thread_ref_frame(&dst->tf, &src->tf);
-+    if (ret < 0)
-+        return ret;
-+
-+    if (src->col_mvf_buf != NULL)
-+    {
-+        dst->col_mvf_buf = av_buffer_ref(src->col_mvf_buf);
-+        if (!dst->col_mvf_buf)
-+            goto fail;
-+    }
-+    dst->col_mvf = src->col_mvf;
-+
-+    dst->poc        = src->poc;
-+    dst->flags      = src->flags;
-+    dst->sequence   = src->sequence;
-+    return 0;
-+
-+fail:
-+    ff_hevc_rpi_unref_frame(s, dst, ~0);
-+    return AVERROR(ENOMEM);
-+}
-+
-+
-+static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+{
-+    HEVCRpiContext * const s = avctx->priv_data;
-+    int i;
-+
-+    pic_arrays_free(s);
-+
-+    av_freep(&s->md5_ctx);
-+
-+    av_freep(&s->cabac_save);
-+
-+#if RPI_EXTRA_BIT_THREADS
-+    bit_threads_kill(s);
-+#endif
-+
-+    hevc_exit_worker(s);
-+    for (i = 0; i != 2; ++i) {
-+        ff_hevc_rpi_progress_kill_state(s->progress_states + i);
-+    }
-+    job_lc_kill(s->HEVClc);
-+
-+    av_freep(&s->sao_pixel_buffer_h[0]);  // [1] & [2] allocated with [0]
-+    av_freep(&s->sao_pixel_buffer_v[0]);
-+    av_frame_free(&s->output_frame);
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
-+        av_frame_free(&s->DPB[i].frame);
-+    }
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++)
-+        av_buffer_unref(&s->ps.vps_list[i]);
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++)
-+        av_buffer_unref(&s->ps.sps_list[i]);
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++)
-+        av_buffer_unref(&s->ps.pps_list[i]);
-+    s->ps.sps = NULL;
-+    s->ps.pps = NULL;
-+    s->ps.vps = NULL;
-+
-+    // Free separately from sLists as used that way by RPI WPP
-+    for (i = 0; i < MAX_NB_THREADS && s->HEVClcList[i] != NULL; ++i) {
-+        av_freep(s->HEVClcList + i);
-+    }
-+    s->HEVClc = NULL;  // Allocated as part of HEVClcList
-+
-+    ff_h2645_packet_uninit(&s->pkt);
-+
-+    if (s->qpu_init_ok)
-+        vpu_qpu_term();
-+    s->qpu_init_ok = 0;
-+
-+    return 0;
-+}
-+
-+
-+static av_cold int hevc_init_context(AVCodecContext *avctx)
-+{
-+    HEVCRpiContext *s = avctx->priv_data;
-+    int i;
-+
-+    s->avctx = avctx;
-+
-+    s->HEVClc = av_mallocz(sizeof(HEVCRpiLocalContext));
-+    if (!s->HEVClc)
-+        goto fail;
-+    s->HEVClcList[0] = s->HEVClc;
-+
-+    if (vpu_qpu_init() != 0)
-+        goto fail;
-+    s->qpu_init_ok = 1;
-+
-+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
-+    {
-+        static const uint32_t dframe[1] = {0x80808080};
-+        s->qpu_dummy_frame_emu = (const uint8_t *)dframe;
-+    }
-+#endif
-+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
-+    s->qpu_dummy_frame_qpu = qpu_dummy();
-+#endif
-+
-+    bt_lc_init(s, s->HEVClc, 0);
-+    job_lc_init(s->HEVClc);
-+
-+    for (i = 0; i != 2; ++i) {
-+        ff_hevc_rpi_progress_init_state(s->progress_states + i);
-+    }
-+
-+    if ((s->cabac_save = av_malloc(sizeof(*s->cabac_save))) == NULL)
-+        goto fail;
-+
-+     if ((s->output_frame = av_frame_alloc()) == NULL)
-+        goto fail;
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        s->DPB[i].frame = av_frame_alloc();
-+        if (!s->DPB[i].frame)
-+            goto fail;
-+        s->DPB[i].tf.f = s->DPB[i].frame;
-+        s->DPB[i].dpb_no = i;
-+    }
-+
-+    s->max_ra = INT_MAX;
-+
-+    if ((s->md5_ctx = av_md5_alloc()) == NULL)
-+        goto fail;
-+
-+    s->context_initialized = 1;
-+    s->eos = 0;
-+
-+    ff_hevc_rpi_reset_sei(&s->sei);
-+
-+    return 0;
-+
-+fail:
-+    av_log(s->avctx, AV_LOG_ERROR, "%s: Failed\n", __func__);
-+    hevc_decode_free(avctx);
-+    return AVERROR(ENOMEM);
-+}
-+
-+#if HAVE_THREADS
-+static int hevc_update_thread_context(AVCodecContext *dst,
-+                                      const AVCodecContext *src)
-+{
-+    HEVCRpiContext *s  = dst->priv_data;
-+    HEVCRpiContext *s0 = src->priv_data;
-+    int i, ret;
-+
-+    av_assert0(s->context_initialized);
-+
-+    // dst == src can happen according to the comments and in that case
-+    // there is nothing to do here
-+    if (dst == src)
-+        return 0;
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
-+        if (s0->DPB[i].frame->buf[0]) {
-+            ret = hevc_ref_frame(s, &s->DPB[i], &s0->DPB[i]);
-+            if (ret < 0)
-+                return ret;
-+        }
-+    }
-+
-+    if (s->ps.sps != s0->ps.sps)
-+        s->ps.sps = NULL;
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) {
-+        av_buffer_unref(&s->ps.vps_list[i]);
-+        if (s0->ps.vps_list[i]) {
-+            s->ps.vps_list[i] = av_buffer_ref(s0->ps.vps_list[i]);
-+            if (!s->ps.vps_list[i])
-+                return AVERROR(ENOMEM);
-+        }
-+    }
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
-+        av_buffer_unref(&s->ps.sps_list[i]);
-+        if (s0->ps.sps_list[i]) {
-+            s->ps.sps_list[i] = av_buffer_ref(s0->ps.sps_list[i]);
-+            if (!s->ps.sps_list[i])
-+                return AVERROR(ENOMEM);
-+        }
-+    }
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++) {
-+        av_buffer_unref(&s->ps.pps_list[i]);
-+        if (s0->ps.pps_list[i]) {
-+            s->ps.pps_list[i] = av_buffer_ref(s0->ps.pps_list[i]);
-+            if (!s->ps.pps_list[i])
-+                return AVERROR(ENOMEM);
-+        }
-+    }
-+
-+    if (s->ps.sps != s0->ps.sps)
-+        if ((ret = set_sps(s, s0->ps.sps, src->pix_fmt)) < 0)
-+            return ret;
-+
-+    s->seq_decode = s0->seq_decode;
-+    s->seq_output = s0->seq_output;
-+    s->pocTid0    = s0->pocTid0;
-+    s->max_ra     = s0->max_ra;
-+    s->eos        = s0->eos;
-+    s->no_rasl_output_flag = s0->no_rasl_output_flag;
-+
-+    s->is_nalff        = s0->is_nalff;
-+    s->nal_length_size = s0->nal_length_size;
-+
-+    s->threads_type        = s0->threads_type;
-+
-+    if (s0->eos) {
-+        s->seq_decode = (s->seq_decode + 1) & 0xff;
-+        s->max_ra = INT_MAX;
-+    }
-+
-+    s->sei.frame_packing        = s0->sei.frame_packing;
-+    s->sei.display_orientation  = s0->sei.display_orientation;
-+    s->sei.mastering_display    = s0->sei.mastering_display;
-+    s->sei.content_light        = s0->sei.content_light;
-+    s->sei.alternative_transfer = s0->sei.alternative_transfer;
-+
-+    // * We do this here as it allows us to easily locate our parents
-+    //   global job pool, but there really should be a less nasty way
-+    if (s->jbc == NULL)
-+    {
-+        av_assert0((s->jbc = rpi_job_ctl_new(s0->jbc->jbg)) != NULL);
-+        hevc_init_worker(s);
-+    }
-+
-+    return 0;
-+}
-+#endif
-+
-+#include <sys/stat.h>
-+static int qpu_ok(void)
-+{
-+    static int is_pi3 = -1;
-+    if (is_pi3 == -1)
-+    {
-+        struct stat sb;
-+        is_pi3 = (stat("/dev/rpivid-intcmem", &sb) != 0);
-+    }
-+    return is_pi3;
-+}
-+
-+static av_cold int hevc_decode_init(AVCodecContext *avctx)
-+{
-+    HEVCRpiContext *s = avctx->priv_data;
-+    int ret;
-+
-+    if (!qpu_ok())
-+        return AVERROR_DECODER_NOT_FOUND;
-+
-+    if ((ret = hevc_init_context(avctx)) < 0)
-+        return ret;
-+
-+    // If we are a child context then stop now
-+    // Everything after this point is either 1st decode setup or global alloc
-+    // that must not be repeated
-+    // Global info will be copied into children in update_thread_context (we
-+    // can't do it here as we have no way of finding the parent context)
-+    if (avctx->internal->is_copy)
-+        return 0;
-+
-+    // Job allocation requires VCSM alloc to work so ensure that we have it
-+    // initialised by this point
-+    {
-+        HEVCRpiJobGlobal * const jbg = jbg_new(FFMAX(avctx->thread_count * 3, 5));
-+        if (jbg == NULL) {
-+            av_log(s->avctx, AV_LOG_ERROR, "%s: Job global init failed\n", __func__);
-+            ret = AVERROR(ENOMEM);
-+            goto fail;
-+        }
-+
-+        if ((s->jbc = rpi_job_ctl_new(jbg)) == NULL) {
-+            av_log(s->avctx, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__);
-+            ret = AVERROR(ENOMEM);
-+            goto fail;
-+        }
-+    }
-+
-+    hevc_init_worker(s);
-+
-+    s->eos = 1;
-+
-+    if (avctx->extradata_size > 0 && avctx->extradata) {
-+        if ((ret = hevc_rpi_decode_extradata(s, avctx->extradata, avctx->extradata_size, 1)) < 0)
-+            goto fail;
-+
-+        if (!all_sps_supported(s)) {
-+            ret = AVERROR_DECODER_NOT_FOUND;
-+            goto fail;
-+        }
-+    }
-+
-+    if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
-+        s->threads_type = FF_THREAD_FRAME;
-+    else
-+        s->threads_type = 0;
-+
-+    return 0;
-+
-+fail:
-+    hevc_decode_free(avctx);
-+    return ret;
-+}
-+
-+static void hevc_decode_flush(AVCodecContext *avctx)
-+{
-+    HEVCRpiContext *s = avctx->priv_data;
-+    ff_hevc_rpi_flush_dpb(s);
-+    s->max_ra = INT_MAX;
-+    s->eos = 1;
-+}
-+
-+typedef struct  hwaccel_rpi3_qpu_env_s {
-+    const AVClass *av_class;
-+    AVZcEnvPtr zc;
-+} hwaccel_rpi3_qpu_env_t;
-+
-+static int hwaccel_alloc_frame(AVCodecContext *s, AVFrame *frame)
-+{
-+    hwaccel_rpi3_qpu_env_t * const r3 = s->internal->hwaccel_priv_data;
-+    int rv;
-+
-+    if (av_rpi_zc_in_use(s))
-+    {
-+        rv = s->get_buffer2(s, frame, 0);
-+    }
-+    else
-+    {
-+        rv = av_rpi_zc_get_buffer(r3->zc, frame);
-+        if (rv == 0)
-+            rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID);  // actually do the alloc
-+    }
-+
-+    if (rv == 0 &&
-+        (rv = ff_attach_decode_data(frame)) < 0)
-+    {
-+        av_frame_unref(frame);
-+    }
-+
-+    return rv;
-+}
-+
-+static int hwaccel_rpi3_qpu_free(AVCodecContext *avctx)
-+{
-+    hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data;
-+    av_rpi_zc_int_env_freep(&r3->zc);
-+    return 0;
-+}
-+
-+static int hwaccel_rpi3_qpu_init(AVCodecContext *avctx)
-+{
-+    hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data;
-+
-+    if ((r3->zc = av_rpi_zc_int_env_alloc(avctx)) == NULL)
-+        goto fail;
-+
-+    return 0;
-+
-+fail:
-+    av_log(avctx, AV_LOG_ERROR, "Rpi3 QPU init failed\n");
-+    hwaccel_rpi3_qpu_free(avctx);
-+    return AVERROR(ENOMEM);
-+}
-+
-+
-+#define OFFSET(x) offsetof(HEVCRpiContext, x)
-+#define PAR (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
-+
-+
-+static const AVOption options[] = {
-+    { "apply_defdispwin", "Apply default display window from VUI", OFFSET(apply_defdispwin),
-+        AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
-+    { "strict-displaywin", "stricly apply default display window size", OFFSET(apply_defdispwin),
-+        AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
-+    { NULL },
-+};
-+
-+static const AVClass hevc_rpi_decoder_class = {
-+    .class_name = "HEVC RPI decoder",
-+    .item_name  = av_default_item_name,
-+    .option     = options,
-+    .version    = LIBAVUTIL_VERSION_INT,
-+};
-+
-+static const enum AVPixelFormat hevc_rpi_pix_fmts[] = {
-+    AV_PIX_FMT_SAND128,
-+    AV_PIX_FMT_SAND64_10,
-+    AV_PIX_FMT_NONE
-+};
-+
-+
-+static const AVHWAccel hwaccel_rpi3_qpu = {
-+    .name           = "Pi3 QPU Hwaccel",
-+    .alloc_frame    = hwaccel_alloc_frame,
-+    .init           = hwaccel_rpi3_qpu_init,
-+    .uninit         = hwaccel_rpi3_qpu_free,
-+    .priv_data_size = sizeof(hwaccel_rpi3_qpu_env_t),
-+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
-+};
-+
-+static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand128 =
-+{
-+    .public = {
-+        .pix_fmt = AV_PIX_FMT_SAND128,
-+        .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC,
-+        .device_type = AV_HWDEVICE_TYPE_NONE,
-+    },
-+    .hwaccel = &hwaccel_rpi3_qpu
-+};
-+static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand64_10 =
-+{
-+    .public = {
-+        .pix_fmt = AV_PIX_FMT_SAND64_10,
-+        .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC,
-+        .device_type = AV_HWDEVICE_TYPE_NONE,
-+    },
-+    .hwaccel = &hwaccel_rpi3_qpu
-+};
-+
-+
-+static const AVCodecHWConfigInternal *hevc_rpi_hw_configs[] = {
-+    &hevc_rpi_hw_config_sand128,
-+    &hevc_rpi_hw_config_sand64_10,
-+    NULL
-+};
-+
-+
-+AVCodec ff_hevc_rpi_decoder = {
-+    .name                  = "hevc_rpi",
-+    .long_name             = NULL_IF_CONFIG_SMALL("HEVC (rpi)"),
-+    .type                  = AVMEDIA_TYPE_VIDEO,
-+    .id                    = AV_CODEC_ID_HEVC,
-+    .priv_data_size        = sizeof(HEVCRpiContext),
-+    .priv_class            = &hevc_rpi_decoder_class,
-+    .init                  = hevc_decode_init,
-+    .close                 = hevc_decode_free,
-+    .decode                = hevc_rpi_decode_frame,
-+    .flush                 = hevc_decode_flush,
-+    .update_thread_context = ONLY_IF_THREADS_ENABLED(hevc_update_thread_context),
-+    .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
-+                             AV_CODEC_CAP_HARDWARE |
-+                             AV_CODEC_CAP_AVOID_PROBING |
-+#if 0
-+    // Debugging is often easier without threads getting in the way
-+                            0,
-+#warning H265 threading turned off
-+#else
-+    // We only have decent optimisation for frame - so only admit to that
-+                             AV_CODEC_CAP_FRAME_THREADS,
-+#endif
-+    .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE |
-+                             FF_CODEC_CAP_EXPORTS_CROPPING |
-+                             FF_CODEC_CAP_ALLOCATE_PROGRESS,
-+    .pix_fmts              = hevc_rpi_pix_fmts,
-+    .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
-+    .hw_configs            = hevc_rpi_hw_configs,
-+//    .wrapper_name          = "hevc_rpi",
-+};
-+
---- /dev/null
-+++ b/libavcodec/rpi_hevcdec.h
-@@ -0,0 +1,1091 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVCDEC_H
-+#define AVCODEC_RPI_HEVCDEC_H
-+
-+#include "config.h"
-+
-+#include <stdatomic.h>
-+
-+#include "libavutil/buffer.h"
-+
-+#include "avcodec.h"
-+#include "bswapdsp.h"
-+#include "cabac.h"
-+#include "get_bits.h"
-+#include "rpi_hevcpred.h"
-+#include "h2645_parse.h"
-+#include "hevc.h"
-+#include "rpi_hevc_mv.h"
-+#include "rpi_hevc_ps.h"
-+#include "rpi_hevc_sei.h"
-+#include "rpi_hevcdsp.h"
-+#include "internal.h"
-+#include "thread.h"
-+#include "videodsp.h"
-+
-+#if ARCH_ARM
-+#include "arm/rpi_hevc_misc_neon.h"
-+#endif
-+
-+#define MAX_NB_THREADS 16
-+#define SHIFT_CTB_WPP 2
-+
-+//TODO: check if this is really the maximum
-+#define MAX_TRANSFORM_DEPTH 5
-+
-+#define MAX_TB_SIZE 32
-+#define MAX_QP 51
-+#define DEFAULT_INTRA_TC_OFFSET 2
-+
-+#define HEVC_CONTEXTS 199
-+
-+#define MRG_MAX_NUM_CANDS     5
-+
-+#define HEVC_MAX_CTB_SIZE (1 << HEVC_MAX_LOG2_CTB_SIZE)  // 64
-+
-+// Size of DPB array
-+#define HEVC_DPB_ELS            32
-+
-+#define L0 0
-+#define L1 1
-+
-+#define EPEL_EXTRA_BEFORE 1
-+#define EPEL_EXTRA_AFTER  2
-+#define EPEL_EXTRA        3
-+#define QPEL_EXTRA_BEFORE 3
-+#define QPEL_EXTRA_AFTER  4
-+#define QPEL_EXTRA        7
-+
-+#define EDGE_EMU_BUFFER_STRIDE 80
-+
-+#include <semaphore.h>
-+#include "rpi_qpu.h"
-+
-+// Max jobs per frame thread. Actual usage will be limited by the size
-+// of the global job pool
-+// ?? Limits
-+#define RPI_MAX_JOBS            8
-+
-+// This is the number of _extra_ bit threads - we will have
-+// RPI_EXTRA_BIT_THREADS+1 threads actually doing the processing
-+//
-+// 0 is legitimate and will disable our WPP processing
-+//#define RPI_EXTRA_BIT_THREADS 0
-+#define RPI_EXTRA_BIT_THREADS   2
-+
-+// Number of separate threads/passes in worker
-+// 2 and 3 are the currently valid numbers
-+// At the moment 3 seems fractionally faster
-+//#define RPI_PASSES              2
-+#define RPI_PASSES              3
-+
-+// Print out various usage stats
-+#define RPI_TSTATS              0
-+
-+// Define RPI_COMPRESS_COEFFS to 1 to send coefficients in compressed form
-+#define RPI_COMPRESS_COEFFS     1
-+
-+// Wait for VPU/QPU to finish in worker pass 0
-+// If 0 then the wait is in pass 1
-+//
-+// One might expect the better place to wait would be in pass 1 however
-+// testing shows that pass 0 produces overall faster decode.
-+// Interestingly it is QPU/VPU limited streams that seem to suffer
-+// from pass 1 waits, CPU limited ones tend to show a very mild gain.
-+// This define exists so it is easy to test this.
-+#define RPI_WORKER_WAIT_PASS_0  1
-+
-+// Use ARM emulation of QPU pred
-+// These are for debug only as the emulation makes only limited
-+// effort to be fast
-+#define RPI_QPU_EMU_Y           0
-+#define RPI_QPU_EMU_C           0
-+
-+// Max width & height we are prepared to consider
-+// Sand frame shape calc becomes confused with large frames
-+// Some buffer alloc also depends on this
-+#define HEVC_RPI_MAX_WIDTH      2048
-+#define HEVC_RPI_MAX_HEIGHT     1088
-+
-+
-+// Min CTB size is 16
-+#define HEVC_RPI_MAX_CTBS ((HEVC_RPI_MAX_WIDTH + 15) / 16) * ((HEVC_RPI_MAX_HEIGHT + 15) / 16)
-+
-+/**
-+ * Value of the luma sample at position (x, y) in the 2D array tab.
-+ */
-+#define SAMPLE(tab, x, y) ((tab)[(y) * s->sps->width + (x)])
-+#define SAMPLE_CTB(tab, x, y) ((tab)[(y) * min_cb_width + (x)])
-+
-+#define IS_IDR(s) ((s)->nal_unit_type == HEVC_NAL_IDR_W_RADL || (s)->nal_unit_type == HEVC_NAL_IDR_N_LP)
-+#define IS_BLA(s) ((s)->nal_unit_type == HEVC_NAL_BLA_W_RADL || (s)->nal_unit_type == HEVC_NAL_BLA_W_LP || \
-+                   (s)->nal_unit_type == HEVC_NAL_BLA_N_LP)
-+#define IS_IRAP(s) ((s)->nal_unit_type >= 16 && (s)->nal_unit_type <= 23)
-+
-+enum RPSType {
-+    ST_CURR_BEF = 0,
-+    ST_CURR_AFT,
-+    ST_FOLL,
-+    LT_CURR,
-+    LT_FOLL,
-+    NB_RPS_TYPE,
-+};
-+
-+enum SyntaxElement {
-+    SAO_MERGE_FLAG = 0,
-+    SAO_TYPE_IDX,
-+    SAO_EO_CLASS,
-+    SAO_BAND_POSITION,
-+    SAO_OFFSET_ABS,
-+    SAO_OFFSET_SIGN,
-+    END_OF_SLICE_FLAG,
-+    SPLIT_CODING_UNIT_FLAG,
-+    CU_TRANSQUANT_BYPASS_FLAG,
-+    SKIP_FLAG,
-+    CU_QP_DELTA,
-+    PRED_MODE_FLAG,
-+    PART_MODE,
-+    PCM_FLAG,
-+    PREV_INTRA_LUMA_PRED_FLAG,
-+    MPM_IDX,
-+    REM_INTRA_LUMA_PRED_MODE,
-+    INTRA_CHROMA_PRED_MODE,
-+    MERGE_FLAG,
-+    MERGE_IDX,
-+    INTER_PRED_IDC,
-+    REF_IDX_L0,
-+    REF_IDX_L1,
-+    ABS_MVD_GREATER0_FLAG,
-+    ABS_MVD_GREATER1_FLAG,
-+    ABS_MVD_MINUS2,
-+    MVD_SIGN_FLAG,
-+    MVP_LX_FLAG,
-+    NO_RESIDUAL_DATA_FLAG,
-+    SPLIT_TRANSFORM_FLAG,
-+    CBF_LUMA,
-+    CBF_CB_CR,
-+    TRANSFORM_SKIP_FLAG,
-+    EXPLICIT_RDPCM_FLAG,
-+    EXPLICIT_RDPCM_DIR_FLAG,
-+    LAST_SIGNIFICANT_COEFF_X_PREFIX,
-+    LAST_SIGNIFICANT_COEFF_Y_PREFIX,
-+    LAST_SIGNIFICANT_COEFF_X_SUFFIX,
-+    LAST_SIGNIFICANT_COEFF_Y_SUFFIX,
-+    SIGNIFICANT_COEFF_GROUP_FLAG,
-+    SIGNIFICANT_COEFF_FLAG,
-+    COEFF_ABS_LEVEL_GREATER1_FLAG,
-+    COEFF_ABS_LEVEL_GREATER2_FLAG,
-+    COEFF_ABS_LEVEL_REMAINING,
-+    COEFF_SIGN_FLAG,
-+    LOG2_RES_SCALE_ABS,
-+    RES_SCALE_SIGN_FLAG,
-+    CU_CHROMA_QP_OFFSET_FLAG,
-+    CU_CHROMA_QP_OFFSET_IDX,
-+};
-+
-+enum PartMode {
-+    PART_2Nx2N = 0,
-+    PART_2NxN  = 1,
-+    PART_Nx2N  = 2,
-+    PART_NxN   = 3,
-+    PART_2NxnU = 4,
-+    PART_2NxnD = 5,
-+    PART_nLx2N = 6,
-+    PART_nRx2N = 7,
-+};
-+
-+enum PredMode {
-+    MODE_INTER = 0,
-+    MODE_INTRA,
-+    MODE_SKIP,
-+};
-+
-+enum InterPredIdc {
-+    PRED_L0 = 0,
-+    PRED_L1,
-+    PRED_BI,
-+};
-+
-+enum PredFlag {
-+    PF_INTRA = 0,
-+    PF_L0,
-+    PF_L1,
-+    PF_BI,
-+};
-+
-+enum SAOType {
-+    SAO_NOT_APPLIED = 0,
-+    SAO_BAND,
-+    SAO_EDGE,
-+    SAO_APPLIED
-+};
-+
-+enum SAOEOClass {
-+    SAO_EO_HORIZ = 0,
-+    SAO_EO_VERT,
-+    SAO_EO_135D,
-+    SAO_EO_45D,
-+};
-+
-+enum ScanType {
-+    SCAN_DIAG = 0,
-+    SCAN_HORIZ,
-+    SCAN_VERT,
-+};
-+
-+typedef struct RefPicList {
-+    struct HEVCRpiFrame *ref[HEVC_MAX_REFS];
-+    int list[HEVC_MAX_REFS];
-+    uint8_t isLongTerm[HEVC_MAX_REFS];
-+    int nb_refs;
-+} RefPicList;
-+
-+typedef struct RefPicListTab {
-+    RefPicList refPicList[2];
-+} RefPicListTab;
-+
-+typedef struct RpiCodingUnit {
-+    unsigned int x;             // Passed to deblock
-+    unsigned int y;
-+    unsigned int x_split;
-+    unsigned int y_split;
-+
-+    enum PredMode pred_mode;    ///< PredMode
-+    enum PartMode part_mode;    ///< PartMode
-+
-+    // Inferred parameters
-+    uint8_t intra_split_flag;   ///< IntraSplitFlag
-+    uint8_t max_trafo_depth;    ///< MaxTrafoDepth
-+    uint8_t cu_transquant_bypass_flag;
-+} RpiCodingUnit;
-+
-+typedef struct RpiPredictionUnit {
-+    uint8_t intra_pred_mode[4];
-+    uint8_t intra_pred_mode_c[4];
-+    uint8_t chroma_mode_c[4];
-+    uint8_t merge_flag;
-+} RpiPredictionUnit;
-+
-+typedef struct HEVCRpiTransformUnit {
-+    int8_t cu_qp_delta;
-+
-+    // Inferred parameters;
-+    uint8_t intra_pred_mode;
-+    uint8_t intra_pred_mode_c;
-+    uint8_t chroma_mode_c;
-+    uint8_t is_cu_qp_delta_wanted;
-+    uint8_t cu_chroma_qp_offset_wanted;
-+    const int8_t * qp_divmod6[3];
-+} HEVCRpiTransformUnit;
-+
-+typedef struct DBParams {
-+    int8_t beta_offset; // -12 to +12
-+    int8_t tc_offset;   // -12 to +12
-+} DBParams;
-+
-+#define HEVC_FRAME_FLAG_OUTPUT    (1 << 0)
-+#define HEVC_FRAME_FLAG_SHORT_REF (1 << 1)
-+#define HEVC_FRAME_FLAG_LONG_REF  (1 << 2)
-+#define HEVC_FRAME_FLAG_BUMPING   (1 << 3)
-+
-+struct HEVCRpiJob;
-+
-+typedef struct HEVCRpiFrame {
-+    AVFrame *frame;
-+    ThreadFrame tf;
-+    ColMvField *col_mvf;
-+    int poc;
-+    struct HEVCRpiFrame *collocated_ref;
-+
-+    AVBufferRef *col_mvf_buf;
-+
-+    /**
-+     * A sequence counter, so that old frames are output first
-+     * after a POC reset
-+     */
-+    uint16_t sequence;
-+
-+    /**
-+     * A combination of HEVC_FRAME_FLAG_*
-+     */
-+    uint8_t flags;
-+
-+    // Entry no in DPB - can be used as a small unique
-+    // frame identifier (within the current thread)
-+    uint8_t dpb_no;
-+} HEVCRpiFrame;
-+
-+typedef struct HEVCRpiLocalContext {
-+    HEVCRpiTransformUnit tu;
-+
-+    CABACContext cc;
-+
-+    // Vars that allow us to locate everything from just an lc
-+    struct HEVCRpiContext * context;  // ??? make const ???
-+    unsigned int lc_n; // lc list el no
-+
-+    // Job wait links
-+    struct HEVCRpiLocalContext * jw_next;
-+    struct HEVCRpiLocalContext * jw_prev;
-+    struct HEVCRpiLocalContext * ljw_next;
-+    struct HEVCRpiLocalContext * ljw_prev;
-+    struct HEVCRpiJob * volatile jw_job;
-+    sem_t jw_sem;
-+
-+    // ?? Wrap in structure ??
-+    sem_t bt_sem_in;
-+    sem_t * bt_psem_out;
-+    volatile int bt_terminate;
-+    unsigned int ts;
-+    unsigned int bt_last_line;  // Last line in this bit_thread chunk
-+    unsigned int bt_line_no;
-+    unsigned int bt_line_width;
-+    unsigned int bt_line_inc;
-+
-+    struct HEVCRpiJob * jb0;
-+    char unit_done;  // Set once we have dealt with this slice
-+    char bt_is_tile;
-+    char last_progress_good;
-+    char cabac_init_req;
-+
-+    uint8_t cabac_state[HEVC_CONTEXTS];
-+    uint8_t stat_coeff[4];
-+    GetBitContext gb;
-+
-+    uint8_t ct_depth;
-+    int8_t qp_y;
-+    int8_t curr_qp_y;
-+    int8_t qPy_pred;
-+
-+// N.B. Used by asm (neon) - do not change
-+#define AVAIL_S_UR  0
-+#define AVAIL_S_U   1
-+#define AVAIL_S_UL  2
-+#define AVAIL_S_L   3
-+#define AVAIL_S_DL  4
-+
-+#define AVAIL_U     (1 << AVAIL_S_U)
-+#define AVAIL_L     (1 << AVAIL_S_L)
-+#define AVAIL_UL    (1 << AVAIL_S_UL)
-+#define AVAIL_UR    (1 << AVAIL_S_UR)
-+#define AVAIL_DL    (1 << AVAIL_S_DL)
-+
-+// Intra filters - same number space as avail
-+#define FILTER_LIGHT    0x40
-+#define FILTER_STRONG   0x80
-+#define FILTER_EITHER   (FILTER_LIGHT | FILTER_STRONG)
-+
-+    uint8_t ctb_avail;
-+    int     end_of_ctb_x;
-+    int     end_of_ctb_y;
-+
-+    RpiCodingUnit cu;
-+    RpiPredictionUnit pu;
-+
-+#define BOUNDARY_LEFT_SLICE     (1 << 0)
-+#define BOUNDARY_LEFT_TILE      (1 << 1)
-+#define BOUNDARY_UPPER_SLICE    (1 << 2)
-+#define BOUNDARY_UPPER_TILE     (1 << 3)
-+    /* properties of the boundary of the current CTB for the purposes
-+     * of the deblocking filter */
-+    unsigned int boundary_flags;
-+
-+#define IPM_TAB_SIZE (HEVC_MAX_CTB_SIZE >> LOG2_MIN_PU_SIZE)
-+    uint8_t ipm_left[IPM_TAB_SIZE];
-+    uint8_t ipm_up[IPM_TAB_SIZE];
-+
-+//#define MVF_STASH_WIDTH       128
-+#define MVF_STASH_WIDTH       64
-+#define MVF_STASH_HEIGHT      64
-+#define MVF_STASH_WIDTH_PU    (MVF_STASH_WIDTH >> LOG2_MIN_PU_SIZE)
-+#define MVF_STASH_HEIGHT_PU   (MVF_STASH_HEIGHT >> LOG2_MIN_PU_SIZE)
-+    HEVCRpiMvField mvf_ul[1];
-+    HEVCRpiMvField mvf_stash[MVF_STASH_WIDTH_PU * MVF_STASH_HEIGHT_PU];
-+
-+    /* +7 is for subpixel interpolation, *2 for high bit depths */
-+//    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
-+    /* The extended size between the new edge emu buffer is abused by SAO */
-+//    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
-+//    DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
-+
-+} HEVCRpiLocalContext;
-+
-+// Each block can have an intra prediction and an add_residual command
-+// noof-cmds(2) * max-ctu height(64) / min-transform(4) * planes(3) * MAX_WIDTH
-+
-+// Sand only has 2 planes (Y/C)
-+#define RPI_MAX_PRED_CMDS (2*(HEVC_MAX_CTB_SIZE/4)*2*(HEVC_RPI_MAX_WIDTH/4))
-+
-+// Command for intra prediction and transform_add of predictions to coefficients
-+enum rpi_pred_cmd_e
-+{
-+    RPI_PRED_ADD_RESIDUAL,
-+    RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
-+    RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
-+    RPI_PRED_ADD_RESIDUAL_C, // Merged U+V
-+    RPI_PRED_ADD_DC,
-+    RPI_PRED_ADD_DC_U,       // Both U & V are effectively C
-+    RPI_PRED_ADD_DC_V,
-+    RPI_PRED_INTRA,
-+    RPI_PRED_INTRA_C,
-+    RPI_PRED_I_PCM,
-+    RPI_PRED_CMD_MAX
-+};
-+
-+typedef struct HEVCPredCmd {
-+    uint8_t type;
-+    uint8_t size;  // log2 "size" used by all variants
-+    uint8_t avail; // i_pred - but left here as they pack well
-+    uint8_t dummy;
-+    union {
-+        struct {  // TRANSFORM_ADD
-+            uint8_t * dst;
-+            const int16_t * buf;
-+            uint16_t stride;  // Should be good enough for all pic fmts we use
-+            int16_t dc;
-+        } ta;
-+        struct {
-+            uint8_t * dst;
-+            uint32_t stride;
-+            int dc;
-+        } dc;
-+        struct {  // INTRA
-+            uint16_t x;
-+            uint16_t y;
-+            enum IntraPredMode mode;
-+        } i_pred;
-+        struct {  // I_PCM
-+            uint16_t x;
-+            uint16_t y;
-+            const void * src;
-+            uint32_t src_len;
-+        } i_pcm;
-+    };
-+} HEVCPredCmd;
-+
-+union qpu_mc_pred_cmd_s;
-+struct qpu_mc_pred_y_p_s;
-+struct qpu_mc_src_s;
-+
-+typedef struct HEVCRpiInterPredQ
-+{
-+    union qpu_mc_pred_cmd_u *qpu_mc_base;
-+    union qpu_mc_pred_cmd_u *qpu_mc_curr;
-+    struct qpu_mc_src_s *last_l0;
-+    struct qpu_mc_src_s *last_l1;
-+    unsigned int load;
-+    uint32_t code_setup;
-+    uint32_t code_sync;
-+    uint32_t code_exit;
-+} HEVCRpiInterPredQ;
-+
-+typedef struct HEVCRpiInterPredEnv
-+{
-+    HEVCRpiInterPredQ * q;
-+    uint8_t n;                  // Number of Qs
-+    uint8_t n_grp;              // Number of Q in a group
-+    uint8_t curr;               // Current Q number (0..n-1)
-+    uint8_t used;               // 0 if nothing in any Q, 1 otherwise
-+    uint8_t used_grp;           // 0 if nothing in any Q in the current group
-+    unsigned int max_fill;
-+    unsigned int min_gap;
-+    GPU_MEM_PTR_T gptr;
-+} HEVCRpiInterPredEnv;
-+
-+typedef struct HEVCRpiIntraPredEnv {
-+    unsigned int n;        // Number of commands
-+    HEVCPredCmd * cmds;
-+} HEVCRpiIntraPredEnv;
-+
-+typedef struct HEVCRpiCoeffEnv {
-+    unsigned int n;
-+#if RPI_COMPRESS_COEFFS
-+    unsigned int packed; // Equal to 1 if coefficients should be being packed
-+    unsigned int packed_n; // Value of n when packed was set equal to 0 (i.e. the amount that is sent compressed).  Only valid if packed==0
-+#endif
-+    int16_t * buf;
-+} HEVCRpiCoeffEnv;
-+
-+typedef struct HEVCRpiCoeffsEnv {
-+    HEVCRpiCoeffEnv s[4];
-+    GPU_MEM_PTR_T gptr;
-+    void * mptr;
-+} HEVCRpiCoeffsEnv;
-+
-+typedef struct HEVCRpiFrameProgressWait {
-+    int req;
-+    struct HEVCRpiFrameProgressWait * next;
-+    sem_t sem;
-+} HEVCRpiFrameProgressWait;
-+
-+typedef struct HEVCRpiFrameProgressState {
-+    struct HEVCRpiFrameProgressWait * first;
-+    struct HEVCRpiFrameProgressWait * last;
-+    pthread_mutex_t lock;
-+} HEVCRpiFrameProgressState;
-+
-+typedef struct RpiBlk
-+{
-+    unsigned int x;
-+    unsigned int y;
-+    unsigned int w;
-+    unsigned int h;
-+} RpiBlk;
-+
-+typedef struct HEVCRpiJob {
-+    struct HEVCRpiJob * next;  // Free chain
-+    struct HEVCRpiJobCtl * jbc_local;
-+    const HEVCRpiSPS * sps;       // sps used to set up this job
-+
-+    int waited;
-+    int ctu_ts_first;
-+    int ctu_ts_last;
-+    RpiBlk bounds;  // Bounding box of job
-+
-+    struct qpu_mc_pred_y_p_s * last_y8_p;
-+    struct qpu_mc_src_s * last_y8_l1;
-+    rpi_cache_flush_env_t * rfe;
-+
-+    HEVCRpiInterPredEnv chroma_ip;
-+    HEVCRpiInterPredEnv luma_ip;
-+    int16_t progress_req[HEVC_DPB_ELS]; // index by dpb_no
-+    HEVCRpiIntraPredEnv intra;
-+    HEVCRpiCoeffsEnv coeffs;
-+    HEVCRpiFrameProgressWait progress_wait;
-+    sem_t sem;
-+    rpi_cache_buf_t flush_buf;
-+} HEVCRpiJob;
-+
-+struct HEVCRpiContext;
-+
-+typedef void HEVCRpiWorkerFn(const struct HEVCRpiContext * const s, HEVCRpiJob * const jb);
-+
-+typedef struct HEVCRpiPassQueue
-+{
-+//    int pending;
-+    volatile int terminate;
-+    sem_t sem_in;
-+    sem_t * psem_out;
-+    unsigned int job_n;
-+    struct HEVCRpiContext * context; // Context pointer as we get to pass a single "void * this" to the thread
-+    HEVCRpiWorkerFn * worker;
-+    pthread_t thread;
-+    uint8_t pass_n;  // Pass number - debug
-+    uint8_t started;
-+} HEVCRpiPassQueue;
-+
-+
-+struct HEVCRpiJobGlobal;
-+
-+typedef struct HEVCRpiJobCtl
-+{
-+    sem_t sem_out;
-+
-+    HEVCRpiJob * volatile jb1;  // The job associated with this frame if unallocated - NULL if allocated
-+    struct HEVCRpiJobGlobal * jbg;
-+
-+    HEVCRpiLocalContext * lcw_head;
-+    HEVCRpiLocalContext * lcw_tail;
-+
-+    pthread_mutex_t in_lock;
-+    int offload_in;
-+
-+    HEVCRpiJob *offloadq[RPI_MAX_JOBS];
-+} HEVCRpiJobCtl;
-+
-+
-+typedef struct HEVCRpiJobGlobal
-+{
-+    intptr_t ref_count;
-+    pthread_mutex_t lock;
-+    HEVCRpiJob * free1;                 // Singly linked list of free jobs
-+    HEVCRpiLocalContext * wait_head;       // Double linked list of lcs waiting for a job
-+    HEVCRpiLocalContext * wait_good;  // Last good tail
-+    HEVCRpiLocalContext * wait_tail;
-+
-+} HEVCRpiJobGlobal;
-+
-+#define RPI_BIT_THREADS (RPI_EXTRA_BIT_THREADS + 1)
-+
-+#if RPI_TSTATS
-+typedef struct HEVCRpiStats {
-+    int y_pred1_y8_merge;
-+    int y_pred1_xy;
-+    int y_pred1_x0;
-+    int y_pred1_y0;
-+    int y_pred1_x0y0;
-+    int y_pred1_wle8;
-+    int y_pred1_wgt8;
-+    int y_pred1_hle16;
-+    int y_pred1_hgt16;
-+    int y_pred2_xy;
-+    int y_pred2_x0;
-+    int y_pred2_y0;
-+    int y_pred2_x0y0;
-+    int y_pred2_hle16;
-+    int y_pred2_hgt16;
-+} HEVCRpiStats;
-+#endif
-+
-+typedef struct HEVCRpiCabacState
-+{
-+    uint8_t rice[4];
-+    uint8_t state[HEVC_CONTEXTS];
-+} HEVCRpiCabacState;
-+
-+#define HEVC_RPI_BS_STRIDE1_PEL_SHIFT   6   // 64 pels
-+#define HEVC_RPI_BS_STRIDE1_PELS        (1U << HEVC_RPI_BS_STRIDE1_PEL_SHIFT)
-+#define HEVC_RPI_BS_STRIDE1_PEL_MASK    (HEVC_RPI_BS_STRIDE1_PELS - 1)
-+#define HEVC_RPI_BS_ELS_PER_BYTE_SHIFT  2   // 4 els per byte
-+#define HEVC_RPI_BS_PELS_PER_EL_SHIFT   2   // 4 pels per el
-+#define HEVC_RPI_BS_PELS_PER_BYTE_SHIFT (HEVC_RPI_BS_PELS_PER_EL_SHIFT + HEVC_RPI_BS_ELS_PER_BYTE_SHIFT)
-+#define HEVC_RPI_BS_STRIDE1_BYTE_SHIFT  (HEVC_RPI_BS_STRIDE1_PEL_SHIFT - HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)
-+#define HEVC_RPI_BS_STRIDE1_BYTES       (1U << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
-+#define HEVC_RPI_BS_Y_SHR               3   // 8 vertical pels per row
-+#define HEVC_RPI_BS_COL_BYTES_SHR       (HEVC_RPI_BS_Y_SHR - HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
-+
-+typedef struct HEVCRpiContext {
-+    const AVClass *c;  // needed by private avoptions
-+    AVCodecContext *avctx;
-+
-+    uint8_t             threads_type;
-+    char qpu_init_ok;
-+
-+    /** 1 if the independent slice segment header was successfully parsed */
-+    uint8_t slice_initialized;
-+    char used_for_ref;  // rpi
-+    char is_irap;
-+    char offload_recon;
-+    uint8_t eos;       ///< current packet contains an EOS/EOB NAL
-+    uint8_t last_eos;  ///< last packet contains an EOS/EOB NAL
-+    uint8_t no_backward_pred_flag;
-+    uint8_t is_decoded;
-+    uint8_t no_rasl_output_flag;
-+
-+
-+    /**
-+     * Sequence counters for decoded and output frames, so that old
-+     * frames are output first after a POC reset
-+     */
-+    uint16_t seq_decode;
-+    uint16_t seq_output;
-+
-+    int                 width;
-+    int                 height;
-+
-+    HEVCRpiJobCtl * jbc;
-+    // cabac stash
-+    // b0       skip flag
-+    // b1+      ct_depth
-+    uint8_t * cabac_stash_left;
-+    uint8_t * cabac_stash_up;
-+
-+    // Function pointers
-+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
-+    const uint8_t * qpu_dummy_frame_emu;
-+#endif
-+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
-+    uint32_t qpu_dummy_frame_qpu;  // Not a frame - just a bit of memory
-+#endif
-+    HEVCRpiQpu qpu;
-+
-+    HEVCRpiFrameProgressState progress_states[2];
-+
-+    HEVCRpiCabacState *cabac_save;
-+
-+    AVFrame *frame;
-+    AVFrame *output_frame;
-+    uint8_t *sao_pixel_buffer_h[3];
-+    uint8_t *sao_pixel_buffer_v[3];
-+
-+    unsigned int col_mvf_stride;
-+    AVBufferPool *col_mvf_pool;
-+
-+    RpiSAOParams *sao;
-+    DBParams *deblock;
-+    enum HEVCNALUnitType nal_unit_type;
-+    int temporal_id;  ///< temporal_id_plus1 - 1
-+    HEVCRpiFrame *ref;
-+    int poc;
-+    int pocTid0;
-+    int slice_idx; ///< number of the slice being currently decoded
-+    int max_ra;
-+
-+    int8_t *qp_y_tab;
-+
-+    // Deblocking block strength bitmaps
-+    unsigned int bs_stride2;
-+    unsigned int bs_size;
-+    uint8_t *bs_horizontal;
-+    uint8_t *bs_vertical;
-+    uint8_t *bsf_stash_up;
-+    uint8_t *bsf_stash_left;
-+
-+#if HEVC_RPI_MAX_CTBS >= 0xffff
-+#define TAB_SLICE_ADDR_BROKEN ~(uint32_t)0
-+    uint32_t *tab_slice_address;
-+#else
-+#define TAB_SLICE_ADDR_BROKEN ~(uint16_t)0
-+    uint16_t *tab_slice_address;
-+#endif
-+
-+    // Bitfield 1 bit per 8 pels (min pcm size)
-+    uint8_t *is_pcm;
-+    // Bitfield 1 bit per 8 pels (min cb size)
-+    // Only needed for CIP as CIP processing is async to the main thread
-+    uint8_t *is_intra;
-+
-+    // PU
-+    HEVCRpiMvField *mvf_up;
-+    HEVCRpiMvField *mvf_left;
-+
-+    const RefPicList **rpl_up;
-+    const RefPicList **rpl_left;
-+    RefPicList * refPicList;
-+
-+    // CTB-level flags affecting loop filter operation
-+    uint8_t *filter_slice_edges;
-+
-+    /** used on BE to byteswap the lines for checksumming */
-+    uint8_t *checksum_buf;
-+    int      checksum_buf_size;
-+
-+    const uint8_t *data;
-+
-+    H2645Packet pkt;
-+    // type of the first VCL NAL of the current frame
-+    enum HEVCNALUnitType first_nal_type;
-+
-+    uint8_t context_initialized;
-+    int is_nalff;           ///< this flag is != 0 if bitstream is encapsulated
-+                            ///< as a format defined in 14496-15
-+    int apply_defdispwin;
-+
-+    int nal_length_size;    ///< Number of bytes used for nal length (1, 2 or 4)
-+    int nuh_layer_id;
-+
-+    struct AVMD5 *md5_ctx;
-+
-+    RefPicListTab * rpl_tab;
-+    unsigned int rpl_tab_size;
-+
-+    uint8_t *is_intra_store;
-+
-+    RpiSliceHeader sh;
-+
-+    HEVCRpiParamSets ps;
-+
-+    HEVCRpiLocalContext    *HEVClc;
-+    HEVCRpiLocalContext    *HEVClcList[MAX_NB_THREADS];
-+
-+    HEVCRpiFrame DPB[HEVC_DPB_ELS];
-+
-+    ///< candidate references for the current frame
-+    RefPicList rps[5];
-+
-+    HEVCRpiPredContext hpc;
-+    HEVCDSPContext hevcdsp;
-+
-+    HEVCSEIContext sei;
-+
-+    // Put structures that allocate non-trivial storage at the end
-+    // These are mostly used indirectly so position in the structure doesn't matter
-+    HEVCRpiPassQueue passq[RPI_PASSES];
-+#if RPI_EXTRA_BIT_THREADS > 0
-+    int bt_started;
-+    // This simply contains thread descriptors - task setup is held elsewhere
-+    pthread_t bit_threads[RPI_EXTRA_BIT_THREADS];
-+#endif
-+#if RPI_TSTATS
-+    HEVCRpiStats tstats;
-+#endif
-+} HEVCRpiContext;
-+
-+/**
-+ * Mark all frames in DPB as unused for reference.
-+ */
-+void ff_hevc_rpi_clear_refs(HEVCRpiContext *s);
-+
-+/**
-+ * Drop all frames currently in DPB.
-+ */
-+void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s);
-+
-+/**
-+ * Construct the reference picture sets for the current frame.
-+ */
-+int ff_hevc_rpi_frame_rps(HEVCRpiContext *s);
-+
-+/**
-+ * Construct the reference picture list(s) for the current slice.
-+ */
-+int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s);
-+
-+
-+/**
-+ * Get the number of candidate references for the current frame.
-+ */
-+int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s);
-+
-+int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc);
-+
-+/**
-+ * Find next frame in output order and put a reference to it in frame.
-+ * @return 1 if a frame was output, 0 otherwise
-+ */
-+int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *frame, int flush);
-+
-+void ff_hevc_rpi_bump_frame(HEVCRpiContext *s);
-+
-+void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags);
-+
-+unsigned int ff_hevc_rpi_tb_avail_flags(
-+    const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
-+    const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h);
-+
-+void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
-+                                int nPbH, int log2_cb_size, int part_idx,
-+                                int merge_idx, HEVCRpiMvField * const mv);
-+void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+    const unsigned int x0, const unsigned int y0,
-+    const unsigned int nPbW, const unsigned int nPbH,
-+    const unsigned int avail,
-+    HEVCRpiMvField * const mv,
-+    const unsigned int mvp_lx_flag, const unsigned int LX);
-+void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase);
-+void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
-+                                               const unsigned int x0, const unsigned int y0,
-+                                               const unsigned int log2_trafo_size, const int is_coded_block);
-+int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot);
-+
-+extern const uint8_t ff_hevc_rpi_qpel_extra_before[4];
-+extern const uint8_t ff_hevc_rpi_qpel_extra_after[4];
-+extern const uint8_t ff_hevc_rpi_qpel_extra[4];
-+
-+int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n);
-+
-+// arm/hevc_misc_neon.S
-+// Neon coeff zap fn
-+#if HAVE_NEON
-+extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
-+#endif
-+
-+void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+                                     const HEVCRpiFrame * const ref, const int val, const int field);
-+
-+void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field);
-+
-+// All of these expect that s->threads_type == FF_THREAD_FRAME
-+
-+static inline void ff_hevc_rpi_progress_wait_mv(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+                                     const HEVCRpiFrame * const ref, const int y)
-+{
-+    if (s->threads_type != 0)
-+        ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
-+}
-+
-+static inline void ff_hevc_rpi_progress_signal_mv(HEVCRpiContext * const s, const int y)
-+{
-+    if (s->used_for_ref && s->threads_type != 0)
-+        ff_hevc_rpi_progress_signal_field(s, y, 1);
-+}
-+
-+static inline void ff_hevc_rpi_progress_wait_recon(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+                                     const HEVCRpiFrame * const ref, const int y)
-+{
-+    ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0);
-+}
-+
-+static inline void ff_hevc_rpi_progress_signal_recon(HEVCRpiContext * const s, const int y)
-+{
-+    if (s->used_for_ref && s->threads_type != 0)
-+    {
-+        ff_hevc_rpi_progress_signal_field(s, y, 0);
-+    }
-+}
-+
-+static inline void ff_hevc_rpi_progress_signal_all_done(HEVCRpiContext * const s)
-+{
-+    ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0);
-+    ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1);
-+}
-+
-+
-+// Set all done - signal nothing (used in missing refs)
-+// Works for both rpi & non-rpi
-+static inline void ff_hevc_rpi_progress_set_all_done(HEVCRpiFrame * const ref)
-+{
-+    if (ref->tf.progress != NULL)
-+    {
-+        int * const p = (int *)ref->tf.progress->data;
-+        p[0] = INT_MAX;
-+        p[1] = INT_MAX;
-+    }
-+}
-+
-+#define HEVC_RPI_420_ONLY 1
-+#define HEVC_RPI_SAND128_ONLY 1
-+
-+static inline unsigned int ctx_hshift(const HEVCRpiContext * const s, const int cidx)
-+{
-+#if HEVC_RPI_420_ONLY
-+    return cidx == 0 ? 0 : 1;
-+#else
-+    return s->ps.sps->hshift[cidx];
-+#endif
-+}
-+
-+static inline unsigned int ctx_vshift(const HEVCRpiContext * const s, const int cidx)
-+{
-+#if HEVC_RPI_420_ONLY
-+    return cidx == 0 ? 0 : 1;
-+#else
-+    return s->ps.sps->vshift[cidx];
-+#endif
-+}
-+
-+static inline int ctx_cfmt(const HEVCRpiContext * const s)
-+{
-+#if HEVC_RPI_420_ONLY
-+    return 1;
-+#else
-+    return s->ps.sps->chroma_format_idc;
-+#endif
-+}
-+
-+static inline int frame_stride1(const AVFrame * const frame, const int c_idx)
-+{
-+#if HEVC_RPI_SAND128_ONLY
-+    return 128;
-+#else
-+    return frame->linesize[c_idx];
-+#endif
-+}
-+
-+#if HEVC_RPI_SAND128_ONLY
-+// Propagate this decision to later zc includes
-+#define RPI_ZC_SAND128_ONLY 1
-+#endif
-+
-+#ifndef ff_hevc_rpi_copy_vert
-+static inline void ff_hevc_rpi_copy_vert(uint8_t *dst, const uint8_t *src,
-+                                         int pixel_shift, int height,
-+                                         ptrdiff_t stride_dst, ptrdiff_t stride_src)
-+{
-+    int i;
-+    switch (pixel_shift)
-+    {
-+        case 2:
-+            for (i = 0; i < height; i++) {
-+                *(uint32_t *)dst = *(uint32_t *)src;
-+                dst += stride_dst;
-+                src += stride_src;
-+            }
-+            break;
-+        case 1:
-+            for (i = 0; i < height; i++) {
-+                *(uint16_t *)dst = *(uint16_t *)src;
-+                dst += stride_dst;
-+                src += stride_src;
-+            }
-+            break;
-+        default:
-+            for (i = 0; i < height; i++) {
-+                *dst = *src;
-+                dst += stride_dst;
-+                src += stride_src;
-+            }
-+            break;
-+    }
-+}
-+#endif
-+
-+
-+#if MVF_STASH_WIDTH == 64
-+static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
-+                               const unsigned int x, const unsigned int y)
-+{
-+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
-+    return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE));
-+}
-+
-+static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
-+                               const unsigned int x0, const unsigned int y0,
-+                               const unsigned int x, const unsigned int y)
-+{
-+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
-+    const unsigned int x0_ctb = x0 & mask_cs_hi;
-+    const unsigned int y0_ctb = y0 & mask_cs_hi;
-+
-+    return (HEVCRpiMvField *)((y < y0_ctb) ?
-+        (x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE)) :
-+        (x < x0_ctb ? s->mvf_left + (y >> LOG2_MIN_PU_SIZE) :
-+            lc->mvf_stash +
-+                ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU +
-+                ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE)));
-+}
-+
-+static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s,
-+                               const unsigned int x0,
-+                               const unsigned int x)
-+{
-+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
-+    const unsigned int x0_ctb = x0 & mask_cs_hi;
-+    return x < x0_ctb ? 1 : MVF_STASH_WIDTH_PU;
-+}
-+
-+#else
-+static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
-+                               const unsigned int x, const unsigned int y)
-+{
-+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
-+    return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1)));
-+}
-+
-+static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
-+                               const unsigned int x0, const unsigned int y0,
-+                               const unsigned int x, const unsigned int y)
-+{
-+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
-+
-+    const unsigned int x0_ctb = x0 & mask_cs_hi;
-+    const unsigned int y0_ctb = y0 & mask_cs_hi;
-+
-+    // If not in the same CTB for Y assume up
-+    if (y < y0_ctb) {
-+        // If not in the same CTB for X too assume up-left
-+        return (HEVCRpiMvField *)(x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE));
-+    }
-+    return mvf_stash_ptr(s, lc, x, y);
-+}
-+
-+static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s,
-+                               const unsigned int x0,
-+                               const unsigned int x)
-+{
-+    return MVF_STASH_WIDTH_PU;
-+}
-+#endif
-+
-+#endif /* AVCODEC_RPI_HEVCDEC_H */
---- /dev/null
-+++ b/libavcodec/rpi_hevcdsp.c
-@@ -0,0 +1,450 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
-+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "rpi_hevcdsp.h"
-+#include "rpi_hevc_mv.h"
-+
-+static const int8_t transform[32][32] = {
-+    { 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
-+      64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64 },
-+    { 90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4,
-+      -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 },
-+    { 90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90,
-+     -90, -87, -80, -70, -57, -43, -25,  -9,   9,  25,  43,  57,  70,  80,  87,  90 },
-+    { 90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
-+      13,  38,  61,  78,  88,  90,  85,  73,  54,  31,   4, -22, -46, -67, -82, -90 },
-+    { 89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89,
-+      89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89 },
-+    { 88,  67,  31, -13, -54, -82, -90, -78, -46, -4,   38,  73,  90,  85,  61,  22,
-+     -22, -61, -85, -90, -73, -38,   4,  46,  78,  90,  82,  54,  13, -31, -67, -88 },
-+    { 87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87,
-+     -87, -57,  -9,  43,  80,  90,  70,  25, -25, -70, -90, -80, -43,   9,  57,  87 },
-+    { 85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31,
-+      31,  78,  90,  61,   4, -54, -88, -82, -38,  22,  73,  90,  67,  13, -46, -85 },
-+    { 83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83,
-+      83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83 },
-+    { 82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38,
-+     -38, -88, -73,  -4,  67,  90,  46, -31, -85, -78, -13,  61,  90,  54, -22, -82 },
-+    { 80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80,
-+     -80,  -9,  70,  87,  25, -57, -90, -43,  43,  90,  57, -25, -87, -70,   9,  80 },
-+    { 78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46,
-+      46,  90,  38, -54, -90, -31,  61,  88,  22, -67, -85, -13,  73,  82,   4, -78 },
-+    { 75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75,
-+      75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75 },
-+    { 73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54,
-+     -54, -85,   4,  88,  46, -61, -82,  13,  90,  38, -67, -78,  22,  90,  31, -73 },
-+    { 70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70,
-+     -70,  43,  87,  -9, -90, -25,  80,  57, -57, -80,  25,  90,   9, -87, -43,  70 },
-+    { 67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61,
-+      61,  73, -46, -82,  31,  88, -13, -90,  -4,  90,  22, -85, -38,  78,  54, -67 },
-+    { 64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,
-+      64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64 },
-+    { 61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67,
-+     -67, -54,  78,  38, -85, -22,  90,   4, -90,  13,  88, -31, -82,  46,  73, -61 },
-+    { 57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57,
-+     -57,  80,  25, -90,   9,  87, -43, -70,  70,  43, -87,  -9,  90, -25, -80,  57 },
-+    { 54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73,
-+      73,  31, -90,  22,  78, -67, -38,  90, -13, -82,  61,  46, -88,   4,  85, -54 },
-+    { 50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50,
-+      50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50 },
-+    { 46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78,
-+     -78,  -4,  82, -73, -13,  85, -67, -22,  88, -61, -31,  90, -54, -38,  90, -46 },
-+    { 43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43,
-+     -43,  90, -57, -25,  87, -70,  -9,  80, -80,   9,  70, -87,  25,  57, -90,  43 },
-+    { 38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82,
-+      82, -22, -54,  90, -61, -13,  78, -85,  31,  46, -90,  67,   4, -73,  88, -38 },
-+    { 36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36,
-+      36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36 },
-+    { 31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85,
-+     -85,  46,  13, -67,  90, -73,  22,  38, -82,  88, -54,  -4,  61, -90,  78, -31 },
-+    { 25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25,
-+     -25,  70, -90,  80, -43,  -9,  57, -87,  87, -57,   9,  43, -80,  90, -70,  25 },
-+    { 22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88,
-+      88, -67,  31,  13, -54,  82, -90,  78, -46,   4,  38, -73,  90, -85,  61, -22 },
-+    { 18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18,
-+      18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18 },
-+    { 13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90,
-+     -90,  82, -67,  46, -22,  -4,  31, -54,  73, -85,  90, -88,  78, -61,  38, -13 },
-+    {  9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25, -9,
-+      -9,  25, -43,  57, -70,  80, -87,  90, -90,  87, -80,  70, -57,  43, -25,   9 },
-+    {  4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90,
-+      90, -90,  88, -85,  82, -78,  73, -67,  61, -54,  46, -38,  31, -22,  13,  -4 },
-+};
-+
-+DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_epel_filters[7][4]) = {
-+    { -2, 58, 10, -2},
-+    { -4, 54, 16, -2},
-+    { -6, 46, 28, -4},
-+    { -4, 36, 36, -4},
-+    { -4, 28, 46, -6},
-+    { -2, 16, 54, -4},
-+    { -2, 10, 58, -2},
-+};
-+
-+DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_qpel_filters[3][16]) = {
-+    { -1,  4,-10, 58, 17, -5,  1,  0, -1,  4,-10, 58, 17, -5,  1,  0},
-+    { -1,  4,-11, 40, 40,-11,  4, -1, -1,  4,-11, 40, 40,-11,  4, -1},
-+    {  0,  1, -5, 17, 58,-10,  4, -1,  0,  1, -5, 17, 58,-10,  4, -1}
-+};
-+
-+#define BIT_DEPTH 8
-+#include "rpi_hevcdsp_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 9
-+#include "rpi_hevcdsp_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 10
-+#include "rpi_hevcdsp_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 12
-+#include "rpi_hevcdsp_template.c"
-+#undef BIT_DEPTH
-+
-+static uint32_t hevc_deblocking_boundary_strengths(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
-+                                               const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+                                               int in_inc0, int in_inc1)
-+{
-+    int shift = 32;
-+    uint32_t bs = 0;
-+    for (; pus > 0; pus--) {
-+        int strength, out;
-+        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
-+        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
-+        int nr_idx0 = neigh->ref_idx[0];
-+        int nr_idx1 = neigh->ref_idx[1];
-+        int neigh_refL0 = neigh_rpl0[nr_idx0];
-+        int neigh_refL1 = neigh_rpl1[nr_idx1];
-+
-+        av_assert0(nr_idx0 >= 0 && nr_idx0 <=31);
-+        av_assert0(nr_idx1 >= 0 && nr_idx1 <=31);
-+
-+#if 1 // This more directly matches the original implementation
-+        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
-+            // same L0 and L1
-+            if (curr_refL0 == neigh_refL0 &&
-+                curr_refL0 == curr_refL1 &&
-+                neigh_refL0 == neigh_refL1) {
-+                if ((FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 ||
-+                     FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4) &&
-+                    (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 ||
-+                     FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4))
-+                    strength = 1;
-+                else
-+                    strength = 0;
-+            } else if (neigh_refL0 == curr_refL0 &&
-+                       neigh_refL1 == curr_refL1) {
-+                if (FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 ||
-+                    FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4)
-+                    strength = 1;
-+                else
-+                    strength = 0;
-+            } else if (neigh_refL1 == curr_refL0 &&
-+                       neigh_refL0 == curr_refL1) {
-+                if (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 ||
-+                    FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4)
-+                    strength = 1;
-+                else
-+                    strength = 0;
-+            } else {
-+                strength = 1;
-+            }
-+        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
-+            MvXY curr_mv0, neigh_mv0;
-+
-+            if (curr->pred_flag & 1) {
-+                curr_mv0   = curr->xy[0];
-+            } else {
-+                curr_mv0   = curr->xy[1];
-+                curr_refL0 = curr_refL1;
-+            }
-+
-+            if (neigh->pred_flag & 1) {
-+                neigh_mv0   = neigh->xy[0];
-+            } else {
-+                neigh_mv0   = neigh->xy[1];
-+                neigh_refL0 = neigh_refL1;
-+            }
-+
-+            if (curr_refL0 == neigh_refL0) {
-+                if (FFABS(MV_X(curr_mv0) - MV_X(neigh_mv0)) >= 4 || FFABS(MV_Y(curr_mv0) - MV_Y(neigh_mv0)) >= 4)
-+                    strength = 1;
-+                else
-+                    strength = 0;
-+            } else
-+                strength = 1;
-+        } else
-+            strength = 1;
-+#else // This has exactly the same effect, but is more suitable for vectorisation
-+        MvXY curr_mv[2];
-+        MvXY neigh_mv[2];
-+        memcpy(curr_mv, curr->xy, sizeof curr_mv);
-+        memcpy(neigh_mv, neigh->xy, sizeof neigh_mv);
-+
-+        if (!(curr->pred_flag & 2)) {
-+            curr_mv[1] = curr_mv[0];
-+            curr_refL1 = curr_refL0;
-+        }
-+        if (!(neigh->pred_flag & 2)) {
-+            neigh_mv[1] = neigh_mv[0];
-+            neigh_refL1 = neigh_refL0;
-+        }
-+        if (!(curr->pred_flag & 1)) {
-+            curr_mv[0] = curr_mv[1];
-+            curr_refL0 = curr_refL1;
-+        }
-+        if (!(neigh->pred_flag & 1)) {
-+            neigh_mv[0] = neigh_mv[1];
-+            neigh_refL0 = neigh_refL1;
-+        }
-+
-+        strength = 1;
-+
-+        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
-+                (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[0])) >= 4) |
-+                (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[1])) >= 4);
-+
-+        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
-+                (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[0])) >= 4) |
-+                (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[1])) >= 4);
-+
-+        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
-+#endif
-+
-+        curr += in_inc0 / sizeof (HEVCRpiMvField);
-+        neigh += in_inc1 / sizeof (HEVCRpiMvField);
-+
-+        for (out = dup; out > 0; out--)
-+        {
-+            bs = (bs >> 2) | (strength << 30);
-+            shift -= 2;
-+        }
-+    }
-+    return bs >> shift;
-+}
-+
-+
-+static void cpy_blk(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height)
-+{
-+    unsigned int i, j;
-+
-+    if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) {
-+        for (i = 0; i < height; i++) {
-+            for (j = 0; j < width; j+=8)
-+                AV_COPY64U(dst+j, src+j);
-+            dst += stride_dst;
-+            src += stride_src;
-+        }
-+    } else {
-+        for (i = 0; i < height; i++) {
-+            for (j = 0; j < width; j+=16)
-+                AV_COPY128(dst+j, src+j);
-+            dst += stride_dst;
-+            src += stride_src;
-+        }
-+    }
-+}
-+
-+
-+
-+void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
-+{
-+#undef FUNC
-+#define FUNC(a, depth) a ## _ ## depth
-+
-+#undef PEL_FUNC
-+#define PEL_FUNC(dst1, idx1, idx2, a, depth)                                   \
-+    for(i = 0 ; i < 10 ; i++)                                                  \
-+{                                                                              \
-+    hevcdsp->dst1[i][idx1][idx2] = a ## _ ## depth;                            \
-+}
-+
-+#undef EPEL_FUNCS
-+#define EPEL_FUNCS(depth)                                                     \
-+    PEL_FUNC(put_hevc_epel, 0, 0, put_hevc_pel_pixels, depth);                \
-+    PEL_FUNC(put_hevc_epel, 0, 1, put_hevc_epel_h, depth);                    \
-+    PEL_FUNC(put_hevc_epel, 1, 0, put_hevc_epel_v, depth);                    \
-+    PEL_FUNC(put_hevc_epel, 1, 1, put_hevc_epel_hv, depth)
-+
-+#undef EPEL_UNI_FUNCS
-+#define EPEL_UNI_FUNCS(depth)                                                 \
-+    PEL_FUNC(put_hevc_epel_uni, 0, 0, put_hevc_pel_uni_pixels, depth);        \
-+    PEL_FUNC(put_hevc_epel_uni, 0, 1, put_hevc_epel_uni_h, depth);            \
-+    PEL_FUNC(put_hevc_epel_uni, 1, 0, put_hevc_epel_uni_v, depth);            \
-+    PEL_FUNC(put_hevc_epel_uni, 1, 1, put_hevc_epel_uni_hv, depth);           \
-+    PEL_FUNC(put_hevc_epel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth);    \
-+    PEL_FUNC(put_hevc_epel_uni_w, 0, 1, put_hevc_epel_uni_w_h, depth);        \
-+    PEL_FUNC(put_hevc_epel_uni_w, 1, 0, put_hevc_epel_uni_w_v, depth);        \
-+    PEL_FUNC(put_hevc_epel_uni_w, 1, 1, put_hevc_epel_uni_w_hv, depth)
-+
-+#undef EPEL_BI_FUNCS
-+#define EPEL_BI_FUNCS(depth)                                                \
-+    PEL_FUNC(put_hevc_epel_bi, 0, 0, put_hevc_pel_bi_pixels, depth);        \
-+    PEL_FUNC(put_hevc_epel_bi, 0, 1, put_hevc_epel_bi_h, depth);            \
-+    PEL_FUNC(put_hevc_epel_bi, 1, 0, put_hevc_epel_bi_v, depth);            \
-+    PEL_FUNC(put_hevc_epel_bi, 1, 1, put_hevc_epel_bi_hv, depth);           \
-+    PEL_FUNC(put_hevc_epel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth);    \
-+    PEL_FUNC(put_hevc_epel_bi_w, 0, 1, put_hevc_epel_bi_w_h, depth);        \
-+    PEL_FUNC(put_hevc_epel_bi_w, 1, 0, put_hevc_epel_bi_w_v, depth);        \
-+    PEL_FUNC(put_hevc_epel_bi_w, 1, 1, put_hevc_epel_bi_w_hv, depth)
-+
-+#undef QPEL_FUNCS
-+#define QPEL_FUNCS(depth)                                                     \
-+    PEL_FUNC(put_hevc_qpel, 0, 0, put_hevc_pel_pixels, depth);                \
-+    PEL_FUNC(put_hevc_qpel, 0, 1, put_hevc_qpel_h, depth);                    \
-+    PEL_FUNC(put_hevc_qpel, 1, 0, put_hevc_qpel_v, depth);                    \
-+    PEL_FUNC(put_hevc_qpel, 1, 1, put_hevc_qpel_hv, depth)
-+
-+#undef QPEL_UNI_FUNCS
-+#define QPEL_UNI_FUNCS(depth)                                                 \
-+    PEL_FUNC(put_hevc_qpel_uni, 0, 0, put_hevc_pel_uni_pixels, depth);        \
-+    PEL_FUNC(put_hevc_qpel_uni, 0, 1, put_hevc_qpel_uni_h, depth);            \
-+    PEL_FUNC(put_hevc_qpel_uni, 1, 0, put_hevc_qpel_uni_v, depth);            \
-+    PEL_FUNC(put_hevc_qpel_uni, 1, 1, put_hevc_qpel_uni_hv, depth);           \
-+    PEL_FUNC(put_hevc_qpel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth);    \
-+    PEL_FUNC(put_hevc_qpel_uni_w, 0, 1, put_hevc_qpel_uni_w_h, depth);        \
-+    PEL_FUNC(put_hevc_qpel_uni_w, 1, 0, put_hevc_qpel_uni_w_v, depth);        \
-+    PEL_FUNC(put_hevc_qpel_uni_w, 1, 1, put_hevc_qpel_uni_w_hv, depth)
-+
-+#undef QPEL_BI_FUNCS
-+#define QPEL_BI_FUNCS(depth)                                                  \
-+    PEL_FUNC(put_hevc_qpel_bi, 0, 0, put_hevc_pel_bi_pixels, depth);          \
-+    PEL_FUNC(put_hevc_qpel_bi, 0, 1, put_hevc_qpel_bi_h, depth);              \
-+    PEL_FUNC(put_hevc_qpel_bi, 1, 0, put_hevc_qpel_bi_v, depth);              \
-+    PEL_FUNC(put_hevc_qpel_bi, 1, 1, put_hevc_qpel_bi_hv, depth);             \
-+    PEL_FUNC(put_hevc_qpel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth);      \
-+    PEL_FUNC(put_hevc_qpel_bi_w, 0, 1, put_hevc_qpel_bi_w_h, depth);          \
-+    PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
-+    PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
-+
-+#define SLICED_ADD_RESIDUAL(depth)\
-+    hevcdsp->add_residual_u[0]      = FUNC(add_residual4x4_u, depth);         \
-+    hevcdsp->add_residual_u[1]      = FUNC(add_residual8x8_u, depth);         \
-+    hevcdsp->add_residual_u[2]      = FUNC(add_residual16x16_u, depth);       \
-+    hevcdsp->add_residual_u[3]      = FUNC(add_residual32x32_u, depth);       \
-+    hevcdsp->add_residual_v[0]      = FUNC(add_residual4x4_v, depth);         \
-+    hevcdsp->add_residual_v[1]      = FUNC(add_residual8x8_v, depth);         \
-+    hevcdsp->add_residual_v[2]      = FUNC(add_residual16x16_v, depth);       \
-+    hevcdsp->add_residual_v[3]      = FUNC(add_residual32x32_v, depth);       \
-+    hevcdsp->add_residual_c[0]      = FUNC(add_residual4x4_c, depth);         \
-+    hevcdsp->add_residual_c[1]      = FUNC(add_residual8x8_c, depth);         \
-+    hevcdsp->add_residual_c[2]      = FUNC(add_residual16x16_c, depth);       \
-+    hevcdsp->add_residual_c[3]      = FUNC(add_residual32x32_c, depth);       \
-+    hevcdsp->add_residual_dc_c[0]   = FUNC(add_residual4x4_dc_c, depth);         \
-+    hevcdsp->add_residual_dc_c[1]   = FUNC(add_residual8x8_dc_c, depth);         \
-+    hevcdsp->add_residual_dc_c[2]   = FUNC(add_residual16x16_dc_c, depth);       \
-+    hevcdsp->add_residual_dc_c[3]   = FUNC(add_residual32x32_dc_c, depth);       \
-+    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth)
-+#define SLICED_LOOP_FILTERS(depth)\
-+    hevcdsp->hevc_h_loop_filter_luma2 = FUNC(hevc_h_loop_filter_luma2, depth); \
-+    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
-+    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
-+    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
-+#define SLICED_SAO(depth)\
-+    for (i = 0; i != SAO_FILTER_N; ++i) {                                     \
-+        hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth);       \
-+        hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth);       \
-+    }                                                                         \
-+    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);       \
-+    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth)
-+
-+#define HEVC_DSP(depth)                                                     \
-+    hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
-+    hevcdsp->add_residual[0]        = FUNC(add_residual4x4, depth);         \
-+    hevcdsp->add_residual[1]        = FUNC(add_residual8x8, depth);         \
-+    hevcdsp->add_residual[2]        = FUNC(add_residual16x16, depth);       \
-+    hevcdsp->add_residual[3]        = FUNC(add_residual32x32, depth);       \
-+    hevcdsp->add_residual_dc[0]     = FUNC(add_residual4x4_dc, depth);         \
-+    hevcdsp->add_residual_dc[1]     = FUNC(add_residual8x8_dc, depth);         \
-+    hevcdsp->add_residual_dc[2]     = FUNC(add_residual16x16_dc, depth);       \
-+    hevcdsp->add_residual_dc[3]     = FUNC(add_residual32x32_dc, depth);       \
-+    SLICED_ADD_RESIDUAL(depth);                                             \
-+    hevcdsp->dequant                = FUNC(dequant, depth);                 \
-+    hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
-+    hevcdsp->transform_4x4_luma     = FUNC(transform_4x4_luma, depth);      \
-+    hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
-+    hevcdsp->idct[1]                = FUNC(idct_8x8, depth);                \
-+    hevcdsp->idct[2]                = FUNC(idct_16x16, depth);              \
-+    hevcdsp->idct[3]                = FUNC(idct_32x32, depth);              \
-+                                                                            \
-+    hevcdsp->idct_dc[0]             = FUNC(idct_4x4_dc, depth);             \
-+    hevcdsp->idct_dc[1]             = FUNC(idct_8x8_dc, depth);             \
-+    hevcdsp->idct_dc[2]             = FUNC(idct_16x16_dc, depth);           \
-+    hevcdsp->idct_dc[3]             = FUNC(idct_32x32_dc, depth);           \
-+                                                                            \
-+    for (i = 0; i != SAO_FILTER_N; ++i) {                                   \
-+        hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth);         \
-+        hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth);         \
-+    }                                                                       \
-+    hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
-+    hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
-+    SLICED_SAO(depth);                                                         \
-+                                                                               \
-+    QPEL_FUNCS(depth);                                                         \
-+    QPEL_UNI_FUNCS(depth);                                                     \
-+    QPEL_BI_FUNCS(depth);                                                      \
-+    EPEL_FUNCS(depth);                                                         \
-+    EPEL_UNI_FUNCS(depth);                                                     \
-+    EPEL_BI_FUNCS(depth);                                                      \
-+                                                                               \
-+    SLICED_LOOP_FILTERS(depth);                                                \
-+    hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
-+    hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
-+    hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
-+    hevcdsp->hevc_v_loop_filter_chroma   = FUNC(hevc_v_loop_filter_chroma, depth); \
-+    hevcdsp->hevc_h_loop_filter_luma_c   = FUNC(hevc_h_loop_filter_luma, depth);   \
-+    hevcdsp->hevc_v_loop_filter_luma_c   = FUNC(hevc_v_loop_filter_luma, depth);   \
-+    hevcdsp->hevc_h_loop_filter_chroma_c = FUNC(hevc_h_loop_filter_chroma, depth); \
-+    hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth)
-+int i = 0;
-+
-+    switch (bit_depth) {
-+    case 9:
-+        HEVC_DSP(9);
-+        break;
-+    case 10:
-+        HEVC_DSP(10);
-+        break;
-+    case 12:
-+        HEVC_DSP(12);
-+        break;
-+    default:
-+        HEVC_DSP(8);
-+        break;
-+    }
-+
-+    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
-+    hevcdsp->cpy_blk = cpy_blk;
-+
-+    if (ARCH_PPC)
-+        ff_hevc_rpi_dsp_init_ppc(hevcdsp, bit_depth);
-+    if (ARCH_X86)
-+        ff_hevc_rpi_dsp_init_x86(hevcdsp, bit_depth);
-+    if (ARCH_ARM)
-+        ff_hevcdsp_rpi_init_arm(hevcdsp, bit_depth);
-+    if (ARCH_MIPS)
-+        ff_hevc_rpi_dsp_init_mips(hevcdsp, bit_depth);
-+}
---- /dev/null
-+++ b/libavcodec/rpi_hevcdsp.h
-@@ -0,0 +1,177 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
-+ *
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVCDSP_H
-+#define AVCODEC_RPI_HEVCDSP_H
-+
-+#include "hevc.h"
-+#include "get_bits.h"
-+
-+struct HEVCRpiMvField;
-+
-+#define MAX_PB_SIZE 64
-+
-+#define RPI_HEVC_SAO_BUF_STRIDE 160
-+
-+
-+typedef struct RpiSAOParams {
-+    uint8_t band_position[3];   ///< sao_band_position (Y,U,V)
-+    uint8_t eo_class[3];        ///< sao_eo_class      (Y,U=V)
-+    uint8_t type_idx[3];        ///< sao_type_idx      (Y,U=V)
-+
-+    int16_t offset_val[3][5];   ///<SaoOffsetVal       (Y,U,V)
-+
-+} RpiSAOParams;
-+
-+
-+// This controls how many sao dsp functions there are
-+// N=5 has width = 8, 16, 32, 48, 64
-+// N=6 adds a function for width=24 (in fn array el 5 so existing code should
-+// still work)
-+#define SAO_FILTER_N 6
-+
-+
-+typedef struct HEVCDSPContext {
-+    void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
-+                    struct GetBitContext *gb, int pcm_bit_depth);
-+
-+    void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-+    void (*add_residual_dc[4])(uint8_t *dst, ptrdiff_t stride, int dc);
-+    void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_v);
-+    void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_u);
-+
-+    void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
-+    void (*add_residual_dc_c[4])(uint8_t *dst, ptrdiff_t stride, int32_t dc_uv);
-+    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
-+                    struct GetBitContext *gb, int pcm_bit_depth);
-+
-+    void (*dequant)(int16_t *coeffs, int16_t log2_size);
-+
-+    void (*transform_rdpcm)(int16_t *coeffs, int16_t log2_size, int mode);
-+
-+    void (*transform_4x4_luma)(int16_t *coeffs);
-+
-+    void (*idct[4])(int16_t *coeffs, int col_limit);
-+
-+    void (*idct_dc[4])(int16_t *coeffs);
-+
-+    void (*sao_band_filter[SAO_FILTER_N])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-+                               int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+    void (*sao_band_filter_c[SAO_FILTER_N])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-+                               const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                               const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                               int width, int height);
-+
-+    /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
-+    void (*sao_edge_filter[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
-+                               int16_t *sao_offset_val, int sao_eo_class, int width, int height);
-+    void (*sao_edge_filter_c[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
-+                               const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
-+
-+    void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-+                                struct RpiSAOParams *sao, int *borders, int _width, int _height, int c_idx,
-+                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
-+    void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-+                                struct RpiSAOParams *sao, int *borders, int _width, int _height, int c_idx,
-+                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
-+
-+    void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
-+                                    int height, intptr_t mx, intptr_t my, int width);
-+    void (*put_hevc_qpel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
-+                                        int height, intptr_t mx, intptr_t my, int width);
-+    void (*put_hevc_qpel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
-+
-+    void (*put_hevc_qpel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int16_t *src2,
-+                                       int height, intptr_t mx, intptr_t my, int width);
-+    void (*put_hevc_qpel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                         int16_t *src2,
-+                                         int height, int denom, int wx0, int wx1,
-+                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width);
-+    void (*put_hevc_epel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
-+                                    int height, intptr_t mx, intptr_t my, int width);
-+
-+    void (*put_hevc_epel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                        int height, intptr_t mx, intptr_t my, int width);
-+    void (*put_hevc_epel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
-+    void (*put_hevc_epel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int16_t *src2,
-+                                       int height, intptr_t mx, intptr_t my, int width);
-+    void (*put_hevc_epel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                         int16_t *src2,
-+                                         int height, int denom, int wx0, int ox0, int wx1,
-+                                         int ox1, intptr_t mx, intptr_t my, int width);
-+
-+    void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-+                                    int beta, int32_t *tc,
-+                                    uint8_t *no_p, uint8_t *no_q);
-+    void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-+                                    int beta, int32_t *tc,
-+                                    uint8_t *no_p, uint8_t *no_q);
-+    void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-+                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
-+    void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-+                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
-+    void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
-+                                      int beta, int32_t *tc,
-+                                      uint8_t *no_p, uint8_t *no_q);
-+    void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
-+                                      int beta, int32_t *tc,
-+                                      uint8_t *no_p, uint8_t *no_q);
-+    void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
-+                                        int32_t *tc, uint8_t *no_p,
-+                                        uint8_t *no_q);
-+    void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
-+                                        int32_t *tc, uint8_t *no_p,
-+                                        uint8_t *no_q);
-+    void (*hevc_h_loop_filter_luma2)(uint8_t * _pix_r,
-+                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
-+    void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
-+                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
-+                                 uint8_t * _pix_l);
-+    void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
-+                                 unsigned int no_f);
-+    void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
-+                                 uint8_t * src_l,
-+                                 unsigned int no_f);
-+
-+    uint32_t (*hevc_deblocking_boundary_strengths)(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh,
-+                                               const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+                                               int in_inc0, int inc_inc1);
-+
-+    void (* cpy_blk)(uint8_t * dst, unsigned int dst_stride, const uint8_t * src, unsigned int src_stride, unsigned int width, unsigned int height);
-+} HEVCDSPContext;
-+
-+void ff_hevc_rpi_dsp_init(HEVCDSPContext *hpc, int bit_depth);
-+
-+extern const int8_t ff_hevc_rpi_epel_filters[7][4];
-+extern const int8_t ff_hevc_rpi_qpel_filters[3][16];
-+
-+void ff_hevc_rpi_dsp_init_ppc(HEVCDSPContext *c, const int bit_depth);
-+void ff_hevc_rpi_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
-+void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth);
-+void ff_hevc_rpi_dsp_init_mips(HEVCDSPContext *c, const int bit_depth);
-+#endif /* AVCODEC_RPI_HEVCDSP_H */
---- /dev/null
-+++ b/libavcodec/rpi_hevcdsp_template.c
-@@ -0,0 +1,2279 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "get_bits.h"
-+#include "rpi_hevcdec.h"
-+
-+#include "bit_depth_template.c"
-+#include "rpi_hevcdsp.h"
-+
-+#include "rpi_hevc_shader_template.h"
-+
-+static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
-+                          GetBitContext *gb, int pcm_bit_depth)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
-+        dst += stride;
-+    }
-+}
-+
-+static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
-+                          GetBitContext *gb, int pcm_bit_depth)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
-+        dst += stride;
-+    }
-+
-+    dst = (pixel *)_dst + 1;
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
-+        dst += stride;
-+    }
-+}
-+
-+static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
-+                                                ptrdiff_t stride, int size)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size; x++) {
-+            dst[x] = av_clip_pixel(dst[x] + *res);
-+            res++;
-+        }
-+        dst += stride;
-+    }
-+}
-+
-+static av_always_inline void FUNC(add_residual_dc)(uint8_t *_dst, ptrdiff_t stride, const int dc, int size)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size; x++) {
-+            dst[x] = av_clip_pixel(dst[x] + dc);
-+        }
-+        dst += stride;
-+    }
-+}
-+
-+
-+static av_always_inline void FUNC(add_residual_u)(uint8_t *_dst, const int16_t *res,
-+                                                ptrdiff_t stride, const int dc_v, int size)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size * 2; x += 2) {
-+            dst[x] = av_clip_pixel(dst[x] + *res);
-+            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
-+            res++;
-+        }
-+        dst += stride;
-+    }
-+}
-+
-+static av_always_inline void FUNC(add_residual_v)(uint8_t *_dst, const int16_t *res,
-+                                                ptrdiff_t stride, const int dc_u, int size)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size * 2; x += 2) {
-+            dst[x] = av_clip_pixel(dst[x] + dc_u);
-+            dst[x + 1] = av_clip_pixel(dst[x + 1] + *res);
-+            res++;
-+        }
-+        dst += stride;
-+    }
-+}
-+
-+static av_always_inline void FUNC(add_residual_c)(uint8_t *_dst, const int16_t *res,
-+                                                ptrdiff_t stride, unsigned int size)
-+{
-+    unsigned int x, y;
-+    pixel *dst = (pixel *)_dst;
-+    const int16_t * ru = res;
-+    const int16_t * rv = res + size * size;
-+
-+//    rpi_sand_dump16("ARC In Pred", _dst, stride, 0, 0, 0, size, size, 1);
-+//    rpi_sand_dump16("ARC In RU", ru, size * 2, 0, 0, 0, size, size, 0);
-+//    rpi_sand_dump16("ARC In RV", rv, size * 2, 0, 0, 0, size, size, 0);
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size * 2; x += 2) {
-+            dst[x + 0] = av_clip_pixel(dst[x + 0] + *ru++);
-+            dst[x + 1] = av_clip_pixel(dst[x + 1] + *rv++);
-+        }
-+        dst += stride;
-+    }
-+
-+//    rpi_sand_dump16("ARC Out", _dst, stride * 2, 0, 0, 0, size, size, 1);
-+}
-+
-+
-+static av_always_inline void FUNC(add_residual_dc_c)(uint8_t *_dst, ptrdiff_t stride, const int32_t dc, int size)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+    const int dc_v = dc >> 16;
-+    const int dc_u = (dc << 16) >> 16;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size * 2; x += 2) {
-+            dst[x] = av_clip_pixel(dst[x] + dc_u);
-+            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
-+        }
-+        dst += stride;
-+    }
-+}
-+
-+
-+static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
-+                                  ptrdiff_t stride)
-+{
-+    FUNC(add_residual)(_dst, res, stride, 4);
-+}
-+
-+static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res,
-+                                  ptrdiff_t stride)
-+{
-+    FUNC(add_residual)(_dst, res, stride, 8);
-+}
-+
-+static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res,
-+                                    ptrdiff_t stride)
-+{
-+    FUNC(add_residual)(_dst, res, stride, 16);
-+}
-+
-+static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
-+                                    ptrdiff_t stride)
-+{
-+    FUNC(add_residual)(_dst, res, stride, 32);
-+}
-+
-+static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
-+{
-+    FUNC(add_residual_dc)(_dst, stride, dc, 4);
-+}
-+
-+static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
-+{
-+    FUNC(add_residual_dc)(_dst, stride, dc, 8);
-+}
-+
-+static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
-+{
-+    FUNC(add_residual_dc)(_dst, stride, dc, 16);
-+}
-+
-+static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
-+{
-+    FUNC(add_residual_dc)(_dst, stride, dc, 32);
-+}
-+
-+// -- U -- (plaited)
-+
-+static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride, int dc_u)
-+{
-+    FUNC(add_residual_u)(_dst, res, stride, dc_u, 4);
-+}
-+
-+static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride, int dc_u)
-+{
-+    FUNC(add_residual_u)(_dst, res, stride, dc_u, 8);
-+}
-+
-+static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride, int dc_u)
-+{
-+    FUNC(add_residual_u)(_dst, res, stride, dc_u, 16);
-+}
-+
-+static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride, int dc_u)
-+{
-+    // Should never occur for 420, which is all that sand supports
-+    av_assert0(0);
-+}
-+
-+// -- V -- (plaited)
-+
-+static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride, int dc_v)
-+{
-+    FUNC(add_residual_v)(_dst, res, stride, dc_v, 4);
-+}
-+
-+static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride, int dc_v)
-+{
-+    FUNC(add_residual_v)(_dst, res, stride, dc_v, 8);
-+}
-+
-+static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride, int dc_v)
-+{
-+    FUNC(add_residual_v)(_dst, res, stride, dc_v, 16);
-+}
-+
-+static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride, int dc_v)
-+{
-+    // Should never occur for 420, which is all that sand supports
-+    av_assert0(0);
-+}
-+
-+// -- C -- (plaited - both U & V)
-+
-+static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride)
-+{
-+    FUNC(add_residual_c)(_dst, res, stride, 4);
-+}
-+
-+static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride)
-+{
-+    FUNC(add_residual_c)(_dst, res, stride, 8);
-+}
-+
-+static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride)
-+{
-+    FUNC(add_residual_c)(_dst, res, stride, 16);
-+}
-+
-+static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride)
-+{
-+    // Should never occur for 420, which is all that sand supports
-+    av_assert0(0);
-+}
-+
-+static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
-+{
-+    FUNC(add_residual_dc_c)(_dst, stride, dc, 4);
-+}
-+
-+static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
-+{
-+    FUNC(add_residual_dc_c)(_dst, stride, dc, 8);
-+}
-+
-+static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
-+{
-+    FUNC(add_residual_dc_c)(_dst, stride, dc, 16);
-+}
-+
-+static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
-+{
-+    // Should never occur for 420, which is all that sand supports
-+    av_assert0(0);
-+}
-+
-+
-+static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
-+{
-+    int16_t *coeffs = (int16_t *) _coeffs;
-+    int x, y;
-+    int size = 1 << log2_size;
-+
-+    if (mode) {
-+        coeffs += size;
-+        for (y = 0; y < size - 1; y++) {
-+            for (x = 0; x < size; x++)
-+                coeffs[x] += coeffs[x - size];
-+            coeffs += size;
-+        }
-+    } else {
-+        for (y = 0; y < size; y++) {
-+            for (x = 1; x < size; x++)
-+                coeffs[x] += coeffs[x - 1];
-+            coeffs += size;
-+        }
-+    }
-+}
-+
-+static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size)
-+{
-+    int shift  = 15 - BIT_DEPTH - log2_size;
-+    int x, y;
-+    int size = 1 << log2_size;
-+
-+    if (shift > 0) {
-+        int offset = 1 << (shift - 1);
-+        for (y = 0; y < size; y++) {
-+            for (x = 0; x < size; x++) {
-+                *coeffs = (*coeffs + offset) >> shift;
-+                coeffs++;
-+            }
-+        }
-+    } else {
-+        for (y = 0; y < size; y++) {
-+            for (x = 0; x < size; x++) {
-+                *coeffs = *coeffs << -shift;
-+                coeffs++;
-+            }
-+        }
-+    }
-+}
-+
-+#define SET(dst, x)   (dst) = (x)
-+#define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
-+
-+#define TR_4x4_LUMA(dst, src, step, assign)                             \
-+    do {                                                                \
-+        int c0 = src[0 * step] + src[2 * step];                         \
-+        int c1 = src[2 * step] + src[3 * step];                         \
-+        int c2 = src[0 * step] - src[3 * step];                         \
-+        int c3 = 74 * src[1 * step];                                    \
-+                                                                        \
-+        assign(dst[2 * step], 74 * (src[0 * step] -                     \
-+                                    src[2 * step] +                     \
-+                                    src[3 * step]));                    \
-+        assign(dst[0 * step], 29 * c0 + 55 * c1 + c3);                  \
-+        assign(dst[1 * step], 55 * c2 - 29 * c1 + c3);                  \
-+        assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
-+    } while (0)
-+
-+static void FUNC(transform_4x4_luma)(int16_t *coeffs)
-+{
-+    int i;
-+    int shift    = 7;
-+    int add      = 1 << (shift - 1);
-+    int16_t *src = coeffs;
-+
-+    for (i = 0; i < 4; i++) {
-+        TR_4x4_LUMA(src, src, 4, SCALE);
-+        src++;
-+    }
-+
-+    shift = 20 - BIT_DEPTH;
-+    add   = 1 << (shift - 1);
-+    for (i = 0; i < 4; i++) {
-+        TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
-+        coeffs += 4;
-+    }
-+}
-+
-+#undef TR_4x4_LUMA
-+
-+#define TR_4(dst, src, dstep, sstep, assign, end)                 \
-+    do {                                                          \
-+        const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep]; \
-+        const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep]; \
-+        const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep]; \
-+        const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep]; \
-+                                                                  \
-+        assign(dst[0 * dstep], e0 + o0);                          \
-+        assign(dst[1 * dstep], e1 + o1);                          \
-+        assign(dst[2 * dstep], e1 - o1);                          \
-+        assign(dst[3 * dstep], e0 - o0);                          \
-+    } while (0)
-+
-+#define TR_8(dst, src, dstep, sstep, assign, end)                 \
-+    do {                                                          \
-+        int i, j;                                                 \
-+        int e_8[4];                                               \
-+        int o_8[4] = { 0 };                                       \
-+        for (i = 0; i < 4; i++)                                   \
-+            for (j = 1; j < end; j += 2)                          \
-+                o_8[i] += transform[4 * j][i] * src[j * sstep];   \
-+        TR_4(e_8, src, 1, 2 * sstep, SET, 4);                     \
-+                                                                  \
-+        for (i = 0; i < 4; i++) {                                 \
-+            assign(dst[i * dstep], e_8[i] + o_8[i]);              \
-+            assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);        \
-+        }                                                         \
-+    } while (0)
-+
-+#define TR_16(dst, src, dstep, sstep, assign, end)                \
-+    do {                                                          \
-+        int i, j;                                                 \
-+        int e_16[8];                                              \
-+        int o_16[8] = { 0 };                                      \
-+        for (i = 0; i < 8; i++)                                   \
-+            for (j = 1; j < end; j += 2)                          \
-+                o_16[i] += transform[2 * j][i] * src[j * sstep];  \
-+        TR_8(e_16, src, 1, 2 * sstep, SET, 8);                    \
-+                                                                  \
-+        for (i = 0; i < 8; i++) {                                 \
-+            assign(dst[i * dstep], e_16[i] + o_16[i]);            \
-+            assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);     \
-+        }                                                         \
-+    } while (0)
-+
-+#define TR_32(dst, src, dstep, sstep, assign, end)                \
-+    do {                                                          \
-+        int i, j;                                                 \
-+        int e_32[16];                                             \
-+        int o_32[16] = { 0 };                                     \
-+        for (i = 0; i < 16; i++)                                  \
-+            for (j = 1; j < end; j += 2)                          \
-+                o_32[i] += transform[j][i] * src[j * sstep];      \
-+        TR_16(e_32, src, 1, 2 * sstep, SET, end / 2);             \
-+                                                                  \
-+        for (i = 0; i < 16; i++) {                                \
-+            assign(dst[i * dstep], e_32[i] + o_32[i]);            \
-+            assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);     \
-+        }                                                         \
-+    } while (0)
-+
-+#define IDCT_VAR4(H)                                              \
-+    int limit2 = FFMIN(col_limit + 4, H)
-+#define IDCT_VAR8(H)                                              \
-+    int limit  = FFMIN(col_limit, H);                             \
-+    int limit2 = FFMIN(col_limit + 4, H)
-+#define IDCT_VAR16(H)   IDCT_VAR8(H)
-+#define IDCT_VAR32(H)   IDCT_VAR8(H)
-+
-+#define IDCT(H)                                                   \
-+static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs,          \
-+                                        int col_limit)            \
-+{                                                                 \
-+    int i;                                                        \
-+    int      shift = 7;                                           \
-+    int      add   = 1 << (shift - 1);                            \
-+    int16_t *src   = coeffs;                                      \
-+    IDCT_VAR ## H(H);                                             \
-+                                                                  \
-+    for (i = 0; i < H; i++) {                                     \
-+        TR_ ## H(src, src, H, H, SCALE, limit2);                  \
-+        if (limit2 < H && i%4 == 0 && !!i)                        \
-+            limit2 -= 4;                                          \
-+        src++;                                                    \
-+    }                                                             \
-+                                                                  \
-+    shift = 20 - BIT_DEPTH;                                       \
-+    add   = 1 << (shift - 1);                                     \
-+    for (i = 0; i < H; i++) {                                     \
-+        TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit);             \
-+        coeffs += H;                                              \
-+    }                                                             \
-+}
-+
-+#define IDCT_DC(H)                                                \
-+static void FUNC(idct_ ## H ## x ## H ## _dc)(int16_t *coeffs)    \
-+{                                                                 \
-+    int i, j;                                                     \
-+    int shift = 14 - BIT_DEPTH;                                   \
-+    int add   = 1 << (shift - 1);                                 \
-+    int coeff = (((coeffs[0] + 1) >> 1) + add) >> shift;          \
-+                                                                  \
-+    for (j = 0; j < H; j++) {                                     \
-+        for (i = 0; i < H; i++) {                                 \
-+            coeffs[i + j * H] = coeff;                            \
-+        }                                                         \
-+    }                                                             \
-+}
-+
-+IDCT( 4)
-+IDCT( 8)
-+IDCT(16)
-+IDCT(32)
-+
-+IDCT_DC( 4)
-+IDCT_DC( 8)
-+IDCT_DC(16)
-+IDCT_DC(32)
-+
-+#undef TR_4
-+#undef TR_8
-+#undef TR_16
-+#undef TR_32
-+
-+#undef SET
-+#undef SCALE
-+
-+static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  int16_t *sao_offset_val, int sao_left_class,
-+                                  int width, int height)
-+{
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int offset_table[32] = { 0 };
-+    int k, y, x;
-+    int shift  = BIT_DEPTH - 5;
-+
-+    stride_dst /= sizeof(pixel);
-+    stride_src /= sizeof(pixel);
-+
-+    for (k = 0; k < 4; k++)
-+        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
-+        dst += stride_dst;
-+        src += stride_src;
-+    }
-+}
-+
-+#define CMP(a, b) (((a) > (b)) - ((a) < (b)))
-+
-+static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
-+                                  int eo, int width, int height) {
-+
-+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-+    static const int8_t pos[4][2][2] = {
-+        { { -1,  0 }, {  1, 0 } }, // horizontal
-+        { {  0, -1 }, {  0, 1 } }, // vertical
-+        { { -1, -1 }, {  1, 1 } }, // 45 degree
-+        { {  1, -1 }, { -1, 1 } }, // 135 degree
-+    };
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int a_stride, b_stride;
-+    int x, y;
-+    const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel);
-+    stride_dst /= sizeof(pixel);
-+
-+    a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
-+    b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++) {
-+            int diff0 = CMP(src[x], src[x + a_stride]);
-+            int diff1 = CMP(src[x], src[x + b_stride]);
-+            int offset_val        = edge_idx[2 + diff0 + diff1];
-+            dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
-+        }
-+        src += stride_src;
-+        dst += stride_dst;
-+    }
-+}
-+
-+
-+#if BIT_DEPTH == 10
-+// We need a 32 bit variation for the _c restores so hijack bit depth 10
-+#undef pixel
-+#undef BIT_DEPTH
-+#define pixel uint32_t
-+#define BIT_DEPTH 32
-+// All 16 bit variations are the same
-+#define sao_edge_restore_0_10 sao_edge_restore_0_9
-+#define sao_edge_restore_1_10 sao_edge_restore_1_9
-+#define sao_edge_restore_0_11 sao_edge_restore_0_9
-+#define sao_edge_restore_1_11 sao_edge_restore_1_9
-+#define sao_edge_restore_0_12 sao_edge_restore_0_9
-+#define sao_edge_restore_1_12 sao_edge_restore_1_9
-+#define sao_edge_restore_0_13 sao_edge_restore_0_9
-+#define sao_edge_restore_1_13 sao_edge_restore_1_9
-+#define sao_edge_restore_0_14 sao_edge_restore_0_9
-+#define sao_edge_restore_1_14 sao_edge_restore_1_9
-+#define sao_edge_restore_0_15 sao_edge_restore_0_9
-+#define sao_edge_restore_1_15 sao_edge_restore_1_9
-+#define sao_edge_restore_0_16 sao_edge_restore_0_9
-+#define sao_edge_restore_1_16 sao_edge_restore_1_9
-+#endif
-+#if BIT_DEPTH <= 9 || BIT_DEPTH == 32
-+static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
-+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao,
-+                                    int *borders, int _width, int _height,
-+                                    int c_idx, uint8_t *vert_edge,
-+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int sao_eo_class    = sao->eo_class[c_idx];
-+    int init_x = 0, width = _width, height = _height;
-+
-+    stride_dst /= sizeof(pixel);
-+    stride_src /= sizeof(pixel);
-+
-+    if (sao_eo_class != SAO_EO_VERT) {
-+        if (borders[0]) {
-+            for (y = 0; y < height; y++) {
-+                dst[y * stride_dst] = src[y * stride_src];
-+            }
-+            init_x = 1;
-+        }
-+        if (borders[2]) {
-+            int offset     = width - 1;
-+            for (x = 0; x < height; x++) {
-+                dst[x * stride_dst + offset] = src[x * stride_src + offset];
-+            }
-+            width--;
-+        }
-+    }
-+    if (sao_eo_class != SAO_EO_HORIZ) {
-+        if (borders[1]) {
-+            for (x = init_x; x < width; x++)
-+                dst[x] = src[x];
-+        }
-+        if (borders[3]) {
-+            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
-+            ptrdiff_t y_stride_src = stride_src * (height - 1);
-+            for (x = init_x; x < width; x++)
-+                dst[x + y_stride_dst] = src[x + y_stride_src];
-+            height--;
-+        }
-+    }
-+}
-+
-+static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
-+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao,
-+                                    int *borders, int _width, int _height,
-+                                    int c_idx, uint8_t *vert_edge,
-+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int sao_eo_class    = sao->eo_class[c_idx];
-+    int init_x = 0, init_y = 0, width = _width, height = _height;
-+
-+    stride_dst /= sizeof(pixel);
-+    stride_src /= sizeof(pixel);
-+
-+    if (sao_eo_class != SAO_EO_VERT) {
-+        if (borders[0]) {
-+            for (y = 0; y < height; y++) {
-+                dst[y * stride_dst] = src[y * stride_src];
-+            }
-+            init_x = 1;
-+        }
-+        if (borders[2]) {
-+            int offset     = width - 1;
-+            for (x = 0; x < height; x++) {
-+                dst[x * stride_dst + offset] = src[x * stride_src + offset];
-+            }
-+            width--;
-+        }
-+    }
-+    if (sao_eo_class != SAO_EO_HORIZ) {
-+        if (borders[1]) {
-+            for (x = init_x; x < width; x++)
-+                dst[x] = src[x];
-+            init_y = 1;
-+        }
-+        if (borders[3]) {
-+            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
-+            ptrdiff_t y_stride_src = stride_src * (height - 1);
-+            for (x = init_x; x < width; x++)
-+                dst[x + y_stride_dst] = src[x + y_stride_src];
-+            height--;
-+        }
-+    }
-+
-+    {
-+        int save_upper_left  = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
-+        int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D  && !borders[1] && !borders[2];
-+        int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3];
-+        int save_lower_left  = !diag_edge[3] && sao_eo_class == SAO_EO_45D  && !borders[0] && !borders[3];
-+
-+        // Restore pixels that can't be modified
-+        if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) {
-+            for(y = init_y+save_upper_left; y< height-save_lower_left; y++)
-+                dst[y*stride_dst] = src[y*stride_src];
-+        }
-+        if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) {
-+            for(y = init_y+save_upper_right; y< height-save_lower_right; y++)
-+                dst[y*stride_dst+width-1] = src[y*stride_src+width-1];
-+        }
-+
-+        if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) {
-+            for(x = init_x+save_upper_left; x < width-save_upper_right; x++)
-+                dst[x] = src[x];
-+        }
-+        if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) {
-+            for(x = init_x+save_lower_left; x < width-save_lower_right; x++)
-+                dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x];
-+        }
-+        if(diag_edge[0] && sao_eo_class == SAO_EO_135D)
-+            dst[0] = src[0];
-+        if(diag_edge[1] && sao_eo_class == SAO_EO_45D)
-+            dst[width-1] = src[width-1];
-+        if(diag_edge[2] && sao_eo_class == SAO_EO_135D)
-+            dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1];
-+        if(diag_edge[3] && sao_eo_class == SAO_EO_45D)
-+            dst[stride_dst*(height-1)] = src[stride_src*(height-1)];
-+
-+    }
-+}
-+#endif
-+#if BIT_DEPTH == 32
-+#undef BIT_DEPTH
-+#undef pixel
-+#define BIT_DEPTH 10
-+#define pixel uint16_t
-+#endif
-+
-+// --- Plaited chroma versions
-+
-+static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height)
-+{
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int offset_table_u[32] = { 0 };
-+    int offset_table_v[32] = { 0 };
-+    int k, y, x;
-+    int shift  = BIT_DEPTH - 5;
-+
-+    stride_dst /= sizeof(pixel);
-+    stride_src /= sizeof(pixel);
-+    width *= 2;
-+
-+    for (k = 0; k < 4; k++)
-+    {
-+        offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
-+        offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
-+    }
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x += 2)
-+        {
-+//            printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift);
-+//            printf("offsets=%x,%x\n", src[x + 0], src[x + 1]);
-+            // *** & 31 shouldn't be wanted but just now we generate broken input that
-+            // crashes us in 10-bit world
-+            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]);
-+            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]);
-+        }
-+        dst += stride_dst;
-+        src += stride_src;
-+    }
-+}
-+
-+static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
-+                                  int eo, int width, int height) {
-+
-+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-+    static const int8_t pos[4][2][2] = {
-+        { { -1,  0 }, {  1, 0 } }, // horizontal
-+        { {  0, -1 }, {  0, 1 } }, // vertical
-+        { { -1, -1 }, {  1, 1 } }, // 45 degree
-+        { {  1, -1 }, { -1, 1 } }, // 135 degree
-+    };
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int a_stride, b_stride;
-+    int x, y;
-+    const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel);
-+
-+    stride_dst /= sizeof(pixel);
-+    width *= 2;
-+
-+    av_assert0(width <= 64);
-+
-+    a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
-+    b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x += 2) {
-+            int diff0u = CMP(src[x], src[x + a_stride]);
-+            int diff1u = CMP(src[x], src[x + b_stride]);
-+            int offset_valu        = edge_idx[2 + diff0u + diff1u];
-+            int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
-+            int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
-+            int offset_valv        = edge_idx[2 + diff0v + diff1v];
-+            dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
-+            dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
-+        }
-+        src += stride_src;
-+        dst += stride_dst;
-+    }
-+}
-+
-+// Do once
-+#if BIT_DEPTH == 8
-+// Any old 2 byte 'normal' restore will work for these
-+#define sao_edge_restore_c_0_8  sao_edge_restore_0_16
-+#define sao_edge_restore_c_1_8  sao_edge_restore_1_16
-+// We need 32 bit for 9 bit+
-+#define sao_edge_restore_c_0_9  sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_9  sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_10 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_10 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_11 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_11 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_12 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_12 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_13 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_13 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_14 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_14 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_15 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_15 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_16 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_16 sao_edge_restore_1_32
-+#endif
-+
-+#undef CMP
-+
-+////////////////////////////////////////////////////////////////////////////////
-+//
-+////////////////////////////////////////////////////////////////////////////////
-+static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
-+                                      uint8_t *_src, ptrdiff_t _srcstride,
-+                                      int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src          = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = src[x] << (14 - BIT_DEPTH);
-+        src += srcstride;
-+        dst += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                          int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int y;
-+    pixel *src          = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+    for (y = 0; y < height; y++) {
-+        memcpy(dst, src, width * sizeof(pixel));
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                         int16_t *src2,
-+                                         int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src          = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+    int shift = 14  + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift);
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                            int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src          = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    ox     = ox * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                           int16_t *src2,
-+                                           int height, int denom, int wx0, int wx1,
-+                                           int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src          = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+    int shift = 14  + 1 - BIT_DEPTH;
-+    int log2Wd = denom + shift - 1;
-+
-+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
-+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++) {
-+            dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1));
-+        }
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+////////////////////////////////////////////////////////////////////////////////
-+//
-+////////////////////////////////////////////////////////////////////////////////
-+#define QPEL_FILTER(src, stride)                                               \
-+    (filter[0] * src[x - 3 * stride] +                                         \
-+     filter[1] * src[x - 2 * stride] +                                         \
-+     filter[2] * src[x -     stride] +                                         \
-+     filter[3] * src[x             ] +                                         \
-+     filter[4] * src[x +     stride] +                                         \
-+     filter[5] * src[x + 2 * stride] +                                         \
-+     filter[6] * src[x + 3 * stride] +                                         \
-+     filter[7] * src[x + 4 * stride])
-+
-+static void FUNC(put_hevc_qpel_h)(int16_t *dst,
-+                                  uint8_t *_src, ptrdiff_t _srcstride,
-+                                  int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        dst += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_v)(int16_t *dst,
-+                                  uint8_t *_src, ptrdiff_t _srcstride,
-+                                  int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
-+    for (y = 0; y < height; y++)  {
-+        for (x = 0; x < width; x++)
-+            dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        dst += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
-+                                   uint8_t *_src,
-+                                   ptrdiff_t _srcstride,
-+                                   int height, intptr_t mx,
-+                                   intptr_t my, int width)
-+{
-+    int x, y;
-+    const int8_t *filter;
-+    pixel *src = (pixel*)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+
-+    src   -= QPEL_EXTRA_BEFORE * srcstride;
-+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+    for (y = 0; y < height + QPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_qpel_filters[my - 1];
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
-+        tmp += MAX_PB_SIZE;
-+        dst += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst,  ptrdiff_t _dststride,
-+                                      uint8_t *_src, ptrdiff_t _srcstride,
-+                                      int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
-+    int shift = 14 - BIT_DEPTH;
-+
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                     int16_t *src2,
-+                                     int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
-+
-+    int shift = 14  + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst,  ptrdiff_t _dststride,
-+                                     uint8_t *_src, ptrdiff_t _srcstride,
-+                                     int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
-+    int shift = 14 - BIT_DEPTH;
-+
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+
-+static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                     int16_t *src2,
-+                                     int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+    int shift = 14 + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
-+                                       uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    const int8_t *filter;
-+    pixel *src = (pixel*)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift =  14 - BIT_DEPTH;
-+
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    src   -= QPEL_EXTRA_BEFORE * srcstride;
-+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+    for (y = 0; y < height + QPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
-+        tmp += MAX_PB_SIZE;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                      int16_t *src2,
-+                                      int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    const int8_t *filter;
-+    pixel *src = (pixel*)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift = 14 + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    src   -= QPEL_EXTRA_BEFORE * srcstride;
-+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+    for (y = 0; y < height + QPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
-+        tmp  += MAX_PB_SIZE;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst,  ptrdiff_t _dststride,
-+                                        uint8_t *_src, ptrdiff_t _srcstride,
-+                                        int height, int denom, int wx, int ox,
-+                                        intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
-+    int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    ox = ox * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int16_t *src2,
-+                                       int height, int denom, int wx0, int wx1,
-+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
-+
-+    int shift = 14  + 1 - BIT_DEPTH;
-+    int log2Wd = denom + shift - 1;
-+
-+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
-+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
-+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst,  ptrdiff_t _dststride,
-+                                        uint8_t *_src, ptrdiff_t _srcstride,
-+                                        int height, int denom, int wx, int ox,
-+                                        intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
-+    int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    ox = ox * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int16_t *src2,
-+                                       int height, int denom, int wx0, int wx1,
-+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+    int shift = 14 + 1 - BIT_DEPTH;
-+    int log2Wd = denom + shift - 1;
-+
-+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
-+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
-+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
-+                                         uint8_t *_src, ptrdiff_t _srcstride,
-+                                         int height, int denom, int wx, int ox,
-+                                         intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    const int8_t *filter;
-+    pixel *src = (pixel*)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    src   -= QPEL_EXTRA_BEFORE * srcstride;
-+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+    for (y = 0; y < height + QPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+    ox = ox * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
-+        tmp += MAX_PB_SIZE;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                        int16_t *src2,
-+                                        int height, int denom, int wx0, int wx1,
-+                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    const int8_t *filter;
-+    pixel *src = (pixel*)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift = 14 + 1 - BIT_DEPTH;
-+    int log2Wd = denom + shift - 1;
-+
-+    src   -= QPEL_EXTRA_BEFORE * srcstride;
-+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+    for (y = 0; y < height + QPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
-+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
-+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-+        tmp  += MAX_PB_SIZE;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+////////////////////////////////////////////////////////////////////////////////
-+//
-+////////////////////////////////////////////////////////////////////////////////
-+#define EPEL_FILTER(src, stride)                                               \
-+    (filter[0] * src[x - stride] +                                             \
-+     filter[1] * src[x]          +                                             \
-+     filter[2] * src[x + stride] +                                             \
-+     filter[3] * src[x + 2 * stride])
-+
-+static void FUNC(put_hevc_epel_h)(int16_t *dst,
-+                                  uint8_t *_src, ptrdiff_t _srcstride,
-+                                  int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        dst += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_v)(int16_t *dst,
-+                                  uint8_t *_src, ptrdiff_t _srcstride,
-+                                  int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        dst += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_hv)(int16_t *dst,
-+                                   uint8_t *_src, ptrdiff_t _srcstride,
-+                                   int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+
-+    src -= EPEL_EXTRA_BEFORE * srcstride;
-+
-+    for (y = 0; y < height + EPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
-+        tmp += MAX_PB_SIZE;
-+        dst += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                      int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int shift = 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                     int16_t *src2,
-+                                     int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int shift = 14 + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++) {
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
-+        }
-+        dst  += dststride;
-+        src  += srcstride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                      int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
-+    int shift = 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                     int16_t *src2,
-+                                     int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    int shift = 14 + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
-+        dst  += dststride;
-+        src  += srcstride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift = 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    src -= EPEL_EXTRA_BEFORE * srcstride;
-+
-+    for (y = 0; y < height + EPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
-+        tmp += MAX_PB_SIZE;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                      int16_t *src2,
-+                                      int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift = 14 + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    src -= EPEL_EXTRA_BEFORE * srcstride;
-+
-+    for (y = 0; y < height + EPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
-+        tmp  += MAX_PB_SIZE;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    ox     = ox * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++) {
-+            dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
-+        }
-+        dst += dststride;
-+        src += srcstride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int16_t *src2,
-+                                       int height, int denom, int wx0, int wx1,
-+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int shift = 14 + 1 - BIT_DEPTH;
-+    int log2Wd = denom + shift - 1;
-+
-+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
-+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
-+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
-+    int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    ox     = ox * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++) {
-+            dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
-+        }
-+        dst += dststride;
-+        src += srcstride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int16_t *src2,
-+                                       int height, int denom, int wx0, int wx1,
-+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    int shift = 14 + 1 - BIT_DEPTH;
-+    int log2Wd = denom + shift - 1;
-+
-+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
-+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
-+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                         int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    src -= EPEL_EXTRA_BEFORE * srcstride;
-+
-+    for (y = 0; y < height + EPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+    ox     = ox * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
-+        tmp += MAX_PB_SIZE;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                        int16_t *src2,
-+                                        int height, int denom, int wx0, int wx1,
-+                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift = 14 + 1 - BIT_DEPTH;
-+    int log2Wd = denom + shift - 1;
-+
-+    src -= EPEL_EXTRA_BEFORE * srcstride;
-+
-+    for (y = 0; y < height + EPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
-+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
-+                                    ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
-+        tmp  += MAX_PB_SIZE;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+// line zero
-+#define P3 pix[-4 * xstride]
-+#define P2 pix[-3 * xstride]
-+#define P1 pix[-2 * xstride]
-+#define P0 pix[-1 * xstride]
-+#define Q0 pix[0 * xstride]
-+#define Q1 pix[1 * xstride]
-+#define Q2 pix[2 * xstride]
-+#define Q3 pix[3 * xstride]
-+
-+// line three. used only for deblocking decision
-+#define TP3 pix[-4 * xstride + 3 * ystride]
-+#define TP2 pix[-3 * xstride + 3 * ystride]
-+#define TP1 pix[-2 * xstride + 3 * ystride]
-+#define TP0 pix[-1 * xstride + 3 * ystride]
-+#define TQ0 pix[0  * xstride + 3 * ystride]
-+#define TQ1 pix[1  * xstride + 3 * ystride]
-+#define TQ2 pix[2  * xstride + 3 * ystride]
-+#define TQ3 pix[3  * xstride + 3 * ystride]
-+
-+static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
-+                                        ptrdiff_t _xstride, ptrdiff_t _ystride,
-+                                        int beta, int *_tc,
-+                                        uint8_t *_no_p, uint8_t *_no_q)
-+{
-+    int d, j;
-+    pixel *pix        = (pixel *)_pix;
-+    ptrdiff_t xstride = _xstride / sizeof(pixel);
-+    ptrdiff_t ystride = _ystride / sizeof(pixel);
-+
-+    beta <<= BIT_DEPTH - 8;
-+
-+    for (j = 0; j < 2; j++) {
-+        const int dp0  = abs(P2  - 2 * P1  + P0);
-+        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
-+        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
-+        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
-+        const int d0   = dp0 + dq0;
-+        const int d3   = dp3 + dq3;
-+        const int tc   = _tc[j]   << (BIT_DEPTH - 8);
-+        const int no_p = _no_p[j];
-+        const int no_q = _no_q[j];
-+
-+        if (d0 + d3 >= beta) {
-+            pix += 4 * ystride;
-+            continue;
-+        } else {
-+            const int beta_3 = beta >> 3;
-+            const int beta_2 = beta >> 2;
-+            const int tc25   = ((tc * 5 + 1) >> 1);
-+
-+            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
-+                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
-+                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
-+                // strong filtering
-+                const int tc2 = tc << 1;
-+                for (d = 0; d < 4; d++) {
-+                    const int p3 = P3;
-+                    const int p2 = P2;
-+                    const int p1 = P1;
-+                    const int p0 = P0;
-+                    const int q0 = Q0;
-+                    const int q1 = Q1;
-+                    const int q2 = Q2;
-+                    const int q3 = Q3;
-+                    if (!no_p) {
-+                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
-+                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
-+                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
-+                    }
-+                    if (!no_q) {
-+                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
-+                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
-+                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
-+                    }
-+                    pix += ystride;
-+                }
-+            } else { // normal filtering
-+                int nd_p = 1;
-+                int nd_q = 1;
-+                const int tc_2 = tc >> 1;
-+                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
-+                    nd_p = 2;
-+                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
-+                    nd_q = 2;
-+
-+                for (d = 0; d < 4; d++) {
-+                    const int p2 = P2;
-+                    const int p1 = P1;
-+                    const int p0 = P0;
-+                    const int q0 = Q0;
-+                    const int q1 = Q1;
-+                    const int q2 = Q2;
-+                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
-+                    if (abs(delta0) < 10 * tc) {
-+                        delta0 = av_clip(delta0, -tc, tc);
-+                        if (!no_p)
-+                            P0 = av_clip_pixel(p0 + delta0);
-+                        if (!no_q)
-+                            Q0 = av_clip_pixel(q0 - delta0);
-+                        if (!no_p && nd_p > 1) {
-+                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
-+                            P1 = av_clip_pixel(p1 + deltap1);
-+                        }
-+                        if (!no_q && nd_q > 1) {
-+                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
-+                            Q1 = av_clip_pixel(q1 + deltaq1);
-+                        }
-+                    }
-+                    pix += ystride;
-+                }
-+            }
-+        }
-+    }
-+}
-+
-+static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
-+                                          ptrdiff_t _ystride, int *_tc,
-+                                          uint8_t *_no_p, uint8_t *_no_q)
-+{
-+    int d, j, no_p, no_q;
-+    pixel *pix        = (pixel *)_pix;
-+    ptrdiff_t xstride = _xstride / sizeof(pixel);
-+    ptrdiff_t ystride = _ystride / sizeof(pixel);
-+
-+    for (j = 0; j < 2; j++) {
-+        const int tc = _tc[j] << (BIT_DEPTH - 8);
-+        if (tc <= 0) {
-+            pix += 4 * ystride;
-+            continue;
-+        }
-+        no_p = _no_p[j];
-+        no_q = _no_q[j];
-+
-+        for (d = 0; d < 4; d++) {
-+            int delta0;
-+            const int p1 = P1;
-+            const int p0 = P0;
-+            const int q0 = Q0;
-+            const int q1 = Q1;
-+            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
-+            if (!no_p)
-+                P0 = av_clip_pixel(p0 + delta0);
-+            if (!no_q)
-+                Q0 = av_clip_pixel(q0 - delta0);
-+            pix += ystride;
-+        }
-+    }
-+}
-+
-+static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-+                                            int32_t *tc, uint8_t *no_p,
-+                                            uint8_t *no_q)
-+{
-+    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
-+}
-+
-+static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-+                                            int32_t *tc, uint8_t *no_p,
-+                                            uint8_t *no_q)
-+{
-+    FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
-+}
-+
-+static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-+                                          int beta, int32_t *tc, uint8_t *no_p,
-+                                          uint8_t *no_q)
-+{
-+    FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
-+                                beta, tc, no_p, no_q);
-+}
-+
-+static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-+                                          int beta, int32_t *tc, uint8_t *no_p,
-+                                          uint8_t *no_q)
-+{
-+    FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
-+                                beta, tc, no_p, no_q);
-+}
-+
-+#undef P3
-+#undef P2
-+#undef P1
-+#undef P0
-+#undef Q0
-+#undef Q1
-+#undef Q2
-+#undef Q3
-+
-+#undef TP3
-+#undef TP2
-+#undef TP1
-+#undef TP0
-+#undef TQ0
-+#undef TQ1
-+#undef TQ2
-+#undef TQ3
-+
-+// line zero
-+#define P3 pix_l[0 * xstride]
-+#define P2 pix_l[1 * xstride]
-+#define P1 pix_l[2 * xstride]
-+#define P0 pix_l[3 * xstride]
-+#define Q0 pix_r[0 * xstride]
-+#define Q1 pix_r[1 * xstride]
-+#define Q2 pix_r[2 * xstride]
-+#define Q3 pix_r[3 * xstride]
-+
-+// line three. used only for deblocking decision
-+#define TP3 pix_l[0 * xstride + 3 * ystride]
-+#define TP2 pix_l[1 * xstride + 3 * ystride]
-+#define TP1 pix_l[2 * xstride + 3 * ystride]
-+#define TP0 pix_l[3 * xstride + 3 * ystride]
-+#define TQ0 pix_r[0 * xstride + 3 * ystride]
-+#define TQ1 pix_r[1 * xstride + 3 * ystride]
-+#define TQ2 pix_r[2 * xstride + 3 * ystride]
-+#define TQ3 pix_r[3 * xstride + 3 * ystride]
-+
-+// This is identical to hevc_loop_filter_luma except that the P/Q
-+// components are on separate pointers
-+static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
-+                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
-+                                 uint8_t * _pix_l)
-+{
-+    int d, j;
-+    pixel *pix_l        = (pixel *)_pix_l;
-+    pixel *pix_r        = (pixel *)_pix_r;
-+    const ptrdiff_t xstride = 1;
-+    const ptrdiff_t ystride = _stride / sizeof(pixel);
-+
-+    beta <<= BIT_DEPTH - 8;
-+
-+    for (j = 0; j < 2; j++) {
-+        const int dp0  = abs(P2  - 2 * P1  + P0);
-+        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
-+        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
-+        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
-+        const int d0   = dp0 + dq0;
-+        const int d3   = dp3 + dq3;
-+        const int tc   = ((tc2 >> (j << 4)) & 0xffff) << (BIT_DEPTH - 8);
-+        const int no_p = no_f & 1;
-+        const int no_q = no_f & 2;
-+
-+        if (d0 + d3 >= beta) {
-+            pix_l += 4 * ystride;
-+            pix_r += 4 * ystride;
-+            continue;
-+        } else {
-+            const int beta_3 = beta >> 3;
-+            const int beta_2 = beta >> 2;
-+            const int tc25   = ((tc * 5 + 1) >> 1);
-+
-+            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
-+                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
-+                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
-+                // strong filtering
-+                const int tc2 = tc << 1;
-+                for (d = 0; d < 4; d++) {
-+                    const int p3 = P3;
-+                    const int p2 = P2;
-+                    const int p1 = P1;
-+                    const int p0 = P0;
-+                    const int q0 = Q0;
-+                    const int q1 = Q1;
-+                    const int q2 = Q2;
-+                    const int q3 = Q3;
-+                    if (!no_p) {
-+                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
-+                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
-+                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
-+                    }
-+                    if (!no_q) {
-+                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
-+                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
-+                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
-+                    }
-+                    pix_l += ystride;
-+                    pix_r += ystride;
-+                }
-+            } else { // normal filtering
-+                int nd_p = 1;
-+                int nd_q = 1;
-+                const int tc_2 = tc >> 1;
-+                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
-+                    nd_p = 2;
-+                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
-+                    nd_q = 2;
-+
-+                for (d = 0; d < 4; d++) {
-+                    const int p2 = P2;
-+                    const int p1 = P1;
-+                    const int p0 = P0;
-+                    const int q0 = Q0;
-+                    const int q1 = Q1;
-+                    const int q2 = Q2;
-+                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
-+                    if (abs(delta0) < 10 * tc) {
-+                        delta0 = av_clip(delta0, -tc, tc);
-+                        if (!no_p)
-+                            P0 = av_clip_pixel(p0 + delta0);
-+                        if (!no_q)
-+                            Q0 = av_clip_pixel(q0 - delta0);
-+                        if (!no_p && nd_p > 1) {
-+                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
-+                            P1 = av_clip_pixel(p1 + deltap1);
-+                        }
-+                        if (!no_q && nd_q > 1) {
-+                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
-+                            Q1 = av_clip_pixel(q1 + deltaq1);
-+                        }
-+                    }
-+                    pix_l += ystride;
-+                    pix_r += ystride;
-+                }
-+            }
-+        }
-+    }
-+}
-+
-+static void FUNC(hevc_h_loop_filter_luma2)(uint8_t * _pix_r,
-+                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f)
-+{
-+    // Just call the non-2 function having massaged the parameters
-+    int32_t tc[2] = {tc2 & 0xffff, tc2 >> 16};
-+    uint8_t no_p[2] = {no_f & 1, no_f & 1};
-+    uint8_t no_q[2] = {no_f & 2, no_f & 2};
-+    FUNC(hevc_h_loop_filter_luma)(_pix_r, _stride, beta, tc, no_p, no_q);
-+}
-+
-+#undef TP3
-+#undef TP2
-+#undef TP1
-+#undef TP0
-+#undef TQ0
-+#undef TQ1
-+#undef TQ2
-+#undef TQ3
-+
-+#undef P3
-+#undef P2
-+#undef P1
-+#undef P0
-+#undef Q0
-+#undef Q1
-+#undef Q2
-+#undef Q3
-+
-+#define P1 pix_l[0 * xstride]
-+#define P0 pix_l[1 * xstride]
-+#define Q0 pix_r[0 * xstride]
-+#define Q1 pix_r[1 * xstride]
-+
-+static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
-+                                          ptrdiff_t _ystride, const int32_t *_tc,
-+                                          const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
-+{
-+    int d, j, no_p, no_q;
-+    pixel *pix_l        = (pixel *)_pix_l;
-+    pixel *pix_r        = (pixel *)_pix_r;
-+    ptrdiff_t xstride = _xstride / sizeof(pixel);
-+    ptrdiff_t ystride = _ystride / sizeof(pixel);
-+
-+    for (j = 0; j < 2; j++) {
-+        const int tc = _tc[j] << (BIT_DEPTH - 8);
-+        if (tc <= 0) {
-+            pix_l += 4 * ystride;
-+            pix_r += 4 * ystride;
-+            continue;
-+        }
-+        no_p = _no_p[j];
-+        no_q = _no_q[j];
-+
-+        for (d = 0; d < 4; d++) {
-+            int delta0;
-+            const int p1 = P1;
-+            const int p0 = P0;
-+            const int q0 = Q0;
-+            const int q1 = Q1;
-+            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
-+            if (!no_p)
-+                P0 = av_clip_pixel(p0 + delta0);
-+            if (!no_q)
-+                Q0 = av_clip_pixel(q0 - delta0);
-+            pix_l += ystride;
-+            pix_r += ystride;
-+        }
-+    }
-+}
-+
-+static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
-+                                 unsigned int no_f)
-+{
-+    uint8_t no_p[2] = {no_f & 1, no_f & 2};
-+    uint8_t no_q[2] = {no_f & 4, no_f & 8};
-+    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
-+    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
-+    FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
-+}
-+
-+static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
-+                                 uint8_t * src_l,
-+                                 unsigned int no_f)
-+{
-+    uint8_t no_p[2] = {no_f & 1, no_f & 2};
-+    uint8_t no_q[2] = {no_f & 4, no_f & 8};
-+    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
-+    FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
-+    FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
-+}
-+
-+#undef P1
-+#undef P0
-+#undef Q0
-+#undef Q1
-+
---- /dev/null
-+++ b/libavcodec/rpi_hevcpred.c
-@@ -0,0 +1,161 @@
-+/*
-+ * HEVC video Decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "rpi_hevcdec.h"
-+
-+#include "rpi_hevcpred.h"
-+#if (ARCH_ARM)
-+#include "arm/rpi_hevcpred_arm.h"
-+#endif
-+
-+#define PRED_C 0
-+#define BIT_DEPTH 8
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 9
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 10
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 12
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+#undef PRED_C
-+
-+#define PRED_C 1
-+#define BIT_DEPTH 8
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 9
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 10
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 12
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+#undef PRED_C
-+
-+void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth)
-+{
-+#undef FUNC
-+#define FUNC(a, depth) a ## _ ## depth
-+
-+#undef FUNCC
-+#define FUNCC(a, depth) a ## _ ## depth ## _c
-+
-+#define HEVC_PRED_Y(depth)                                \
-+    hpc->intra_pred      = FUNC(intra_pred, depth);     \
-+    hpc->intra_filter[0] = FUNC(intra_filter_2, depth); \
-+    hpc->intra_filter[1] = FUNC(intra_filter_3, depth); \
-+    hpc->intra_filter[2] = FUNC(intra_filter_4, depth); \
-+    hpc->intra_filter[3] = FUNC(intra_filter_5, depth); \
-+    hpc->pred_planar[0]  = FUNC(pred_planar_0, depth);  \
-+    hpc->pred_planar[1]  = FUNC(pred_planar_1, depth);  \
-+    hpc->pred_planar[2]  = FUNC(pred_planar_2, depth);  \
-+    hpc->pred_planar[3]  = FUNC(pred_planar_3, depth);  \
-+    hpc->pred_dc[0]      = FUNC(pred_dc_0, depth);      \
-+    hpc->pred_dc[1]      = FUNC(pred_dc_1, depth);      \
-+    hpc->pred_dc[2]      = FUNC(pred_dc_2, depth);      \
-+    hpc->pred_dc[3]      = FUNC(pred_dc_3, depth);      \
-+    hpc->pred_vertical[0] = FUNC(pred_angular_0, depth); \
-+    hpc->pred_vertical[1] = FUNC(pred_angular_1, depth); \
-+    hpc->pred_vertical[2] = FUNC(pred_angular_2, depth); \
-+    hpc->pred_vertical[3] = FUNC(pred_angular_3, depth); \
-+    hpc->pred_horizontal[0] = FUNC(pred_angular_0, depth); \
-+    hpc->pred_horizontal[1] = FUNC(pred_angular_1, depth); \
-+    hpc->pred_horizontal[2] = FUNC(pred_angular_2, depth); \
-+    hpc->pred_horizontal[3] = FUNC(pred_angular_3, depth); \
-+    hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \
-+    hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \
-+    hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
-+    hpc->pred_angular[3] = FUNC(pred_angular_3, depth); \
-+    hpc->pred_dc0[0]     = FUNC(pred_dc0_0, depth);     \
-+    hpc->pred_dc0[1]     = FUNC(pred_dc0_1, depth);     \
-+    hpc->pred_dc0[2]     = FUNC(pred_dc0_2, depth);     \
-+    hpc->pred_dc0[3]     = FUNC(pred_dc0_3, depth);
-+
-+#define HEVC_PRED_C(depth)                                \
-+    hpc->intra_pred_c      = FUNCC(intra_pred, depth);     \
-+	hpc->intra_filter_c[0] = FUNCC(intra_filter_2, depth); \
-+	hpc->intra_filter_c[1] = FUNCC(intra_filter_3, depth); \
-+	hpc->intra_filter_c[2] = FUNCC(intra_filter_4, depth); \
-+	hpc->intra_filter_c[3] = FUNCC(intra_filter_5, depth); \
-+    hpc->pred_planar_c[0]  = FUNCC(pred_planar_0, depth);  \
-+    hpc->pred_planar_c[1]  = FUNCC(pred_planar_1, depth);  \
-+    hpc->pred_planar_c[2]  = FUNCC(pred_planar_2, depth);  \
-+    hpc->pred_planar_c[3]  = FUNCC(pred_planar_3, depth);  \
-+    hpc->pred_dc_c[0]      = FUNCC(pred_dc_0, depth);      \
-+    hpc->pred_dc_c[1]      = FUNCC(pred_dc_1, depth);      \
-+    hpc->pred_dc_c[2]      = FUNCC(pred_dc_2, depth);      \
-+    hpc->pred_dc_c[3]      = FUNCC(pred_dc_3, depth);      \
-+    hpc->pred_vertical_c[0] = FUNCC(pred_angular_0, depth); \
-+    hpc->pred_vertical_c[1] = FUNCC(pred_angular_1, depth); \
-+    hpc->pred_vertical_c[2] = FUNCC(pred_angular_2, depth); \
-+    hpc->pred_vertical_c[3] = FUNCC(pred_angular_3, depth); \
-+    hpc->pred_horizontal_c[0] = FUNCC(pred_angular_0, depth); \
-+    hpc->pred_horizontal_c[1] = FUNCC(pred_angular_1, depth); \
-+    hpc->pred_horizontal_c[2] = FUNCC(pred_angular_2, depth); \
-+    hpc->pred_horizontal_c[3] = FUNCC(pred_angular_3, depth); \
-+    hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
-+    hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
-+    hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
-+    hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); \
-+    hpc->pred_dc0_c[0]     = FUNCC(pred_dc0_0, depth);     \
-+    hpc->pred_dc0_c[1]     = FUNCC(pred_dc0_1, depth);     \
-+    hpc->pred_dc0_c[2]     = FUNCC(pred_dc0_2, depth);     \
-+    hpc->pred_dc0_c[3]     = FUNCC(pred_dc0_3, depth);
-+
-+#define HEVC_PRED(depth) \
-+    HEVC_PRED_Y(depth); \
-+    HEVC_PRED_C(depth);
-+
-+    switch (bit_depth) {
-+    case 9:
-+        HEVC_PRED(9);
-+        break;
-+    case 10:
-+        HEVC_PRED(10);
-+        break;
-+    case 12:
-+        HEVC_PRED(12);
-+        break;
-+    default:
-+        HEVC_PRED(8);
-+        break;
-+    }
-+
-+#if (ARCH_ARM)
-+    ff_hevc_rpi_pred_init_arm(hpc, bit_depth);
-+#elif (ARCH_MIPS)
-+    ff_hevc_rpi_pred_init_mips(hpc, bit_depth);
-+#endif
-+}
---- /dev/null
-+++ b/libavcodec/rpi_hevcpred.h
-@@ -0,0 +1,123 @@
-+/*
-+ * HEVC video Decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVCPRED_H
-+#define AVCODEC_RPI_HEVCPRED_H
-+
-+#include <stddef.h>
-+#include <stdint.h>
-+#include "config.h"
-+
-+struct HEVCRpiContext;
-+struct HEVCRpiLocalContext;
-+
-+enum IntraPredMode {
-+    INTRA_PLANAR = 0,
-+    INTRA_DC,
-+    INTRA_ANGULAR_2,
-+    INTRA_ANGULAR_3,
-+    INTRA_ANGULAR_4,
-+    INTRA_ANGULAR_5,
-+    INTRA_ANGULAR_6,
-+    INTRA_ANGULAR_7,
-+    INTRA_ANGULAR_8,
-+    INTRA_ANGULAR_9,
-+    INTRA_ANGULAR_10,
-+    INTRA_ANGULAR_11,
-+    INTRA_ANGULAR_12,
-+    INTRA_ANGULAR_13,
-+    INTRA_ANGULAR_14,
-+    INTRA_ANGULAR_15,
-+    INTRA_ANGULAR_16,
-+    INTRA_ANGULAR_17,
-+    INTRA_ANGULAR_18,
-+    INTRA_ANGULAR_19,
-+    INTRA_ANGULAR_20,
-+    INTRA_ANGULAR_21,
-+    INTRA_ANGULAR_22,
-+    INTRA_ANGULAR_23,
-+    INTRA_ANGULAR_24,
-+    INTRA_ANGULAR_25,
-+    INTRA_ANGULAR_26,
-+    INTRA_ANGULAR_27,
-+    INTRA_ANGULAR_28,
-+    INTRA_ANGULAR_29,
-+    INTRA_ANGULAR_30,
-+    INTRA_ANGULAR_31,
-+    INTRA_ANGULAR_32,
-+    INTRA_ANGULAR_33,
-+    INTRA_ANGULAR_34,
-+};
-+#define INTRA_ANGULAR_HORIZONTAL INTRA_ANGULAR_10
-+#define INTRA_ANGULAR_VERTICAL   INTRA_ANGULAR_26
-+
-+typedef void intra_filter_fn_t(
-+        uint8_t * const left, uint8_t * const top,
-+        const unsigned int req, const unsigned int avail,
-+        const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur,
-+        const unsigned int stride,
-+        const unsigned int top_right_size, const unsigned int down_left_size);
-+
-+typedef struct HEVCRpiPredContext {
-+    void (*intra_pred)(const struct HEVCRpiContext * const s,
-+                          const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0,
-+                          const unsigned int avail, const unsigned int log2_size);
-+
-+    intra_filter_fn_t *intra_filter[4];
-+    void (*pred_planar[4])(uint8_t *src, const uint8_t *top,
-+                           const uint8_t *left, ptrdiff_t stride);
-+    void (*pred_dc[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
-+                    ptrdiff_t stride);
-+    void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
-+                            const uint8_t *left, ptrdiff_t stride,
-+                            int mode);
-+    void (*pred_vertical[4])(uint8_t *src, const uint8_t *top,
-+                            const uint8_t *left, ptrdiff_t stride,
-+                            int mode);
-+    void (*pred_horizontal[4])(uint8_t *src, const uint8_t *top,
-+                            const uint8_t *left, ptrdiff_t stride,
-+                            int mode);
-+    void (*pred_dc0[4])(uint8_t *src, ptrdiff_t stride);
-+
-+    void (*intra_pred_c)(const struct HEVCRpiContext * const s,
-+                          const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0,
-+                          const unsigned int avail, const unsigned int log2_size);
-+    intra_filter_fn_t *intra_filter_c[4];
-+    void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
-+                           const uint8_t *left, ptrdiff_t stride);
-+    void (*pred_dc_c[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
-+                    ptrdiff_t stride);
-+    void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
-+                            const uint8_t *left, ptrdiff_t stride,
-+                            int mode);
-+    void (*pred_vertical_c[4])(uint8_t *src, const uint8_t *top,
-+                            const uint8_t *left, ptrdiff_t stride,
-+                            int mode);
-+    void (*pred_horizontal_c[4])(uint8_t *src, const uint8_t *top,
-+                            const uint8_t *left, ptrdiff_t stride,
-+                            int mode);
-+    void (*pred_dc0_c[4])(uint8_t *src, ptrdiff_t stride);
-+} HEVCRpiPredContext;
-+
-+void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth);
-+
-+#endif /* AVCODEC_RPI_HEVCPRED_H */
---- /dev/null
-+++ b/libavcodec/rpi_hevcpred_template.c
-@@ -0,0 +1,1407 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "config.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/rpi_sand_fns.h"
-+#include "bit_depth_template.c"
-+
-+#include "rpi_hevcdec.h"
-+#include "rpi_hevcpred.h"
-+
-+#define DUMP_PRED 0
-+
-+#define POS(x, y) src[(x) + stride * (y)]
-+
-+// INCLUDED_ONCE defined at EOF
-+#ifndef INCLUDED_ONCE
-+typedef uint8_t (* c8_dst_ptr_t)[2];
-+typedef const uint8_t (* c8_src_ptr_t)[2];
-+typedef uint16_t (* c16_dst_ptr_t)[2];
-+typedef const uint16_t (* c16_src_ptr_t)[2];
-+
-+// *** On ARM make these NEON registers
-+typedef struct pixel4_16 {
-+    uint16_t x[4];
-+} pixel4_16;
-+typedef struct pixel4_32 {
-+    uint32_t x[4];
-+} pixel4_32;
-+static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x)
-+{
-+    pixel4_16 t = {{x, x, x, x}};
-+    return t;
-+}
-+static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x)
-+{
-+    pixel4_32 t = {{x, x, x, x}};
-+    return t;
-+}
-+#endif
-+
-+#if PRED_C
-+// For chroma we double pixel size so we copy pairs
-+#undef pixel
-+#undef pixel2
-+#undef pixel4
-+#undef dctcoef
-+#undef INIT_CLIP
-+#undef no_rnd_avg_pixel4
-+#undef rnd_avg_pixel4
-+#undef AV_RN2P
-+#undef AV_RN4P
-+#undef AV_RN4PA
-+#undef AV_WN2P
-+#undef AV_WN4P
-+#undef AV_WN4PA
-+#undef CLIP
-+#undef FUNC
-+#undef FUNCC
-+#undef av_clip_pixel
-+#undef PIXEL_SPLAT_X4
-+
-+#if BIT_DEPTH == 8
-+#define pixel uint16_t
-+#define pixel4 pixel4_16
-+#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16
-+#define cpel uint8_t
-+#define c_src_ptr_t  c8_src_ptr_t
-+#define c_dst_ptr_t  c8_dst_ptr_t
-+#else
-+#define pixel uint32_t
-+#define pixel4 pixel4_32
-+#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32
-+#define cpel uint16_t
-+#define c_src_ptr_t c16_dst_ptr_t
-+#define c_dst_ptr_t c16_dst_ptr_t
-+#endif
-+#define AV_RN4P(p) (*(pixel4*)(p))
-+#define AV_WN4P(p,x) (*(pixel4*)(p) = (x))
-+#define FUNC(a) FUNC2(a, BIT_DEPTH, _c)
-+#endif
-+
-+
-+// Get PW prior to horrid PRED_C trickery
-+#if BIT_DEPTH == 8
-+#define PW 1
-+#else
-+#define PW 2
-+#endif
-+
-+
-+#if DUMP_PRED && !defined(INCLUDED_ONCE)
-+static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
-+{
-+    for (unsigned int y = 0; y != size; y++, data += stride * 2) {
-+        for (unsigned int x = 0; x != size; x++) {
-+            printf("%4d", data[x * 2]);
-+        }
-+        printf("\n");
-+    }
-+    printf("\n");
-+}
-+#endif
-+
-+#ifndef INCLUDED_ONCE
-+static inline void extend_8(void * ptr, const unsigned int v, unsigned int n)
-+{
-+    if ((n >>= 2) != 0) {
-+        uint32_t v4 = v | (v << 8);
-+        uint32_t * p = (uint32_t *)ptr;
-+        v4 = v4 | (v4 << 16);
-+        do {
-+            *p++ = v4;
-+        } while (--n != 0);
-+    }
-+}
-+
-+static inline void extend_16(void * ptr, const unsigned int v, unsigned int n)
-+{
-+    if ((n >>= 2) != 0) {
-+        uint32_t v2 = v | (v << 16);
-+        uint32_t * p = (uint32_t *)ptr;
-+        do {
-+            *p++ = v2;
-+            *p++ = v2;
-+        } while (--n != 0);
-+    }
-+}
-+
-+static inline void extend_32(void * ptr, const unsigned int v, unsigned int n)
-+{
-+    if ((n >>= 2) != 0) {
-+        uint32_t * p = (uint32_t *)ptr;
-+        do {
-+            *p++ = v;
-+            *p++ = v;
-+            *p++ = v;
-+            *p++ = v;
-+        } while (--n != 0);
-+    }
-+}
-+
-+// Beware that this inverts the avail ordering
-+// For CIP it seems easier this way round
-+static unsigned int cip_avail_l(const uint8_t * is_intra, const int i_stride, const unsigned int i_mask,
-+                                const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size,
-+                              unsigned int s0, unsigned int odd_s)
-+{
-+    const unsigned int n = 1 << log2_intra_bits;
-+    unsigned int fa = 0;
-+    unsigned int i;
-+
-+    size >>= 2;   // Now in 4-pel units
-+    s0 >>= 2;
-+
-+    if ((avail & AVAIL_DL) != 0)
-+        fa |= ((1 << s0) - 1) << (size - s0);
-+    if ((avail & AVAIL_L) != 0)
-+        fa |= ((1 << size) - 1) << size;
-+    if ((avail & AVAIL_UL) != 0)
-+        fa |= 1 << (size << 1);
-+
-+    if (odd_s) {
-+        if ((fa & 1) != 0 && (*is_intra & i_mask) == 0)
-+            fa &= ~1;
-+        is_intra += i_stride;
-+    }
-+
-+    for (i = odd_s; (fa >> i) != 0; i += n, is_intra += i_stride) {
-+        const unsigned int m = ((1 << n) - 1) << i;
-+        if ((fa & m) != 0 && (*is_intra & i_mask) == 0)
-+            fa &= ~m;
-+    }
-+
-+    return fa;
-+}
-+
-+static unsigned int cip_avail_u(const uint8_t * is_intra, unsigned int i_shift,
-+                                const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size,
-+                                unsigned int s1, unsigned int odd_s)
-+{
-+    if ((avail & (AVAIL_U | AVAIL_UR)) == 0)
-+    {
-+        return 0;
-+    }
-+    else
-+    {
-+        const unsigned int n = 1 << log2_intra_bits;
-+        unsigned int fa = 0;
-+        unsigned int i;
-+        unsigned int im = ((is_intra[1] << 8) | (is_intra[0])) >> i_shift;
-+
-+        size >>= 2;   // Now in 4-pel units
-+        s1 >>= 2;
-+
-+        if ((avail & AVAIL_U) != 0)
-+            fa |= ((1 << size) - 1);
-+        if ((avail & AVAIL_UR) != 0)
-+            fa |= ((1 << s1) - 1) << size;
-+
-+        if (odd_s) {
-+            fa &= im | ~1;
-+            im >>= 1;
-+        }
-+
-+        for (i = odd_s; (fa >> i) != 0; i += n, im >>= 1) {
-+            const unsigned int m = ((1 << n) - 1) << i;
-+            if ((im & 1) == 0)
-+                fa &= ~m;
-+        }
-+        return fa;
-+    }
-+}
-+
-+
-+
-+static inline unsigned int rmbd(unsigned int x)
-+{
-+#if 1
-+    return __builtin_ctz(x);
-+#else
-+    unsigned int n = 0;
-+    if ((x & 0xffff) == 0) {
-+        x >>= 16;
-+        n += 16;
-+    }
-+    if ((x & 0xff) == 0) {
-+        x >>= 8;
-+        n += 8;
-+    }
-+    if ((x & 0xf) == 0) {
-+        x >>= 4;
-+        n += 4;
-+    }
-+    if ((x & 0x3) == 0) {
-+        x >>= 2;
-+        n += 2;
-+    }
-+
-+    return (x & 1) == 0 ? n + 1 : n;
-+#endif
-+}
-+#endif
-+
-+
-+static void FUNC(cip_fill)(pixel * const left, pixel * const top,
-+    const unsigned int avail_l, const unsigned int avail_u,
-+    const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur,
-+    const unsigned int stride,
-+    const unsigned int size)
-+{
-+    pixel a;
-+    unsigned int i;
-+
-+    // 1st find DL value
-+    if ((avail_l & 1) == 0) {
-+        if (avail_l != 0)
-+            a = src_l[((int)size * 2 - 1 - (int)rmbd(avail_l)*4) * (int)stride];
-+        else
-+        {
-+            // (avail_l | avail_u) != 0 so this must be good
-+            const unsigned int n = rmbd(avail_u)*4;
-+            a = (n >= size) ? src_ur[n - size] : src_u[n];
-+        }
-+    }
-+
-+    // L
-+    {
-+        pixel * d = left + size * 2 - 1;
-+        const pixel * s = src_l + (size * 2 - 1) * stride;
-+        unsigned int x = avail_l;
-+        for (i = 0; i < size * 2; i += 4, x >>= 1)
-+        {
-+            if ((x & 1) != 0) {
-+                // Avail
-+                *d-- = *s;
-+                s -= stride;
-+                *d-- = *s;
-+                s -= stride;
-+                *d-- = *s;
-+                s -= stride;
-+                *d-- = a = *s;
-+                s -= stride;
-+            }
-+            else
-+            {
-+                *d-- = a;
-+                *d-- = a;
-+                *d-- = a;
-+                *d-- = a;
-+                s -= stride * 4;
-+            }
-+        }
-+        // UL
-+        *d = a = (x & 1) != 0 ? *s : a;
-+    }
-+
-+    // U
-+    {
-+        pixel * d = top;
-+        const pixel * s = src_u;
-+        unsigned int x = avail_u;
-+
-+        for (i = 0; i < size; i += 4, x >>= 1)
-+        {
-+            if ((x & 1) != 0) {
-+                // Avail
-+                *d++ = *s++;
-+                *d++ = *s++;
-+                *d++ = *s++;
-+                *d++ = a = *s++;
-+            }
-+            else
-+            {
-+                *d++ = a;
-+                *d++ = a;
-+                *d++ = a;
-+                *d++ = a;
-+                s += 4;
-+            }
-+        }
-+
-+        // UR
-+        s = src_ur;
-+        for (i = 0; i < size; i += 4, x >>= 1)
-+        {
-+            if ((x & 1) != 0) {
-+                // Avail
-+                *d++ = *s++;
-+                *d++ = *s++;
-+                *d++ = *s++;
-+                *d++ = a = *s++;
-+            }
-+            else
-+            {
-+                *d++ = a;
-+                *d++ = a;
-+                *d++ = a;
-+                *d++ = a;
-+                s += 4;
-+            }
-+        }
-+    }
-+}
-+
-+
-+#if !PRED_C && PW == 1
-+#define EXTEND(ptr, val, len) extend_8(ptr, val, len)
-+#elif (!PRED_C && PW == 2) || (PRED_C && PW == 1)
-+#define EXTEND(ptr, val, len) extend_16(ptr, val, len)
-+#else
-+#define EXTEND(ptr, val, len) extend_32(ptr, val, len)
-+#endif
-+
-+// Reqs:
-+//
-+// Planar:  DL[0], L, ul, U, UR[0]
-+// DC:         dl, L, ul, U, ur
-+// A2-9:       DL, L, ul, u, ur
-+// A10:        dl, L, ul, u, ur
-+// A11-17      dl, L, UL, U, ur
-+// A18-25      dl, L, Ul, U, ur
-+// A26         dl, l, ul, U, ur
-+// A27-34      dl, l, ul, U, UR
-+
-+#ifndef INCLUDED_ONCE
-+
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16;
-+
-+static const uint8_t req_avail_c[35] =
-+{
-+    AVAIL_DL | AVAIL_L | 0         |  AVAIL_U | AVAIL_UR,  // Planar (DL[0] & UR[0] only needed)
-+               AVAIL_L | 0         |  AVAIL_U,             // DC
-+    AVAIL_DL | AVAIL_L,                                    // 2
-+    AVAIL_DL | AVAIL_L,                                    // 3
-+    AVAIL_DL | AVAIL_L,                                    // 4
-+    AVAIL_DL | AVAIL_L,                                    // 5
-+    AVAIL_DL | AVAIL_L,                                    // 6
-+    AVAIL_DL | AVAIL_L,                                    // 7
-+    AVAIL_DL | AVAIL_L,                                    // 8
-+    AVAIL_DL | AVAIL_L,                                    // 9
-+               AVAIL_L,                                    // 10 (H)
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 11
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 12
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 13
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 14
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 15
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 16
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 17
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 18
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 19
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 20
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 21
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 22
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 23
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 24
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 25
-+                                    AVAIL_U,               // 26 (V)
-+                                    AVAIL_U | AVAIL_UR,    // 27
-+                                    AVAIL_U | AVAIL_UR,    // 28
-+                                    AVAIL_U | AVAIL_UR,    // 29
-+                                    AVAIL_U | AVAIL_UR,    // 30
-+                                    AVAIL_U | AVAIL_UR,    // 31
-+                                    AVAIL_U | AVAIL_UR,    // 32
-+                                    AVAIL_U | AVAIL_UR,    // 33
-+                                    AVAIL_U | AVAIL_UR     // 34
-+};
-+
-+static const uint8_t req_avail[4][35] = {
-+{
-+    AVAIL_DL | AVAIL_L | 0         |  AVAIL_U | AVAIL_UR,  // Planar (DL[0] & UR[0] only needed)
-+               AVAIL_L | 0         |  AVAIL_U,             // DC
-+    AVAIL_DL | AVAIL_L,                                    // 2
-+    AVAIL_DL | AVAIL_L,                                    // 3
-+    AVAIL_DL | AVAIL_L,                                    // 4
-+    AVAIL_DL | AVAIL_L,                                    // 5
-+    AVAIL_DL | AVAIL_L,                                    // 6
-+    AVAIL_DL | AVAIL_L,                                    // 7
-+    AVAIL_DL | AVAIL_L,                                    // 8
-+    AVAIL_DL | AVAIL_L,                                    // 9
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 10 (H)
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 11
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 12
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 13
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 14
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 15
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 16
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 17
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 18
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 19
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 20
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 21
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 22
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 23
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 24
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 25
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 26 (V)
-+                                    AVAIL_U | AVAIL_UR,    // 27
-+                                    AVAIL_U | AVAIL_UR,    // 28
-+                                    AVAIL_U | AVAIL_UR,    // 29
-+                                    AVAIL_U | AVAIL_UR,    // 30
-+                                    AVAIL_U | AVAIL_UR,    // 31
-+                                    AVAIL_U | AVAIL_UR,    // 32
-+                                    AVAIL_U | AVAIL_UR,    // 33
-+                                    AVAIL_U | AVAIL_UR     // 34
-+},
-+{  // 3
-+    AVAIL_DL | AVAIL_L | 0        | AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // Planar (DL[0] & UR[0] only needed)
-+               AVAIL_L | 0        | AVAIL_U,                            // DC
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 2
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 3
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 4
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 5
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 6
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 7
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 8
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 9
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 10 (H)
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 11
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 12
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 13
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 14
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 15
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 16
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 17
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 18
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 19
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 20
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 21
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 22
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 23
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 24
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 25
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 26 (V)
-+                                    AVAIL_U | AVAIL_UR | 0,             // 27
-+                                    AVAIL_U | AVAIL_UR | 0,             // 28
-+                                    AVAIL_U | AVAIL_UR | 0,             // 29
-+                                    AVAIL_U | AVAIL_UR | 0,             // 30
-+                                    AVAIL_U | AVAIL_UR | 0,             // 31
-+                                    AVAIL_U | AVAIL_UR | 0,             // 32
-+                                    AVAIL_U | AVAIL_UR | 0,             // 33
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT   // 34
-+},
-+{  // 4
-+    AVAIL_DL | AVAIL_L | 0        | AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // Planar (DL[0] & UR[0] only needed)
-+               AVAIL_L | 0        | AVAIL_U,                            // DC
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 2
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 3
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 4
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 5
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 6
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 7
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 8
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 9
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 10 (H)
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 11
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 12
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 13
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 14
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 15
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 16
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 17
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 18
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 19
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 20
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 21
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 22
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 23
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 24
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 25
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 26 (V)
-+                                    AVAIL_U | AVAIL_UR | 0,             // 27
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 28
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 29
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 30
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 31
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 32
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 33
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT   // 34
-+},
-+{  // 5
-+    AVAIL_DL | AVAIL_L | 0        | AVAIL_U | AVAIL_UR | FILTER_EITHER, // Planar (DL[0] & UR[0] only needed)
-+               AVAIL_L | 0        | AVAIL_U,                            // DC
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 2
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 3
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 4
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 5
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 6
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 7
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 8
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 9
-+               AVAIL_L                                 | 0,             // 10 (H)
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 11
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 12
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 13
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 14
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 15
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 16
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 17
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 18
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 19
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 20
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 21
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 22
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 23
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 24
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 25
-+                                    AVAIL_U            | 0,             // 26 (V)
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 27
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 28
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 29
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 30
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 31
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 32
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 33
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER  // 34
-+}
-+};
-+
-+
-+#endif
-+
-+#define filter_light1 FUNC(filter_light1)
-+static inline pixel filter_light1(pixel a, pixel b, pixel c)
-+{
-+    return (a + b*2 + c + 2) >> 2;
-+}
-+
-+#define filter_light FUNC(filter_light)
-+static inline void filter_light(pixel * dst, pixel p1, const pixel * src, const pixel pn, const int sstride, const unsigned int n)
-+{
-+    pixel p0;
-+    pixel p2 = *src;
-+    // Allow for final pel - it is just clearer to to have the call take the actual number of output pels
-+    unsigned int n_minus_1 = n - 1;
-+
-+    do
-+    {
-+        src += sstride;
-+        p0 = p1;
-+        p1 = p2;
-+        p2 = *src;
-+        *dst++ = filter_light1(p0, p1, p2);
-+    } while (--n_minus_1 != 0);
-+    *dst = filter_light1(p1, p2, pn);
-+}
-+
-+#define filter_strong FUNC(filter_strong)
-+static inline void filter_strong(pixel * dst, const unsigned int p0, const unsigned int p1, unsigned int n)
-+{
-+    unsigned int a = 64 * p0 + 32;
-+    const int v = p1 - p0;
-+
-+    do
-+    {
-+        *dst++ = (a += v) >> 6;
-+    } while (--n != 0);
-+}
-+
-+#define intra_filter FUNC(intra_filter)
-+static av_always_inline void intra_filter(
-+    pixel * const left, pixel * const top,
-+    const unsigned int req, const unsigned int avail,
-+    const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur,
-+    const unsigned int stride,
-+    const unsigned int top_right_size, const unsigned int down_left_size,
-+    const unsigned int log2_size)
-+{
-+    const unsigned int strong_threshold = 1 << (BIT_DEPTH - 5);
-+    const unsigned int size = 1 << log2_size;
-+
-+    // a_ is the first pel in a section working round dl -> ur
-+    // b_ is the last
-+    // Beware that top & left work out from UL so usage of a_ & b_ may
-+    // swap between them.  It is a bad naming scheme but I have found no
-+    // better
-+    const pixel * a_dl = src_l + (down_left_size + size - 1) * stride;
-+    const pixel * b_dl = src_l + size * stride;
-+    const pixel * a_l  = src_l + (size - 1) * stride;
-+    const pixel * b_l  = src_l;
-+    const pixel * ab_ul = src_l - stride;
-+    const pixel * a_u = src_u;
-+    const pixel * b_u = src_u + size - 1;
-+    const pixel * a_ur = src_ur;
-+    const pixel * b_ur = src_ur + top_right_size - 1;
-+
-+    const unsigned int want = req & ~avail;
-+    const unsigned int have = req & avail;
-+    unsigned int i;
-+
-+    if ((avail & AVAIL_DL) == 0)
-+    {
-+        a_dl = a_ur;
-+        if ((avail & AVAIL_U) != 0)
-+            a_dl = a_u;
-+        if ((avail & AVAIL_UL) != 0)
-+            a_dl = ab_ul;
-+        if ((avail & AVAIL_L) != 0)
-+            a_dl = a_l;
-+        b_dl = a_dl;
-+    }
-+
-+    if ((avail & AVAIL_L) == 0)
-+    {
-+        a_l = b_dl;
-+        b_l = b_dl;
-+    }
-+    if ((avail & AVAIL_UL) == 0)
-+    {
-+        ab_ul = b_l;
-+    }
-+    if ((avail & AVAIL_U) == 0)
-+    {
-+        a_u = ab_ul;
-+        b_u = ab_ul;
-+    }
-+    if ((avail & AVAIL_UR) == 0)
-+    {
-+        a_ur = b_u;
-+        b_ur = b_u;
-+    }
-+
-+    if ((req & FILTER_LIGHT) == 0 || PRED_C || log2_size == 2)  // PRED_C, log2_size compiler opt hints
-+    {
-+        if ((req & AVAIL_UL) != 0)
-+            left[-1] = *ab_ul;
-+
-+        if ((want & AVAIL_L) != 0)
-+            EXTEND(left, *a_l, size);
-+        if ((want & AVAIL_DL) != 0)
-+            EXTEND(left + size, *a_dl, size);
-+        if ((want & AVAIL_U) != 0)
-+            EXTEND(top, *a_u, size);
-+        if ((want & AVAIL_UR) != 0)
-+            EXTEND(top + size, *a_ur, size);
-+
-+        if ((have & AVAIL_U) != 0)
-+            // Always good - even with sand
-+            memcpy(top, a_u, size * sizeof(pixel));
-+        if ((have & AVAIL_UR) != 0)
-+        {
-+            memcpy(top + size, a_ur, top_right_size * sizeof(pixel));
-+            EXTEND(top + size + top_right_size, *b_ur,
-+                   size - top_right_size);
-+        }
-+        if ((have & AVAIL_L) != 0)
-+        {
-+            for (i = 0; i < size; i++)
-+                left[i] = b_l[stride * i];
-+        }
-+        if ((have & AVAIL_DL) != 0)
-+        {
-+            for (i = 0; i < down_left_size; i++)
-+                left[i + size] = b_dl[stride * i];
-+            EXTEND(left + size + down_left_size, *a_dl,
-+                   size - down_left_size);
-+        }
-+    }
-+    else if ((req & FILTER_STRONG) != 0 && log2_size == 5 && // log2_size compiler opt hint
-+            FFABS((int)(*a_dl - *a_l * 2 + *ab_ul)) < strong_threshold &&
-+            FFABS((int)(*ab_ul - *b_u * 2 + *b_ur)) < strong_threshold)
-+    {
-+        if ((req & (AVAIL_U | AVAIL_UR)) != 0)
-+            filter_strong(top, *ab_ul, *b_ur, size * 2);
-+        left[-1] = *ab_ul;
-+        if ((req & (AVAIL_L | AVAIL_DL)) != 0)
-+            filter_strong(left, *ab_ul, *a_dl, size*2);
-+    }
-+    else
-+    {
-+        // Same code for both have & want for UL
-+        if ((req & AVAIL_UL) != 0)
-+        {
-+            left[-1] = filter_light1(*b_l, *ab_ul, *a_u);
-+        }
-+
-+        if ((want & AVAIL_L) != 0)
-+        {
-+            EXTEND(left, *a_l, size);
-+            left[0] = (*a_l * 3 + *ab_ul + 2) >> 2;
-+        }
-+        if ((want & AVAIL_DL) != 0)
-+        {
-+            // If we want DL then it cannot be avail so a_dl = a_l so no edge rounding
-+            EXTEND(left + size, *a_l, size);
-+        }
-+        if ((want & AVAIL_U) != 0)
-+        {
-+            EXTEND(top, *a_u, size);
-+            top[size - 1] = (*a_u * 3 + *a_ur + 2) >> 2;
-+        }
-+        if ((want & AVAIL_UR) != 0)
-+        {
-+            // If we want UR then it cannot be avail so a_ur = b_u so no edge rounding
-+            EXTEND(top + size, *a_ur, size);
-+        }
-+
-+        if ((have & AVAIL_U) != 0)
-+        {
-+            filter_light(top, *ab_ul, a_u, *a_ur, 1, size);
-+        }
-+        if ((have & AVAIL_UR) != 0) {
-+            filter_light(top + size, *b_u, a_ur, *b_ur, 1, top_right_size);
-+            top[size*2 - 1] = *b_ur;
-+            EXTEND(top + size + top_right_size, *b_ur, size - top_right_size);
-+        }
-+        if ((have & AVAIL_L) != 0)
-+        {
-+            filter_light(left, *ab_ul, b_l, *b_dl, stride, size);
-+        }
-+        if ((have & AVAIL_DL) != 0)
-+        {
-+            filter_light(left + size, *a_l, b_dl, *a_dl, stride, down_left_size);
-+            left[size*2 - 1] = *a_dl;
-+            EXTEND(left + size + down_left_size, *a_dl, size - down_left_size);
-+        }
-+    }
-+}
-+
-+#define INTRA_FILTER(log2_size) \
-+static void FUNC(intra_filter_ ## log2_size)( \
-+     uint8_t * const left, uint8_t * const top, \
-+     const unsigned int req, const unsigned int avail, \
-+     const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, \
-+     const unsigned int stride, \
-+     const unsigned int top_right_size, const unsigned int down_left_size) \
-+{ \
-+    intra_filter((pixel *)left, (pixel *)top, req, avail, \
-+        (const pixel *)src_l, (const pixel *)src_u, (const pixel *)src_ur, stride / sizeof(pixel), top_right_size, down_left_size, log2_size); \
-+}
-+
-+INTRA_FILTER(2)
-+INTRA_FILTER(3)
-+INTRA_FILTER(4)
-+INTRA_FILTER(5)
-+
-+#undef intra_filter
-+#undef INTRA_FILTER
-+
-+static void FUNC(intra_pred)(const HEVCRpiContext * const s,
-+                                              const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail,
-+                                              const unsigned int log2_size)
-+{
-+    // c_idx will alaways be 1 for _c versions and 0 for y
-+    const unsigned int c_idx = PRED_C;
-+    const unsigned int hshift = ctx_hshift(s, c_idx);
-+    const unsigned int vshift = ctx_vshift(s, c_idx);
-+    const unsigned int size = (1 << log2_size);
-+    const unsigned int x = x0 >> hshift;
-+    const unsigned int y = y0 >> vshift;
-+
-+    const ptrdiff_t stride = frame_stride1(s->frame, c_idx) / sizeof(pixel);
-+    pixel *const src = c_idx == 0 ?
-+        (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) :
-+        (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y);
-+
-+    // Align so we can do multiple loads in the asm
-+    // Padded to 16 byte boundary so as not to confuse anything
-+    DECLARE_ALIGNED(16, pixel, top[2 * MAX_TB_SIZE]);
-+    DECLARE_ALIGNED(16, pixel, left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]);
-+
-+    pixel  * const left  = left_array  + 16 / sizeof(pixel);
-+    const pixel * top_pred = top;
-+
-+    const pixel * src_l = src - 1;
-+    const pixel * src_u = src - stride;
-+    const pixel * src_ur = src_u + size;
-+#if !PRED_C
-+    const unsigned int req = req_avail[log2_size - 2][mode] & ~s->ps.sps->intra_filters_disable;
-+#else
-+    const unsigned int req = req_avail_c[mode];
-+#endif
-+
-+    // If we have nothing to pred from then fill with grey
-+    // This isn't a common case but dealing with it here means we don't have to
-+    // test for it later
-+    if (avail == 0)
-+    {
-+dc_only:
-+#if !PRED_C
-+        s->hpc.pred_dc0[log2_size - 2]((uint8_t *)src, stride);
-+#else
-+        s->hpc.pred_dc0_c[log2_size - 2]((uint8_t *)src, stride);
-+#endif
-+        return;
-+    }
-+
-+    {
-+        // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs
-+        const AVFrame * const frame = s->frame;
-+        const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
-+        const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride;
-+        if ((x & mask) == 0)
-+            src_l -= stripe_adj;
-+        if (((x + size) & mask) == 0)
-+            src_ur += stripe_adj;
-+    }
-+
-+    // Can deal with I-slices in 'normal' code even if CIP
-+    // This also means that we don't need to generate (elsewhere) is_intra
-+    // for IRAP frames
-+    if (s->ps.pps->constrained_intra_pred_flag == 1 &&
-+        s->sh.slice_type != HEVC_SLICE_I)
-+    {
-+        // * If we ever actually care about CIP performance then we should
-+        //   special case out size 4 stuff (can be done by 'normal') and
-+        //   have 8-pel avail masks
-+        unsigned int avail_l = cip_avail_l(s->is_intra + ((y + size * 2 - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + ((x - 1) >> (6 - hshift)),
-+                                           -(int)(s->ps.sps->pcm_width),
-+                                           1 << (((x - 1) >> (3 - hshift)) & 7),
-+                                           1 - hshift,
-+                                           avail,
-+                                           size,
-+                                           FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size),
-+                                           vshift != 0 ? 0 : (y >> 2) & 1);
-+
-+        unsigned int avail_u = cip_avail_u(s->is_intra + ((y - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + (x >> (6 - hshift)),
-+                                           (x >> (3 - hshift)) & 7,
-+                                           1 - hshift,
-+                                           avail,
-+                                           size,
-+                                           FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size),
-+                                           hshift != 0 ? 0 : (x >> 2) & 1);
-+
-+        // Anything left?
-+        if ((avail_l | avail_u) == 0)
-+            goto dc_only;
-+
-+        FUNC(cip_fill)(left, top, avail_l, avail_u, src_l, src_u, src_ur, stride, size);
-+
-+#if !PRED_C
-+        if ((req & FILTER_LIGHT) != 0)
-+        {
-+            const unsigned threshold = 1 << (BIT_DEPTH - 5);
-+            if ((req & FILTER_STRONG) != 0 &&
-+                (int)(FFABS(left[-1]  + top[63] - 2 * top[31]))  < threshold &&
-+                (int)(FFABS(left[-1] + left[63] - 2 * left[31])) < threshold)
-+            {
-+                filter_strong(top, left[-1], top[63], 64);
-+                filter_strong(left, left[-1], left[63], 64);
-+            } else
-+            {
-+                // LHS writes UL too so copy for top
-+                const pixel p_ul = left[-1];
-+                filter_light(left - 1, top[0], left - 1, left[2*size - 1], 1, 2*size);
-+                filter_light(top, p_ul, top, top[2*size - 1], 1, 2*size - 1);
-+            }
-+        }
-+#endif
-+    }
-+    else
-+    {
-+        const unsigned int ur_size = FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size);
-+        if ((req & ~((AVAIL_UR | AVAIL_U) & avail)) == 0 &&
-+            ((req & AVAIL_UR) == 0 || src_u + 2*size == src_ur + ur_size))
-+        {
-+            top_pred = src_u;
-+        }
-+        else
-+        {
-+#if !PRED_C
-+            s->hpc.intra_filter[log2_size - 2]
-+#else
-+            s->hpc.intra_filter_c[log2_size - 2]
-+#endif
-+                ((uint8_t *)left, (uint8_t *)top, req, avail,
-+                 (const uint8_t *)src_l, (const uint8_t *)src_u, (const uint8_t *)src_ur, stride * sizeof(pixel),
-+                              ur_size,
-+                              FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size));
-+        }
-+    }
-+
-+
-+#if !PRED_C
-+    switch (mode) {
-+    case INTRA_PLANAR:
-+        s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                          (uint8_t *)left, stride);
-+        break;
-+    case INTRA_DC:
-+        s->hpc.pred_dc[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                       (uint8_t *)left, stride);
-+        break;
-+    case INTRA_ANGULAR_HORIZONTAL:
-+        s->hpc.pred_horizontal[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                           (uint8_t *)left, stride,
-+                                           mode);
-+        break;
-+    case INTRA_ANGULAR_VERTICAL:
-+        s->hpc.pred_vertical[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                           (uint8_t *)left, stride,
-+                                           mode);
-+        break;
-+    default:
-+        s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                           (uint8_t *)left, stride,
-+                                           mode);
-+        break;
-+    }
-+#else
-+    switch (mode) {
-+    case INTRA_PLANAR:
-+        s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                          (uint8_t *)left, stride);
-+        break;
-+    case INTRA_DC:
-+        s->hpc.pred_dc_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                       (uint8_t *)left, stride);
-+        break;
-+    case INTRA_ANGULAR_HORIZONTAL:
-+        s->hpc.pred_horizontal_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                           (uint8_t *)left, stride,
-+                                           mode);
-+        break;
-+    case INTRA_ANGULAR_VERTICAL:
-+        s->hpc.pred_vertical_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                           (uint8_t *)left, stride,
-+                                           mode);
-+        break;
-+    default:
-+        s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                           (uint8_t *)left, stride,
-+                                           mode);
-+        break;
-+    }
-+
-+#if DUMP_PRED
-+    printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
-+    dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
-+    printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
-+    dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
-+#endif
-+#endif
-+}
-+
-+#if !PRED_C
-+static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
-+                                  const uint8_t *_left, ptrdiff_t stride,
-+                                  int trafo_size)
-+{
-+    int x, y;
-+    pixel *src        = (pixel *)_src;
-+    const pixel *top  = (const pixel *)_top;
-+    const pixel *left = (const pixel *)_left;
-+    int size = 1 << trafo_size;
-+    for (y = 0; y < size; y++)
-+        for (x = 0; x < size; x++)
-+            POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size]  +
-+                         (size - 1 - y) * top[x]  + (y + 1) * left[size] + size) >> (trafo_size + 1);
-+}
-+#else
-+static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
-+                                  const uint8_t * _left, ptrdiff_t stride,
-+                                  int trafo_size)
-+{
-+    int x, y;
-+    int size = 1 << trafo_size;
-+    c_dst_ptr_t src = (c_dst_ptr_t)_src;
-+    const c_src_ptr_t top = (c_src_ptr_t)_top;
-+    const c_src_ptr_t left = (c_src_ptr_t)_left;
-+
-+    for (y = 0; y < size; y++, src += stride)
-+    {
-+        for (x = 0; x < size; x++)
-+        {
-+            src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0]  +
-+                         (size - 1 - y) * top[x][0]  + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
-+            src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1]  +
-+                         (size - 1 - y) * top[x][1]  + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
-+        }
-+    }
-+}
-+#endif
-+
-+#define PRED_PLANAR(size)\
-+static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
-+                                       const uint8_t *left, ptrdiff_t stride)   \
-+{                                                                               \
-+    FUNC(pred_planar)(src, top, left, stride, size + 2);                        \
-+}
-+
-+PRED_PLANAR(0)
-+PRED_PLANAR(1)
-+PRED_PLANAR(2)
-+PRED_PLANAR(3)
-+
-+#undef PRED_PLANAR
-+
-+#if !PRED_C
-+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
-+                          const uint8_t *_left,
-+                          ptrdiff_t stride, int log2_size)
-+{
-+    int i, j, x, y;
-+    int size          = (1 << log2_size);
-+    pixel *src        = (pixel *)_src;
-+    const pixel *top  = (const pixel *)_top;
-+    const pixel *left = (const pixel *)_left;
-+    int dc            = size;
-+    pixel4 a;
-+    for (i = 0; i < size; i++)
-+        dc += left[i] + top[i];
-+
-+    dc >>= log2_size + 1;
-+
-+    a = PIXEL_SPLAT_X4(dc);
-+
-+    for (i = 0; i < size; i++)
-+        for (j = 0; j < size; j+=4)
-+            AV_WN4P(&POS(j, i), a);
-+
-+//    if (c_idx == 0 && size < 32)
-+// As we now have separate fns for y & c - no need to test that
-+    if (size < 32)
-+    {
-+        POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2;
-+        for (x = 1; x < size; x++)
-+            POS(x, 0) = (top[x] + 3 * dc + 2) >> 2;
-+        for (y = 1; y < size; y++)
-+            POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
-+    }
-+}
-+#else
-+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
-+                          const uint8_t *_left,
-+                          ptrdiff_t stride, int log2_size)
-+{
-+    unsigned int i, j;
-+    const unsigned int size = (1 << log2_size);
-+    c_dst_ptr_t src = (c_dst_ptr_t)_src;
-+    const c_src_ptr_t top = (c_src_ptr_t)_top;
-+    const c_src_ptr_t left = (c_src_ptr_t)_left;
-+    unsigned int dc0 = size;
-+    unsigned int dc1 = size;
-+
-+    for (i = 0; i < size; i++)
-+    {
-+        dc0 += left[i][0] + top[i][0];
-+        dc1 += left[i][1] + top[i][1];
-+    }
-+
-+    dc0 >>= log2_size + 1;
-+    dc1 >>= log2_size + 1;
-+
-+    for (i = 0; i < size; i++, src += stride)
-+    {
-+        for (j = 0; j < size; ++j)
-+        {
-+            src[j][0] = dc0;
-+            src[j][1] = dc1;
-+
-+        }
-+    }
-+}
-+#endif
-+
-+#define PRED_DC(size)\
-+static void FUNC(pred_dc_ ## size)(uint8_t *src, const uint8_t *top,        \
-+                                       const uint8_t *left, ptrdiff_t stride)   \
-+{                                                                               \
-+    FUNC(pred_dc)(src, top, left, stride, size + 2);                        \
-+}
-+
-+PRED_DC(0)
-+PRED_DC(1)
-+PRED_DC(2)
-+PRED_DC(3)
-+
-+#undef PRED_DC
-+
-+
-+
-+
-+#if !PRED_C
-+static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size)
-+{
-+    int i, j;
-+    int size          = (1 << log2_size);
-+    pixel *src        = (pixel *)_src;
-+    pixel4 a = PIXEL_SPLAT_X4(1 << (BIT_DEPTH - 1));
-+
-+    for (i = 0; i < size; i++)
-+        for (j = 0; j < size; j+=4)
-+            AV_WN4P(&POS(j, i), a);
-+}
-+#else
-+static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size)
-+{
-+    unsigned int i, j;
-+    const unsigned int size = (1 << log2_size);
-+    c_dst_ptr_t src = (c_dst_ptr_t)_src;
-+    const pixel a = (1 << (BIT_DEPTH - 1));
-+
-+    for (i = 0; i < size; i++, src += stride)
-+    {
-+        for (j = 0; j < size; ++j)
-+        {
-+            src[j][0] = a;
-+            src[j][1] = a;
-+        }
-+    }
-+}
-+#endif
-+
-+#define PRED_DC0(size)\
-+static void FUNC(pred_dc0_ ## size)(uint8_t *src, ptrdiff_t stride)   \
-+{                                                                               \
-+    FUNC(pred_dc0)(src, stride, size + 2);                        \
-+}
-+
-+PRED_DC0(0)
-+PRED_DC0(1)
-+PRED_DC0(2)
-+PRED_DC0(3)
-+
-+#undef PRED_DC0
-+
-+
-+
-+
-+#ifndef ANGLE_CONSTS
-+#define ANGLE_CONSTS
-+static const int intra_pred_angle[] = {
-+     32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
-+    -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
-+};
-+static const int inv_angle[] = {
-+    -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
-+    -630, -910, -1638, -4096
-+};
-+#endif
-+
-+#if !PRED_C
-+static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
-+                                                const uint8_t *_top,
-+                                                const uint8_t *_left,
-+                                                ptrdiff_t stride,
-+                                                int mode, int size)
-+{
-+    int x, y;
-+    pixel *src        = (pixel *)_src;
-+    const pixel *top  = (const pixel *)_top;
-+    const pixel *left = (const pixel *)_left;
-+
-+    int angle = intra_pred_angle[mode - 2];
-+    pixel ref_array[3 * MAX_TB_SIZE + 4];
-+    pixel *ref_tmp = ref_array + size;
-+    const pixel *ref;
-+    int last = (size * angle) >> 5;
-+
-+    if (mode >= 18) {
-+        ref = top - 1;
-+
-+        if (angle < 0)
-+        {
-+            memcpy(ref_tmp + 1, top, size * PW);
-+            ref_tmp[0] = left[-1];
-+
-+            for (x = last; x <= -1; x++)
-+                ref_tmp[x] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
-+            ref = ref_tmp;
-+        }
-+
-+        for (y = 0; y < size; y++) {
-+            int idx  = ((y + 1) * angle) >> 5;
-+            int fact = ((y + 1) * angle) & 31;
-+            if (fact) {
-+                for (x = 0; x < size; x += 4) {
-+                    POS(x    , y) = ((32 - fact) * ref[x + idx + 1] +
-+                                           fact  * ref[x + idx + 2] + 16) >> 5;
-+                    POS(x + 1, y) = ((32 - fact) * ref[x + 1 + idx + 1] +
-+                                           fact  * ref[x + 1 + idx + 2] + 16) >> 5;
-+                    POS(x + 2, y) = ((32 - fact) * ref[x + 2 + idx + 1] +
-+                                           fact  * ref[x + 2 + idx + 2] + 16) >> 5;
-+                    POS(x + 3, y) = ((32 - fact) * ref[x + 3 + idx + 1] +
-+                                           fact  * ref[x + 3 + idx + 2] + 16) >> 5;
-+                }
-+            } else {
-+                for (x = 0; x < size; x += 4)
-+                    AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1]));
-+            }
-+        }
-+        if (mode == 26 && size < 32) {
-+            for (y = 0; y < size; y++)
-+                POS(0, y) = av_clip_pixel(top[0] + ((left[y] - left[-1]) >> 1));
-+        }
-+
-+    } else {
-+        ref = left - 1;
-+        if (angle < 0 && last < -1) {
-+            for (x = 0; x <= size; x += 4)
-+                AV_WN4P(&ref_tmp[x], AV_RN4P(&left[x - 1]));
-+            // Inv angle <= -256 so top offset >= 0
-+            for (x = last; x <= -1; x++)
-+                ref_tmp[x] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
-+            ref = ref_tmp;
-+        }
-+
-+        for (x = 0; x < size; x++) {
-+            int idx  = ((x + 1) * angle) >> 5;
-+            int fact = ((x + 1) * angle) & 31;
-+            if (fact) {
-+                for (y = 0; y < size; y++) {
-+                    POS(x, y) = ((32 - fact) * ref[y + idx + 1] +
-+                                       fact  * ref[y + idx + 2] + 16) >> 5;
-+                }
-+            } else {
-+                for (y = 0; y < size; y++)
-+                    POS(x, y) = ref[y + idx + 1];
-+            }
-+        }
-+        if (mode == 10 && size < 32) {
-+            for (x = 0; x < size; x += 4) {
-+                POS(x,     0) = av_clip_pixel(left[0] + ((top[x    ] - left[-1]) >> 1));
-+                POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - left[-1]) >> 1));
-+                POS(x + 2, 0) = av_clip_pixel(left[0] + ((top[x + 2] - left[-1]) >> 1));
-+                POS(x + 3, 0) = av_clip_pixel(left[0] + ((top[x + 3] - left[-1]) >> 1));
-+            }
-+        }
-+    }
-+}
-+#else
-+static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
-+                                                const uint8_t *_top,
-+                                                const uint8_t *_left,
-+                                                ptrdiff_t stride,
-+                                                int mode, int size)
-+{
-+    int x, y;
-+    c_dst_ptr_t src  = (c_dst_ptr_t)_src;
-+    c_src_ptr_t top  = (c_src_ptr_t)_top;
-+    c_src_ptr_t left = (c_src_ptr_t)_left;
-+
-+    const int angle = intra_pred_angle[mode - 2];
-+    cpel ref_array[3 * MAX_TB_SIZE + 4][2];
-+    c_dst_ptr_t ref_tmp = ref_array + size;
-+    c_src_ptr_t ref;
-+    const int last = (size * angle) >> 5;
-+
-+    if (mode >= 18) {
-+        ref = top - 1;
-+        if (angle < 0) {
-+            memcpy(ref_tmp + 1, top, size * 2 * PW);
-+            ref_tmp[0][0] = left[-1][0];
-+            ref_tmp[0][1] = left[-1][1];
-+            for (x = last; x <= -1; x++)
-+            {
-+                ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
-+                ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
-+            }
-+            ref = (c_src_ptr_t)ref_tmp;
-+        }
-+
-+        for (y = 0; y < size; y++, src += stride) {
-+            const int idx  = ((y + 1) * angle) >> 5;
-+            const int fact = ((y + 1) * angle) & 31;
-+            if (fact) {
-+                for (x = 0; x < size; ++x) {
-+                    src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
-+                                       fact  * ref[x + idx + 2][0] + 16) >> 5;
-+                    src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
-+                                       fact  * ref[x + idx + 2][1] + 16) >> 5;
-+                }
-+            } else {
-+                memcpy(src, ref + idx + 1, size * 2 * PW);
-+            }
-+        }
-+    } else {
-+        ref = left - 1;
-+        if (angle < 0 && last < -1) {
-+            memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW);
-+            for (x = last; x <= -1; x++)
-+            {
-+                ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
-+                ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
-+            }
-+            ref = (c_src_ptr_t)ref_tmp;
-+        }
-+
-+        for (x = 0; x < size; x++, src++) {
-+            const int idx  = ((x + 1) * angle) >> 5;
-+            const int fact = ((x + 1) * angle) & 31;
-+            if (fact) {
-+                for (y = 0; y < size; y++) {
-+                    src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
-+                                       fact  * ref[y + idx + 2][0] + 16) >> 5;
-+                    src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
-+                                       fact  * ref[y + idx + 2][1] + 16) >> 5;
-+                }
-+            } else {
-+                for (y = 0; y < size; y++)
-+                {
-+                    src[y * stride][0] = ref[y + idx + 1][0];
-+                    src[y * stride][1] = ref[y + idx + 1][1];
-+                }
-+            }
-+        }
-+    }
-+}
-+#endif
-+
-+static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
-+                                 const uint8_t *left,
-+                                 ptrdiff_t stride, int mode)
-+{
-+    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 2);
-+}
-+
-+static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top,
-+                                 const uint8_t *left,
-+                                 ptrdiff_t stride, int mode)
-+{
-+    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 3);
-+}
-+
-+static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top,
-+                                 const uint8_t *left,
-+                                 ptrdiff_t stride, int mode)
-+{
-+    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 4);
-+}
-+
-+static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
-+                                 const uint8_t *left,
-+                                 ptrdiff_t stride, int mode)
-+{
-+    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 5);
-+}
-+
-+#undef cpel
-+#undef c_src_ptr_t
-+#undef c_dst_ptr_t
-+
-+#undef EXTEND
-+#undef POS
-+#undef PW
-+
-+#undef filter_light1
-+#undef filter_light
-+#undef filter_strong
-+#undef ref_gen
-+
-+#ifndef INCLUDED_ONCE
-+#define INCLUDED_ONCE
-+#endif
-+
---- /dev/null
-+++ b/libavcodec/rpi_mailbox.c
-@@ -0,0 +1,155 @@
-+/*
-+Copyright (c) 2012, Broadcom Europe Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+*/
-+
-+#include <stdio.h>
-+#include <string.h>
-+#include <stdlib.h>
-+#include <fcntl.h>
-+#include <unistd.h>
-+#include <assert.h>
-+#include <stdint.h>
-+#include <sys/ioctl.h>
-+
-+#include <linux/ioctl.h>
-+
-+#define MAJOR_NUM 100
-+#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
-+#define DEVICE_FILE_NAME "/dev/vcio"
-+
-+#include "rpi_mailbox.h"
-+//#include <interface/vctypes/vc_image_structs.h>
-+
-+/*
-+ * use ioctl to send mbox property message
-+ */
-+
-+static int mbox_property(int file_desc, void *buf)
-+{
-+   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
-+
-+   if (ret_val < 0) {
-+      printf("ioctl_set_msg failed:%d\n", ret_val);
-+   }
-+
-+#ifdef DEBUG
-+   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
-+   for (i=0; i<size/4; i++)
-+      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
-+#endif
-+   return ret_val;
-+}
-+
-+#define GET_VCIMAGE_PARAMS 0x30044
-+
-+int mbox_get_image_params(int fd, VC_IMAGE_T * img)
-+{
-+    uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32];
-+    uint32_t * p = buf;
-+    void * rimg;
-+    int rv;
-+
-+    *p++ = 0; // size
-+    *p++ = 0; // process request
-+    *p++ = GET_VCIMAGE_PARAMS;
-+    *p++ = sizeof(*img);
-+    *p++ = sizeof(*img);
-+    rimg = p;
-+    memcpy(p, img, sizeof(*img));
-+    p += sizeof(*img) / sizeof(*p);
-+    *p++ = 0;  // End tag
-+    buf[0] = (p - buf) * sizeof(*p);
-+
-+    rv = mbox_property(fd, buf);
-+    memcpy(img, rimg, sizeof(*img));
-+
-+    return rv;
-+}
-+
-+
-+#define SET_CLOCK_RATE 0x00038002
-+#define GET_MAX_CLOCK 0x00030004
-+#define CLOCK_HEVC 11
-+
-+static int mbox_property_generic(int fd, unsigned command, unsigned *word0, unsigned *word1)
-+{
-+    uint32_t buf[32];
-+    uint32_t * p = buf;
-+    int rv;
-+
-+    *p++ = 0; // size
-+    *p++ = 0; // process request
-+    *p++ = command;
-+    *p++ = 8;
-+    *p++ = 8;
-+    *p++ = *word0;
-+    *p++ = *word1;
-+    *p++ = 0;  // End tag
-+    buf[0] = (p - buf) * sizeof(*p);
-+
-+    rv = mbox_property(fd, buf);
-+    *word0 = buf[6];
-+    *word1 = buf[7];
-+    return rv;
-+}
-+
-+int mbox_open() {
-+   int file_desc;
-+
-+   // open a char device file used for communicating with kernel mbox driver
-+   file_desc = open(DEVICE_FILE_NAME, 0);
-+   if (file_desc < 0) {
-+      printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
-+      printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
-+   }
-+   return file_desc;
-+}
-+
-+void mbox_close(int file_desc) {
-+  close(file_desc);
-+}
-+
-+int mbox_request_clock(int fd) {
-+   int rv;
-+   unsigned word0, word1 = 0;
-+   word0 = CLOCK_HEVC;
-+   rv = mbox_property_generic(fd, GET_MAX_CLOCK, &word0, &word1);
-+   if (rv != 0)
-+      return rv;
-+   word1 = word0;
-+   word0 = CLOCK_HEVC;
-+   rv = mbox_property_generic(fd, SET_CLOCK_RATE, &word0, &word1);
-+   return rv;
-+}
-+
-+int mbox_release_clock(int fd) {
-+  int rv;
-+  unsigned word0, word1 = 0;
-+  word0 = CLOCK_HEVC;
-+  word1 = 0;
-+  rv = mbox_property_generic(fd, SET_CLOCK_RATE, &word0, &word1);
-+  return rv;
-+}
---- /dev/null
-+++ b/libavcodec/rpi_mailbox.h
-@@ -0,0 +1,58 @@
-+#ifndef RPI_MAILBOX_H
-+#define RPI_MAILBOX_H
-+
-+/* The image structure. */
-+typedef struct vc_image_extra_uv_s {
-+  void *u, *v;
-+  int vpitch;
-+} VC_IMAGE_EXTRA_UV_T;
-+
-+typedef union {
-+    VC_IMAGE_EXTRA_UV_T uv;
-+//  VC_IMAGE_EXTRA_RGBA_T rgba;
-+//  VC_IMAGE_EXTRA_PAL_T pal;
-+//  VC_IMAGE_EXTRA_TF_T tf;
-+//  VC_IMAGE_EXTRA_BAYER_T bayer;
-+//  VC_IMAGE_EXTRA_MSBAYER_T msbayer;
-+//  VC_IMAGE_EXTRA_CODEC_T codec;
-+//  VC_IMAGE_EXTRA_OPENGL_T opengl;
-+} VC_IMAGE_EXTRA_T;
-+
-+
-+typedef struct VC_IMAGE_T {
-+  unsigned short                  type;           /* should restrict to 16 bits */
-+  unsigned short                  info;           /* format-specific info; zero for VC02 behaviour */
-+  unsigned short                  width;          /* width in pixels */
-+  unsigned short                  height;         /* height in pixels */
-+  int                             pitch;          /* pitch of image_data array in bytes */
-+  int                             size;           /* number of bytes available in image_data array */
-+  void                           *image_data;     /* pixel data */
-+  VC_IMAGE_EXTRA_T                extra;          /* extra data like palette pointer */
-+  void                           *metadata;       /* metadata header for the image */
-+  void                           *pool_object;    /* nonNULL if image was allocated from a vc_pool */
-+  int                             mem_handle;     /* the mem handle for relocatable memory storage */
-+  int                             metadata_size;  /* size of metadata of each channel in bytes */
-+  int                             channel_offset; /* offset of consecutive channels in bytes */
-+  uint32_t                        video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */
-+  uint8_t                         num_channels;   /* number of channels (2 for stereo) */
-+  uint8_t                         current_channel;/* the channel this header is currently pointing to */
-+  uint8_t                         linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/
-+  uint8_t                         is_channel_linked;     /* Track if the above structure is been used to link the header
-+                                                            into a linked-mulitchannel image */
-+  uint8_t                         channel_index;         /* index of the channel this header represents while
-+                                                            it is being linked. */
-+  uint8_t                         _dummy[3];      /* pad struct to 64 bytes */
-+} VC_IMAGE_T;
-+
-+typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
-+
-+
-+extern int mbox_open(void);
-+extern void mbox_close(int file_desc);
-+
-+int mbox_get_image_params(int fd, VC_IMAGE_T * img);
-+
-+int mbox_request_clock(int fd);
-+int mbox_release_clock(int fd);
-+
-+#endif
---- /dev/null
-+++ b/libavcodec/rpi_mem.c
-@@ -0,0 +1,326 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox
-+*/
-+
-+
-+#include <stdlib.h>
-+#include <string.h>
-+#include <stddef.h>
-+#include <stdint.h>
-+
-+#include "config.h"
-+
-+#include "libavutil/avassert.h"
-+#include "libavutil/rpi_sand_fns.h"
-+
-+#pragma GCC diagnostic push
-+// Many many redundant decls in the header files
-+#pragma GCC diagnostic ignored "-Wredundant-decls"
-+#include <bcm_host.h>
-+#include <interface/vctypes/vc_image_types.h>
-+#include <interface/vcsm/user-vcsm.h>
-+#pragma GCC diagnostic pop
-+
-+#include "rpi_mem.h"
-+#include "rpi_zc_frames.h"
-+
-+
-+#define OPT_PREFER_CMA 0
-+
-+struct rpi_cache_flush_env_s {
-+  struct vcsm_user_clean_invalid2_s v;
-+};
-+
-+
-+// GPU memory alloc fns (internal)
-+
-+static void gpu_free_internal(GPU_MEM_PTR_T * const p)
-+{
-+    if (p->arm != NULL)
-+        vcsm_unlock_ptr(p->arm);
-+    if (p->vcsm_handle != 0)
-+        vcsm_free(p->vcsm_handle);
-+    memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
-+}
-+
-+
-+static int gpu_malloc_internal(GPU_MEM_PTR_T * const p,
-+    const int numbytes, const unsigned int cache_type, const char * const name)
-+{
-+    memset(p, 0, sizeof(*p));
-+    p->numbytes = (numbytes + 255) & ~255;  // Round up
-+
-+    if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0)
-+    {
-+        av_log(NULL, AV_LOG_ERROR, "Unable to alloc %d bytes from VCSM for %s\n", p->numbytes, name);
-+        goto fail;
-+    }
-+    if ((p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0)
-+    {
-+        av_log(NULL, AV_LOG_ERROR, "Unable to VC handle from VCSM for %s\n", name);
-+        goto fail;
-+    }
-+    if ((p->arm = vcsm_lock(p->vcsm_handle)) == NULL)
-+    {
-+        av_log(NULL, AV_LOG_ERROR, "Unable to lock handle from VCSM for %s\n", name);
-+        goto fail;
-+    }
-+    if ((p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0)
-+    {
-+        av_log(NULL, AV_LOG_ERROR, "Unable to get VC addr from VCSM for %s\n", name);
-+        goto fail;
-+    }
-+
-+    return 0;
-+
-+fail:
-+    gpu_free_internal(p);
-+    return AVERROR(ENOMEM);
-+}
-+
-+// Public gpu fns
-+
-+// Allocate memory on GPU
-+// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
-+// Returns 0 on success.
-+// This allocates memory that will not be cached in ARM's data cache.
-+// Therefore safe to use without data cache flushing.
-+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
-+{
-+    return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_NONE, "ffmpeg uncached");
-+}
-+
-+// This allocates data that will be
-+//    Cached in ARM L2
-+//    Uncached in VPU L2
-+int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
-+{
-+    return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_HOST, "ffmpeg cached");
-+}
-+
-+void gpu_free(GPU_MEM_PTR_T * const p) {
-+    gpu_free_internal(p);
-+}
-+
-+void rpi_mem_gpu_uninit(void)
-+{
-+    vcsm_exit();
-+    bcm_host_deinit();
-+}
-+
-+int rpi_mem_gpu_init(const unsigned int flags)
-+{
-+    const int wants_cma = bcm_host_is_fkms_active();
-+    int use_cma;
-+
-+    (void)flags;
-+
-+    if (vcsm_init_ex(wants_cma ? 1 : 0, -1) == 0)
-+        use_cma = 1;
-+    else if (vcsm_init_ex(wants_cma ? 0 : 1, -1) == 0)
-+        use_cma = 0;
-+    else
-+        return AVERROR(EINVAL);
-+
-+    bcm_host_init();
-+
-+    return use_cma + 1;
-+}
-+
-+// ----------------------------------------------------------------------------
-+//
-+// Cache flush functions
-+
-+#define CACHE_EL_MAX ((sizeof(rpi_cache_buf_t) - sizeof (struct vcsm_user_clean_invalid2_s)) / sizeof (struct vcsm_user_clean_invalid2_block_s))
-+
-+rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf)
-+{
-+  rpi_cache_flush_env_t * const rfe = (rpi_cache_flush_env_t *)buf;
-+  *rfe = (rpi_cache_flush_env_t){.v={.op_count = 0}};
-+  return rfe;
-+}
-+
-+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
-+{
-+  // Nothing needed
-+}
-+
-+int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe)
-+{
-+    int rc = 0;
-+    if (rfe->v.op_count != 0) {
-+        if (vcsm_clean_invalid2(&rfe->v) != 0)
-+        {
-+          const int err = errno;
-+          av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid2 failed: errno=%d\n", err);
-+          rc = AVERROR(err);
-+        }
-+        rfe->v.op_count = 0;
-+    }
-+    return rc;
-+}
-+
-+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
-+{
-+  int rc = rpi_cache_flush_execute(rfe);;
-+
-+  return rc;
-+}
-+
-+inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
-+  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride)
-+{
-+  struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
-+
-+  av_assert1(rfe->v.op_count <= CACHE_EL_MAX);
-+
-+  b->invalidate_mode = mode;
-+  b->block_count = blocks;
-+  b->start_address = gm->arm + offset0;
-+  b->block_size = block_size;
-+  b->inter_block_stride = block_stride;
-+}
-+
-+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
-+  const unsigned int offset, const unsigned int size)
-+{
-+  // Deal with empty pointer trivially
-+  if (gm == NULL || size == 0)
-+    return;
-+
-+  av_assert1(offset <= gm->numbytes);
-+  av_assert1(size <= gm->numbytes);
-+  av_assert1(offset + size <= gm->numbytes);
-+
-+  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0);
-+}
-+
-+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
-+{
-+  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0);
-+}
-+
-+
-+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
-+{
-+#if !RPI_ONE_BUF
-+#error Fixme! (NIF)
-+#endif
-+  if (gpu_is_buf1(frame)) {
-+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode);
-+  }
-+  else
-+  {
-+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode);
-+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode);
-+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode);
-+  }
-+}
-+
-+// Flush an area of a frame
-+// Width, height, x0, y0 in luma pels
-+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
-+  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
-+  const unsigned int uv_shift, const int do_luma, const int do_chroma)
-+{
-+  const unsigned int y_offset = frame->linesize[0] * y0;
-+  const unsigned int y_size = frame->linesize[0] * height;
-+  // Round UV up/down to get everything
-+  const unsigned int uv_rnd = (1U << uv_shift) >> 1;
-+  const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift);
-+  const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset;
-+
-+#if 0
-+  // *** frame->height is cropped height so not good
-+  // As all unsigned they will also reject -ve
-+  // Test individually as well as added to reject overflow
-+  av_assert0(start_line <= (unsigned int)frame->height);  // ***** frame height cropped
-+  av_assert0(n <= (unsigned int)frame->height);
-+  av_assert0(start_line + n <= (unsigned int)frame->height);
-+#endif
-+
-+  if (!gpu_is_buf1(frame))
-+  {
-+    if (do_luma) {
-+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
-+    }
-+    if (do_chroma) {
-+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
-+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
-+    }
-+  }
-+  else if (!av_rpi_is_sand_frame(frame))
-+  {
-+    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
-+    if (do_luma) {
-+      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
-+    }
-+    if (do_chroma) {
-+      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size);
-+      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size);
-+    }
-+  }
-+  else
-+  {
-+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
-+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
-+    const unsigned int xshl = av_rpi_sand_frame_xshl(frame);
-+    const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1);
-+    const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1;  // Same for Y & C
-+    av_assert1(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX);
-+
-+    if (do_chroma)
-+    {
-+      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
-+      b->invalidate_mode = mode;
-+      b->block_count = block_count;
-+      b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1);
-+      b->block_size = uv_size;
-+      b->inter_block_stride = stride1 * stride2;
-+    }
-+    if (do_luma)
-+    {
-+      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
-+      b->invalidate_mode = mode;
-+      b->block_count = block_count;
-+      b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0);
-+      b->block_size = y_size;
-+      b->inter_block_stride = stride1 * stride2;
-+    }
-+  }
-+}
-+
-+// Call this to clean and invalidate a region of memory
-+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode)
-+{
-+  rpi_cache_buf_t cbuf;
-+  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf);
-+  rpi_cache_flush_add_gm_ptr(rfe, p, mode);
-+  rpi_cache_flush_finish(rfe);
-+}
-+
---- /dev/null
-+++ b/libavcodec/rpi_mem.h
-@@ -0,0 +1,88 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox, Ben Avison
-+*/
-+
-+#ifndef RPI_MEM_H
-+#define RPI_MEM_H
-+
-+typedef struct gpu_mem_ptr_s {
-+  unsigned char *arm; // Pointer to memory mapped on ARM side
-+  int vc_handle;   // Videocore handle of relocatable memory
-+  int vcsm_handle; // Handle for use by VCSM
-+  int vc;       // Address for use in GPU code
-+  int numbytes; // Size of memory block
-+} GPU_MEM_PTR_T;
-+
-+// General GPU functions
-+
-+#define GPU_INIT_GPU 1
-+#define GPU_INIT_CMA 2
-+
-+extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
-+extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
-+extern void gpu_free(GPU_MEM_PTR_T * const p);
-+int rpi_mem_gpu_init(const unsigned int flags);
-+void rpi_mem_gpu_uninit(void);
-+
-+// Cache flush stuff
-+
-+struct rpi_cache_flush_env_s;
-+typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t;
-+
-+typedef struct {uint32_t t[33];} rpi_cache_buf_t;
-+
-+rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf);
-+// Free env without flushing
-+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe);
-+// Do the accumulated flush & clear but do not free the env
-+int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe);
-+// Do the accumulated flush & free the env
-+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe);
-+
-+typedef enum
-+{
-+    RPI_CACHE_FLUSH_MODE_INVALIDATE     = 1,
-+    RPI_CACHE_FLUSH_MODE_WRITEBACK      = 2,
-+    RPI_CACHE_FLUSH_MODE_WB_INVALIDATE  = 3
-+} rpi_cache_flush_mode_t;
-+
-+struct AVFrame;
-+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
-+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
-+  const unsigned int offset, const unsigned int size);
-+void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
-+  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride);
-+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode);
-+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode,
-+  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
-+  const unsigned int uv_shift, const int do_luma, const int do_chroma);
-+
-+// init, add, finish for one gm ptr
-+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
-+
-+#endif
---- /dev/null
-+++ b/libavcodec/rpi_qpu.c
-@@ -0,0 +1,776 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox
-+*/
-+
-+
-+#include <stdio.h>
-+#include <stdlib.h>
-+#include <string.h>
-+#include <stddef.h>
-+#include <stdint.h>
-+#include "libavutil/avassert.h"
-+
-+#include "config.h"
-+
-+#include <pthread.h>
-+#include <time.h>
-+
-+#include <interface/vcsm/user-vcsm.h>
-+
-+#include "rpi_mailbox.h"
-+#include "rpi_mem.h"
-+#include "rpi_qpu.h"
-+#include "rpi_hevc_shader.h"
-+#include "rpi_hevc_transform8.h"
-+#include "rpi_hevc_transform10.h"
-+#include "libavutil/rpi_sand_fns.h"
-+
-+// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
-+#define RPI_TRACE_TIME_VPU_QPU_WAIT     0
-+
-+// Add profile flags to all QPU requests - generates output in "vcdbg log msg"
-+// Beware this is expensive and will probably throw off all other timing by >10%
-+#define RPI_TRACE_QPU_PROFILE_ALL       0
-+
-+// QPU "noflush" flags
-+// a mixture of flushing & profiling
-+
-+#define QPU_FLAGS_NO_FLUSH_VPU          1       // If unset VPU cache will be flushed
-+#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2       // Clear & Enable detailed QPU profiling registers
-+#define QPU_FLAGS_PROF_OUTPUT_COUNTS    4       // Print the results
-+#define QPU_FLAGS_OUTPUT_QPU_TIMES      8       // Print QPU times - independant of the profiling
-+#define QPU_FLAGS_NO_FLUSH_QPU          16      // If unset flush QPU caches & TMUs (uniforms always flushed)
-+
-+#define vcos_verify_ge0(x) ((x)>=0)
-+
-+// Size in 32bit words
-+#define QPU_CODE_SIZE 4098
-+#define VPU_CODE_SIZE 16384
-+
-+static const short rpi_transMatrix2even[32][16] = { // Even rows first
-+{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
-+{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
-+{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
-+{87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87},
-+{83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83},
-+{80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80},
-+{75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75},
-+{70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70},
-+{64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64},
-+{57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57},
-+{50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50},
-+{43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43},
-+{36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36},
-+{25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25},
-+{18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18},
-+{ 9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25,  -9},
-+// Odd rows
-+{90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4},
-+{90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
-+{88,  67,  31, -13, -54, -82, -90, -78, -46,  -4,  38,  73,  90,  85,  61,  22},
-+{85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31},
-+{82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38},
-+{78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46},
-+{73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54},
-+{67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61},
-+{61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67},
-+{54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73},
-+{46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78},
-+{38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82},
-+{31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85},
-+{22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88},
-+{13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90},
-+{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
-+};
-+
-+// Code/constants on GPU
-+struct GPU
-+{
-+//  unsigned int qpu_code[QPU_CODE_SIZE];
-+    unsigned int vpu_code8[VPU_CODE_SIZE];
-+    unsigned int vpu_code10[VPU_CODE_SIZE];
-+    short transMatrix2even[16*16*2];
-+};
-+
-+#define WAIT_COUNT_MAX 16
-+
-+typedef struct trace_time_one_s
-+{
-+    int count;
-+    int64_t start[WAIT_COUNT_MAX];
-+    int64_t total[WAIT_COUNT_MAX];
-+} trace_time_one_t;
-+
-+typedef struct trace_time_wait_s
-+{
-+    unsigned int jcount;
-+    int64_t start0;
-+    int64_t last_update;
-+    trace_time_one_t active;
-+    trace_time_one_t wait;
-+} trace_time_wait_t;
-+
-+typedef struct vq_wait_s
-+{
-+    sem_t sem;
-+    struct vq_wait_s * next;
-+} vq_wait_t;
-+
-+#define VQ_WAIT_POOL_SIZE 16
-+typedef struct vq_wait_pool_s
-+{
-+    vq_wait_t * head;
-+    vq_wait_t pool[VQ_WAIT_POOL_SIZE];
-+} vq_wait_pool_t;
-+
-+static void vq_wait_pool_init(vq_wait_pool_t * const pool);
-+static void vq_wait_pool_deinit(vq_wait_pool_t * const pool);
-+
-+typedef struct gpu_env_s
-+{
-+    int open_count;
-+    int init_count;
-+    int vpu_i_cache_flushed;
-+    GPU_MEM_PTR_T qpu_code_gm_ptr;
-+    GPU_MEM_PTR_T code_gm_ptr;
-+    GPU_MEM_PTR_T dummy_gm_ptr;
-+    vq_wait_pool_t wait_pool;
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+    trace_time_wait_t ttw;
-+#endif
-+} gpu_env_t;
-+
-+// Stop more than one thread trying to allocate memory or use the processing resources at once
-+static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
-+static gpu_env_t * gpu = NULL;
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+
-+static int64_t ns_time(void)
-+{
-+    struct timespec ts;
-+    clock_gettime(CLOCK_MONOTONIC, &ts);
-+    return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec;
-+}
-+
-+
-+#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000
-+
-+#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U)
-+#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000)
-+#define T_ARG(t) T_SEC(t), T_MS(t)
-+#define T_FMT "%u.%03u"
-+
-+static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix)
-+{
-+    // Update totals for levels that are still pending
-+    for (int i = 0; i < tto->count; ++i) {
-+        tto->total[i] += now - tto->start[i];
-+        tto->start[i] = now;
-+    }
-+
-+    printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n",
-+         prefix,
-+         T_ARG(now - start0 - tto->total[0]),
-+         T_ARG(tto->total[0]),
-+         T_ARG(tto->total[1]),
-+         T_ARG(tto->total[2]),
-+         T_ARG(tto->total[3]));
-+}
-+
-+
-+static void tto_start(trace_time_one_t * const tto, const int64_t now)
-+{
-+    av_assert0(tto->count < WAIT_COUNT_MAX);
-+    tto->start[tto->count++] = now;
-+}
-+
-+static void tto_end(trace_time_one_t * const tto, const int64_t now)
-+{
-+    const int n = --tto->count;
-+    av_assert0(n >= 0);
-+    tto->total[n] += now - tto->start[n];
-+}
-+
-+static void ttw_print(trace_time_wait_t * const ttw, const int64_t now)
-+{
-+    printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0));
-+    tto_print(&ttw->active, now, ttw->start0, "Active");
-+    tto_print(&ttw->wait,   now, ttw->start0, "  Wait");
-+}
-+
-+#endif
-+
-+// GPU memory alloc fns (internal)
-+
-+static void gpu_free_internal(GPU_MEM_PTR_T * const p)
-+{
-+    if (p->arm != NULL)
-+        vcsm_unlock_ptr(p->arm);
-+    if (p->vcsm_handle != 0)
-+        vcsm_free(p->vcsm_handle);
-+    memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
-+}
-+
-+
-+static int gpu_malloc_internal(GPU_MEM_PTR_T * const p,
-+    const int numbytes, const unsigned int cache_type, const char * const name)
-+{
-+    memset(p, 0, sizeof(*p));
-+    p->numbytes = (numbytes + 255) & ~255;  // Round up
-+
-+    if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0 ||
-+        (p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0 ||
-+        (p->arm = vcsm_lock(p->vcsm_handle)) == NULL ||
-+        (p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0)
-+    {
-+        gpu_free_internal(p);
-+        return AVERROR(ENOMEM);
-+    }
-+    return 0;
-+}
-+
-+
-+// GPU init, free, lock, unlock
-+
-+static void gpu_term(void)
-+{
-+    gpu_env_t * const ge = gpu;
-+
-+    // We have to hope that eveything has terminated...
-+    gpu = NULL;
-+
-+    vc_gpuserv_deinit();
-+
-+    gpu_free_internal(&ge->code_gm_ptr);
-+    gpu_free_internal(&ge->qpu_code_gm_ptr);
-+    gpu_free_internal(&ge->dummy_gm_ptr);
-+
-+    vcsm_exit();
-+
-+    vq_wait_pool_deinit(&ge->wait_pool);
-+
-+    free(ge);
-+}
-+
-+
-+// Connect to QPU, returns 0 on success.
-+static int gpu_init(gpu_env_t ** const gpu) {
-+    volatile struct GPU* ptr;
-+    gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t));
-+    int rv;
-+    *gpu = NULL;
-+
-+    if (ge == NULL)
-+        return -1;
-+
-+    vq_wait_pool_init(&ge->wait_pool);
-+
-+    vcsm_init();
-+
-+    // Now copy over the QPU code into GPU memory
-+    if ((rv = gpu_malloc_internal(&ge->qpu_code_gm_ptr, QPU_CODE_SIZE * 4, VCSM_CACHE_TYPE_NONE, "ffmpeg qpu code")) != 0)
-+      return rv;
-+
-+    {
-+        int num_bytes = (char *)mc_end - (char *)ff_hevc_rpi_shader;
-+        av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
-+        memcpy(ge->qpu_code_gm_ptr.arm, ff_hevc_rpi_shader, num_bytes);
-+        memset(ge->qpu_code_gm_ptr.arm + num_bytes, 0, QPU_CODE_SIZE*4 - num_bytes);
-+    }
-+
-+    // And the VPU code
-+    if ((rv = gpu_malloc_internal(&ge->code_gm_ptr, sizeof(struct GPU), VCSM_CACHE_TYPE_VC, "ffmpeg vpu code")) != 0)
-+        return rv;
-+    ptr = (volatile struct GPU*)ge->code_gm_ptr.arm;
-+
-+    // Zero everything so we have zeros between the code bits
-+    memset((void *)ptr, 0, sizeof(*ptr));
-+    {
-+        int num_bytes = sizeof(rpi_hevc_transform8);
-+        av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-+        memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes);
-+    }
-+    {
-+        int num_bytes = sizeof(rpi_hevc_transform10);
-+        av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-+        memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes);
-+    }
-+    // And the transform coefficients
-+    memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
-+
-+    // Generate a dummy "frame" & fill with 0x80
-+    // * Could reset to 1 <<bit_depth?
-+    if ((rv = gpu_malloc_internal(&ge->dummy_gm_ptr, 0x4000, VCSM_CACHE_TYPE_NONE, "ffmpeg dummy frame")) != 0)
-+        return rv;
-+    memset(ge->dummy_gm_ptr.arm, 0x80, 0x4000);
-+
-+    *gpu = ge;
-+    return 0;
-+}
-+
-+
-+
-+static void gpu_unlock(void) {
-+    pthread_mutex_unlock(&gpu_mutex);
-+}
-+
-+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
-+static gpu_env_t * gpu_lock(void) {
-+    pthread_mutex_lock(&gpu_mutex);
-+
-+    av_assert1(gpu != NULL);
-+    return gpu;
-+}
-+
-+static gpu_env_t * gpu_lock_ref(void)
-+{
-+    pthread_mutex_lock(&gpu_mutex);
-+
-+    if (gpu == NULL) {
-+        int rv = gpu_init(&gpu);
-+        if (rv != 0) {
-+            gpu_unlock();
-+            return NULL;
-+        }
-+    }
-+
-+    ++gpu->open_count;
-+    return gpu;
-+}
-+
-+static void gpu_unlock_unref(gpu_env_t * const ge)
-+{
-+    if (--ge->open_count == 0)
-+        gpu_term();
-+
-+    gpu_unlock();
-+}
-+
-+static inline gpu_env_t * gpu_ptr(void)
-+{
-+    av_assert1(gpu != NULL);
-+    return gpu;
-+}
-+
-+unsigned int vpu_get_fn(const unsigned int bit_depth) {
-+  uint32_t a = 0;
-+
-+  // Make sure that the gpu is initialized
-+  av_assert1(gpu != NULL);
-+  switch (bit_depth){
-+    case 8:
-+      a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8);
-+      break;
-+    case 10:
-+      a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10);
-+      break;
-+    default:
-+      av_assert0(0);
-+  }
-+  return a;
-+}
-+
-+unsigned int vpu_get_constants(void) {
-+  av_assert1(gpu != NULL);
-+  return (gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even));
-+}
-+
-+void gpu_ref(void)
-+{
-+  gpu_lock_ref();
-+  gpu_unlock();
-+}
-+
-+void gpu_unref(void)
-+{
-+  gpu_env_t * const ge = gpu_lock();
-+  gpu_unlock_unref(ge);
-+}
-+
-+// ----------------------------------------------------------------------------
-+
-+
-+// Wait abstractions - mostly so we can easily add profile code
-+static void vq_wait_pool_init(vq_wait_pool_t * const wp)
-+{
-+  unsigned int i;
-+  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
-+    sem_init(&wp->pool[i].sem, 0, 0);
-+    wp->pool[i].next = wp->pool + i + 1;
-+  }
-+  wp->head = wp->pool + 0;
-+  wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL;
-+}
-+
-+static void vq_wait_pool_deinit(vq_wait_pool_t * const wp)
-+{
-+  unsigned int i;
-+  wp->head = NULL;
-+  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
-+    sem_destroy(&wp->pool[i].sem);
-+    wp->pool[i].next = NULL;
-+  }
-+}
-+
-+
-+// If sem_init actually takes time then maybe we want a pool...
-+static vq_wait_t * vq_wait_new(void)
-+{
-+  gpu_env_t * const ge = gpu_lock_ref();
-+  vq_wait_t * const wait = ge->wait_pool.head;
-+  ge->wait_pool.head = wait->next;
-+  wait->next = NULL;
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+  tto_start(&ge->ttw.active, ns_time());
-+#endif
-+
-+  gpu_unlock();
-+  return wait;
-+}
-+
-+static void vq_wait_delete(vq_wait_t * const wait)
-+{
-+  gpu_env_t * const ge = gpu_lock();
-+  wait->next = ge->wait_pool.head;
-+  ge->wait_pool.head = wait;
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+  {
-+    trace_time_wait_t * const ttw = &ge->ttw;
-+    const int64_t now = ns_time();
-+    ++ttw->jcount;
-+    tto_end(&ttw->wait, now);
-+
-+    if (ttw->start0 == 0)
-+    {
-+      ttw->start0 = ttw->active.start[0];
-+      ttw->last_update = ttw->start0;
-+    }
-+    if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD)
-+    {
-+      ttw->last_update += WAIT_TIME_PRINT_PERIOD;
-+      ttw_print(ttw, now);
-+    }
-+  }
-+#endif
-+  gpu_unlock_unref(ge);
-+}
-+
-+static void vq_wait_wait(vq_wait_t * const wait)
-+{
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+  {
-+      const int64_t now = ns_time();
-+      gpu_env_t * const ge = gpu_lock();
-+      tto_start(&ge->ttw.wait, now);
-+      gpu_unlock();
-+  }
-+#endif
-+
-+  while (sem_wait(&wait->sem) == -1 && errno == EINTR)
-+    /* loop */;
-+}
-+
-+static void vq_wait_post(vq_wait_t * const wait)
-+{
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+  {
-+    gpu_env_t *const ge = gpu_lock();
-+    tto_end(&ge->ttw.active, ns_time());
-+    gpu_unlock();
-+  }
-+#endif
-+
-+  sem_post(&wait->sem);
-+}
-+
-+
-+
-+// Header comments were wrong for these two
-+#define VPU_QPU_MASK_QPU  1
-+#define VPU_QPU_MASK_VPU  2
-+
-+typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
-+
-+vpu_qpu_job_env_t * vpu_qpu_job_init(vpu_qpu_job_env_t * const buf)
-+{
-+//  vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t));
-+  vpu_qpu_job_env_t * vqj = buf;
-+//  memset(vqj, 0, sizeof(*vqj));
-+  vqj->n = 0;
-+  vqj->mask = 0;
-+  return vqj;
-+}
-+
-+void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj)
-+{
-+//  memset(vqj, 0, sizeof(*vqj));
-+//  free(vqj);
-+}
-+
-+static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj)
-+{
-+  struct gpu_job_s * const j = vqj->j + vqj->n++;
-+  av_assert1(vqj->n <= VPU_QPU_JOB_MAX);
-+  return j;
-+}
-+
-+void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code,
-+  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5)
-+{
-+  if (vpu_code != 0) {
-+    struct gpu_job_s *const j = new_job(vqj);
-+    vqj->mask |= VPU_QPU_MASK_VPU;
-+
-+    j->command = EXECUTE_VPU;
-+    j->callback.func = 0;
-+    j->callback.cookie = NULL;
-+    // The bottom two bits of the execute address contain no-flush flags
-+    // b0 will flush the VPU I-cache if unset so we nearly always want that set
-+    // as we never reload code
-+    j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed;
-+    j->u.v.q[1] = r0;
-+    j->u.v.q[2] = r1;
-+    j->u.v.q[3] = r2;
-+    j->u.v.q[4] = r3;
-+    j->u.v.q[5] = r4;
-+    j->u.v.q[6] = r5;
-+    gpu->vpu_i_cache_flushed = 1;
-+  }
-+}
-+
-+// flags are QPU_FLAGS_xxx
-+void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail)
-+{
-+  if (n != 0) {
-+    struct gpu_job_s *const j = new_job(vqj);
-+    vqj->mask |= VPU_QPU_MASK_QPU;
-+
-+    j->command = EXECUTE_QPU;
-+    j->callback.func = 0;
-+    j->callback.cookie = NULL;
-+
-+    j->u.q.jobs = n;
-+#if RPI_TRACE_QPU_PROFILE_ALL
-+    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS;
-+#else
-+    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
-+#endif
-+    j->u.q.timeout = 5000;
-+    memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
-+  }
-+}
-+
-+// Convert callback to sem post
-+static void vpu_qpu_job_callback_wait(void * v)
-+{
-+  vq_wait_post(v);
-+}
-+
-+// Poke a user-supplied sem
-+static void vpu_qpu_job_callback_sem(void * v)
-+{
-+  sem_post((sem_t *)v);
-+}
-+
-+void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h)
-+{
-+  vq_wait_t * wait;
-+
-+  if (vqj->mask == 0) {
-+    *wait_h = NULL;
-+    return;
-+  }
-+
-+  // We are going to want a sync object
-+  wait = vq_wait_new();
-+
-+  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
-+  // If we only posted one thing or only QPU jobs
-+  if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
-+  {
-+    struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
-+    av_assert1(j->callback.func == 0);
-+
-+    j->callback.func = vpu_qpu_job_callback_wait;
-+    j->callback.cookie = wait;
-+  }
-+  else
-+  {
-+    struct gpu_job_s *const j = new_job(vqj);
-+
-+    j->command = EXECUTE_SYNC;
-+    j->u.s.mask = vqj->mask;
-+    j->callback.func = vpu_qpu_job_callback_wait;
-+    j->callback.cookie = wait;
-+  }
-+
-+  vqj->mask = 0;
-+  *wait_h = wait;
-+}
-+
-+// Returns 0 if no sync added ('cos Q empty), 1 if sync added
-+int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem)
-+{
-+  // If nothing on q then just return
-+  if (vqj->mask == 0)
-+    return 0;
-+
-+  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
-+  // If we only posted one thing or only QPU jobs
-+  if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
-+  {
-+    struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
-+    av_assert1(j->callback.func == 0);
-+
-+    j->callback.func = vpu_qpu_job_callback_sem;
-+    j->callback.cookie = sem;
-+  }
-+  else
-+  {
-+    struct gpu_job_s *const j = new_job(vqj);
-+
-+    j->command = EXECUTE_SYNC;
-+    j->u.s.mask = vqj->mask;
-+    j->callback.func = vpu_qpu_job_callback_sem;
-+    j->callback.cookie = sem;
-+  }
-+
-+  vqj->mask = 0;
-+  return 1;
-+}
-+
-+
-+int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj)
-+{
-+  if (vqj->n == 0)
-+    return 0;
-+
-+  return vc_gpuserv_execute_code(vqj->n, vqj->j);
-+}
-+
-+// Simple wrapper of start + delete
-+int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj)
-+{
-+  int rv;
-+  rv = vpu_qpu_job_start(vqj);
-+  vpu_qpu_job_delete(vqj);
-+  return rv;
-+}
-+
-+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
-+{
-+  if (wait_h != NULL)
-+  {
-+    vq_wait_t * const wait = *wait_h;
-+    if (wait != NULL) {
-+      *wait_h = NULL;
-+      vq_wait_wait(wait);
-+      vq_wait_delete(wait);
-+    }
-+  }
-+}
-+
-+int vpu_qpu_init()
-+{
-+  gpu_env_t * const ge = gpu_lock_ref();
-+  if (ge == NULL)
-+    return -1;
-+
-+  if (ge->init_count++ == 0)
-+  {
-+    vc_gpuserv_init();
-+  }
-+
-+  gpu_unlock();
-+  return 0;
-+}
-+
-+void vpu_qpu_term()
-+{
-+  gpu_env_t * const ge = gpu_lock();
-+
-+  if (--ge->init_count == 0) {
-+    vc_gpuserv_deinit();
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+    ttw_print(&ge->ttw, ns_time());
-+#endif
-+  }
-+
-+  gpu_unlock_unref(ge);
-+}
-+
-+uint32_t qpu_fn(const int * const mc_fn)
-+{
-+  return gpu->qpu_code_gm_ptr.vc + ((const char *)mc_fn - (const char *)ff_hevc_rpi_shader);
-+}
-+
-+uint32_t qpu_dummy(void)
-+{
-+  return gpu->dummy_gm_ptr.vc;
-+}
-+
-+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth)
-+{
-+  // Dummy values we can catch with emulation
-+  qf->y_pxx = ~1U;
-+  qf->y_bxx = ~2U;
-+  qf->y_p00 = ~3U;
-+  qf->y_b00 = ~4U;
-+  qf->c_pxx = ~5U;
-+  qf->c_bxx = ~6U;
-+
-+  switch (bit_depth) {
-+    case 8:
-+      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
-+      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
-+      qf->y_bxx = qpu_fn(mc_filter_y_bxx);
-+      qf->y_p00 = qpu_fn(mc_filter_y_p00);
-+      qf->y_b00 = qpu_fn(mc_filter_y_b00);
-+      qf->c_pxx = qpu_fn(mc_filter_c_p);
-+      qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1);
-+      qf->c_bxx = qpu_fn(mc_filter_c_b);
-+      break;
-+    case 10:
-+      qf->c_pxx = qpu_fn(mc_filter_c10_p);
-+      qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1);
-+      qf->c_bxx = qpu_fn(mc_filter_c10_b);
-+      qf->y_pxx = qpu_fn(mc_filter_y10_pxx);
-+      qf->y_bxx = qpu_fn(mc_filter_y10_bxx);
-+      qf->y_p00 = qpu_fn(mc_filter_y10_p00);
-+      qf->y_b00 = qpu_fn(mc_filter_y10_b00);
-+      break;
-+    default:
-+      return -1;
-+  }
-+  return 0;
-+}
-+
---- /dev/null
-+++ b/libavcodec/rpi_qpu.h
-@@ -0,0 +1,103 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox, Ben Avison
-+*/
-+
-+#ifndef RPI_QPU_H
-+#define RPI_QPU_H
-+
-+#include "rpi_mem.h"
-+#include "rpi_zc_frames.h"
-+
-+#pragma GCC diagnostic push
-+// Many many redundant decls in the header files
-+#pragma GCC diagnostic ignored "-Wredundant-decls"
-+#pragma GCC diagnostic ignored "-Wstrict-prototypes"
-+#include "interface/vmcs_host/vc_vchi_gpuserv.h"  // for gpu_job_s
-+#pragma GCC diagnostic pop
-+
-+// QPU specific functions
-+
-+typedef struct HEVCRpiQpu {
-+    uint32_t c_pxx;
-+    uint32_t c_pxx_l1;
-+    uint32_t c_bxx;
-+    uint32_t y_pxx;
-+    uint32_t y_bxx;
-+    uint32_t y_p00;
-+    uint32_t y_b00;
-+} HEVCRpiQpu;
-+
-+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth);
-+
-+uint32_t qpu_fn(const int * const mc_fn);
-+uint32_t qpu_dummy(void);
-+
-+#define QPU_N_GRP    4
-+#define QPU_N_MAX    12
-+
-+#define QPU_MAIL_EL_VALS  2
-+
-+struct vpu_qpu_wait_s;
-+typedef struct vq_wait_s * vpu_qpu_wait_h;
-+
-+// VPU specific functions
-+
-+struct vpu_qpu_job_env_s;
-+typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h;
-+
-+#define VPU_QPU_JOB_MAX 4
-+struct vpu_qpu_job_env_s
-+{
-+  unsigned int n;
-+  unsigned int mask;
-+  struct gpu_job_s j[VPU_QPU_JOB_MAX];
-+};
-+typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
-+
-+vpu_qpu_job_h vpu_qpu_job_init(vpu_qpu_job_env_t * const buf);
-+void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
-+void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
-+  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
-+void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail);
-+void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
-+int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem);
-+int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
-+int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
-+
-+extern unsigned int vpu_get_fn(const unsigned int bit_depth);
-+extern unsigned int vpu_get_constants(void);
-+
-+// Waits for previous post_codee to complete and Will null out *wait_h after use
-+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
-+int vpu_qpu_init(void);
-+void vpu_qpu_term(void);
-+
-+void gpu_ref(void);
-+void gpu_unref(void);
-+
-+#endif
---- /dev/null
-+++ b/libavcodec/rpi_zc.c
-@@ -0,0 +1,1227 @@
-+#include "config.h"
-+
-+#include "libavcodec/avcodec.h"
-+#include "rpi_mem.h"
-+#include "rpi_mailbox.h"
-+#include "rpi_zc.h"
-+#include "libavutil/avassert.h"
-+#include <pthread.h>
-+
-+#include "libavutil/buffer_internal.h"
-+
-+#pragma GCC diagnostic push
-+// Many many redundant decls in the header files
-+#pragma GCC diagnostic ignored "-Wredundant-decls"
-+#include <interface/vctypes/vc_image_types.h>
-+#include <interface/vcsm/user-vcsm.h>
-+#pragma GCC diagnostic pop
-+
-+#define TRACE_ALLOC 0
-+#define DEBUG_ALWAYS_KEEP_LOCKED 0
-+
-+struct ZcPoolEnt;
-+
-+typedef struct ZcPool
-+{
-+    size_t numbytes;
-+    struct ZcPoolEnt * head;
-+    pthread_mutex_t lock;
-+} ZcPool;
-+
-+typedef struct ZcPoolEnt
-+{
-+    size_t numbytes;
-+
-+    unsigned int vcsm_handle;
-+    unsigned int vc_handle;
-+    void * map_arm;
-+    unsigned int map_vc;
-+
-+    struct ZcPoolEnt * next;
-+    struct ZcPool * pool;
-+} ZcPoolEnt;
-+
-+typedef struct ZcOldCtxVals
-+{
-+    int thread_safe_callbacks;
-+    int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags);
-+    void * opaque;
-+} ZcOldCtxVals;
-+
-+typedef struct AVZcEnv
-+{
-+    unsigned int refcount;
-+    ZcOldCtxVals old;
-+
-+    void * pool_env;
-+    av_rpi_zc_alloc_buf_fn_t * alloc_buf;
-+    av_rpi_zc_free_pool_fn_t * free_pool;
-+
-+    unsigned int pool_size;
-+} ZcEnv;
-+
-+typedef struct ZcUserBufEnv {
-+    void * v;
-+    const av_rpi_zc_buf_fn_tab_t * fn;
-+    size_t numbytes;
-+    int offset;
-+} ZcUserBufEnv;
-+
-+#define ZC_BUF_INVALID  0
-+#define ZC_BUF_VALID    1
-+#define ZC_BUF_NEVER    2
-+
-+typedef struct ZcBufEnv {
-+    GPU_MEM_PTR_T gmem;
-+    AVZcEnvPtr zc;
-+    int is_valid;
-+    AVBufferRef * user;
-+    AVRpiZcFrameGeometry geo;
-+    size_t size_y;
-+    size_t size_c;
-+    size_t size_pic;
-+    ssize_t offset;
-+    pthread_mutex_t lock;
-+    pthread_cond_t cond;
-+} ZcBufEnv;
-+
-+
-+
-+
-+
-+
-+#define ALLOC_PAD       0
-+#define ALLOC_ROUND     0x1000
-+#define STRIDE_ROUND    64
-+#define STRIDE_OR       0
-+
-+#define DEBUG_ZAP0_BUFFERS 0
-+
-+static inline int av_rpi_is_sand_format(const int format)
-+{
-+    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16) ||
-+        (format == AV_PIX_FMT_RPI4_8 || format == AV_PIX_FMT_RPI4_10);
-+}
-+
-+static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
-+{
-+    return av_rpi_is_sand_format(frame->format);
-+}
-+
-+//----------------------------------------------------------------------------
-+//
-+// Internal pool stuff
-+
-+// Pool entry functions
-+
-+static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const size_t req_size)
-+{
-+    ZcPoolEnt * const zp = av_mallocz(sizeof(ZcPoolEnt));
-+
-+    // Round up to 4k & add 4k
-+    const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1);
-+
-+    if (zp == NULL) {
-+        av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n");
-+        goto fail0;
-+    }
-+
-+    // The 0x80 here maps all pages here rather than waiting for lazy mapping
-+    // BEWARE that in GPU land a later unlock/lock pair will put us back into
-+    // lazy mode - which will also break cache invalidate calls.
-+    if ((zp->vcsm_handle = vcsm_malloc_cache(alloc_size, VCSM_CACHE_TYPE_HOST | 0x80, "ffmpeg_rpi_zc")) == 0)
-+    {
-+        av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size);
-+        goto fail1;
-+    }
-+
-+#if TRACE_ALLOC
-+    printf("%s: Alloc %#x bytes @ h=%d\n", __func__, alloc_size, zp->vcsm_handle);
-+#endif
-+
-+    zp->numbytes = alloc_size;
-+    zp->pool = pool;
-+    return zp;
-+
-+fail1:
-+    av_free(zp);
-+fail0:
-+    return NULL;
-+}
-+
-+static void zc_pool_ent_free(ZcPoolEnt * const zp)
-+{
-+#if TRACE_ALLOC
-+    printf("%s: Free %#x bytes @ h=%d\n", __func__, zp->numbytes, zp->vcsm_handle);
-+#endif
-+
-+    if (zp->vcsm_handle != 0)
-+    {
-+        // VC addr & handle need no dealloc
-+        if (zp->map_arm != NULL)
-+            vcsm_unlock_hdl(zp->vcsm_handle);
-+        vcsm_free(zp->vcsm_handle);
-+    }
-+    av_free(zp);
-+}
-+
-+//----------------------------------------------------------------------------
-+//
-+// Pool functions
-+
-+static void zc_pool_free_ent_list(ZcPoolEnt * p)
-+{
-+    while (p != NULL)
-+    {
-+        ZcPoolEnt * const zp = p;
-+        p = p->next;
-+        zc_pool_ent_free(zp);
-+    }
-+}
-+
-+static void zc_pool_flush(ZcPool * const pool)
-+{
-+    ZcPoolEnt * p = pool->head;
-+    pool->head = NULL;
-+    pool->numbytes = ~0U;
-+    zc_pool_free_ent_list(p);
-+}
-+
-+static ZcPoolEnt * zc_pool_get_ent(ZcPool * const pool, const size_t req_bytes)
-+{
-+    ZcPoolEnt * zp = NULL;
-+    ZcPoolEnt * flush_list = NULL;
-+    size_t numbytes;
-+
-+    pthread_mutex_lock(&pool->lock);
-+
-+    numbytes = pool->numbytes;
-+
-+    // If size isn't close then dump the pool
-+    // Close in this context means within 128k
-+    if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes)
-+    {
-+        flush_list = pool->head;
-+        pool->head = NULL;
-+        pool->numbytes = numbytes = req_bytes;
-+    }
-+    else if (pool->head != NULL)
-+    {
-+        zp = pool->head;
-+        pool->head = zp->next;
-+    }
-+
-+    pthread_mutex_unlock(&pool->lock);
-+
-+    zc_pool_free_ent_list(flush_list);
-+
-+    if (zp == NULL)
-+        zp = zc_pool_ent_alloc(pool, numbytes);
-+
-+    return zp;
-+}
-+
-+static void zc_pool_put_ent(ZcPoolEnt * const zp)
-+{
-+    ZcPool * const pool = zp == NULL ? NULL : zp->pool;
-+    if (zp != NULL)
-+    {
-+        pthread_mutex_lock(&pool->lock);
-+#if TRACE_ALLOC
-+        printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->numbytes);
-+#endif
-+
-+        if (pool->numbytes == zp->numbytes)
-+        {
-+            zp->next = pool->head;
-+            pool->head = zp;
-+            pthread_mutex_unlock(&pool->lock);
-+        }
-+        else
-+        {
-+            pthread_mutex_unlock(&pool->lock);
-+            zc_pool_ent_free(zp);
-+        }
-+    }
-+}
-+
-+static ZcPool *
-+zc_pool_new(void)
-+{
-+    ZcPool * const pool = av_mallocz(sizeof(*pool));
-+    if (pool == NULL)
-+        return NULL;
-+
-+    pool->numbytes = -1;
-+    pool->head = NULL;
-+    pthread_mutex_init(&pool->lock, NULL);
-+    return pool;
-+}
-+
-+static void
-+zc_pool_delete(ZcPool * const pool)
-+{
-+    if (pool != NULL)
-+    {
-+        pool->numbytes = -1;
-+        zc_pool_flush(pool);
-+        pthread_mutex_destroy(&pool->lock);
-+        av_free(pool);
-+    }
-+}
-+
-+//============================================================================
-+//
-+// ZC implementation using above pool implementation
-+//
-+// Fn table fns...
-+
-+static void zc_pool_free_v(void * v)
-+{
-+    zc_pool_put_ent(v);
-+}
-+
-+static unsigned int zc_pool_ent_vcsm_handle_v(void * v)
-+{
-+    ZcPoolEnt * zp = v;
-+    return zp->vcsm_handle;
-+}
-+
-+static unsigned int zc_pool_ent_vc_handle_v(void * v)
-+{
-+    ZcPoolEnt * zp = v;
-+    if (zp->vc_handle == 0)
-+    {
-+        if ((zp->vc_handle = vcsm_vc_hdl_from_hdl(zp->vcsm_handle)) == 0)
-+            av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC handle\n",
-+                   __func__, zp->vcsm_handle);
-+    }
-+    return zp->vc_handle;
-+}
-+
-+static void * zc_pool_ent_map_arm_v(void * v)
-+{
-+    ZcPoolEnt * zp = v;
-+    if (zp->map_arm == NULL)
-+    {
-+        if ((zp->map_arm = vcsm_lock(zp->vcsm_handle)) == NULL)
-+            av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to ARM address\n",
-+                   __func__, zp->vcsm_handle);
-+    }
-+    return zp->map_arm;
-+}
-+
-+static unsigned int zc_pool_ent_map_vc_v(void * v)
-+{
-+    ZcPoolEnt * zp = v;
-+    if (zp->map_vc == 0)
-+    {
-+        if ((zp->map_vc = vcsm_vc_addr_from_hdl(zp->vcsm_handle)) == 0)
-+            av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC address\n",
-+                   __func__, zp->vcsm_handle);
-+    }
-+    return zp->map_vc;
-+}
-+
-+static const av_rpi_zc_buf_fn_tab_t zc_pool_buf_fns = {
-+    .free        = zc_pool_free_v,
-+    .vcsm_handle = zc_pool_ent_vcsm_handle_v,
-+    .vc_handle   = zc_pool_ent_vc_handle_v,
-+    .map_arm     = zc_pool_ent_map_arm_v,
-+    .map_vc      = zc_pool_ent_map_vc_v,
-+};
-+
-+// ZC Env fns
-+
-+// Delete pool
-+// All buffers guaranteed freed by now
-+static void
-+zc_pool_delete_v(void * v)
-+{
-+    zc_pool_delete((ZcPool *)v);
-+    rpi_mem_gpu_uninit();
-+}
-+
-+// Allocate a new ZC buffer
-+static AVBufferRef *
-+zc_pool_buf_alloc(void * v, size_t size, const AVRpiZcFrameGeometry * geo)
-+{
-+    ZcPool * const pool = v;
-+    ZcPoolEnt *const zp = zc_pool_get_ent(pool, size);
-+    AVBufferRef * buf;
-+
-+    (void)geo;  // geo ignored here
-+
-+    if (zp == NULL) {
-+        av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size);
-+        goto fail0;
-+    }
-+
-+    if ((buf = av_rpi_zc_buf(size, 0, zp, &zc_pool_buf_fns)) == NULL)
-+    {
-+        av_log(NULL, AV_LOG_ERROR, "av_rpi_zc_buf() failed\n");
-+        goto fail2;
-+    }
-+
-+    return buf;
-+
-+fail2:
-+    zc_pool_put_ent(zp);
-+fail0:
-+    return NULL;
-+}
-+
-+// Init wrappers - the public fns
-+
-+AVZcEnvPtr
-+av_rpi_zc_int_env_alloc(void * logctx)
-+{
-+    ZcEnv * zc;
-+    ZcPool * pool_env;
-+
-+    if (rpi_mem_gpu_init(0) < 0)
-+        return NULL;
-+
-+    if ((pool_env = zc_pool_new()) == NULL)
-+        goto fail1;
-+
-+    if ((zc = av_rpi_zc_env_alloc(logctx, pool_env, zc_pool_buf_alloc, zc_pool_delete_v)) == NULL)
-+        goto fail2;
-+
-+    return zc;
-+
-+fail2:
-+    zc_pool_delete(pool_env);
-+fail1:
-+    rpi_mem_gpu_uninit();
-+    return NULL;
-+}
-+
-+void
-+av_rpi_zc_int_env_freep(AVZcEnvPtr * zcp)
-+{
-+    const AVZcEnvPtr zc = *zcp;
-+    *zcp = NULL;
-+    if (zc != NULL)
-+        av_rpi_zc_env_release(zc);
-+}
-+
-+//============================================================================
-+//
-+// Geometry
-+//
-+// This is a separate chunck to the rest
-+
-+// Get mailbox fd - should be in a lock when called
-+// Rely on process close to close it
-+static int mbox_fd(void)
-+{
-+    static int fd = -1;
-+    if (fd != -1)
-+        return fd;
-+    return (fd = mbox_open());
-+}
-+
-+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
-+    const int format, const unsigned int video_width, const unsigned int video_height)
-+{
-+    static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
-+
-+    AVRpiZcFrameGeometry geo = {
-+        .format       = format,
-+        .video_width  = video_width,
-+        .video_height = video_height
-+    };
-+
-+    switch (format)
-+    {
-+        case AV_PIX_FMT_YUV420P:
-+            geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
-+            geo.stride_c = geo.stride_y / 2;
-+            geo.height_y = (video_height + 32 + 31) & ~31;
-+            geo.height_c = geo.height_y / 2;
-+            geo.planes_c = 2;
-+            geo.stripes = 1;
-+            geo.bytes_per_pel = 1;
-+            geo.stripe_is_yc = 1;
-+            break;
-+
-+        case AV_PIX_FMT_YUV420P10:
-+            geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
-+            geo.stride_c = geo.stride_y / 2;
-+            geo.height_y = (video_height + 32 + 31) & ~31;
-+            geo.height_c = geo.height_y / 2;
-+            geo.planes_c = 2;
-+            geo.stripes = 1;
-+            geo.bytes_per_pel = 2;
-+            geo.stripe_is_yc = 1;
-+            break;
-+
-+        case AV_PIX_FMT_SAND128:
-+        case AV_PIX_FMT_RPI4_8:
-+        {
-+            const unsigned int stripe_w = 128;
-+
-+            static VC_IMAGE_T img = {0};
-+
-+            // Given the overhead of calling the mailbox keep a stashed
-+            // copy as we will almost certainly just want the same numbers again
-+            // but that means we need a lock
-+            pthread_mutex_lock(&sand_lock);
-+
-+            if (img.width != video_width || img.height != video_height)
-+            {
-+                VC_IMAGE_T new_img = {
-+                    .type = VC_IMAGE_YUV_UV,
-+                    .width = video_width,
-+                    .height = video_height
-+                };
-+
-+                mbox_get_image_params(mbox_fd(), &new_img);
-+                img = new_img;
-+            }
-+
-+            geo.stride_y = stripe_w;
-+            geo.stride_c = stripe_w;
-+            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
-+            geo.height_c = img.pitch / stripe_w - geo.height_y;
-+            geo.stripe_is_yc = 1;
-+            if (geo.height_y * stripe_w > img.pitch)
-+            {
-+                // "tall" sand - all C blocks now follow Y
-+                geo.height_y = img.pitch / stripe_w;
-+                geo.height_c = geo.height_y;
-+                geo.stripe_is_yc = 0;
-+            }
-+            geo.planes_c = 1;
-+            geo.stripes = (video_width + stripe_w - 1) / stripe_w;
-+            geo.bytes_per_pel = 1;
-+
-+            pthread_mutex_unlock(&sand_lock);
-+#if 0
-+            printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n",
-+                   video_width, video_height,
-+                   geo.stride_y, geo.stride_c,
-+                   geo.height_y, geo.height_c,
-+                   geo.stripes, img.pitch);
-+#endif
-+            av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
-+            av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
-+            break;
-+        }
-+
-+        case AV_PIX_FMT_RPI4_10:
-+        {
-+            const unsigned int stripe_w = 128;  // bytes
-+
-+            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
-+            static VC_IMAGE_T img = {0};
-+
-+            // Given the overhead of calling the mailbox keep a stashed
-+            // copy as we will almost certainly just want the same numbers again
-+            // but that means we need a lock
-+            pthread_mutex_lock(&sand_lock);
-+
-+            if (img.width != video_width || img.height != video_height)
-+            {
-+                VC_IMAGE_T new_img = {
-+                    .type = VC_IMAGE_YUV10COL,
-+                    .width = video_width,
-+                    .height = video_height
-+                };
-+
-+                mbox_get_image_params(mbox_fd(), &new_img);
-+                img = new_img;
-+            }
-+
-+            geo.stride_y = stripe_w;
-+            geo.stride_c = stripe_w;
-+            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
-+            geo.height_c = img.pitch / stripe_w - geo.height_y;
-+            geo.planes_c = 1;
-+            geo.stripes = ((video_width * 4 + 2) / 3 + stripe_w - 1) / stripe_w;
-+            geo.bytes_per_pel = 1;
-+            geo.stripe_is_yc = 1;
-+
-+            pthread_mutex_unlock(&sand_lock);
-+
-+#if 0
-+            printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n",
-+                   video_width, video_height,
-+                   geo.stride_y, geo.stride_c,
-+                   geo.height_y, geo.height_c,
-+                   geo.stripes, img.pitch);
-+#endif
-+            av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
-+            av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
-+            break;
-+        }
-+
-+        case AV_PIX_FMT_SAND64_16:
-+        case AV_PIX_FMT_SAND64_10:
-+        {
-+            const unsigned int stripe_w = 128;  // bytes
-+
-+            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
-+            static VC_IMAGE_T img = {0};
-+
-+            // Given the overhead of calling the mailbox keep a stashed
-+            // copy as we will almost certainly just want the same numbers again
-+            // but that means we need a lock
-+            pthread_mutex_lock(&sand_lock);
-+
-+             if (img.width != video_width || img.height != video_height)
-+            {
-+                VC_IMAGE_T new_img = {
-+                    .type = VC_IMAGE_YUV_UV_16,
-+                    .width = video_width,
-+                    .height = video_height
-+                };
-+
-+                mbox_get_image_params(mbox_fd(), &new_img);
-+                img = new_img;
-+            }
-+
-+            geo.stride_y = stripe_w;
-+            geo.stride_c = stripe_w;
-+            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
-+            geo.height_c = img.pitch / stripe_w - geo.height_y;
-+            geo.planes_c = 1;
-+            geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w;
-+            geo.bytes_per_pel = 2;
-+            geo.stripe_is_yc = 1;
-+
-+            pthread_mutex_unlock(&sand_lock);
-+            break;
-+        }
-+
-+        default:
-+            break;
-+    }
-+    return geo;
-+}
-+
-+//============================================================================
-+//
-+// ZC Env fns
-+//
-+// Frame copy fns
-+
-+static AVBufferRef * zc_copy(const AVZcEnvPtr zc,
-+    const AVFrame * const src)
-+{
-+    AVFrame dest_frame;
-+    AVFrame * const dest = &dest_frame;
-+    unsigned int i;
-+    uint8_t * psrc, * pdest;
-+
-+    dest->format = src->format;
-+    dest->width = src->width;
-+    dest->height = src->height;
-+
-+    if (av_rpi_zc_get_buffer(zc, dest) != 0 ||
-+        av_rpi_zc_resolve_frame(dest, ZC_RESOLVE_ALLOC_VALID) != 0)
-+    {
-+        return NULL;
-+    }
-+
-+    for (i = 0, psrc = src->data[0], pdest = dest->data[0];
-+         i != dest->height;
-+         ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
-+    {
-+        memcpy(pdest, psrc, dest->width);
-+    }
-+    for (i = 0, psrc = src->data[1], pdest = dest->data[1];
-+         i != dest->height / 2;
-+         ++i, psrc += src->linesize[1], pdest += dest->linesize[1])
-+    {
-+        memcpy(pdest, psrc, dest->width / 2);
-+    }
-+    for (i = 0, psrc = src->data[2], pdest = dest->data[2];
-+         i != dest->height / 2;
-+         ++i, psrc += src->linesize[2], pdest += dest->linesize[2])
-+    {
-+        memcpy(pdest, psrc, dest->width / 2);
-+    }
-+
-+    return dest->buf[0];
-+}
-+
-+
-+static AVBufferRef * zc_420p10_to_sand128(const AVZcEnvPtr zc,
-+    const AVFrame * const src)
-+{
-+    assert(0);
-+    return NULL;
-+}
-+
-+
-+static AVBufferRef * zc_sand64_16_to_sand128(const AVZcEnvPtr zc,
-+    const AVFrame * const src, const unsigned int src_bits)
-+{
-+    assert(0);
-+    return NULL;
-+}
-+
-+//----------------------------------------------------------------------------
-+//
-+// Public info extraction calls
-+
-+static void zc_buf_env_free_cb(void * opaque, uint8_t * data);
-+
-+static inline ZcBufEnv * pic_zbe_ptr(AVBufferRef *const buf)
-+{
-+    // Kludge where we check the free fn to check this is really
-+    // one of our buffers - can't think of a better way
-+    return buf == NULL || buf->buffer->free != zc_buf_env_free_cb ? NULL :
-+        av_buffer_get_opaque(buf);
-+}
-+
-+static inline GPU_MEM_PTR_T * pic_gm_ptr(AVBufferRef * const buf)
-+{
-+    // As gmem is the first el NULL should be preserved
-+    return &pic_zbe_ptr(buf)->gmem;
-+}
-+
-+unsigned int av_rpi_zc_vcsm_handle(const AVRpiZcRefPtr fr_ref)
-+{
-+    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
-+    return p == NULL ? 0 : p->vcsm_handle;
-+}
-+
-+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref)
-+{
-+    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
-+    return p == NULL ? -1 : p->vc_handle;
-+}
-+
-+int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref)
-+{
-+    const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
-+    return zbe == NULL ? 0 : zbe->offset;
-+}
-+
-+int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref)
-+{
-+    const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
-+    return zbe == NULL ? 0 : zbe->size_pic;
-+}
-+
-+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref)
-+{
-+    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
-+    return p == NULL ? 0 : p->numbytes;
-+}
-+
-+const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref)
-+{
-+    const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
-+    return zbe == NULL ? NULL : &zbe->geo;
-+}
-+
-+AVRpiZcRefPtr av_rpi_zc_ref(void * const logctx, const AVZcEnvPtr zc,
-+    const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy)
-+{
-+    av_assert0(!maycopy || zc != NULL);
-+
-+    if (frame->format != AV_PIX_FMT_YUV420P &&
-+        frame->format != AV_PIX_FMT_YUV420P10 &&
-+        !av_rpi_is_sand_frame(frame))
-+    {
-+        av_log(logctx, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format);
-+        return NULL;
-+    }
-+
-+    if (frame->buf[1] != NULL || frame->format != expected_format)
-+    {
-+#if RPI_ZC_SAND_8_IN_10_BUF
-+        if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL)
-+        {
-+//            av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__);
-+            return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]);
-+        }
-+#endif
-+
-+        if (maycopy)
-+        {
-+            if (frame->buf[1] != NULL)
-+                av_log(logctx, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
-+            else
-+                av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format);
-+
-+            switch (frame->format)
-+            {
-+                case AV_PIX_FMT_YUV420P10:
-+                    return zc_420p10_to_sand128(zc, frame);
-+
-+                case AV_PIX_FMT_SAND64_10:
-+                    return zc_sand64_16_to_sand128(zc, frame, 10);
-+
-+                default:
-+                    return zc_copy(zc, frame);
-+            }
-+        }
-+        else
-+        {
-+            if (frame->buf[1] != NULL)
-+                av_log(logctx, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__);
-+            else
-+                av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format);
-+            return NULL;
-+        }
-+    }
-+
-+    if (pic_gm_ptr(frame->buf[0]) == NULL)
-+    {
-+        if (maycopy)
-+        {
-+            av_log(logctx, AV_LOG_INFO, "%s: *** Not one of our buffers: copying\n", __func__);
-+            return zc_copy(zc, frame);
-+        }
-+        else
-+        {
-+            av_log(logctx, AV_LOG_WARNING, "%s: *** Not one of our buffers: NULL\n", __func__);
-+            return NULL;
-+        }
-+    }
-+
-+    return av_buffer_ref(frame->buf[0]);
-+}
-+
-+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref)
-+{
-+    if (fr_ref != NULL)
-+    {
-+        av_buffer_unref(&fr_ref);
-+    }
-+}
-+
-+//----------------------------------------------------------------------------
-+
-+// Extract user environment from an AVBufferRef
-+void * av_rpi_zc_buf_v(AVBufferRef * const buf)
-+{
-+    ZcBufEnv * const zbe = pic_zbe_ptr(buf);
-+    if (zbe != NULL && zbe->user != NULL)
-+    {
-+        const ZcUserBufEnv * const zub = (const ZcUserBufEnv *)zbe->user->data;
-+        return zub == NULL ? NULL : zub->v;
-+    }
-+    return NULL;
-+}
-+
-+// AV buffer pre-free callback
-+static void zc_user_buf_free_cb(void * opaque, uint8_t * data)
-+{
-+    if (opaque != NULL)
-+    {
-+        ZcUserBufEnv * const zub = opaque;
-+
-+        if (zub->fn->free)
-+            zub->fn->free(zub->v);
-+
-+        av_free(zub);
-+    }
-+}
-+
-+static void zc_buf_env_free_cb(void * opaque, uint8_t * data)
-+{
-+    if (opaque != NULL)
-+    {
-+        ZcBufEnv * const zbe = opaque;
-+
-+        av_buffer_unref(&zbe->user);
-+
-+        if (zbe->zc != NULL)
-+            av_rpi_zc_env_release(zbe->zc);
-+
-+        pthread_cond_destroy(&zbe->cond);
-+        pthread_mutex_destroy(&zbe->lock);
-+        av_free(zbe);
-+    }
-+}
-+
-+
-+// Wrap the various ZC bits in an AV Buffer and resolve those things we want
-+// resolved now.
-+// Currently we resolve everything, but in future we might not
-+AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab)
-+{
-+    AVBufferRef *buf;
-+    ZcUserBufEnv * zub;
-+
-+    if ((zub = av_malloc(sizeof(ZcUserBufEnv))) == NULL)
-+        return NULL;
-+
-+    zub->fn = fn_tab;
-+    zub->v = v;
-+    zub->numbytes = numbytes;
-+    zub->offset = addr_offset;
-+
-+    if ((buf = av_buffer_create((uint8_t*)zub, sizeof(*zub), zc_user_buf_free_cb, zub, 0)) == NULL)
-+    {
-+        av_log(NULL, AV_LOG_ERROR, "ZC: Failed av_buffer_create\n");
-+        av_free(zub);
-+        return NULL;
-+    }
-+
-+    return buf;
-+}
-+
-+int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int alloc_mode)
-+{
-+    ZcBufEnv * const zbe = pic_zbe_ptr(buf);
-+
-+    if (zbe == NULL)
-+        return AVERROR(EINVAL);
-+
-+    if (alloc_mode == ZC_RESOLVE_FAIL && !zbe->is_valid)
-+        return AVERROR(EAGAIN);
-+
-+    if (alloc_mode == ZC_RESOLVE_WAIT_VALID && !zbe->is_valid)
-+    {
-+        pthread_mutex_lock(&zbe->lock);
-+        while (!zbe->is_valid)
-+            pthread_cond_wait(&zbe->cond, &zbe->lock);
-+        pthread_mutex_unlock(&zbe->lock);
-+    }
-+
-+    if (zbe->is_valid == ZC_BUF_NEVER)
-+        return AVERROR(EINVAL);
-+
-+    // Do alloc if we need it
-+    if (zbe->user == NULL)
-+    {
-+        ZcEnv * const zc = zbe->zc;
-+        const ZcUserBufEnv * zub;
-+
-+        av_assert0(alloc_mode == ZC_RESOLVE_ALLOC || alloc_mode == ZC_RESOLVE_ALLOC_VALID);
-+
-+        if ((zbe->user = zc->alloc_buf(zc->pool_env, zbe->size_pic, &zbe->geo)) == NULL)
-+        {
-+            av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
-+            goto fail;
-+        }
-+        zub = (const ZcUserBufEnv *)zbe->user->data;
-+
-+        // Track
-+
-+        zbe->offset = zub->offset;
-+        zbe->gmem.numbytes = zub->numbytes;
-+        if ((zbe->gmem.arm =  zub->fn->map_arm(zub->v)) == NULL)
-+        {
-+            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to lock vcsm_handle %u\n", zbe->gmem.vcsm_handle);
-+            goto fail;
-+        }
-+
-+        if ((zbe->gmem.vcsm_handle = zub->fn->vcsm_handle(zub->v)) == 0)
-+        {
-+            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vcsm_handle\n");
-+            goto fail;
-+        }
-+
-+        if ((zbe->gmem.vc_handle = zub->fn->vc_handle(zub->v)) == 0)
-+        {
-+            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc handle from vcsm_handle %u\n", zbe->gmem.vcsm_handle);
-+            goto fail;
-+        }
-+        if ((zbe->gmem.vc = zub->fn->map_vc(zub->v)) == 0)
-+        {
-+            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc addr from vcsm_handle %u\n", zbe->gmem.vcsm_handle);
-+            goto fail;
-+        }
-+
-+        buf->buffer->data = zbe->gmem.arm + zbe->offset;
-+        buf->buffer->size = zbe->size_pic;
-+
-+        // In this mode we shouldn't have anyone waiting for us
-+        // so no need to signal
-+        if (alloc_mode == ZC_RESOLVE_ALLOC_VALID)
-+            zbe->is_valid = 1;
-+    }
-+
-+    // Just overwrite - no point in testing
-+    buf->data = zbe->gmem.arm + zbe->offset;
-+    buf->size = zbe->size_pic;
-+    return 0;
-+
-+fail:
-+    av_buffer_unref(&zbe->user);
-+    return AVERROR(ENOMEM);
-+}
-+
-+int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc)
-+{
-+    int rv;
-+
-+    // Do alloc if we need it
-+    if ((rv = av_rpi_zc_resolve_buffer(frame->buf[0], may_alloc)) != 0)
-+        return rv;
-+
-+    // If we are a framebuf copy then the alloc can be done but we haven't
-+    // imported its results yet
-+    if (frame->data[0] == NULL)
-+    {
-+        const ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
-+
-+        frame->linesize[0] = zbe->geo.stride_y;
-+        frame->linesize[1] = zbe->geo.stride_c;
-+        frame->linesize[2] = zbe->geo.stride_c;
-+        // abuse: linesize[3] = "stripe stride"
-+        // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
-+        // In a general case this makes the calculation an xor and multiply rather
-+        // than a divide and multiply
-+        if (zbe->geo.stripes > 1)
-+            frame->linesize[3] = zbe->geo.stripe_is_yc ? zbe->geo.height_y + zbe->geo.height_c : zbe->geo.height_y;
-+
-+        frame->data[0] = frame->buf[0]->data;
-+        frame->data[1] = frame->data[0] + (zbe->geo.stripe_is_yc ? zbe->size_y : zbe->size_y * zbe->geo.stripes);
-+        if (zbe->geo.planes_c > 1)
-+            frame->data[2] = frame->data[1] + zbe->size_c;
-+
-+        frame->extended_data = frame->data;
-+        // Leave extended buf alone
-+    }
-+
-+    return 0;
-+}
-+
-+int av_rpi_zc_set_valid_frame(AVFrame * const frame)
-+{
-+    ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
-+
-+    if (zbe == NULL)
-+        return AVERROR(EINVAL);
-+
-+    zbe->is_valid = ZC_BUF_VALID;
-+    pthread_cond_broadcast(&zbe->cond);
-+
-+    return 0;
-+}
-+
-+int av_rpi_zc_set_broken_frame(AVFrame * const frame)
-+{
-+    ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
-+
-+    if (zbe == NULL)
-+        return AVERROR(EINVAL);
-+
-+    zbe->is_valid = ZC_BUF_NEVER;
-+    pthread_cond_broadcast(&zbe->cond);
-+
-+    return 0;
-+}
-+
-+void av_rpi_zc_set_decoder_pool_size(ZcEnv *const zc, const unsigned int pool_size)
-+{
-+    zc->pool_size = pool_size;
-+}
-+
-+unsigned int av_rpi_zc_get_decoder_pool_size(ZcEnv *const zc)
-+{
-+    return zc->pool_size;
-+}
-+
-+int av_rpi_zc_get_buffer(ZcEnv *const zc, AVFrame * const frame)
-+{
-+#if 1
-+    ZcBufEnv * zbe = av_mallocz(sizeof(*zbe));
-+
-+    for (unsigned int i = 0; i < AV_NUM_DATA_POINTERS; i++) {
-+        frame->buf[i] = NULL;
-+        frame->data[i] = NULL;
-+        frame->linesize[i] = 0;
-+    }
-+
-+    if (zbe == NULL)
-+        return AVERROR(ENOMEM);
-+
-+    if ((frame->buf[0] = av_buffer_create((uint8_t *)zbe, sizeof(*zbe), zc_buf_env_free_cb, zbe, 0)) == NULL)
-+    {
-+        av_free(zbe);
-+        return AVERROR(ENOMEM);
-+    }
-+
-+    pthread_mutex_init(&zbe->lock, NULL);
-+    pthread_cond_init(&zbe->cond, NULL);
-+    zbe->zc = zc;
-+    atomic_fetch_add(&zc->refcount, 1);
-+
-+    zbe->geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);  // Note geometry for later use
-+    zbe->size_y = zbe->geo.stride_y * zbe->geo.height_y;
-+    zbe->size_c = zbe->geo.stride_c * zbe->geo.height_c;
-+    zbe->size_pic = (zbe->size_y + zbe->size_c * zbe->geo.planes_c) * zbe->geo.stripes;
-+
-+#else
-+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);
-+    const unsigned int size_y = geo.stride_y * geo.height_y;
-+    const unsigned int size_c = geo.stride_c * geo.height_c;
-+    const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes;
-+    AVBufferRef * buf;
-+    unsigned int i;
-+
-+//    printf("Do local alloc: format=%#x, %dx%d: %u\n", frame->format, frame->width, frame->height, size_pic);
-+
-+    if ((buf = zc->alloc_buf(zc->pool_env, size_pic, &geo)) == NULL)
-+    {
-+        av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
-+        return AVERROR(ENOMEM);
-+    }
-+
-+    // Track
-+    atomic_fetch_add(&zc->refcount, 1);
-+    pic_zbe_ptr(buf)->zc = zc;
-+
-+    for (i = 0; i < AV_NUM_DATA_POINTERS; i++) {
-+        frame->buf[i] = NULL;
-+        frame->data[i] = NULL;
-+        frame->linesize[i] = 0;
-+    }
-+
-+    frame->buf[0] = buf;
-+
-+    frame->linesize[0] = geo.stride_y;
-+    frame->linesize[1] = geo.stride_c;
-+    frame->linesize[2] = geo.stride_c;
-+    // abuse: linesize[3] = "stripe stride"
-+    // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
-+    // In a general case this makes the calculation an xor and multiply rather
-+    // than a divide and multiply
-+    if (geo.stripes > 1)
-+        frame->linesize[3] = geo.stripe_is_yc ? geo.height_y + geo.height_c : geo.height_y;
-+
-+    frame->data[0] = buf->data;
-+    frame->data[1] = frame->data[0] + (geo.stripe_is_yc ? size_y : size_y * geo.stripes);
-+    if (geo.planes_c > 1)
-+        frame->data[2] = frame->data[1] + size_c;
-+
-+    frame->extended_data = frame->data;
-+    // Leave extended buf alone
-+
-+#if RPI_ZC_SAND_8_IN_10_BUF != 0
-+    // *** If we intend to use this for real we will want a 2nd buffer pool
-+    frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = zc_pool_buf_alloc(&zc->pool, size_pic);  // *** 2 * wanted size - kludge
-+#endif
-+#endif
-+
-+    return 0;
-+}
-+
-+void av_rpi_zc_env_release(const AVZcEnvPtr zc)
-+{
-+    const int n = atomic_fetch_add(&zc->refcount, -1);
-+    if (n == 1)  // was 1, now 0
-+    {
-+        zc->free_pool(zc->pool_env);
-+        av_free(zc);
-+    }
-+}
-+
-+AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx,
-+                    void * pool_env,
-+                    av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
-+                    av_rpi_zc_free_pool_fn_t * free_pool_fn)
-+{
-+    ZcEnv * zc;
-+
-+    if ((zc = av_mallocz(sizeof(ZcEnv))) == NULL)
-+    {
-+        av_log(logctx, AV_LOG_ERROR, "av_rpi_zc_env_alloc: Context allocation failed\n");
-+        return NULL;
-+    }
-+
-+    *zc = (ZcEnv){
-+        .refcount = ATOMIC_VAR_INIT(1),
-+        .pool_env = pool_env,
-+        .alloc_buf = alloc_buf_fn,
-+        .free_pool = free_pool_fn,
-+        .pool_size = 0
-+    };
-+
-+    return zc;
-+}
-+
-+//============================================================================
-+//
-+// External ZC initialisation
-+
-+#define RPI_GET_BUFFER2 1
-+
-+
-+static int zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags)
-+{
-+#if !RPI_GET_BUFFER2
-+    return avcodec_default_get_buffer2(s, frame, flags);
-+#else
-+    int rv;
-+
-+    if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0)
-+    {
-+//        printf("Do default alloc: format=%#x\n", frame->format);
-+        rv = avcodec_default_get_buffer2(s, frame, flags);
-+    }
-+    else if (frame->format == AV_PIX_FMT_YUV420P ||
-+             av_rpi_is_sand_frame(frame))
-+    {
-+        if ((rv = av_rpi_zc_get_buffer(s->opaque, frame)) == 0)
-+            rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID);
-+    }
-+    else
-+    {
-+        rv = avcodec_default_get_buffer2(s, frame, flags);
-+    }
-+
-+#if 0
-+    printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
-+        frame->format, frame->width, frame->height,
-+        frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3],
-+        frame->data[0], frame->data[1], frame->data[2],
-+        frame->buf[0], frame->buf[1], frame->buf[2],
-+        av_buffer_get_opaque(frame->buf[0]));
-+#endif
-+    return rv;
-+#endif
-+}
-+
-+int av_rpi_zc_in_use(const struct AVCodecContext * const s)
-+{
-+    return s->get_buffer2 == zc_get_buffer2;
-+}
-+
-+int av_rpi_zc_init2(struct AVCodecContext * const s,
-+                    void * pool_env,
-+                    av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
-+                    av_rpi_zc_free_pool_fn_t * free_pool_fn)
-+{
-+    ZcEnv * zc;
-+
-+    av_assert0(!av_rpi_zc_in_use(s));
-+
-+    if ((zc = av_rpi_zc_env_alloc(s, pool_env, alloc_buf_fn, free_pool_fn)) == NULL)
-+        return AVERROR(ENOMEM);
-+
-+    zc->old = (ZcOldCtxVals){
-+        .opaque = s->opaque,
-+        .get_buffer2 = s->get_buffer2,
-+        .thread_safe_callbacks = s->thread_safe_callbacks
-+    };
-+
-+    s->opaque = zc;
-+    s->get_buffer2 = zc_get_buffer2;
-+    s->thread_safe_callbacks = 1;
-+    return 0;
-+}
-+
-+void av_rpi_zc_uninit2(struct AVCodecContext * const s)
-+{
-+    ZcEnv * const zc = s->opaque;
-+
-+    av_assert0(av_rpi_zc_in_use(s));
-+
-+    s->get_buffer2 = zc->old.get_buffer2;
-+    s->opaque = zc->old.opaque;
-+    s->thread_safe_callbacks = zc->old.thread_safe_callbacks;
-+
-+    av_rpi_zc_env_release(zc);
-+}
-+
---- /dev/null
-+++ b/libavcodec/rpi_zc.h
-@@ -0,0 +1,228 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox
-+*/
-+
-+#ifndef LIBAVCODEC_RPI_ZC_H
-+#define LIBAVCODEC_RPI_ZC_H
-+
-+// Zero-Copy frame code for RPi
-+// RPi needs Y/U/V planes to be contiguous for display.  By default
-+// ffmpeg will allocate separated planes so a memcpy is needed before
-+// display.  This code provides a method a making ffmpeg allocate a single
-+// bit of memory for the frame when can then be reference counted until
-+// display has finished with it.
-+
-+// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame
-+// 0 disables
-+// *** This option still in development
-+//     Only works if SAO active
-+//     Allocates buffers that are twice the required size
-+#define RPI_ZC_SAND_8_IN_10_BUF  0
-+
-+struct AVBufferRef;
-+struct AVFrame;
-+struct AVCodecContext;
-+enum AVPixelFormat;
-+
-+// "Opaque" pointer to whatever we are using as a buffer reference
-+typedef struct AVBufferRef * AVRpiZcRefPtr;
-+
-+struct AVZcEnv;
-+typedef struct AVZcEnv * AVZcEnvPtr;
-+
-+typedef struct AVRpiZcFrameGeometry
-+{
-+    unsigned int stride_y;  // Luma stride (bytes)
-+    unsigned int height_y;  // Luma height (lines)
-+    unsigned int stride_c;  // Chroma stride (bytes)
-+    unsigned int height_c;  // Chroma stride (lines)
-+    unsigned int planes_c;  // Chroma plane count (U, V = 2, interleaved = 1)
-+    unsigned int stripes;   // Number of stripes (sand)
-+    unsigned int bytes_per_pel;
-+    int stripe_is_yc;       // A single stripe is Y then C (false for tall sand)
-+
-+    int format;                 // Requested format
-+    unsigned int video_width;   // Requested width
-+    unsigned int video_height;  // Requested height
-+} AVRpiZcFrameGeometry;
-+
-+// Get expected MMAL geometry for a given format, width & height
-+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
-+    const int format,
-+    const unsigned int video_width, const unsigned int video_height);
-+
-+//----------------------------------------------------------------------------
-+//
-+// Calls that extract info from a ZC frame whether internally or externally
-+// allocated
-+
-+// Generate a ZC reference to the buffer(s) in this frame
-+// If the buffer doesn't appear to be one allocated by ZC
-+// then the behaviour depends on maycopy:
-+//   If maycopy=0 then return NULL
-+//   If maycopy=1 && the src frame is in a form where we can easily copy
-+//     the data, then allocate a new buffer and copy the data into it
-+//   Otherwise return NULL
-+// If maycopy == 0 then ZC may be NULL
-+AVRpiZcRefPtr av_rpi_zc_ref(void * const logging_context, const AVZcEnvPtr zc,
-+    const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy);
-+
-+// Unreference the buffer refed/allocated by _zc_ref
-+// If fr_ref is NULL then this will NOP
-+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref);
-+
-+// Get the vc_handle from the frame ref
-+// Returns -1 if ref doesn't look valid
-+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref);
-+// Get the vcsm_handle from the frame ref
-+// Returns 0 if ref doesn't look valid
-+unsigned int av_rpi_zc_vcsm_handle(const AVRpiZcRefPtr fr_ref);
-+// Get offset from the start of the memory referenced
-+// by the vc_handle to valid data
-+int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref);
-+// Length of buffer data
-+int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref);
-+// Get the number of bytes allocated from the frame ref
-+// Returns 0 if ref doesn't look valid
-+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref);
-+// Geometry this frame was allocated with
-+const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref);
-+
-+//----------------------------------------------------------------------------
-+//
-+// Calls for external frame allocation
-+
-+// Callbacks registered in av_rpi_zc_init2
-+
-+// Callback to allocate a buf for a frame
-+// The frame itself is generated in the calling code
-+//
-+// Parameters:
-+//   pool_env  value passed to av-rpi_zc_init2
-+//   size      size wanted
-+//   geo       geometry of the frame to be allocated
-+// Returns:
-+//   NULL      Alloc failed
-+//   ptr       AVBufferBuf* of allocated buffer
-+//             In most cases av_rpi_zc_buf will be called by this function
-+//             and this will be the buf returned by that.
-+typedef AVBufferRef * av_rpi_zc_alloc_buf_fn_t(void * pool_env, size_t size,
-+                                               const AVRpiZcFrameGeometry * geo);
-+
-+// Callback once ffmpeg is completely done with this pool
-+// Called once all allocated buffers have been derefed and ffmpegs ref to this
-+// pool has been dropped
-+typedef void av_rpi_zc_free_pool_fn_t(void * pool_env);
-+
-+// Init ZC into a context
-+// Sets opaque, get_buffer2, thread_safe_callbacks
-+// Use if you want to allocate your own pools and/or create ZC buffers for
-+// all decoders
-+// RPI HEVC decoders will allocate appropriate VCSM buffers which can be taken
-+// apart by av_rpi_zc_xxx calls without this
-+int av_rpi_zc_init2(struct AVCodecContext * const s,
-+                    void * pool_env, av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
-+                    av_rpi_zc_free_pool_fn_t * free_pool_fn);
-+
-+// Free ZC from a context
-+void av_rpi_zc_uninit2(struct AVCodecContext * const s);
-+
-+// Get minimum pool size in frames - valid by the time the first alloc request
-+// occurs.  Takes into account thread requests and DPB sizes derived from SPS
-+// rather than just adding a worst case DPB size.
-+unsigned int av_rpi_zc_get_decoder_pool_size(const AVZcEnvPtr zc);
-+
-+typedef struct av_rpi_zc_buf_fn_tab_s {
-+    // This AVBuffer is being freed by ffmpeg - return memory
-+    // to external pool. Memory may be, but need not be, unmapped.
-+    // v is the ptr passed in av_rpi_zc_buf
-+    void (* free)(void * v);
-+
-+    // Return appropriate handles / mappings
-+    // v is the ptr passed in av_rpi_zc_buf
-+    unsigned int (* vcsm_handle)(void * v);
-+    unsigned int (* vc_handle)(void * v);
-+    void * (* map_arm)(void * v);
-+    unsigned int (* map_vc)(void * v);
-+} av_rpi_zc_buf_fn_tab_t;
-+
-+// Allocate a ZC AVBufferRef and set its callback table
-+// Doesn't take a buffer address directly - relies on callbacks to return
-+// addresses as they are required.  Mappings need not be generated until
-+// the map callbacks are called but they should persist from then until
-+// the buffer is freed.
-+//
-+// Parameters:
-+//   numbytes    Size of the buffer
-+//   addr_offset Offset to first usable byte of buffer (for alignment)
-+//               normally 0
-+//   v           Pointer passed to callbacks
-+//   fn_tab      Function table
-+AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab);
-+
-+// Get v ptr set in in av_rpi_zc_buf
-+void * av_rpi_zc_buf_v(AVBufferRef * const buf);
-+
-+//----------------------------------------------------------------------------
-+//
-+// Mostly internal calls but might possibly be wanted by outside code
-+
-+void av_rpi_zc_int_env_freep(AVZcEnvPtr * zc);
-+AVZcEnvPtr av_rpi_zc_int_env_alloc(void * const logctx);
-+void av_rpi_zc_set_decoder_pool_size(const AVZcEnvPtr zc, const unsigned int pool_size);
-+
-+// Test to see if the context is using zc (checks get_buffer2)
-+int av_rpi_zc_in_use(const struct AVCodecContext * const s);
-+
-+// Get buffer generates placeholders for later alloc
-+int av_rpi_zc_get_buffer(const AVZcEnvPtr zc, AVFrame * const frame);
-+// Resolve actually does the alloc (noop if already alloced)
-+// Set data pointers on a buffer/frame that was copied before the alloc
-+// accured
-+#define ZC_RESOLVE_FAIL         0  // return error on invalid
-+#define ZC_RESOLVE_ALLOC        1  // alloc as invalid
-+#define ZC_RESOLVE_WAIT_VALID   2  // wait for valid
-+#define ZC_RESOLVE_ALLOC_VALID  3  // alloc as valid
-+int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int may_alloc);
-+int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc);
-+
-+int av_rpi_zc_set_valid_frame(AVFrame * const frame);
-+int av_rpi_zc_set_broken_frame(AVFrame * const frame);
-+
-+
-+
-+
-+AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx,
-+                    void * pool_env,
-+                    av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
-+                    av_rpi_zc_free_pool_fn_t * free_pool_fn);
-+void av_rpi_zc_env_release(const AVZcEnvPtr zc);
-+
-+
-+#endif
-+
---- /dev/null
-+++ b/libavcodec/rpi_zc_frames.h
-@@ -0,0 +1,142 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox, Ben Avison
-+*/
-+
-+#ifndef RPI_ZC_FRAMES_H
-+#define RPI_ZC_FRAMES_H
-+
-+#define RPI_ONE_BUF 1
-+
-+#include "rpi_mem.h"  // for GPU_MEM_PTR_T
-+#include "libavutil/frame.h"
-+
-+#if !RPI_ONE_BUF
-+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[0]);
-+    return p->vc;
-+}
-+
-+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[1]);
-+    return p->vc;
-+}
-+
-+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[2]);
-+    return p->vc;
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
-+    return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[0]);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
-+    return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[1]);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
-+    return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[2]);
-+}
-+
-+#else
-+
-+static inline int gpu_is_buf1(const AVFrame * const frame)
-+{
-+    return frame->buf[1] == NULL;
-+}
-+
-+static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
-+{
-+    return av_buffer_get_opaque(frame->buf[0]);
-+}
-+
-+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n)
-+{
-+    return av_buffer_pool_buffer_get_opaque(frame->buf[n]);
-+}
-+
-+static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n)
-+{
-+    const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n);
-+    return gm->vc + (frame->data[n] - gm->arm);
-+}
-+
-+
-+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
-+    return get_vc_address3(frame, 0);
-+}
-+
-+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
-+    return get_vc_address3(frame, 1);
-+}
-+
-+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
-+    return get_vc_address3(frame, 2);
-+}
-+
-+#if 0
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
-+    if (gpu_is_buf1(frame))
-+    {
-+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
-+        g.numbytes = frame->data[1] - frame->data[0];
-+        return g;
-+    }
-+    else
-+        return *gpu_buf3_gmem(frame, 0);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
-+    if (gpu_is_buf1(frame))
-+    {
-+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
-+        g.arm += frame->data[1] - frame->data[0];
-+        g.vc += frame->data[1] - frame->data[0];
-+        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
-+        return g;
-+    }
-+    else
-+        return *gpu_buf3_gmem(frame, 1);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
-+    if (gpu_is_buf1(frame))
-+    {
-+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
-+        g.arm += frame->data[2] - frame->data[0];
-+        g.vc += frame->data[2] - frame->data[0];
-+        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
-+        return g;
-+    }
-+    else
-+        return *gpu_buf3_gmem(frame, 2);
-+}
-+#endif
-+#endif
-+
-+#endif
---- /dev/null
-+++ b/libavcodec/rpivid_hevc.c
-@@ -0,0 +1,2128 @@
-+// FFMPEG HEVC decoder hardware accelerator
-+// Andrew Holme, Argon Design Ltd
-+// Copyright (c) June 2017 Raspberry Pi Ltd
-+
-+#include <stdio.h>
-+#include <fcntl.h>
-+#include <pthread.h>
-+#include <semaphore.h>
-+#include <unistd.h>
-+#include <sys/mman.h>
-+
-+#include "fftools/ffmpeg.h"
-+#include "libavutil/avassert.h"
-+#include "libavutil/imgutils.h"
-+#include "avcodec.h"
-+#include "hwconfig.h"
-+#include "decode.h"
-+
-+#include "hevc.h"
-+#include "hevcdec.h"
-+#include "rpi_zc.h"
-+#include "rpi_mem.h"
-+#include "rpi_zc_frames.h"
-+#include "rpi_mailbox.h"
-+
-+
-+#define OPT_PHASE_TIMING 0      // Generate stats for phase usage
-+
-+#define OPT_EMU 0
-+
-+#define TRACE_DEV 0
-+#define TRACE_ENTRY 0
-+
-+#define NUM_SCALING_FACTORS 4064
-+
-+#define AXI_BASE64 0
-+
-+#define PROB_BACKUP ((20<<12) + (20<<6) + (0<<0))
-+#define PROB_RELOAD ((20<<12) + (20<<0) + (0<<6))
-+
-+#define RPIVID_COL_PICS 17                 // 16 ref & current
-+
-+#define RPIVID_BITBUFS          2          // Bit + Cmd bufs (phase 0 & 1)
-+#define RPIVID_BITBUF_SIZE      (4 << 20)  // Bit + Cmd buf size
-+
-+#define RPIVID_COEFFBUFS        3          // PU + Coeff bufs (phase 1 & 2)
-+#define RPIVID_COEFFBUF_SIZE    (16 << 20) // PU + Coeff buf size
-+
-+//////////////////////////////////////////////////////////////////////////////
-+//
-+// Register offsets
-+
-+#define RPI_SPS0         0
-+#define RPI_SPS1         4
-+#define RPI_PPS          8
-+#define RPI_SLICE        12
-+#define RPI_TILESTART    16
-+#define RPI_TILEEND      20
-+#define RPI_SLICESTART   24
-+#define RPI_MODE         28
-+#define RPI_LEFT0        32
-+#define RPI_LEFT1        36
-+#define RPI_LEFT2        40
-+#define RPI_LEFT3        44
-+#define RPI_QP           48
-+#define RPI_CONTROL      52
-+#define RPI_STATUS       56
-+#define RPI_VERSION      60
-+#define RPI_BFBASE       64
-+#define RPI_BFNUM        68
-+#define RPI_BFCONTROL    72
-+#define RPI_BFSTATUS     76
-+#define RPI_PUWBASE      80
-+#define RPI_PUWSTRIDE    84
-+#define RPI_COEFFWBASE   88
-+#define RPI_COEFFWSTRIDE 92
-+#define RPI_SLICECMDS    96
-+#define RPI_BEGINTILEEND 100
-+#define RPI_TRANSFER     104
-+#define RPI_CFBASE       108
-+#define RPI_CFNUM        112
-+#define RPI_CFSTATUS     116
-+
-+#define RPI_PURBASE       0x8000
-+#define RPI_PURSTRIDE     0x8004
-+#define RPI_COEFFRBASE    0x8008
-+#define RPI_COEFFRSTRIDE  0x800C
-+#define RPI_NUMROWS       0x8010
-+#define RPI_CONFIG2       0x8014
-+#define RPI_OUTYBASE      0x8018
-+#define RPI_OUTYSTRIDE    0x801C
-+#define RPI_OUTCBASE      0x8020
-+#define RPI_OUTCSTRIDE    0x8024
-+#define RPI_STATUS2       0x8028
-+#define RPI_FRAMESIZE     0x802C
-+#define RPI_MVBASE        0x8030
-+#define RPI_MVSTRIDE      0x8034
-+#define RPI_COLBASE       0x8038
-+#define RPI_COLSTRIDE     0x803C
-+#define RPI_CURRPOC       0x8040
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+// Unused but left here to illustrate the diffrences between FFmpegs prob
-+// structure and the rpivid one
-+
-+struct FFM_PROB {
-+    uint8_t  sao_merge_flag                   [ 1];
-+    uint8_t  sao_type_idx                     [ 1];
-+    uint8_t  split_coding_unit_flag           [ 3];
-+    uint8_t  cu_transquant_bypass_flag        [ 1];
-+    uint8_t  skip_flag                        [ 3];
-+    uint8_t  cu_qp_delta                      [ 3];
-+    uint8_t  pred_mode_flag                   [ 1];
-+    uint8_t  part_mode                        [ 4];
-+    uint8_t  prev_intra_luma_pred_flag        [ 1];
-+    uint8_t  intra_chroma_pred_mode           [ 2];
-+    uint8_t  merge_flag                       [ 1];
-+    uint8_t  merge_idx                        [ 1];
-+    uint8_t  inter_pred_idc                   [ 5];
-+    uint8_t  ref_idx_l0                       [ 2];
-+    uint8_t  ref_idx_l1                       [ 2];
-+    uint8_t  abs_mvd_greater0_flag            [ 2];
-+    uint8_t  abs_mvd_greater1_flag            [ 2];
-+    uint8_t  mvp_lx_flag                      [ 1];
-+    uint8_t  no_residual_data_flag            [ 1];
-+    uint8_t  split_transform_flag             [ 3];
-+    uint8_t  cbf_luma                         [ 2];
-+    uint8_t  cbf_cb_cr                        [ 4];
-+    uint8_t  transform_skip_flag/*[][]*/      [ 2];
-+    uint8_t  explicit_rdpcm_flag/*[][]*/      [ 2];
-+    uint8_t  explicit_rdpcm_dir_flag/*[][]*/  [ 2];
-+    uint8_t  last_significant_coeff_x_prefix  [18];
-+    uint8_t  last_significant_coeff_y_prefix  [18];
-+    uint8_t  significant_coeff_group_flag     [ 4];
-+    uint8_t  significant_coeff_flag           [44];
-+    uint8_t  coeff_abs_level_greater1_flag    [24];
-+    uint8_t  coeff_abs_level_greater2_flag    [ 6];
-+    uint8_t  log2_res_scale_abs               [ 8];
-+    uint8_t  res_scale_sign_flag              [ 2];
-+    uint8_t  cu_chroma_qp_offset_flag         [ 1];
-+    uint8_t  cu_chroma_qp_offset_idx          [ 1];
-+} __attribute__((packed));
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+struct RPI_PROB {
-+    uint8_t  SAO_MERGE_FLAG             [ 1];
-+    uint8_t  SAO_TYPE_IDX               [ 1];
-+    uint8_t  SPLIT_FLAG                 [ 3];
-+    uint8_t  CU_SKIP_FLAG               [ 3];
-+    uint8_t  CU_TRANSQUANT_BYPASS_FLAG  [ 1];
-+    uint8_t  PRED_MODE                  [ 1];
-+    uint8_t  PART_SIZE                  [ 4];
-+    uint8_t  INTRA_PRED_MODE            [ 1];
-+    uint8_t  CHROMA_PRED_MODE           [ 1];
-+    uint8_t  MERGE_FLAG_EXT             [ 1];
-+    uint8_t  MERGE_IDX_EXT              [ 1];
-+    uint8_t  INTER_DIR                  [ 5];
-+    uint8_t  REF_PIC                    [ 2];
-+    uint8_t  MVP_IDX                    [ 1];
-+    uint8_t  MVD                        [ 2];
-+    uint8_t  QT_ROOT_CBF                [ 1];
-+    uint8_t  TRANS_SUBDIV_FLAG          [ 3];
-+    uint8_t  QT_CBF                     [ 6];
-+    uint8_t  DQP                        [ 2];
-+    uint8_t  ONE_FLAG                   [24];
-+    uint8_t  LASTX                      [18];
-+    uint8_t  LASTY                      [18];
-+    uint8_t  SIG_CG_FLAG                [ 4];
-+    uint8_t  ABS_FLAG                   [ 6];
-+    uint8_t  TRANSFORMSKIP_FLAG         [ 2];
-+    uint8_t  SIG_FLAG                   [42];
-+    uint8_t  SIG_FLAG_unused            [ 2];
-+} __attribute__((packed));
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+struct RPI_CMD {
-+    uint32_t addr;
-+    uint32_t data;
-+} __attribute__((packed));
-+
-+struct RPI_BIT {
-+    int         cmd;
-+    const void *ptr;
-+    int         len;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+struct RPI_T;
-+
-+// Actual addressability is 38bits but we can only alloc in the bottom 32
-+// currently - when passed to rpivid h/w the address is always >> 6 so will
-+// fit in 32 bit there
-+// At some point we may weant to make this uint64_t
-+typedef uint32_t vid_vc_addr_t;
-+
-+typedef enum rpivid_decode_state_e {
-+    RPIVID_DECODE_NEW = 0,
-+    RPIVID_DECODE_START,
-+    RPIVID_DECODE_SLICE,
-+    RPIVID_DECODE_END,
-+} rpivid_decode_state_t;
-+
-+#define RPI_PROB_VALS 154U
-+#define RPI_PROB_ARRAY_SIZE ((154 + 3) & ~3)
-+
-+typedef struct dec_env_s {
-+    const AVCodecContext * avctx;
-+
-+    rpivid_decode_state_t state;
-+    unsigned int    decode_order;
-+
-+    int             phase_no;           // Current phase (i.e. the last one we waited for)
-+    struct dec_env_s * phase_wait_q_next;
-+    sem_t           phase_wait;
-+
-+    struct RPI_BIT *bit_fifo;
-+    struct RPI_CMD *cmd_fifo;
-+    unsigned int    bit_len, bit_max;
-+    unsigned int    cmd_len, cmd_max;
-+    unsigned int    num_slice_msgs;
-+    unsigned int    PicWidthInCtbsY;
-+    unsigned int    PicHeightInCtbsY;
-+    unsigned int    dpbno_col;
-+    uint32_t        reg_slicestart;
-+    unsigned int    wpp_entry_x;
-+    unsigned int    wpp_entry_y;
-+
-+    const uint8_t * nal_buffer;
-+    size_t          nal_size;
-+
-+    uint16_t        slice_msgs[2*HEVC_MAX_REFS*8+3];
-+    uint8_t         scaling_factors[NUM_SCALING_FACTORS];
-+//    unsigned int    RefPicList[2][HEVC_MAX_REFS];
-+} dec_env_t;
-+
-+#define RPIVID_PHASES 3
-+#define RPIVID_PHASE_NEW (RPIVID_PHASES) // Phase before we have inced decode order
-+#define RPIVID_PHASE_START (-1)          // Phase after we have inced decode_order
-+
-+#if OPT_PHASE_TIMING
-+static const unsigned int time_thresholds[8] = {
-+    10, 15, 20, 30, 45, 60, 75, 90
-+};
-+#endif
-+
-+typedef struct phase_wait_env_s {
-+    unsigned int    last_order;
-+    dec_env_t *     q;
-+#if OPT_PHASE_TIMING
-+    uint64_t phase_time;
-+    uint64_t max_phase_time;
-+    uint64_t time_in_phase;
-+    uint64_t time_out_phase;
-+    unsigned int max_time_decode_order;
-+    unsigned int time_bins[9];
-+    unsigned int time_bins3[9];
-+    unsigned int time_bins5[9];
-+    uint64_t time_stash[16];
-+    unsigned int i3;
-+#endif
-+} phase_wait_env_t;                      // Single linked list of threads waiting for this phase
-+
-+typedef struct RPI_T {
-+    atomic_int      ref_count;
-+    sem_t           ref_zero;
-+
-+    dec_env_t **    dec_envs;
-+    AVZcEnvPtr      zc;
-+
-+    pthread_mutex_t phase_lock;
-+    phase_wait_env_t phase_reqs[RPIVID_PHASES];
-+
-+    volatile uint32_t * regs;
-+    volatile uint32_t * ints;
-+
-+    GPU_MEM_PTR_T   gcolbuf;
-+    unsigned int    col_stride;
-+    size_t          col_picsize;
-+
-+    unsigned int    bitbuf_no;
-+    sem_t           bitbuf_sem;
-+    GPU_MEM_PTR_T   gbitbufs[RPIVID_BITBUFS];
-+
-+    unsigned int    max_pu_msgs;
-+    unsigned int    coeffbuf_no;
-+    sem_t           coeffbuf_sem;
-+    GPU_MEM_PTR_T   gcoeffbufs[RPIVID_COEFFBUFS];
-+
-+    unsigned int    decode_order;
-+    int             mbox_fd;
-+    int             gpu_init_type;
-+} RPI_T;
-+
-+#if OPT_PHASE_TIMING
-+static uint64_t tus64(void)
-+{
-+    struct timespec ts;
-+    clock_gettime(CLOCK_MONOTONIC, &ts);
-+    return (uint64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
-+}
-+#endif
-+
-+static inline unsigned int rnd64(unsigned int x)
-+{
-+    return (x + 63) & ~63;
-+}
-+
-+static inline int rpi_sem_wait(sem_t * const sem)
-+{
-+    int rv;
-+    while ((rv = sem_wait(sem)) != 0 && errno == EINTR)
-+        /* Loop */;
-+    return rv;
-+}
-+
-+//============================================================================
-+
-+#define REGS_NAME "/dev/rpivid-hevcmem"
-+#define REGS_SIZE 0x10000
-+#define INTS_NAME "/dev/rpivid-intcmem"
-+#define INTS_SIZE 0x10000  // 4 is probably enough but we are going to alloc a page anyway
-+
-+static volatile uint32_t * map_dev(AVCodecContext * const avctx, const char * const dev_name, size_t size)
-+{
-+    void *gpio_map;
-+    int  mem_fd;
-+
-+    /* open /dev/mem */
-+    if ((mem_fd = open(dev_name, O_RDWR|O_SYNC) ) < 0) {
-+        av_log(avctx, AV_LOG_WARNING, "can't open %s\n", dev_name);
-+        return NULL;
-+    }
-+
-+    // Now map it
-+    gpio_map = mmap(
-+       NULL,
-+       size,
-+       PROT_READ|PROT_WRITE,
-+       MAP_SHARED,
-+       mem_fd,
-+       0
-+    );
-+
-+    close(mem_fd);  // No longer need the FD
-+
-+    if (gpio_map == MAP_FAILED) {
-+        av_log(avctx, AV_LOG_WARNING, "GPIO mapping failed");
-+        return NULL;
-+    }
-+
-+    return (volatile uint32_t *)gpio_map;
-+}
-+
-+static void unmap_devp(volatile uint32_t ** const p_gpio_map, size_t size)
-+{
-+    volatile uint32_t * const gpio_map = *p_gpio_map;
-+    if (gpio_map != NULL) {
-+        *p_gpio_map = NULL;
-+        munmap((void *)gpio_map, size);
-+    }
-+}
-+
-+#define MANGLE(x) ((x) &~0xc0000000)          // ** If x is ever a 64 bit thing this will need fixing!
-+#define MANGLE64(x) (uint32_t)(MANGLE(x)>>6)
-+
-+static inline void apb_write_vc_addr(const RPI_T *const rpi, const uint32_t addr, const vid_vc_addr_t data)
-+{
-+#if TRACE_DEV
-+    printf("W %x %08x\n", addr, MANGLE64(data));
-+#endif
-+
-+    rpi->regs[addr >> 2] = MANGLE64(data);
-+}
-+
-+static inline void apb_write_vc_len(const RPI_T *const rpi, const uint32_t addr, const unsigned int data)
-+{
-+#if TRACE_DEV
-+    printf("W %x %08x\n", addr, data >> 6);
-+#endif
-+
-+    rpi->regs[addr >> 2] = data >> 6;  // ?? rnd64 - but not currently needed
-+}
-+
-+static inline void apb_write(const RPI_T * const rpi, const uint32_t addr, const uint32_t data)
-+{
-+#if TRACE_DEV
-+    printf("W %x %08x\n", addr, data);
-+#endif
-+
-+    rpi->regs[addr >> 2] = data;
-+}
-+
-+static inline uint32_t apb_read(const RPI_T * const rpi, const uint32_t addr)
-+{
-+    const uint32_t v = rpi->regs[addr >> 2];
-+#if TRACE_DEV
-+    printf("R %x (=%x)\n", addr, v);
-+#endif
-+    return v;
-+}
-+
-+#define ARG_IC_ICTRL_ACTIVE1_INT_SET                   0x00000001
-+#define ARG_IC_ICTRL_ACTIVE1_EDGE_SET                  0x00000002
-+#define ARG_IC_ICTRL_ACTIVE1_EN_SET                    0x00000004
-+#define ARG_IC_ICTRL_ACTIVE1_STATUS_SET                0x00000008
-+#define ARG_IC_ICTRL_ACTIVE2_INT_SET                   0x00000010
-+#define ARG_IC_ICTRL_ACTIVE2_EDGE_SET                  0x00000020
-+#define ARG_IC_ICTRL_ACTIVE2_EN_SET                    0x00000040
-+#define ARG_IC_ICTRL_ACTIVE2_STATUS_SET                0x00000080
-+
-+static inline void int_wait(const RPI_T * const rpi, const unsigned int phase)
-+{
-+    const uint32_t mask_reset = phase == 1 ? ~ARG_IC_ICTRL_ACTIVE2_INT_SET : ~ARG_IC_ICTRL_ACTIVE1_INT_SET;
-+    const uint32_t mask_done = phase == 1 ? ARG_IC_ICTRL_ACTIVE1_INT_SET : ARG_IC_ICTRL_ACTIVE2_INT_SET;
-+    uint32_t ival;
-+    while (((ival = rpi->ints[0]) & mask_done) == 0) {
-+        usleep(1000);
-+    }
-+    rpi->ints[0] = ival & mask_reset;
-+}
-+
-+#if TRACE_DEV && 0
-+static void apb_dump_regs(const RPI_T * const rpi, uint16_t addr, int num) {
-+    int i;
-+
-+    for (i=0; i<num; i++)
-+    {
-+        if ((i%4)==0)
-+          printf("%08x: ", 0x7eb00000 + addr + 4*i);
-+
-+        printf("%08x", rpi->regs[(addr>>2)+i]);
-+
-+        if ((i%4)==3 || i+1 == num)
-+            printf("\n");
-+        else
-+            printf(" ");
-+    }
-+}
-+
-+static void axi_dump(const dec_env_t * const de, uint64_t addr, uint32_t size) {
-+    int i;
-+
-+    for (i=0; i<size>>2; i++)
-+    {
-+        if ((i%4)==0)
-+            printf("%08x: ", MANGLE(de->gbuf.vc) + (uint32_t)addr + 4*i);
-+
-+        printf("%08x", ((uint32_t*)de->gbuf.arm)[(addr>>2)+i]);
-+
-+        if ((i%4)==3 || i+1 == size>>2)
-+            printf("\n");
-+        else
-+            printf(" ");
-+    }
-+}
-+#endif
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+static inline size_t round_up_size(const size_t x)
-+{
-+    /* Admit no size < 256 */
-+    const unsigned int n = x < 256 ? 8 : av_log2(x) - 1;
-+
-+    return x >= (3 << n) ? 4 << n : (3 << n);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Scaling factors
-+
-+static void expand_scaling_list(
-+    const unsigned int sizeID,
-+    const unsigned int matrixID,
-+    uint8_t * const dst0,
-+    const uint8_t * const src0,
-+    uint8_t dc)
-+{
-+    switch (sizeID) {
-+        case 0:
-+            memcpy(dst0, src0, 16);
-+            break;
-+        case 1:
-+            memcpy(dst0, src0, 64);
-+            break;
-+        case 2:
-+        {
-+            uint8_t * d = dst0;
-+            for (unsigned int y=0; y != 16; y++) {
-+                const uint8_t * s = src0 + (y >> 1) * 8;
-+                for (unsigned int x = 0; x != 8; ++x) {
-+                    *d++ = *s;
-+                    *d++ = *s++;
-+                }
-+            }
-+            dst0[0] = dc;
-+            break;
-+        }
-+        default:
-+        {
-+            uint8_t * d = dst0;
-+            for (unsigned int y=0; y != 32; y++) {
-+                const uint8_t * s = src0 + (y >> 2) * 8;
-+                for (unsigned int x = 0; x != 8; ++x) {
-+                    *d++ = *s;
-+                    *d++ = *s;
-+                    *d++ = *s;
-+                    *d++ = *s++;
-+                }
-+            }
-+            dst0[0] = dc;
-+            break;
-+        }
-+    }
-+}
-+
-+static void populate_scaling_factors(dec_env_t * const de, const HEVCContext * const s) {
-+    // Array of constants for scaling factors
-+    static const uint32_t scaling_factor_offsets[4][6] = {
-+        // MID0    MID1    MID2    MID3    MID4    MID5
-+        {0x0000, 0x0010, 0x0020, 0x0030, 0x0040, 0x0050},   // SID0 (4x4)
-+        {0x0060, 0x00A0, 0x00E0, 0x0120, 0x0160, 0x01A0},   // SID1 (8x8)
-+        {0x01E0, 0x02E0, 0x03E0, 0x04E0, 0x05E0, 0x06E0},   // SID2 (16x16)
-+        {0x07E0,      0,      0, 0x0BE0,      0,      0}};  // SID3 (32x32)
-+
-+    // ffmpeg places SID3,MID1 where matrixID 3 normally is
-+    const ScalingList * const sl =
-+        s->ps.pps->scaling_list_data_present_flag ? &s->ps.pps->scaling_list
-+                                                  : &s->ps.sps->scaling_list;
-+    unsigned int mid;
-+
-+    for (mid=0; mid<6; mid++)
-+        expand_scaling_list(0, mid,
-+            de->scaling_factors + scaling_factor_offsets[0][mid],
-+            sl->sl[0][mid], 0);
-+    for (mid=0; mid<6; mid++)
-+        expand_scaling_list(1, mid,
-+            de->scaling_factors + scaling_factor_offsets[1][mid],
-+            sl->sl[1][mid], 0);
-+    for (mid=0; mid<6; mid++)
-+        expand_scaling_list(2, mid,
-+            de->scaling_factors + scaling_factor_offsets[2][mid],
-+            sl->sl[2][mid],
-+            sl->sl_dc[0][mid]);
-+    // second scaling matrix for 32x32 is at matrixID 3 not 1 in ffmpeg
-+    for (mid=0; mid<6; mid += 3)
-+        expand_scaling_list(3, mid,
-+            de->scaling_factors + scaling_factor_offsets[3][mid],
-+            sl->sl[3][mid],
-+            sl->sl_dc[1][mid]);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Probabilities
-+
-+static const uint8_t prob_init[3][156] = {
-+	{
-+		 153, 200, 139, 141, 157, 154, 154, 154,
-+		 154, 154, 184, 154, 154, 154, 184,  63,
-+		 154, 154, 154, 154, 154, 154, 154, 154,
-+		 154, 154, 154, 154, 154, 153, 138, 138,
-+		 111, 141,  94, 138, 182, 154, 154, 154,
-+		 140,  92, 137, 138, 140, 152, 138, 139,
-+		 153,  74, 149,  92, 139, 107, 122, 152,
-+		 140, 179, 166, 182, 140, 227, 122, 197,
-+		 110, 110, 124, 125, 140, 153, 125, 127,
-+		 140, 109, 111, 143, 127, 111,  79, 108,
-+		 123,  63, 110, 110, 124, 125, 140, 153,
-+		 125, 127, 140, 109, 111, 143, 127, 111,
-+		  79, 108, 123,  63,  91, 171, 134, 141,
-+		 138, 153, 136, 167, 152, 152, 139, 139,
-+		 111, 111, 125, 110, 110,  94, 124, 108,
-+		 124, 107, 125, 141, 179, 153, 125, 107,
-+		 125, 141, 179, 153, 125, 107, 125, 141,
-+		 179, 153, 125, 140, 139, 182, 182, 152,
-+		 136, 152, 136, 153, 136, 139, 111, 136,
-+		 139, 111,   0,   0,	},
-+	{
-+		 153, 185, 107, 139, 126, 197, 185, 201,
-+		 154, 149, 154, 139, 154, 154, 154, 152,
-+		 110, 122,  95,  79,  63,  31,  31, 153,
-+		 153, 168, 140, 198,  79, 124, 138,  94,
-+		 153, 111, 149, 107, 167, 154, 154, 154,
-+		 154, 196, 196, 167, 154, 152, 167, 182,
-+		 182, 134, 149, 136, 153, 121, 136, 137,
-+		 169, 194, 166, 167, 154, 167, 137, 182,
-+		 125, 110,  94, 110,  95,  79, 125, 111,
-+		 110,  78, 110, 111, 111,  95,  94, 108,
-+		 123, 108, 125, 110,  94, 110,  95,  79,
-+		 125, 111, 110,  78, 110, 111, 111,  95,
-+		  94, 108, 123, 108, 121, 140,  61, 154,
-+		 107, 167,  91, 122, 107, 167, 139, 139,
-+		 155, 154, 139, 153, 139, 123, 123,  63,
-+		 153, 166, 183, 140, 136, 153, 154, 166,
-+		 183, 140, 136, 153, 154, 166, 183, 140,
-+		 136, 153, 154, 170, 153, 123, 123, 107,
-+		 121, 107, 121, 167, 151, 183, 140, 151,
-+		 183, 140,   0,   0,	},
-+	{
-+		 153, 160, 107, 139, 126, 197, 185, 201,
-+		 154, 134, 154, 139, 154, 154, 183, 152,
-+		 154, 137,  95,  79,  63,  31,  31, 153,
-+		 153, 168, 169, 198,  79, 224, 167, 122,
-+		 153, 111, 149,  92, 167, 154, 154, 154,
-+		 154, 196, 167, 167, 154, 152, 167, 182,
-+		 182, 134, 149, 136, 153, 121, 136, 122,
-+		 169, 208, 166, 167, 154, 152, 167, 182,
-+		 125, 110, 124, 110,  95,  94, 125, 111,
-+		 111,  79, 125, 126, 111, 111,  79, 108,
-+		 123,  93, 125, 110, 124, 110,  95,  94,
-+		 125, 111, 111,  79, 125, 126, 111, 111,
-+		  79, 108, 123,  93, 121, 140,  61, 154,
-+		 107, 167,  91, 107, 107, 167, 139, 139,
-+		 170, 154, 139, 153, 139, 123, 123,  63,
-+		 124, 166, 183, 140, 136, 153, 154, 166,
-+		 183, 140, 136, 153, 154, 166, 183, 140,
-+		 136, 153, 154, 170, 153, 138, 138, 122,
-+		 121, 122, 121, 167, 151, 183, 140, 151,
-+		 183, 140,   0,   0,	},
-+};
-+
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Phase 1 command and bit FIFOs
-+
-+// ???? uint16_t addr - put in uint32_t
-+static int p1_apb_write(dec_env_t * const de, const uint16_t addr, const uint32_t data) {
-+    if (de->cmd_len==de->cmd_max)
-+        av_assert0(de->cmd_fifo = realloc(de->cmd_fifo, (de->cmd_max*=2)*sizeof(struct RPI_CMD)));
-+
-+#if TRACE_DEV
-+    printf("[%02x] %x %x\n", de->cmd_len, addr, data);
-+#endif
-+
-+    de->cmd_fifo[de->cmd_len].addr = addr;
-+    de->cmd_fifo[de->cmd_len].data = data;
-+    return de->cmd_len++;
-+}
-+
-+static void p1_axi_write(dec_env_t * const de, const uint32_t len, const void * const ptr, const int cmd_idx) {
-+    if (de->bit_len==de->bit_max)
-+        av_assert0(de->bit_fifo = realloc(de->bit_fifo, (de->bit_max*=2)*sizeof(struct RPI_BIT)));
-+    de->bit_fifo[de->bit_len].cmd = cmd_idx;
-+    de->bit_fifo[de->bit_len].ptr = ptr;
-+    de->bit_fifo[de->bit_len].len = len;
-+    de->bit_len++;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Write probability and scaling factor memories
-+
-+#if 0
-+static void WriteProb(dec_env_t * const de) {
-+    int i;
-+    const uint8_t *p = (uint8_t *) &de->probabilities;
-+    for (i=0; i<sizeof(struct RPI_PROB); i+=4, p+=4)
-+        p1_apb_write(de, 0x1000+i, p[0] + (p[1]<<8) + (p[2]<<16) + (p[3]<<24));
-+}
-+#endif
-+
-+static void WriteProb(dec_env_t * const de, const HEVCContext * const s) {
-+    uint8_t dst[RPI_PROB_ARRAY_SIZE];
-+
-+    const unsigned int init_type = (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I) ?
-+        s->sh.slice_type + 1 : 2 - s->sh.slice_type;
-+    const uint8_t * p = prob_init[init_type];
-+    const int q = av_clip(s->sh.slice_qp, 0, 51);
-+    unsigned int i;
-+
-+    for (i = 0; i < RPI_PROB_VALS; i++) {
-+        int init_value = p[i];
-+        int m = (init_value >> 4) * 5 - 45;
-+        int n = ((init_value & 15) << 3) - 16;
-+        int pre = 2 * (((m * q) >> 4) + n) - 127;
-+
-+        pre ^= pre >> 31;
-+        if (pre > 124)
-+            pre = 124 + (pre & 1);
-+        dst[i] = pre;
-+    }
-+    for (i = RPI_PROB_VALS; i != RPI_PROB_ARRAY_SIZE; ++i) {
-+        dst[i] = 0;
-+    }
-+
-+    for (i=0; i < RPI_PROB_ARRAY_SIZE; i+=4)
-+        p1_apb_write(de, 0x1000+i, dst[i] + (dst[i+1]<<8) + (dst[i+2]<<16) + (dst[i+3]<<24));
-+
-+}
-+
-+
-+static void WriteScalingFactors(dec_env_t * const de) {
-+    int i;
-+    const uint8_t *p = (uint8_t *) de->scaling_factors;
-+    for (i=0; i<NUM_SCALING_FACTORS; i+=4, p+=4)
-+        p1_apb_write(de, 0x2000+i, p[0] + (p[1]<<8) + (p[2]<<16) + (p[3]<<24));
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+static int ctb_to_tile (unsigned int ctb, unsigned int *bd, int num) {
-+    int i;
-+    for (i=1; ctb >= bd[i]; i++); // bd[] has num+1 elements; bd[0]=0; see hevc_ps.c
-+    return i-1;
-+}
-+
-+static int ctb_to_slice_w_h (unsigned int ctb, int ctb_size, int width, unsigned int *bd, int num) {
-+    if (ctb < bd[num-1]) return ctb_size;
-+    else if (width % ctb_size) return width % ctb_size;
-+    else return ctb_size;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Handle PU and COEFF stream overflow
-+
-+
-+// Returns:
-+// -2 Other error
-+// -1 Out of coeff space
-+//  0  OK
-+//  1  Out of PU space
-+
-+static int check_status(const RPI_T * const rpi, dec_env_t * const de) {
-+    uint32_t status;
-+
-+    // this is the definition of successful completion of phase 1
-+    // it assures that status register is zero and all blocks in each tile have completed
-+    if (apb_read(rpi, RPI_CFSTATUS) == apb_read(rpi, RPI_CFNUM))
-+        return 0;
-+
-+    status = apb_read(rpi, RPI_STATUS);
-+
-+    if ((status & 8) != 0)
-+        return -1;
-+
-+    if ((status & 0x10) != 0)
-+        return 1;
-+
-+    return -2;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Write STATUS register with expected end CTU address of previous slice
-+
-+static void end_previous_slice(dec_env_t * const de, const HEVCContext * const s, const int ctb_addr_ts) {
-+    const HEVCPPS * const pps = s->ps.pps;
-+    int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY;
-+    int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY;
-+    p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
-+}
-+
-+static void wpp_pause(dec_env_t * const de, int ctb_row) {
-+    p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + 0x25);
-+    p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
-+    p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1 ? 0x70000 : 0x30000);
-+    p1_apb_write(de, RPI_CONTROL, (ctb_row<<16) + 2);
-+}
-+
-+static void wpp_end_previous_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) {
-+    const HEVCPPS *pps = s->ps.pps;
-+    int new_x = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
-+    int new_y = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY;
-+    int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY;
-+    int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY;
-+    if (de->wpp_entry_x<2 && (de->wpp_entry_y<new_y || new_x>2) && de->PicWidthInCtbsY>2)
-+        wpp_pause(de, last_y);
-+    p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
-+    if (new_x==2 || de->PicWidthInCtbsY==2 && de->wpp_entry_y<new_y)
-+        p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+static void new_slice_segment(dec_env_t * const de, const HEVCContext * const s)
-+{
-+    const HEVCSPS *sps = s->ps.sps;
-+    const HEVCPPS *pps = s->ps.pps;
-+
-+    p1_apb_write(de, RPI_SPS0,
-+        (sps->log2_min_cb_size                    <<  0) +
-+        (sps->log2_ctb_size                       <<  4) +
-+        (sps->log2_min_tb_size                    <<  8) +
-+        (sps->log2_max_trafo_size                 << 12) +
-+        (sps->bit_depth                           << 16) +
-+        (sps->bit_depth                           << 20) +
-+        (sps->max_transform_hierarchy_depth_intra << 24) +
-+        (sps->max_transform_hierarchy_depth_inter << 28));
-+
-+    p1_apb_write(de, RPI_SPS1,
-+        (sps->pcm.bit_depth                                        <<  0) +
-+        (sps->pcm.bit_depth_chroma                                 <<  4) +
-+        (sps->pcm.log2_min_pcm_cb_size                             <<  8) +
-+        (sps->pcm.log2_max_pcm_cb_size                             << 12) +
-+        (sps->separate_colour_plane_flag? 0:sps->chroma_format_idc << 16) +
-+        (sps->amp_enabled_flag                                     << 18) +
-+        (sps->pcm_enabled_flag                                     << 19) +
-+        (sps->scaling_list_enable_flag                             << 20) +
-+        (sps->sps_strong_intra_smoothing_enable_flag               << 21));
-+
-+    p1_apb_write(de, RPI_PPS,
-+        (sps->log2_ctb_size - pps->diff_cu_qp_delta_depth   <<  0) +
-+        (pps->cu_qp_delta_enabled_flag                      <<  4) +
-+        (pps->transquant_bypass_enable_flag                 <<  5) +
-+        (pps->transform_skip_enabled_flag                   <<  6) +
-+        (pps->sign_data_hiding_flag                         <<  7) +
-+      (((pps->cb_qp_offset + s->sh.slice_cb_qp_offset)&255) <<  8) +
-+      (((pps->cr_qp_offset + s->sh.slice_cr_qp_offset)&255) << 16) +
-+        (pps->constrained_intra_pred_flag                   << 24));
-+
-+    if (s->ps.sps->scaling_list_enable_flag) WriteScalingFactors(de);
-+
-+    if (!s->sh.dependent_slice_segment_flag) {
-+        int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
-+        int ctb_row = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY;
-+        de->reg_slicestart = (ctb_col<<0) + (ctb_row<<16);
-+    }
-+
-+    p1_apb_write(de, RPI_SLICESTART, de->reg_slicestart);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+static void write_slice(dec_env_t * const de, const HEVCContext * const s,
-+                        const unsigned int slice_w, const unsigned int slice_h) {
-+    uint32_t u32 =
-+          (s->sh.slice_type                           << 12)
-+        + (s->sh.slice_sample_adaptive_offset_flag[0] << 14)
-+        + (s->sh.slice_sample_adaptive_offset_flag[1] << 15)
-+        + (slice_w                                    << 17)
-+        + (slice_h                                    << 24);
-+
-+    if (s->sh.slice_type==HEVC_SLICE_B || s->sh.slice_type==HEVC_SLICE_P) u32 |=
-+          (s->sh.max_num_merge_cand << 0)
-+        + (s->sh.nb_refs[L0]        << 4)
-+        + (s->sh.nb_refs[L1]        << 8);
-+
-+    if (s->sh.slice_type==HEVC_SLICE_B)
-+        u32 |= s->sh.mvd_l1_zero_flag<<16;
-+    p1_apb_write(de, RPI_SLICE, u32);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Wavefront mode
-+
-+static void wpp_entry_point(dec_env_t * const de, const HEVCContext * const s,
-+                            const int do_bte, const int resetQPY, const int ctb_addr_ts) {
-+    const HEVCSPS * const sps = s->ps.sps;
-+    const HEVCPPS * const pps = s->ps.pps;
-+
-+    int ctb_size = 1<<sps->log2_ctb_size;
-+    int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
-+
-+    int ctb_col = de->wpp_entry_x = ctb_addr_rs % de->PicWidthInCtbsY;
-+    int ctb_row = de->wpp_entry_y = ctb_addr_rs / de->PicWidthInCtbsY;
-+
-+    int endx = de->PicWidthInCtbsY-1;
-+    int endy = ctb_row;
-+
-+    uint8_t slice_w = ctb_to_slice_w_h(ctb_col, ctb_size, sps->width,  pps->col_bd, pps->num_tile_columns);
-+    uint8_t slice_h = ctb_to_slice_w_h(ctb_row, ctb_size, sps->height, pps->row_bd, pps->num_tile_rows);
-+
-+    p1_apb_write(de, RPI_TILESTART, 0);
-+    p1_apb_write(de, RPI_TILEEND, endx + (endy<<16));
-+
-+    if (do_bte)
-+        p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16));
-+
-+    write_slice(de, s, slice_w, ctb_row==de->PicHeightInCtbsY-1? slice_h : ctb_size);
-+
-+    if (resetQPY) p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp);
-+
-+    p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1? 0x60001 : 0x20001);
-+    p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16));
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Tiles mode
-+
-+static void new_entry_point(dec_env_t * const de, const HEVCContext * const s,
-+                            const int do_bte, const int resetQPY, const int ctb_addr_ts) {
-+    const HEVCSPS * const sps = s->ps.sps;
-+    const HEVCPPS * const pps = s->ps.pps;
-+
-+    int ctb_col = pps->ctb_addr_ts_to_rs[ctb_addr_ts] % de->PicWidthInCtbsY;
-+    int ctb_row = pps->ctb_addr_ts_to_rs[ctb_addr_ts] / de->PicWidthInCtbsY;
-+
-+    int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns);
-+    int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows);
-+
-+    int endx = pps->col_bd[tile_x+1] - 1;
-+    int endy = pps->row_bd[tile_y+1] - 1;
-+
-+    uint8_t slice_w = ctb_to_slice_w_h(ctb_col, 1<<sps->log2_ctb_size, sps->width,  pps->col_bd, pps->num_tile_columns);
-+    uint8_t slice_h = ctb_to_slice_w_h(ctb_row, 1<<sps->log2_ctb_size, sps->height, pps->row_bd, pps->num_tile_rows);
-+
-+    p1_apb_write(de, RPI_TILESTART, pps->col_bd[tile_x] + (pps->row_bd[tile_y]<<16));
-+    p1_apb_write(de, RPI_TILEEND, endx + (endy<<16));
-+
-+    if (do_bte)
-+        p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16));
-+
-+    write_slice(de, s, slice_w, slice_h);
-+
-+    if (resetQPY)
-+        p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp);
-+
-+    p1_apb_write(de, RPI_MODE, (0xFFFF                            <<  0)
-+                              + (0x0                               << 16)
-+                              + ((tile_x==pps->num_tile_columns-1) << 17)
-+                              + ((tile_y==pps->num_tile_rows-1)    << 18));
-+
-+    p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16));
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+// Doesn't attempt to remove from context as we should only do this at the end
-+// of time or on create error
-+static void
-+dec_env_delete(dec_env_t * const de)
-+{
-+//    gpu_free(&de->gbuf);
-+
-+    av_freep(&de->cmd_fifo);
-+    av_freep(&de->bit_fifo);
-+
-+    sem_destroy(&de->phase_wait);
-+    av_free(de);
-+}
-+
-+static dec_env_t *
-+dec_env_new(AVCodecContext * const avctx, RPI_T * const rpi)
-+{
-+    dec_env_t * const de = av_mallocz(sizeof(*de));
-+    int i;
-+
-+    if (de == NULL)
-+        return NULL;
-+
-+    de->avctx = avctx;
-+    de->phase_no = RPIVID_PHASE_NEW;
-+
-+    sem_init(&de->phase_wait, 0, 0);
-+
-+    if ((de->cmd_fifo = malloc((de->cmd_max=1024)*sizeof(struct RPI_CMD))) == NULL)
-+        goto fail;
-+
-+    if ((de->bit_fifo = malloc((de->bit_max=1024)*sizeof(struct RPI_BIT))) == NULL)
-+        goto fail;
-+
-+    pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this
-+    for (i = 0; i != avctx->thread_count; ++i) {
-+        if (rpi->dec_envs[i] == NULL)
-+        {
-+            rpi->dec_envs[i] = de;
-+            break;
-+        }
-+    }
-+    pthread_mutex_unlock(&rpi->phase_lock);
-+
-+    if (i == avctx->thread_count) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to find a slot for hw thread context\n");
-+        goto fail;
-+    }
-+
-+    return de;
-+
-+fail:
-+    dec_env_delete(de);
-+    return NULL;
-+}
-+
-+
-+static dec_env_t *
-+dec_env_get(AVCodecContext * const avctx, RPI_T * const rpi)
-+{
-+    dec_env_t * de = NULL;
-+    const int ref_count = atomic_fetch_add(&rpi->ref_count, 1);
-+
-+    if (ref_count <= 0) {
-+        // Already dead
-+        av_log(avctx, AV_LOG_ERROR, "RPIVID called whilst dead\n");;
-+        return NULL;
-+    }
-+
-+    for (int i = 0; i != avctx->thread_count; ++i) {
-+        if (rpi->dec_envs[i] == NULL)
-+        {
-+            de = dec_env_new(avctx, rpi);
-+            break;
-+        }
-+        if (rpi->dec_envs[i]->avctx == avctx)
-+        {
-+            de = rpi->dec_envs[i];
-+            break;
-+        }
-+    }
-+    return de;
-+}
-+
-+// Call at end of fn
-+// Used to ensure we aren't in a worker thead when killed
-+static void
-+dec_env_release(RPI_T * const rpi, dec_env_t * const de)
-+{
-+    const int n = atomic_fetch_sub(&rpi->ref_count, 1);
-+    if (n == 1) {
-+        sem_post(&rpi->ref_zero);
-+    }
-+}
-+
-+//----------------------------------------------------------------------------
-+
-+// Wait for a slot in the given phase
-+// Any error return is probably fatal
-+static int
-+wait_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no)
-+{
-+    int needs_wait = 0;
-+    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
-+
-+    pthread_mutex_lock(&rpi->phase_lock);
-+    if (p->last_order + 1 != de->decode_order) {
-+        de->phase_wait_q_next = p->q;
-+        p->q = de;
-+        needs_wait = 1;
-+    }
-+    pthread_mutex_unlock(&rpi->phase_lock);
-+
-+    if (needs_wait) {
-+        while (sem_wait(&de->phase_wait) == -1)
-+        {
-+            int err;
-+            if ((err = errno) != EINTR)
-+                return AVERROR(err);
-+        }
-+    }
-+
-+    de->phase_no = phase_no;
-+    return 0;
-+}
-+
-+static void
-+post_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no)
-+{
-+    dec_env_t * next_de = NULL;
-+    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
-+    dec_env_t ** q = &p->q;
-+
-+    pthread_mutex_lock(&rpi->phase_lock);
-+
-+    p->last_order = de->decode_order;
-+    while (*q != NULL) {
-+        dec_env_t * const t_de = *q;
-+
-+        if (t_de->decode_order == p->last_order + 1) {
-+            // This is us - remove from Q
-+            *q = t_de->phase_wait_q_next;
-+            t_de->phase_wait_q_next = NULL; // Tidy
-+            next_de = t_de;
-+            break;
-+        }
-+        q = &t_de->phase_wait_q_next;
-+    }
-+
-+    pthread_mutex_unlock(&rpi->phase_lock);
-+
-+    if (next_de != NULL)
-+        sem_post(&next_de->phase_wait);
-+}
-+
-+// Wait & signal stuff s.t. threads in other phases can continue
-+static void
-+abort_phases(RPI_T * const rpi, dec_env_t * const de)
-+{
-+    for (int i = de->phase_no + 1; i < RPIVID_PHASE_NEW; ++i) {
-+        wait_phase(rpi, de, i);
-+        post_phase(rpi, de, i);
-+    }
-+    de->phase_no = RPIVID_PHASE_NEW;
-+}
-+
-+// Start timing for phase
-+// Stats only - no actual effect
-+static inline void tstart_phase(RPI_T * const rpi, const int phase_no)
-+{
-+#if OPT_PHASE_TIMING
-+    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
-+    const int64_t now = tus64();
-+    if (p->phase_time != 0)
-+        p->time_out_phase += now - p->phase_time;
-+    p->phase_time = now;
-+#endif
-+}
-+
-+#if OPT_PHASE_TIMING
-+static unsigned int tavg_bin_phase(phase_wait_env_t *const p, const unsigned int avg_n)
-+{
-+    uint64_t tsum = 0;
-+    unsigned int i;
-+    for (i = 0; i != avg_n; ++i)
-+        tsum += p->time_stash[(p->i3 - i) & 15];
-+    for (i = 0; i != 9; ++i) {
-+        if (time_thresholds[i] * 1000 * avg_n > tsum)
-+            break;
-+    }
-+    return i;
-+}
-+#endif
-+
-+// End timing for phase
-+// Stats only - no actual effect
-+static inline void tend_phase(RPI_T * const rpi, const int phase_no)
-+{
-+#if OPT_PHASE_TIMING
-+    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
-+    const uint64_t now = tus64();
-+    const uint64_t in_time = now - p->phase_time;
-+
-+    p->time_in_phase += in_time;
-+    p->phase_time = now;
-+    p->time_stash[p->i3] = in_time;
-+    if (in_time > p->max_phase_time) {
-+        p->max_phase_time = in_time;
-+        p->max_time_decode_order = p->last_order;
-+    }
-+    ++p->time_bins[tavg_bin_phase(p, 1)];
-+    ++p->time_bins3[tavg_bin_phase(p, 3)];
-+    ++p->time_bins5[tavg_bin_phase(p, 5)];
-+
-+    p->i3 = (p->i3 + 1) & 15;
-+#endif
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Start frame
-+
-+static int rpi_hevc_start_frame(
-+    AVCodecContext * avctx,
-+    const uint8_t *buffer,
-+    uint32_t size) {
-+
-+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
-+    dec_env_t * const de = dec_env_get(avctx, rpi);
-+    const HEVCContext * const s = avctx->priv_data;
-+    const HEVCSPS * const sps = s->ps.sps;
-+    const unsigned int CtbSizeY = 1U << sps->log2_ctb_size;
-+
-+#if TRACE_ENTRY
-+    printf("<<< %s[%p]\n", __func__, de);
-+#endif
-+
-+    if (de == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
-+        return -1;
-+    }
-+
-+    de->phase_no = RPIVID_PHASE_START;
-+    de->decode_order = ++rpi->decode_order;  // *** atomic?
-+
-+    ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame
-+
-+    if (de->state != RPIVID_DECODE_NEW && de->state != RPIVID_DECODE_END) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state);
-+        return -1;
-+    }
-+    de->state = RPIVID_DECODE_START;
-+
-+    de->PicWidthInCtbsY  = (sps->width + CtbSizeY - 1) / CtbSizeY;  //7-15
-+    de->PicHeightInCtbsY = (sps->height + CtbSizeY - 1) / CtbSizeY;  //7-17
-+    de->bit_len = 0;
-+    de->cmd_len = 0;
-+
-+#if TRACE_ENTRY
-+    printf(">>> %s[%p]\n", __func__, de);
-+#endif
-+
-+    dec_env_release(rpi, de);
-+    return 0;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Slice messages
-+
-+static void msg_slice(dec_env_t * const de, const uint16_t msg) {
-+    de->slice_msgs[de->num_slice_msgs++] = msg;
-+}
-+
-+static void program_slicecmds(dec_env_t * const de, const int sliceid) {
-+    int i;
-+    p1_apb_write(de, RPI_SLICECMDS, de->num_slice_msgs+(sliceid<<8));
-+    for(i=0; i < de->num_slice_msgs; i++) {
-+        p1_apb_write(de, 0x4000+4*i, de->slice_msgs[i] & 0xffff);
-+    }
-+}
-+
-+static void pre_slice_decode(dec_env_t * const de, const HEVCContext * const s) {
-+    const HEVCSPS * const sps = s->ps.sps;
-+    const HEVCPPS * const pps = s->ps.pps;
-+    const SliceHeader *sh = &s->sh;
-+
-+    int weightedPredFlag, i, rIdx;
-+    uint16_t cmd_slice;
-+    unsigned int collocated_from_l0_flag;
-+
-+    de->num_slice_msgs=0;
-+    de->dpbno_col = 0;
-+    cmd_slice = 0;
-+    if (sh->slice_type==HEVC_SLICE_I) cmd_slice = 1;
-+    if (sh->slice_type==HEVC_SLICE_P) cmd_slice = 2;
-+    if (sh->slice_type==HEVC_SLICE_B) cmd_slice = 3;
-+
-+    if (sh->slice_type!=HEVC_SLICE_I) {
-+        cmd_slice += sh->nb_refs[L0]<<2;
-+        cmd_slice += sh->nb_refs[L1]<<6;
-+    }
-+
-+    if (sh->slice_type==HEVC_SLICE_P ||  sh->slice_type==HEVC_SLICE_B)
-+        cmd_slice |= sh->max_num_merge_cand<<11;
-+
-+    collocated_from_l0_flag =
-+        !sh->slice_temporal_mvp_enabled_flag ?
-+            0 :
-+        sh->slice_type == HEVC_SLICE_B ?
-+            (sh->collocated_list == L0) :
-+            (sh->slice_type==HEVC_SLICE_P);
-+    cmd_slice |= collocated_from_l0_flag<<14;
-+
-+    if (sh->slice_type==HEVC_SLICE_P || sh->slice_type==HEVC_SLICE_B) {
-+
-+        int NoBackwardPredFlag = 1; // Flag to say all reference pictures are from the past
-+        for(i=L0; i<=L1; i++) {
-+            for(rIdx=0; rIdx <sh->nb_refs[i]; rIdx++) {
-+                HEVCFrame *f = s->ref->refPicList[i].ref[rIdx];
-+                HEVCFrame *c = s->ref; // CurrentPicture
-+                if (c->poc < f->poc) NoBackwardPredFlag = 0;
-+            }
-+        }
-+
-+        if (sps->sps_temporal_mvp_enabled_flag)
-+        {
-+            const RefPicList *rpl = (sh->slice_type != HEVC_SLICE_B || collocated_from_l0_flag) ?
-+                s->ref->refPicList + 0 :
-+                s->ref->refPicList + 1;
-+            de->dpbno_col = rpl->ref[sh->collocated_ref_idx] - s->DPB;
-+        }
-+
-+        cmd_slice += NoBackwardPredFlag<<10;
-+        msg_slice(de, cmd_slice);
-+
-+        // Write reference picture descriptions
-+        weightedPredFlag = sh->slice_type==HEVC_SLICE_P? pps->weighted_pred_flag : pps->weighted_bipred_flag;
-+
-+        for(i=L0; i<=L1; i++)
-+            for(rIdx=0; rIdx <sh->nb_refs[i]; rIdx++) {
-+                HEVCFrame *f = s->ref->refPicList[i].ref[rIdx];
-+                HEVCFrame *c = s->ref; // CurrentPicture
-+                int pic = f - s->DPB;
-+                // Make sure pictures are in range 0 to 15
-+                int adjusted_pic = f<c? pic : pic-1;
-+                int lt = s->ref->refPicList[i].isLongTerm[rIdx];
-+                msg_slice(de, adjusted_pic+(lt<<4)+(weightedPredFlag<<5)+(weightedPredFlag<<6));
-+                msg_slice(de, f->poc);
-+                if (weightedPredFlag) {
-+                    msg_slice(de,   s->sh.luma_log2_weight_denom+(((i?s->  sh.luma_weight_l1:  s->sh.luma_weight_l0)[rIdx]   &0x1ff)<<3));
-+                    msg_slice(de,                                  (i?s->  sh.luma_offset_l1:  s->sh.luma_offset_l0)[rIdx]   & 0xff);
-+                    msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][0]&0x1ff)<<3));
-+                    msg_slice(de,                                  (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][0]& 0xff);
-+                    msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][1]&0x1ff)<<3));
-+                    msg_slice(de,                                  (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][1]& 0xff);
-+                }
-+            }
-+    }
-+    else
-+        msg_slice(de, cmd_slice);
-+
-+    msg_slice(de, ((sh->beta_offset/2)&15)
-+        + (((sh->tc_offset/2)&15)                           <<  4)
-+        + (sh->disable_deblocking_filter_flag               <<  8)
-+        + (sh->slice_loop_filter_across_slices_enabled_flag <<  9)
-+        + (pps->loop_filter_across_tiles_enabled_flag       << 10)); // CMD_DEBLOCK
-+
-+    msg_slice(de, ((sh->slice_cr_qp_offset&31)<<5) + (sh->slice_cb_qp_offset&31)); // CMD_QPOFF
-+}
-+
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+static void rpi_hevc_abort_frame(AVCodecContext * const avctx) {
-+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
-+    dec_env_t * const de = dec_env_get(avctx,  rpi);
-+
-+#if TRACE_ENTRY
-+    printf("<<< %s[%p]\n", __func__, de);
-+#endif
-+
-+    if (de == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
-+        return;
-+    }
-+
-+    switch (de->state) {
-+        case RPIVID_DECODE_NEW:
-+        case RPIVID_DECODE_END:
-+            // Expected transition
-+            break;
-+
-+        case RPIVID_DECODE_SLICE:
-+            // Error transition
-+            av_log(avctx, AV_LOG_INFO, "Error in decode - aborting\n");
-+            break;
-+
-+        case RPIVID_DECODE_START:
-+        default:
-+            av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state);
-+            break;
-+    }
-+
-+    abort_phases(rpi, de);
-+    de->state = RPIVID_DECODE_NEW;
-+
-+    dec_env_release(rpi, de);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// End frame
-+
-+static int rpi_hevc_end_frame(AVCodecContext * const avctx) {
-+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
-+    const HEVCContext * const s = avctx->priv_data;
-+    const HEVCPPS * const pps = s->ps.pps;
-+    const HEVCSPS * const sps = s->ps.sps;
-+    dec_env_t * const de = dec_env_get(avctx,  rpi);
-+    AVFrame * const f = s->ref->frame;
-+    const unsigned int dpbno_cur = s->ref - s->DPB;
-+    vid_vc_addr_t cmds_vc;
-+    vid_vc_addr_t pu_base_vc;
-+    unsigned int pu_stride;
-+    vid_vc_addr_t coeff_base_vc;
-+    unsigned int coeff_stride;
-+    unsigned int i;
-+    int rv = 0;
-+    int status = 0;
-+    int coeffbuf_sem_claimed = 0;
-+
-+#if TRACE_ENTRY
-+    fprintf("<<< %s[%p]\n", __func__, de);
-+#endif
-+
-+    if (de == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
-+        return AVERROR_BUG;  // Should never happen
-+    }
-+
-+    if (de->state != RPIVID_DECODE_SLICE) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state);
-+        rv = AVERROR_UNKNOWN;
-+        goto fail;
-+    }
-+    de->state = RPIVID_DECODE_END;
-+
-+    // End of command compilation
-+    {
-+        const unsigned int last_x = pps->col_bd[pps->num_tile_columns]-1;
-+        const unsigned int last_y = pps->row_bd[pps->num_tile_rows]-1;
-+        if (pps->entropy_coding_sync_enabled_flag) {
-+            if (de->wpp_entry_x<2 && de->PicWidthInCtbsY>2)
-+                wpp_pause(de, last_y);
-+        }
-+        p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
-+    }
-+
-+    // Phase 0 ---------------------------------------------------------------
-+
-+    wait_phase(rpi, de, 0);
-+    rpi_sem_wait(&rpi->bitbuf_sem);
-+    tstart_phase(rpi, 0);
-+
-+    // Copy cmds & bits into gpu side buffer
-+    // Layout: CMDS, BITS
-+    {
-+        uint8_t * const armbase = rpi->gbitbufs[rpi->bitbuf_no].arm;
-+        vid_vc_addr_t vcbase = rpi->gbitbufs[rpi->bitbuf_no].vc;
-+        unsigned int cmd_bytes = de->cmd_len * sizeof(struct RPI_CMD);
-+
-+        uint8_t * p = armbase + rnd64(cmd_bytes);
-+        uint8_t * const eobits = armbase + rpi->gbitbufs[rpi->bitbuf_no].numbytes;
-+
-+        cmds_vc = vcbase;
-+
-+        // Copy all the bits & update bitstream cmds to point at the right bits
-+        for (i = 0; i < de->bit_len; ++i)
-+        {
-+            const unsigned int seg_len = de->bit_fifo[i].len;
-+
-+            if (p + seg_len > eobits) {
-+                status = -1;
-+                break;
-+            }
-+
-+            memcpy(p, de->bit_fifo[i].ptr, seg_len);
-+            de->cmd_fifo[de->bit_fifo[i].cmd].data = MANGLE64((p - armbase) + vcbase);
-+
-+            p += rnd64(seg_len);
-+        }
-+
-+        memcpy(armbase, de->cmd_fifo, cmd_bytes);
-+    }
-+
-+    if (status == 0)
-+    {
-+        if (++rpi->bitbuf_no >= RPIVID_BITBUFS)
-+            rpi->bitbuf_no = 0;
-+    }
-+    else
-+    {
-+        sem_post(&rpi->bitbuf_sem);
-+        av_log(avctx, AV_LOG_ERROR, "Out of HEVC bit/cmd memory\n");
-+        rv = AVERROR_BUFFER_TOO_SMALL;
-+    }
-+
-+    tend_phase(rpi, 0);
-+    post_phase(rpi, de, 0);
-+
-+    if (status < 0)
-+        goto fail;
-+
-+    // Phase 1 ---------------------------------------------------------------
-+
-+    wait_phase(rpi, de, 1);
-+    rpi_sem_wait(&rpi->coeffbuf_sem);
-+    coeffbuf_sem_claimed = 1;
-+    tstart_phase(rpi, 1);
-+
-+    status = 0;
-+    for (;;)
-+    {
-+        // (Re-)allocate PU/COEFF stream space
-+        const unsigned int total_size = rpi->gcoeffbufs[rpi->coeffbuf_no].numbytes;
-+        unsigned int pu_size;
-+
-+        pu_base_vc = rpi->gcoeffbufs[rpi->coeffbuf_no].vc;
-+        pu_stride = rnd64(rpi->max_pu_msgs * 2 * de->PicWidthInCtbsY);
-+        pu_size = pu_stride * de->PicHeightInCtbsY;
-+
-+        if (pu_size >= total_size || status == -1) {
-+            GPU_MEM_PTR_T newbuf;
-+
-+            if (gpu_malloc_uncached(round_up_size(total_size + 1), &newbuf) != 0)
-+            {
-+                av_log(avctx, AV_LOG_ERROR, "Failed to reallocate coeffbuf\n");
-+                status = -1;
-+                break;
-+            }
-+            gpu_free(rpi->gcoeffbufs + rpi->coeffbuf_no);
-+            rpi->gcoeffbufs[rpi->coeffbuf_no] = newbuf;
-+            status = 0;
-+            continue;
-+        }
-+
-+        // Allocate all remaining space to coeff
-+        coeff_base_vc = pu_base_vc + pu_size;
-+        coeff_stride = ((total_size - pu_size) / de->PicHeightInCtbsY) & ~63;  // Round down to multiple of 64
-+
-+        apb_write_vc_addr(rpi, RPI_PUWBASE, pu_base_vc);
-+        apb_write_vc_len(rpi, RPI_PUWSTRIDE, pu_stride);
-+        apb_write_vc_addr(rpi, RPI_COEFFWBASE, coeff_base_vc);
-+        apb_write_vc_len(rpi, RPI_COEFFWSTRIDE, coeff_stride);
-+
-+        // Trigger command FIFO
-+        apb_write(rpi, RPI_CFNUM, de->cmd_len);
-+#if TRACE_DEV && 0
-+        apb_dump_regs(rpi, 0x0, 32);
-+        apb_dump_regs(rpi, 0x8000, 24);
-+        axi_dump(de, ((uint64_t)a64)<<6, de->cmd_len * sizeof(struct RPI_CMD));
-+#endif
-+        apb_write_vc_addr(rpi, RPI_CFBASE, cmds_vc);
-+
-+        int_wait(rpi, 1);
-+
-+        status = check_status(rpi, de);
-+
-+        if (status == -1)
-+            continue;
-+        else if (status != 1)
-+            break;
-+
-+        // Status 1 means out of PU space so try again with more
-+        // If we ran out of Coeff space then we are out of memory - we could possibly realloc?
-+        rpi->max_pu_msgs += rpi->max_pu_msgs / 2;
-+    }
-+
-+    // Inc inside the phase 1 lock, but only inc if we succeeded otherwise we
-+    // may reuse a live buffer when we kick the coeff sem
-+    if (status == 0)
-+    {
-+        if (++rpi->coeffbuf_no >= RPIVID_COEFFBUFS)
-+            rpi->coeffbuf_no = 0;
-+    }
-+    else
-+    {
-+        if (status == -1)
-+        {
-+            av_log(avctx, AV_LOG_ERROR, "Out of pu + coeff intermediate memory: pus=%d\n", rpi->max_pu_msgs);
-+            rv = AVERROR_BUFFER_TOO_SMALL;
-+        }
-+        else
-+        {
-+            av_log(avctx, AV_LOG_WARNING, "Phase 1 decode error\n");
-+            rv = AVERROR_INVALIDDATA;
-+        }
-+    }
-+
-+    tend_phase(rpi, 1);
-+    sem_post(&rpi->bitbuf_sem);
-+    post_phase(rpi, de, 1);
-+
-+    if (status != 0)
-+        goto fail;
-+
-+    // Phase 2 ---------------------------------------------------------------
-+
-+    wait_phase(rpi, de, 2);
-+
-+    if ((rv = av_rpi_zc_resolve_frame(f, ZC_RESOLVE_ALLOC)) != 0)
-+    {
-+        // As we are in phase 2 already here we don't need to worry about
-+        // ceoffbuf_no despite the early exit
-+        post_phase(rpi, de, 2);
-+        av_log(avctx, AV_LOG_ERROR, "Failed to allocate output frame\n");
-+        goto fail;
-+    }
-+
-+    tstart_phase(rpi, 2);
-+
-+    apb_write_vc_addr(rpi, RPI_PURBASE, pu_base_vc);
-+    apb_write_vc_len(rpi, RPI_PURSTRIDE, pu_stride);
-+    apb_write_vc_addr(rpi, RPI_COEFFRBASE, coeff_base_vc);
-+    apb_write_vc_len(rpi, RPI_COEFFRSTRIDE, coeff_stride);
-+
-+    apb_write_vc_addr(rpi, RPI_OUTYBASE, get_vc_address_y(f));
-+    apb_write_vc_addr(rpi, RPI_OUTCBASE, get_vc_address_u(f));
-+    apb_write_vc_len(rpi, RPI_OUTYSTRIDE, f->linesize[3] * 128);
-+    apb_write_vc_len(rpi, RPI_OUTCSTRIDE, f->linesize[3] * 128);
-+
-+    // Keep the last thing we resolved as fallback for any ref we fail to
-+    // resolve.  As a final fallback use our current frame.  The pels might
-+    // not be there yet but at least the memory is valid.
-+    //
-+    // Attempt to resolve the entire DPB - we could note what we have used
-+    // in ref lists but probably simpler and more reliable to set the whole thing
-+    {
-+        AVFrame * fallback_frame = f;
-+        for (i = 0; i != 16; ++i) {
-+            // Avoid current frame
-+            const HEVCFrame * hevc_fr = (s->DPB + i >= s->ref) ? s->DPB + i + 1 : s->DPB + i;
-+            AVFrame * fr = hevc_fr->frame;
-+
-+            if (fr != NULL &&
-+                av_rpi_zc_resolve_frame(fr, ZC_RESOLVE_FAIL) == 0)
-+            {
-+                fallback_frame = fr;
-+            }
-+            else
-+            {
-+                fr = fallback_frame;
-+            }
-+
-+            apb_write_vc_addr(rpi, 0x9000+16*i, get_vc_address_y(fr));
-+            apb_write(rpi, 0x9004+16*i, 0);
-+            apb_write_vc_addr(rpi, 0x9008+16*i, get_vc_address_u(fr));
-+            apb_write(rpi, 0x900C+16*i, 0);
-+        }
-+    }
-+
-+    apb_write(rpi, RPI_CONFIG2,
-+          (sps->bit_depth                             << 0) // BitDepthY
-+        + (sps->bit_depth                             << 4) // BitDepthC
-+       + ((sps->bit_depth>8)                          << 8) // BitDepthY
-+       + ((sps->bit_depth>8)                          << 9) // BitDepthC
-+        + (sps->log2_ctb_size                         <<10)
-+        + (pps->constrained_intra_pred_flag           <<13)
-+        + (sps->sps_strong_intra_smoothing_enable_flag<<14)
-+        + (sps->sps_temporal_mvp_enabled_flag         <<15)
-+        + (pps->log2_parallel_merge_level             <<16)
-+        + (s->sh.slice_temporal_mvp_enabled_flag      <<19)
-+        + (sps->pcm.loop_filter_disable_flag          <<20)
-+       + ((pps->cb_qp_offset&31)                      <<21)
-+       + ((pps->cr_qp_offset&31)                      <<26));
-+
-+    apb_write(rpi, RPI_FRAMESIZE, (sps->height<<16) + sps->width);
-+    apb_write(rpi, RPI_CURRPOC, s->poc);
-+
-+    // collocated reads/writes
-+    if (sps->sps_temporal_mvp_enabled_flag) {
-+        av_assert0(de->dpbno_col < RPIVID_COL_PICS);
-+        av_assert0(dpbno_cur < RPIVID_COL_PICS);
-+
-+        apb_write_vc_len(rpi, RPI_COLSTRIDE, rpi->col_stride);
-+        apb_write_vc_len(rpi, RPI_MVSTRIDE,  rpi->col_stride);
-+        apb_write_vc_addr(rpi, RPI_MVBASE,  rpi->gcolbuf.vc + dpbno_cur * rpi->col_picsize);
-+        apb_write_vc_addr(rpi, RPI_COLBASE, rpi->gcolbuf.vc + de->dpbno_col * rpi->col_picsize);
-+    }
-+
-+#if TRACE_DEV && 0
-+    apb_dump_regs(rpi, 0x0, 32);
-+    apb_dump_regs(rpi, 0x8000, 24);
-+#endif
-+
-+    apb_write(rpi, RPI_NUMROWS, de->PicHeightInCtbsY);
-+    apb_read(rpi, RPI_NUMROWS); // Read back to confirm write has reached block
-+
-+    int_wait(rpi, 2);
-+
-+    tend_phase(rpi, 2);
-+    coeffbuf_sem_claimed = 0;
-+    sem_post(&rpi->coeffbuf_sem);
-+    // Set valid here to avoid race in resolving in any pending phase 2
-+    av_rpi_zc_set_valid_frame(f);
-+
-+    post_phase(rpi, de, 2);
-+
-+    // Flush frame for CPU access
-+    // Arguably the best place would be at the start of phase 2 but here
-+    // will overlap with the wait
-+    //
-+    // * Even better would be to have better lock/unlock control in ZC for external access
-+    if (rpi->gpu_init_type == GPU_INIT_GPU)  // * CMA is currently always uncached
-+    {
-+        rpi_cache_buf_t cbuf;
-+        rpi_cache_flush_env_t * const fe = rpi_cache_flush_init(&cbuf);
-+        rpi_cache_flush_add_frame(fe, f, RPI_CACHE_FLUSH_MODE_INVALIDATE);
-+        rpi_cache_flush_finish(fe);
-+    }
-+
-+#if TRACE_ENTRY
-+    printf(">>> %s[%p] OK\n", __func__, de);
-+#endif
-+
-+    dec_env_release(rpi, de);
-+    return 0;
-+
-+fail:
-+    av_rpi_zc_set_broken_frame(f);
-+    if (coeffbuf_sem_claimed)
-+        sem_post(&rpi->coeffbuf_sem);
-+    abort_phases(rpi, de);  // Dummy any unresolved phases
-+
-+#if TRACE_ENTRY
-+    printf(">>> %s[%p] FAIL\n", __func__, de);
-+#endif
-+
-+    dec_env_release(rpi, de);
-+    return rv;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+
-+#if TRACE_DEV
-+static void dump_data(const uint8_t * p, size_t len)
-+{
-+    size_t i;
-+    for (i = 0; i < len; i += 16) {
-+        size_t j;
-+        printf("%04x", i);
-+        for (j = 0; j != 16; ++j) {
-+            printf("%c%02x", i == 8 ? '-' : ' ', p[i+j]);
-+        }
-+        printf("\n");
-+    }
-+}
-+#endif
-+
-+#if OPT_EMU
-+static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx)
-+{
-+    unsigned int z = 0;
-+    while (idx--) {
-+        if (*b++ == 0) {
-+            ++z;
-+            if (z >= 2 && *b == 3) {
-+                ++b;
-+                z = 0;
-+            }
-+        }
-+        else {
-+            z = 0;
-+        }
-+    }
-+    return b;
-+}
-+#endif
-+
-+static void WriteBitstream(dec_env_t * const de, const HEVCContext * const s) {
-+    const int rpi_use_emu = OPT_EMU; // FFmpeg removes emulation prevention bytes
-+    const int offset = 0; // Always 64-byte aligned in sim, need not be on real hardware
-+    const GetBitContext *gb = &s->HEVClc->gb;
-+
-+#if OPT_EMU
-+    const uint8_t *ptr = ptr_from_index(de->nal_buffer, gb->index/8 + 1);
-+    const int len = de->nal_size - (ptr - de->nal_buffer);
-+#else
-+    const int len = 1 + gb->size_in_bits/8 - gb->index/8;
-+    const void *ptr = &gb->buffer[gb->index/8];
-+#endif
-+
-+#if TRACE_DEV
-+    printf("Index=%d, /8=%#x\n", gb->index, gb->index/8);
-+    dump_data(de->nal_buffer, 128);
-+#endif
-+
-+    p1_axi_write(de, len, ptr, p1_apb_write(de, RPI_BFBASE, 0)); // BFBASE set later
-+    p1_apb_write(de, RPI_BFNUM, len);
-+    p1_apb_write(de, RPI_BFCONTROL, offset + (1<<7)); // Stop
-+    p1_apb_write(de, RPI_BFCONTROL, offset + (rpi_use_emu<<6));
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Wavefront mode
-+
-+static void wpp_decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts)
-+{
-+    const HEVCPPS * const pps = s->ps.pps;
-+
-+    int i, resetQPY=1;
-+    int indep = !s->sh.dependent_slice_segment_flag;
-+    int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
-+
-+    if (ctb_addr_ts)
-+        wpp_end_previous_slice(de, s, ctb_addr_ts);
-+    pre_slice_decode(de, s);
-+    WriteBitstream(de, s);
-+    if (ctb_addr_ts==0 || indep || de->PicWidthInCtbsY==1)
-+        WriteProb(de, s);
-+    else if (ctb_col==0)
-+        p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
-+    else
-+        resetQPY=0;
-+    program_slicecmds(de, s->slice_idx);
-+    new_slice_segment(de, s);
-+    wpp_entry_point(de, s, indep, resetQPY, ctb_addr_ts);
-+    for (i=0; i<s->sh.num_entry_point_offsets; i++) {
-+        int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
-+        int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY;
-+        int last_x = de->PicWidthInCtbsY-1;
-+        if (de->PicWidthInCtbsY>2)
-+            wpp_pause(de, ctb_row);
-+        p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + (last_x<<5) + 2);
-+        if (de->PicWidthInCtbsY==2)
-+            p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
-+        if (de->PicWidthInCtbsY==1)
-+            WriteProb(de, s);
-+        else
-+            p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
-+        ctb_addr_ts += pps->column_width[0];
-+        wpp_entry_point(de, s, 0, 1, ctb_addr_ts);
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Tiles mode
-+
-+static void decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) {
-+    const HEVCPPS * const pps = s->ps.pps;
-+    int i, resetQPY;
-+
-+    if (ctb_addr_ts) end_previous_slice(de, s, ctb_addr_ts);
-+    pre_slice_decode(de, s);
-+    WriteBitstream(de, s);
-+    resetQPY = ctb_addr_ts==0
-+            || pps->tile_id[ctb_addr_ts]!=pps->tile_id[ctb_addr_ts-1]
-+            || !s->sh.dependent_slice_segment_flag;
-+    if (resetQPY) WriteProb(de, s);
-+    program_slicecmds(de, s->slice_idx);
-+    new_slice_segment(de, s);
-+    new_entry_point(de, s, !s->sh.dependent_slice_segment_flag, resetQPY, ctb_addr_ts);
-+    for (i=0; i<s->sh.num_entry_point_offsets; i++) {
-+        int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
-+        int ctb_col = ctb_addr_rs % de->PicWidthInCtbsY;
-+        int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY;
-+        int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns);
-+        int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows);
-+        int last_x = pps->col_bd[tile_x+1]-1;
-+        int last_y = pps->row_bd[tile_y+1]-1;
-+        p1_apb_write(de, RPI_STATUS, 2 + (last_x<<5) + (last_y<<18));
-+        WriteProb(de, s);
-+        ctb_addr_ts += pps->column_width[tile_x] * pps->row_height[tile_y];
-+        new_entry_point(de, s, 0, 1, ctb_addr_ts);
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+static int cabac_start_align(HEVCContext *s)
-+{
-+    GetBitContext *gb = &s->HEVClc->gb;
-+    skip_bits(gb, 1);
-+    align_get_bits(gb);
-+    // Should look at getting rid of this
-+    return ff_init_cabac_decoder(&s->HEVClc->cc,
-+                          gb->buffer + get_bits_count(gb) / 8,
-+                          (get_bits_left(gb) + 7) / 8);
-+}
-+
-+static int rpi_hevc_decode_slice(
-+    AVCodecContext *avctx,
-+    const uint8_t *buffer,
-+    uint32_t size)
-+{
-+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
-+    HEVCContext * const s = avctx->priv_data;
-+    dec_env_t * const de = dec_env_get(avctx, rpi);
-+    const HEVCPPS *pps = s->ps.pps;
-+    int ctb_addr_ts = pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
-+
-+#if TRACE_ENTRY
-+    printf("<<< %s[%p]\n", __func__, de);
-+#endif
-+    if (de == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
-+        return -1;
-+    }
-+
-+    if (de->state != RPIVID_DECODE_START && de->state != RPIVID_DECODE_SLICE) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state);
-+        return -1;
-+    }
-+    de->state = RPIVID_DECODE_SLICE;
-+
-+    de->nal_buffer = buffer;
-+    de->nal_size   = size;
-+
-+#if !OPT_EMU
-+//    ff_hevc_cabac_init(s, ctb_addr_ts);
-+    cabac_start_align(s);
-+#endif
-+    if (s->ps.sps->scaling_list_enable_flag)
-+        populate_scaling_factors(de, s);
-+    pps->entropy_coding_sync_enabled_flag? wpp_decode_slice(de, s, ctb_addr_ts)
-+                                             : decode_slice(de, s, ctb_addr_ts);
-+#if TRACE_ENTRY
-+    printf(">>> %s[%p]\n", __func__, de);
-+#endif
-+    dec_env_release(rpi, de);
-+    return 0;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+static int rpivid_retrieve_data(void *logctx, AVFrame *frame)
-+{
-+    int rv;
-+    if ((rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_WAIT_VALID)) != 0)
-+        av_log(logctx, AV_LOG_ERROR, "Unable to resolve output frame\n");
-+    return rv;
-+}
-+
-+static int rpivid_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame)
-+{
-+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
-+    HEVCContext * const s = avctx->priv_data;
-+    // Frame buffering + 1 output.  Would need thread_count extra but we now
-+    // alloc at the start of phase 2 so that is the only thread we need the
-+    // extra buffer for.
-+    const unsigned int pool_req = s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering + 1;
-+    int rv;
-+
-+    if (av_rpi_zc_in_use(avctx))
-+    {
-+        const AVZcEnvPtr zc = avctx->opaque;
-+        av_rpi_zc_set_decoder_pool_size(zc, pool_req);
-+        rv = av_rpi_zc_get_buffer(zc, frame);   // get_buffer2 would alloc
-+    }
-+    else
-+    {
-+        if (rpi->zc == NULL) {
-+            pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this
-+            // Alloc inside lock to make sure we only ever alloc one
-+            if (rpi->zc == NULL) {
-+                rpi->zc = av_rpi_zc_int_env_alloc(s);
-+            }
-+            pthread_mutex_unlock(&rpi->phase_lock);
-+        }
-+        av_rpi_zc_set_decoder_pool_size(rpi->zc, pool_req); // Ignored by local allocator, but set anyway :-)
-+        rv = (rpi->zc == NULL) ? AVERROR(ENOMEM) :
-+            av_rpi_zc_get_buffer(rpi->zc, frame);
-+    }
-+
-+    if (rv == 0 &&
-+        (rv = ff_attach_decode_data(frame)) < 0)
-+    {
-+        av_frame_unref(frame);
-+    }
-+
-+    if (rv == 0)
-+    {
-+        FrameDecodeData *fdd = (FrameDecodeData*)frame->private_ref->data;
-+        fdd->post_process = rpivid_retrieve_data;
-+    }
-+
-+    return rv;
-+}
-+
-+#if OPT_PHASE_TIMING
-+static void log_bin_phase(AVCodecContext * const avctx, const unsigned int * const bins)
-+{
-+    av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d %7d\n",
-+           bins[0],  bins[1], bins[2], bins[3],
-+           bins[4],  bins[5], bins[6], bins[7], bins[8]);
-+}
-+#endif
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+static int rpi_hevc_free(AVCodecContext *avctx) {
-+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
-+
-+#if TRACE_ENTRY
-+    printf("<<< %s\n", __func__);
-+#endif
-+
-+    dec_env_release(rpi, NULL);
-+
-+    // Wait for everything else to stop
-+    {
-+        struct timespec tt;
-+        clock_gettime(CLOCK_REALTIME, &tt);
-+        tt.tv_sec += 2;
-+        while (sem_timedwait(&rpi->ref_zero, &tt) == -1) {
-+            const int err = errno;
-+            if (err == ETIMEDOUT) {
-+                av_log(avctx, AV_LOG_FATAL, "Rpivid worker threads still running\n");
-+                return -1;
-+            }
-+            if (err != EINTR) {
-+                av_log(avctx, AV_LOG_ERROR, "Unexpected error %d waiting for work thread to stop\n", err);
-+                break;
-+            }
-+        }
-+    }
-+
-+#if OPT_PHASE_TIMING
-+    {
-+        unsigned int i;
-+        for (i = 0; i != RPIVID_PHASES; ++i) {
-+            const phase_wait_env_t * const p = rpi->phase_reqs + i;
-+            av_log(avctx, AV_LOG_INFO, "Phase %u: In %3u.%06u, Out %3u.%06u\n", i,
-+                   (unsigned int)(p->time_in_phase / 1000000), (unsigned int)(p->time_in_phase % 1000000),
-+                   (unsigned int)(p->time_out_phase / 1000000), (unsigned int)(p->time_out_phase % 1000000));
-+            av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d        >\n",
-+                   time_thresholds[0], time_thresholds[1], time_thresholds[2], time_thresholds[3],
-+                   time_thresholds[4], time_thresholds[5], time_thresholds[6], time_thresholds[7]);
-+            log_bin_phase(avctx, p->time_bins);
-+            log_bin_phase(avctx, p->time_bins3);
-+            log_bin_phase(avctx, p->time_bins5);
-+            av_log(avctx, AV_LOG_INFO, "Longest duraction: %ums @ frame %u\n",
-+                   (unsigned int)(p->max_phase_time / 1000),
-+                   p->max_time_decode_order);
-+        }
-+        av_log(avctx, AV_LOG_INFO, "PU max=%d\n", rpi->max_pu_msgs);
-+    }
-+#endif
-+
-+    if (rpi->dec_envs != NULL)
-+    {
-+        for (int i; i < avctx->thread_count && rpi->dec_envs[i] != NULL; ++i) {
-+            dec_env_delete(rpi->dec_envs[i]);
-+        }
-+        av_freep(&rpi->dec_envs);
-+    }
-+
-+    av_rpi_zc_int_env_freep(&rpi->zc);
-+
-+    gpu_free(&rpi->gcolbuf);
-+
-+    for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) {
-+        gpu_free(rpi->gbitbufs + i);
-+    }
-+    for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) {
-+        gpu_free(rpi->gcoeffbufs + i);
-+    }
-+
-+    unmap_devp(&rpi->regs, REGS_SIZE);
-+    unmap_devp(&rpi->ints, INTS_SIZE);
-+
-+    if (rpi->gpu_init_type > 0)
-+        rpi_mem_gpu_uninit();
-+
-+    if (rpi->mbox_fd >= 0) {
-+        mbox_release_clock(rpi->mbox_fd);
-+        mbox_close(rpi->mbox_fd);
-+    }
-+
-+    sem_destroy(&rpi->ref_zero);
-+    sem_destroy(&rpi->coeffbuf_sem);
-+    sem_destroy(&rpi->bitbuf_sem);
-+
-+#if TRACE_ENTRY
-+    printf(">>> %s\n", __func__);
-+#endif
-+    return 0;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+static int rpi_hevc_init(AVCodecContext *avctx) {
-+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
-+//    const char *err;
-+
-+#if TRACE_ENTRY
-+    printf("<<< %s\n", __func__);
-+#endif
-+
-+    if (avctx->width>4096 || avctx->height>4096) {
-+        av_log(NULL, AV_LOG_FATAL, "Picture size %dx%d exceeds 4096x4096 maximum for HWAccel\n", avctx->width, avctx->height);
-+        return AVERROR(ENOTSUP);
-+    }
-+
-+    memset(rpi, 0, sizeof(*rpi));
-+
-+    rpi->mbox_fd = -1;
-+    rpi->decode_order = 0;
-+
-+    // Initial PU/COEFF stream buffer split chosen as worst case seen so far
-+    rpi->max_pu_msgs = 768; // 7.2 says at most 1611 messages per CTU
-+
-+
-+    atomic_store(&rpi->ref_count, 1);
-+    sem_init(&rpi->ref_zero, 0, 0);
-+
-+    sem_init(&rpi->bitbuf_sem,   0, RPIVID_BITBUFS);
-+    sem_init(&rpi->coeffbuf_sem, 0, RPIVID_COEFFBUFS);
-+
-+    pthread_mutex_init(&rpi->phase_lock, NULL);
-+
-+    if ((rpi->mbox_fd = mbox_open()) < 0)
-+    {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to open mailbox\n");
-+        goto fail;
-+    }
-+    mbox_request_clock(rpi->mbox_fd);
-+
-+    if ((rpi->regs = map_dev(avctx, REGS_NAME, REGS_SIZE)) == NULL ||
-+        (rpi->ints = map_dev(avctx, INTS_NAME, INTS_SIZE)) == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to open rpivid devices\n");
-+        goto fail;
-+    }
-+
-+    if ((rpi->gpu_init_type = rpi_mem_gpu_init(0)) < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to init GPU\n");
-+        goto fail;
-+    }
-+
-+    if ((rpi->dec_envs = av_mallocz(sizeof(dec_env_t *) * avctx->thread_count)) == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to alloc %d dec envs\n", avctx->thread_count);
-+        goto fail;
-+    }
-+
-+    rpi->col_stride = rnd64(avctx->width);
-+    rpi->col_picsize = rpi->col_stride * (((avctx->height + 63) & ~63) >> 4);
-+    if (gpu_malloc_uncached(rpi->col_picsize * RPIVID_COL_PICS, &rpi->gcolbuf) != 0)
-+    {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to allocate col mv buffer\n");
-+        goto fail;
-+    }
-+
-+    for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) {
-+        if (gpu_malloc_uncached(RPIVID_BITBUF_SIZE, rpi->gbitbufs + i) != 0)
-+        {
-+            av_log(avctx, AV_LOG_ERROR, "Failed to allocate bitbuf %d\n", i);
-+            goto fail;
-+        }
-+    }
-+
-+    for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) {
-+        if (gpu_malloc_uncached(RPIVID_COEFFBUF_SIZE, rpi->gcoeffbufs + i) != 0)
-+        {
-+            av_log(avctx, AV_LOG_ERROR, "Failed to allocate coeffbuf %d\n", i);
-+            goto fail;
-+        }
-+    }
-+
-+    av_log(avctx, AV_LOG_INFO, "RPI HEVC h/w accel init OK\n");
-+
-+    return 0;
-+
-+fail:
-+    rpi_hevc_free(avctx);
-+    return AVERROR_EXTERNAL;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+const AVHWAccel ff_hevc_rpi4_8_hwaccel = {
-+    .name           = "hevc_rpi4_8",
-+    .type           = AVMEDIA_TYPE_VIDEO,
-+    .id             = AV_CODEC_ID_HEVC,
-+    .pix_fmt        = AV_PIX_FMT_RPI4_8,
-+    .alloc_frame    = rpivid_hevc_alloc_frame,
-+    .start_frame    = rpi_hevc_start_frame,
-+    .end_frame      = rpi_hevc_end_frame,
-+    .abort_frame    = rpi_hevc_abort_frame,
-+    .decode_slice   = rpi_hevc_decode_slice,
-+    .init           = rpi_hevc_init,
-+    .uninit         = rpi_hevc_free,
-+    .priv_data_size = sizeof(RPI_T),
-+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
-+};
-+
-+const AVHWAccel ff_hevc_rpi4_10_hwaccel = {
-+    .name           = "hevc_rpi4_10",
-+    .type           = AVMEDIA_TYPE_VIDEO,
-+    .id             = AV_CODEC_ID_HEVC,
-+    .pix_fmt        = AV_PIX_FMT_RPI4_10,
-+    .alloc_frame    = rpivid_hevc_alloc_frame,
-+    .start_frame    = rpi_hevc_start_frame,
-+    .end_frame      = rpi_hevc_end_frame,
-+    .abort_frame    = rpi_hevc_abort_frame,
-+    .decode_slice   = rpi_hevc_decode_slice,
-+    .init           = rpi_hevc_init,
-+    .uninit         = rpi_hevc_free,
-+    .priv_data_size = sizeof(RPI_T),
-+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
-+};
-+
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -21,6 +21,7 @@
-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-  */
- 
-+#include <drm_fourcc.h>
- #include <linux/videodev2.h>
- #include <sys/ioctl.h>
- #include <sys/mman.h>
-@@ -29,57 +30,82 @@
- #include <poll.h>
- #include "libavcodec/avcodec.h"
- #include "libavcodec/internal.h"
-+#include "libavutil/avassert.h"
- #include "libavutil/pixdesc.h"
-+#include "libavutil/hwcontext.h"
- #include "v4l2_context.h"
- #include "v4l2_buffers.h"
- #include "v4l2_m2m.h"
-+#include "weak_link.h"
- 
- #define USEC_PER_SEC 1000000
--static AVRational v4l2_timebase = { 1, USEC_PER_SEC };
-+static const AVRational v4l2_timebase = { 1, USEC_PER_SEC };
- 
--static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf)
-+static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf)
- {
-     return V4L2_TYPE_IS_OUTPUT(buf->context->type) ?
-         container_of(buf->context, V4L2m2mContext, output) :
-         container_of(buf->context, V4L2m2mContext, capture);
- }
- 
--static inline AVCodecContext *logger(V4L2Buffer *buf)
-+static inline AVCodecContext *logger(const V4L2Buffer * const buf)
- {
-     return buf_to_m2mctx(buf)->avctx;
- }
- 
--static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf)
-+static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf)
- {
--    V4L2m2mContext *s = buf_to_m2mctx(avbuf);
--
--    if (s->avctx->pkt_timebase.num)
--        return s->avctx->pkt_timebase;
--    return s->avctx->time_base;
-+    const V4L2m2mContext *s = buf_to_m2mctx(avbuf);
-+    const AVRational tb = s->avctx->pkt_timebase.num ?
-+        s->avctx->pkt_timebase :
-+        s->avctx->time_base;
-+    return tb.num && tb.den ? tb : v4l2_timebase;
- }
- 
--static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts)
-+static inline struct timeval tv_from_int(const int64_t t)
- {
--    int64_t v4l2_pts;
-+    return (struct timeval){
-+        .tv_usec = t % USEC_PER_SEC,
-+        .tv_sec  = t / USEC_PER_SEC
-+    };
-+}
- 
--    if (pts == AV_NOPTS_VALUE)
--        pts = 0;
-+static inline int64_t int_from_tv(const struct timeval t)
-+{
-+    return (int64_t)t.tv_sec * USEC_PER_SEC + t.tv_usec;
-+}
- 
-+static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts)
-+{
-     /* convert pts to v4l2 timebase */
--    v4l2_pts = av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
--    out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
--    out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
-+    const int64_t v4l2_pts =
-+        pts == AV_NOPTS_VALUE ? 0 :
-+            av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
-+    out->buf.timestamp = tv_from_int(v4l2_pts);
- }
- 
--static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf)
-+static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf)
- {
--    int64_t v4l2_pts;
--
-+    const int64_t v4l2_pts = int_from_tv(avbuf->buf.timestamp);
-+    return v4l2_pts != 0 ? v4l2_pts : AV_NOPTS_VALUE;
-+#if 0
-     /* convert pts back to encoder timebase */
--    v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
--                        avbuf->buf.timestamp.tv_usec;
-+    return
-+        avbuf->context->no_pts_rescale ? v4l2_pts :
-+        v4l2_pts == 0 ? AV_NOPTS_VALUE :
-+            av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
-+#endif
-+}
- 
--    return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
-+static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length)
-+{
-+    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
-+        out->planes[plane].bytesused = bytesused;
-+        out->planes[plane].length = length;
-+    } else {
-+        out->buf.bytesused = bytesused;
-+        out->buf.length = length;
-+    }
- }
- 
- static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf)
-@@ -116,6 +142,105 @@ static enum AVColorPrimaries v4l2_get_co
-     return AVCOL_PRI_UNSPECIFIED;
- }
- 
-+static void v4l2_set_color(V4L2Buffer *buf,
-+                           const enum AVColorPrimaries avcp,
-+                           const enum AVColorSpace avcs,
-+                           const enum AVColorTransferCharacteristic avxc)
-+{
-+    enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT;
-+    enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT;
-+    enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT;
-+
-+    switch (avcp) {
-+    case AVCOL_PRI_BT709:
-+        cs = V4L2_COLORSPACE_REC709;
-+        ycbcr = V4L2_YCBCR_ENC_709;
-+        break;
-+    case AVCOL_PRI_BT470M:
-+        cs = V4L2_COLORSPACE_470_SYSTEM_M;
-+        ycbcr = V4L2_YCBCR_ENC_601;
-+        break;
-+    case AVCOL_PRI_BT470BG:
-+        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
-+        break;
-+    case AVCOL_PRI_SMPTE170M:
-+        cs = V4L2_COLORSPACE_SMPTE170M;
-+        break;
-+    case AVCOL_PRI_SMPTE240M:
-+        cs = V4L2_COLORSPACE_SMPTE240M;
-+        break;
-+    case AVCOL_PRI_BT2020:
-+        cs = V4L2_COLORSPACE_BT2020;
-+        break;
-+    case AVCOL_PRI_SMPTE428:
-+    case AVCOL_PRI_SMPTE431:
-+    case AVCOL_PRI_SMPTE432:
-+    case AVCOL_PRI_EBU3213:
-+    case AVCOL_PRI_RESERVED:
-+    case AVCOL_PRI_FILM:
-+    case AVCOL_PRI_UNSPECIFIED:
-+    default:
-+        break;
-+    }
-+
-+    switch (avcs) {
-+    case AVCOL_SPC_RGB:
-+        cs = V4L2_COLORSPACE_SRGB;
-+        break;
-+    case AVCOL_SPC_BT709:
-+        cs = V4L2_COLORSPACE_REC709;
-+        break;
-+    case AVCOL_SPC_FCC:
-+        cs = V4L2_COLORSPACE_470_SYSTEM_M;
-+        break;
-+    case AVCOL_SPC_BT470BG:
-+        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
-+        break;
-+    case AVCOL_SPC_SMPTE170M:
-+        cs = V4L2_COLORSPACE_SMPTE170M;
-+        break;
-+    case AVCOL_SPC_SMPTE240M:
-+        cs = V4L2_COLORSPACE_SMPTE240M;
-+        break;
-+    case AVCOL_SPC_BT2020_CL:
-+        cs = V4L2_COLORSPACE_BT2020;
-+        ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM;
-+        break;
-+    case AVCOL_SPC_BT2020_NCL:
-+        cs = V4L2_COLORSPACE_BT2020;
-+        break;
-+    default:
-+        break;
-+    }
-+
-+    switch (xfer) {
-+    case AVCOL_TRC_BT709:
-+        xfer = V4L2_XFER_FUNC_709;
-+        break;
-+    case AVCOL_TRC_IEC61966_2_1:
-+        xfer = V4L2_XFER_FUNC_SRGB;
-+        break;
-+    case AVCOL_TRC_SMPTE240M:
-+        xfer = V4L2_XFER_FUNC_SMPTE240M;
-+        break;
-+    case AVCOL_TRC_SMPTE2084:
-+        xfer = V4L2_XFER_FUNC_SMPTE2084;
-+        break;
-+    default:
-+        break;
-+    }
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) {
-+        buf->context->format.fmt.pix_mp.colorspace = cs;
-+        buf->context->format.fmt.pix_mp.ycbcr_enc = ycbcr;
-+        buf->context->format.fmt.pix_mp.xfer_func = xfer;
-+    } else {
-+        buf->context->format.fmt.pix.colorspace = cs;
-+        buf->context->format.fmt.pix.ycbcr_enc = ycbcr;
-+        buf->context->format.fmt.pix.xfer_func = xfer;
-+    }
-+}
-+
- static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
- {
-     enum v4l2_quantization qt;
-@@ -134,6 +259,20 @@ static enum AVColorRange v4l2_get_color_
-      return AVCOL_RANGE_UNSPECIFIED;
- }
- 
-+static void v4l2_set_color_range(V4L2Buffer *buf, const enum AVColorRange avcr)
-+{
-+    const enum v4l2_quantization q =
-+        avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE :
-+        avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE :
-+            V4L2_QUANTIZATION_DEFAULT;
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) {
-+        buf->context->format.fmt.pix_mp.quantization = q;
-+    } else {
-+        buf->context->format.fmt.pix.quantization = q;
-+    }
-+}
-+
- static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf)
- {
-     enum v4l2_ycbcr_encoding ycbcr;
-@@ -210,73 +349,165 @@ static enum AVColorTransferCharacteristi
-     return AVCOL_TRC_UNSPECIFIED;
- }
- 
--static void v4l2_free_buffer(void *opaque, uint8_t *unused)
-+static int v4l2_buf_is_interlaced(const V4L2Buffer * const buf)
- {
--    V4L2Buffer* avbuf = opaque;
--    V4L2m2mContext *s = buf_to_m2mctx(avbuf);
--
--    if (atomic_fetch_sub(&avbuf->context_refcount, 1) == 1) {
--        atomic_fetch_sub_explicit(&s->refcount, 1, memory_order_acq_rel);
-+    return V4L2_FIELD_IS_INTERLACED(buf->buf.field);
-+}
- 
--        if (s->reinit) {
--            if (!atomic_load(&s->refcount))
--                sem_post(&s->refsync);
--        } else {
--            if (s->draining && V4L2_TYPE_IS_OUTPUT(avbuf->context->type)) {
--                /* no need to queue more buffers to the driver */
--                avbuf->status = V4L2BUF_AVAILABLE;
--            }
--            else if (avbuf->context->streamon)
--                ff_v4l2_buffer_enqueue(avbuf);
--        }
-+static int v4l2_buf_is_top_first(const V4L2Buffer * const buf)
-+{
-+    return buf->buf.field == V4L2_FIELD_INTERLACED_TB;
-+}
- 
--        av_buffer_unref(&avbuf->context_ref);
--    }
-+static void v4l2_set_interlace(V4L2Buffer * const buf, const int is_interlaced, const int is_tff)
-+{
-+    buf->buf.field = !is_interlaced ? V4L2_FIELD_NONE :
-+        is_tff ? V4L2_FIELD_INTERLACED_TB : V4L2_FIELD_INTERLACED_BT;
- }
- 
--static int v4l2_buf_increase_ref(V4L2Buffer *in)
-+static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
- {
--    V4L2m2mContext *s = buf_to_m2mctx(in);
-+    AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
-+    AVDRMLayerDescriptor *layer;
- 
--    if (in->context_ref)
--        atomic_fetch_add(&in->context_refcount, 1);
--    else {
--        in->context_ref = av_buffer_ref(s->self_ref);
--        if (!in->context_ref)
--            return AVERROR(ENOMEM);
-+    /* fill the DRM frame descriptor */
-+    drm_desc->nb_objects = avbuf->num_planes;
-+    drm_desc->nb_layers = 1;
- 
--        in->context_refcount = 1;
-+    layer = &drm_desc->layers[0];
-+    layer->nb_planes = avbuf->num_planes;
-+
-+    for (int i = 0; i < avbuf->num_planes; i++) {
-+        layer->planes[i].object_index = i;
-+        layer->planes[i].offset = 0;
-+        layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
-     }
- 
--    in->status = V4L2BUF_RET_USER;
--    atomic_fetch_add_explicit(&s->refcount, 1, memory_order_relaxed);
-+    switch (avbuf->context->av_pix_fmt) {
-+    case AV_PIX_FMT_YUYV422:
-+
-+        layer->format = DRM_FORMAT_YUYV;
-+        layer->nb_planes = 1;
- 
--    return 0;
-+        break;
-+
-+    case AV_PIX_FMT_NV12:
-+    case AV_PIX_FMT_NV21:
-+
-+        layer->format = avbuf->context->av_pix_fmt == AV_PIX_FMT_NV12 ?
-+            DRM_FORMAT_NV12 : DRM_FORMAT_NV21;
-+
-+        if (avbuf->num_planes > 1)
-+            break;
-+
-+        layer->nb_planes = 2;
-+
-+        layer->planes[1].object_index = 0;
-+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
-+            avbuf->context->format.fmt.pix.height;
-+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
-+        break;
-+
-+    case AV_PIX_FMT_YUV420P:
-+
-+        layer->format = DRM_FORMAT_YUV420;
-+
-+        if (avbuf->num_planes > 1)
-+            break;
-+
-+        layer->nb_planes = 3;
-+
-+        layer->planes[1].object_index = 0;
-+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
-+            avbuf->context->format.fmt.pix.height;
-+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
-+
-+        layer->planes[2].object_index = 0;
-+        layer->planes[2].offset = layer->planes[1].offset +
-+            ((avbuf->plane_info[0].bytesperline *
-+              avbuf->context->format.fmt.pix.height) >> 2);
-+        layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
-+        break;
-+
-+    default:
-+        drm_desc->nb_layers = 0;
-+        break;
-+    }
-+
-+    return (uint8_t *) drm_desc;
- }
- 
--static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf)
-+static void v4l2_free_bufref(void *opaque, uint8_t *data)
- {
--    int ret;
-+    AVBufferRef * bufref = (AVBufferRef *)data;
-+    V4L2Buffer *avbuf = (V4L2Buffer *)bufref->data;
-+    struct V4L2Context *ctx = ff_weak_link_lock(&avbuf->context_wl);
- 
--    if (plane >= in->num_planes)
--        return AVERROR(EINVAL);
-+    if (ctx != NULL) {
-+        // Buffer still attached to context
-+        V4L2m2mContext *s = buf_to_m2mctx(avbuf);
- 
--    /* even though most encoders return 0 in data_offset encoding vp8 does require this value */
--    *buf = av_buffer_create((char *)in->plane_info[plane].mm_addr + in->planes[plane].data_offset,
--                            in->plane_info[plane].length, v4l2_free_buffer, in, 0);
--    if (!*buf)
--        return AVERROR(ENOMEM);
-+        ff_mutex_lock(&ctx->lock);
- 
--    ret = v4l2_buf_increase_ref(in);
--    if (ret)
--        av_buffer_unref(buf);
-+        ff_v4l2_buffer_set_avail(avbuf);
- 
--    return ret;
-+        if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) {
-+            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer avail\n", ctx->name);
-+            /* no need to queue more buffers to the driver */
-+        }
-+        else if (ctx->streamon) {
-+            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer requeue\n", ctx->name);
-+            avbuf->buf.timestamp.tv_sec = 0;
-+            avbuf->buf.timestamp.tv_usec = 0;
-+            ff_v4l2_buffer_enqueue(avbuf);  // will set to IN_DRIVER
-+        }
-+        else {
-+            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer freed but streamoff\n", ctx->name);
-+        }
-+
-+        ff_mutex_unlock(&ctx->lock);
-+    }
-+
-+    ff_weak_link_unlock(avbuf->context_wl);
-+    av_buffer_unref(&bufref);
- }
- 
--static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset, AVBufferRef* bref)
-+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
-+{
-+    struct v4l2_exportbuffer expbuf;
-+    int i, ret;
-+
-+    for (i = 0; i < avbuf->num_planes; i++) {
-+        memset(&expbuf, 0, sizeof(expbuf));
-+
-+        expbuf.index = avbuf->buf.index;
-+        expbuf.type = avbuf->buf.type;
-+        expbuf.plane = i;
-+
-+        ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_EXPBUF, &expbuf);
-+        if (ret < 0)
-+            return AVERROR(errno);
-+
-+        if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) {
-+            /* drm frame */
-+            avbuf->drm_frame.objects[i].size = avbuf->buf.m.planes[i].length;
-+            avbuf->drm_frame.objects[i].fd = expbuf.fd;
-+            avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+        } else {
-+            /* drm frame */
-+            avbuf->drm_frame.objects[0].size = avbuf->buf.length;
-+            avbuf->drm_frame.objects[0].fd = expbuf.fd;
-+            avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+        }
-+    }
-+
-+    return 0;
-+}
-+
-+static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset)
- {
-     unsigned int bytesused, length;
-+    int rv = 0;
- 
-     if (plane >= out->num_planes)
-         return AVERROR(EINVAL);
-@@ -284,32 +515,57 @@ static int v4l2_bufref_to_buf(V4L2Buffer
-     length = out->plane_info[plane].length;
-     bytesused = FFMIN(size+offset, length);
- 
--    memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset));
--
--    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
--        out->planes[plane].bytesused = bytesused;
--        out->planes[plane].length = length;
--    } else {
--        out->buf.bytesused = bytesused;
--        out->buf.length = length;
-+    if (size > length - offset) {
-+        size = length - offset;
-+        rv = AVERROR(ENOMEM);
-     }
- 
--    return 0;
-+    memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, size);
-+
-+    set_buf_length(out, plane, bytesused, length);
-+
-+    return rv;
-+}
-+
-+static AVBufferRef * wrap_avbuf(V4L2Buffer * const avbuf)
-+{
-+    AVBufferRef * bufref = av_buffer_ref(avbuf->context->bufrefs[avbuf->buf.index]);
-+    AVBufferRef * newbuf;
-+
-+    if (!bufref)
-+        return NULL;
-+
-+    newbuf = av_buffer_create((uint8_t *)bufref, sizeof(*bufref), v4l2_free_bufref, NULL, 0);
-+    if (newbuf == NULL)
-+        av_buffer_unref(&bufref);
-+
-+    avbuf->status = V4L2BUF_RET_USER;
-+    return newbuf;
- }
- 
- static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
- {
--    int i, ret;
-+    int i;
- 
-     frame->format = avbuf->context->av_pix_fmt;
- 
--    for (i = 0; i < avbuf->num_planes; i++) {
--        ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]);
--        if (ret)
--            return ret;
-+    frame->buf[0] = wrap_avbuf(avbuf);
-+    if (frame->buf[0] == NULL)
-+        return AVERROR(ENOMEM);
-+
-+    if (buf_to_m2mctx(avbuf)->output_drm) {
-+        /* 1. get references to the actual data */
-+        frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf);
-+        frame->format = AV_PIX_FMT_DRM_PRIME;
-+        frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref);
-+        return 0;
-+    }
- 
-+
-+    /* 1. get references to the actual data */
-+    for (i = 0; i < avbuf->num_planes; i++) {
-+        frame->data[i] = (uint8_t *)avbuf->plane_info[i].mm_addr + avbuf->planes[i].data_offset;
-         frame->linesize[i] = avbuf->plane_info[i].bytesperline;
--        frame->data[i] = frame->buf[i]->data;
-     }
- 
-     /* fixup special cases */
-@@ -318,17 +574,17 @@ static int v4l2_buffer_buf_to_swframe(AV
-     case AV_PIX_FMT_NV21:
-         if (avbuf->num_planes > 1)
-             break;
--        frame->linesize[1] = avbuf->plane_info[0].bytesperline;
--        frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
-+        frame->linesize[1] = frame->linesize[0];
-+        frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format);
-         break;
- 
-     case AV_PIX_FMT_YUV420P:
-         if (avbuf->num_planes > 1)
-             break;
--        frame->linesize[1] = avbuf->plane_info[0].bytesperline >> 1;
--        frame->linesize[2] = avbuf->plane_info[0].bytesperline >> 1;
--        frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
--        frame->data[2] = frame->data[1] + ((avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height) >> 2);
-+        frame->linesize[1] = frame->linesize[0] / 2;
-+        frame->linesize[2] = frame->linesize[1];
-+        frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format);
-+        frame->data[2] = frame->data[1] + frame->linesize[1] * ff_v4l2_get_format_height(&avbuf->context->format) / 2;
-         break;
- 
-     default:
-@@ -338,68 +594,127 @@ static int v4l2_buffer_buf_to_swframe(AV
-     return 0;
- }
- 
-+static void cpy_2d(uint8_t * dst, int dst_stride, const uint8_t * src, int src_stride, int w, int h)
-+{
-+    if (dst_stride == src_stride && w + 32 >= dst_stride) {
-+        memcpy(dst, src, dst_stride * h);
-+    }
-+    else {
-+        while (--h >= 0) {
-+            memcpy(dst, src, w);
-+            dst += dst_stride;
-+            src += src_stride;
-+        }
-+    }
-+}
-+
-+static int is_chroma(const AVPixFmtDescriptor *desc, int i, int num_planes)
-+{
-+    return i != 0  && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA));
-+}
-+
-+static int v4l2_buffer_primeframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
-+{
-+    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
-+
-+    if (frame->format != AV_PIX_FMT_DRM_PRIME || !src)
-+        return AVERROR(EINVAL);
-+
-+    av_assert0(out->buf.memory == V4L2_MEMORY_DMABUF);
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
-+        // Only currently cope with single buffer types
-+        if (out->buf.length != 1)
-+            return AVERROR_PATCHWELCOME;
-+        if (src->nb_objects != 1)
-+            return AVERROR(EINVAL);
-+
-+        out->planes[0].m.fd = src->objects[0].fd;
-+    }
-+    else {
-+        if (src->nb_objects != 1)
-+            return AVERROR(EINVAL);
-+
-+        out->buf.m.fd      = src->objects[0].fd;
-+    }
-+
-+    // No need to copy src AVDescriptor and if we did then we may confuse
-+    // fd close on free
-+    out->ref_buf = av_buffer_ref(frame->buf[0]);
-+
-+    return 0;
-+}
-+
- static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
- {
--    int i, ret;
--    struct v4l2_format fmt = out->context->format;
--    int pixel_format = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ?
--                       fmt.fmt.pix_mp.pixelformat : fmt.fmt.pix.pixelformat;
--    int height       = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ?
--                       fmt.fmt.pix_mp.height : fmt.fmt.pix.height;
--    int is_planar_format = 0;
--
--    switch (pixel_format) {
--    case V4L2_PIX_FMT_YUV420M:
--    case V4L2_PIX_FMT_YVU420M:
--#ifdef V4L2_PIX_FMT_YUV422M
--    case V4L2_PIX_FMT_YUV422M:
--#endif
--#ifdef V4L2_PIX_FMT_YVU422M
--    case V4L2_PIX_FMT_YVU422M:
--#endif
--#ifdef V4L2_PIX_FMT_YUV444M
--    case V4L2_PIX_FMT_YUV444M:
--#endif
--#ifdef V4L2_PIX_FMT_YVU444M
--    case V4L2_PIX_FMT_YVU444M:
--#endif
--    case V4L2_PIX_FMT_NV12M:
--    case V4L2_PIX_FMT_NV21M:
--    case V4L2_PIX_FMT_NV12MT_16X16:
--    case V4L2_PIX_FMT_NV12MT:
--    case V4L2_PIX_FMT_NV16M:
--    case V4L2_PIX_FMT_NV61M:
--        is_planar_format = 1;
--    }
--
--    if (!is_planar_format) {
--        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
--        int planes_nb = 0;
--        int offset = 0;
--
--        for (i = 0; i < desc->nb_components; i++)
--            planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
--
--        for (i = 0; i < planes_nb; i++) {
--            int size, h = height;
--            if (i == 1 || i == 2) {
-+    int i;
-+    int num_planes = 0;
-+    int pel_strides[4] = {0};
-+
-+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
-+
-+    if ((desc->flags & AV_PIX_FMT_FLAG_HWACCEL) != 0) {
-+        av_log(NULL, AV_LOG_ERROR, "%s: HWACCEL cannot be copied\n", __func__);
-+        return -1;
-+    }
-+
-+    for (i = 0; i != desc->nb_components; ++i) {
-+        if (desc->comp[i].plane >= num_planes)
-+            num_planes = desc->comp[i].plane + 1;
-+        pel_strides[desc->comp[i].plane] = desc->comp[i].step;
-+    }
-+
-+    if (out->num_planes > 1) {
-+        if (num_planes != out->num_planes) {
-+            av_log(NULL, AV_LOG_ERROR, "%s: Num planes mismatch: %d != %d\n", __func__, num_planes, out->num_planes);
-+            return -1;
-+        }
-+        for (i = 0; i != num_planes; ++i) {
-+            int w = frame->width;
-+            int h = frame->height;
-+            if (is_chroma(desc, i, num_planes)) {
-+                w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
-                 h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
-             }
--            size = frame->linesize[i] * h;
--            ret = v4l2_bufref_to_buf(out, 0, frame->data[i], size, offset, frame->buf[i]);
--            if (ret)
--                return ret;
--            offset += size;
-+
-+            cpy_2d(out->plane_info[i].mm_addr, out->plane_info[i].bytesperline,
-+                   frame->data[i], frame->linesize[i],
-+                   w * pel_strides[i], h);
-+            set_buf_length(out, i, out->plane_info[i].bytesperline * h, out->plane_info[i].length);
-         }
--        return 0;
-     }
-+    else
-+    {
-+        unsigned int offset = 0;
-+
-+        for (i = 0; i != num_planes; ++i) {
-+            int w = frame->width;
-+            int h = frame->height;
-+            int dst_stride = out->plane_info[0].bytesperline;
-+            uint8_t * const dst = (uint8_t *)out->plane_info[0].mm_addr + offset;
-+
-+            if (is_chroma(desc, i, num_planes)) {
-+                // Is chroma
-+                dst_stride >>= desc->log2_chroma_w;
-+                offset += dst_stride * (out->context->height >> desc->log2_chroma_h);
-+                w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
-+                h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
-+            }
-+            else {
-+                // Is luma or alpha
-+                offset += dst_stride * out->context->height;
-+            }
-+            if (offset > out->plane_info[0].length) {
-+                av_log(NULL, AV_LOG_ERROR, "%s: Plane total %u > buffer size %zu\n", __func__, offset, out->plane_info[0].length);
-+                return -1;
-+            }
- 
--    for (i = 0; i < out->num_planes; i++) {
--        ret = v4l2_bufref_to_buf(out, i, frame->buf[i]->data, frame->buf[i]->size, 0, frame->buf[i]);
--        if (ret)
--            return ret;
-+            cpy_2d(dst, dst_stride,
-+                   frame->data[i], frame->linesize[i],
-+                   w * pel_strides[i], h);
-+        }
-+        set_buf_length(out, 0, offset, out->plane_info[0].length);
-     }
--
-     return 0;
- }
- 
-@@ -409,16 +724,31 @@ static int v4l2_buffer_swframe_to_buf(co
-  *
-  ******************************************************************************/
- 
--int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
-+int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts)
- {
--    v4l2_set_pts(out, frame->pts);
--
--    return v4l2_buffer_swframe_to_buf(frame, out);
-+    out->buf.flags = frame->key_frame ?
-+        (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
-+        (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
-+    // Beware that colour info is held in format rather than the actual
-+    // v4l2 buffer struct so this may not be as useful as you might hope
-+    v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc);
-+    v4l2_set_color_range(out, frame->color_range);
-+    // PTS & interlace are buffer vars
-+    if (track_ts)
-+        out->buf.timestamp = tv_from_int(track_ts);
-+    else
-+        v4l2_set_pts(out, frame->pts);
-+    v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first);
-+
-+    return frame->format == AV_PIX_FMT_DRM_PRIME ?
-+        v4l2_buffer_primeframe_to_buf(frame, out) :
-+        v4l2_buffer_swframe_to_buf(frame, out);
- }
- 
- int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
- {
-     int ret;
-+    V4L2Context * const ctx = avbuf->context;
- 
-     av_frame_unref(frame);
- 
-@@ -429,17 +759,32 @@ int ff_v4l2_buffer_buf_to_avframe(AVFram
- 
-     /* 2. get frame information */
-     frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME);
-+    frame->pict_type = frame->key_frame ? AV_PICTURE_TYPE_I :
-+        (avbuf->buf.flags & V4L2_BUF_FLAG_PFRAME) != 0 ? AV_PICTURE_TYPE_P :
-+        (avbuf->buf.flags & V4L2_BUF_FLAG_BFRAME) != 0 ? AV_PICTURE_TYPE_B :
-+            AV_PICTURE_TYPE_NONE;
-     frame->color_primaries = v4l2_get_color_primaries(avbuf);
-     frame->colorspace = v4l2_get_color_space(avbuf);
-     frame->color_range = v4l2_get_color_range(avbuf);
-     frame->color_trc = v4l2_get_color_trc(avbuf);
-     frame->pts = v4l2_get_pts(avbuf);
-     frame->pkt_dts = AV_NOPTS_VALUE;
-+    frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf);
-+    frame->top_field_first = v4l2_buf_is_top_first(avbuf);
- 
-     /* these values are updated also during re-init in v4l2_process_driver_event */
--    frame->height = avbuf->context->height;
--    frame->width = avbuf->context->width;
--    frame->sample_aspect_ratio = avbuf->context->sample_aspect_ratio;
-+    frame->height = ctx->height;
-+    frame->width = ctx->width;
-+    frame->sample_aspect_ratio = ctx->sample_aspect_ratio;
-+
-+    if (ctx->selection.height && ctx->selection.width) {
-+        frame->crop_left = ctx->selection.left < frame->width ? ctx->selection.left : 0;
-+        frame->crop_top  = ctx->selection.top < frame->height ? ctx->selection.top  : 0;
-+        frame->crop_right = ctx->selection.left + ctx->selection.width < frame->width ?
-+            frame->width - (ctx->selection.left + ctx->selection.width) : 0;
-+        frame->crop_bottom = ctx->selection.top + ctx->selection.height < frame->height ?
-+            frame->height - (ctx->selection.top + ctx->selection.height) : 0;
-+    }
- 
-     /* 3. report errors upstream */
-     if (avbuf->buf.flags & V4L2_BUF_FLAG_ERROR) {
-@@ -452,15 +797,15 @@ int ff_v4l2_buffer_buf_to_avframe(AVFram
- 
- int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
- {
--    int ret;
--
-     av_packet_unref(pkt);
--    ret = v4l2_buf_to_bufref(avbuf, 0, &pkt->buf);
--    if (ret)
--        return ret;
-+
-+    pkt->buf = wrap_avbuf(avbuf);
-+    if (pkt->buf == NULL)
-+        return AVERROR(ENOMEM);
- 
-     pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused;
--    pkt->data = pkt->buf->data;
-+    pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset;
-+    pkt->flags = 0;
- 
-     if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME)
-         pkt->flags |= AV_PKT_FLAG_KEY;
-@@ -475,31 +820,91 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket
-     return 0;
- }
- 
--int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
-+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
-+                                    const void *extdata, size_t extlen,
-+                                    const int64_t timestamp)
- {
-     int ret;
- 
--    ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, 0, pkt->buf);
--    if (ret)
-+    if (extlen) {
-+        ret = v4l2_bufref_to_buf(out, 0, extdata, extlen, 0);
-+        if (ret)
-+            return ret;
-+    }
-+
-+    ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen);
-+    if (ret && ret != AVERROR(ENOMEM))
-         return ret;
- 
--    v4l2_set_pts(out, pkt->pts);
-+    if (timestamp)
-+        out->buf.timestamp = tv_from_int(timestamp);
-+    else
-+        v4l2_set_pts(out, pkt->pts);
-+
-+    out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ?
-+        (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
-+        (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
- 
--    if (pkt->flags & AV_PKT_FLAG_KEY)
--        out->flags = V4L2_BUF_FLAG_KEYFRAME;
-+    return ret;
-+}
- 
--    return 0;
-+int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
-+{
-+    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0);
-+}
-+
-+
-+static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data)
-+{
-+    V4L2Buffer * const avbuf = (V4L2Buffer *)data;
-+    int i;
-+
-+    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->plane_info); ++i) {
-+        struct V4L2Plane_info *p = avbuf->plane_info + i;
-+        if (p->mm_addr != NULL)
-+            munmap(p->mm_addr, p->length);
-+    }
-+
-+    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
-+        if (avbuf->drm_frame.objects[i].fd != -1)
-+            close(avbuf->drm_frame.objects[i].fd);
-+    }
-+
-+    av_buffer_unref(&avbuf->ref_buf);
-+
-+    ff_weak_link_unref(&avbuf->context_wl);
-+
-+    av_free(avbuf);
- }
- 
--int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
-+
-+int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx, enum v4l2_memory mem)
- {
--    V4L2Context *ctx = avbuf->context;
-     int ret, i;
-+    V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf));
-+    AVBufferRef * bufref;
-+
-+    *pbufref = NULL;
-+    if (avbuf == NULL)
-+        return AVERROR(ENOMEM);
-+
-+    bufref = av_buffer_create((uint8_t*)avbuf, sizeof(*avbuf), v4l2_buffer_buffer_free, NULL, 0);
-+    if (bufref == NULL) {
-+        av_free(avbuf);
-+        return AVERROR(ENOMEM);
-+    }
- 
--    avbuf->buf.memory = V4L2_MEMORY_MMAP;
-+    avbuf->context = ctx;
-+    avbuf->buf.memory = mem;
-     avbuf->buf.type = ctx->type;
-     avbuf->buf.index = index;
- 
-+    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
-+        avbuf->drm_frame.objects[i].fd = -1;
-+    }
-+
-+    avbuf->context_wl = ff_weak_link_ref(ctx->wl_master);
-+
-     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-         avbuf->buf.length = VIDEO_MAX_PLANES;
-         avbuf->buf.m.planes = avbuf->planes;
-@@ -507,7 +912,7 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
- 
-     ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf);
-     if (ret < 0)
--        return AVERROR(errno);
-+        goto fail;
- 
-     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-         avbuf->num_planes = 0;
-@@ -520,6 +925,8 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
-         avbuf->num_planes = 1;
- 
-     for (i = 0; i < avbuf->num_planes; i++) {
-+        const int want_mmap = avbuf->buf.memory == V4L2_MEMORY_MMAP &&
-+            (V4L2_TYPE_IS_OUTPUT(ctx->type) || !buf_to_m2mctx(avbuf)->output_drm);
- 
-         avbuf->plane_info[i].bytesperline = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
-             ctx->format.fmt.pix_mp.plane_fmt[i].bytesperline :
-@@ -527,25 +934,29 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
- 
-         if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-             avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
--            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
--                                           PROT_READ | PROT_WRITE, MAP_SHARED,
--                                           buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
-+
-+            if (want_mmap)
-+                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
-+                                               PROT_READ | PROT_WRITE, MAP_SHARED,
-+                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
-         } else {
-             avbuf->plane_info[i].length = avbuf->buf.length;
--            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
--                                          PROT_READ | PROT_WRITE, MAP_SHARED,
--                                          buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
-+
-+            if (want_mmap)
-+                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
-+                                               PROT_READ | PROT_WRITE, MAP_SHARED,
-+                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
-         }
- 
--        if (avbuf->plane_info[i].mm_addr == MAP_FAILED)
--            return AVERROR(ENOMEM);
-+        if (avbuf->plane_info[i].mm_addr == MAP_FAILED) {
-+            avbuf->plane_info[i].mm_addr = NULL;
-+            ret = AVERROR(ENOMEM);
-+            goto fail;
-+        }
-     }
- 
-     avbuf->status = V4L2BUF_AVAILABLE;
- 
--    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
--        return 0;
--
-     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-         avbuf->buf.m.planes = avbuf->planes;
-         avbuf->buf.length   = avbuf->num_planes;
-@@ -555,20 +966,51 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
-         avbuf->buf.length    = avbuf->planes[0].length;
-     }
- 
--    return ff_v4l2_buffer_enqueue(avbuf);
-+    if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
-+        if (buf_to_m2mctx(avbuf)->output_drm) {
-+            ret = v4l2_buffer_export_drm(avbuf);
-+            if (ret)
-+                    goto fail;
-+        }
-+    }
-+
-+    *pbufref = bufref;
-+    return 0;
-+
-+fail:
-+    av_buffer_unref(&bufref);
-+    return ret;
- }
- 
- int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
- {
-     int ret;
-+    int qc;
- 
--    avbuf->buf.flags = avbuf->flags;
-+    if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) {
-+        av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
-+               avbuf->context->name, avbuf->buf.index,
-+               avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec,
-+               avbuf->context->q_count);
-+    }
- 
-     ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QBUF, &avbuf->buf);
--    if (ret < 0)
--        return AVERROR(errno);
-+    if (ret < 0) {
-+        int err = errno;
-+        av_log(logger(avbuf), AV_LOG_ERROR, "--- %s VIDIOC_QBUF: index %d FAIL err %d (%s)\n",
-+               avbuf->context->name, avbuf->buf.index,
-+               err, strerror(err));
-+        return AVERROR(err);
-+    }
- 
-+    // Lock not wanted - if called from buffer free then lock already obtained
-+    qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1;
-     avbuf->status = V4L2BUF_IN_DRIVER;
-+    pthread_cond_broadcast(&avbuf->context->cond);
-+
-+    av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
-+           avbuf->context->name, avbuf->buf.index,
-+           avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc);
- 
-     return 0;
- }
---- a/libavcodec/v4l2_buffers.h
-+++ b/libavcodec/v4l2_buffers.h
-@@ -27,25 +27,38 @@
- #include <stdatomic.h>
- #include <linux/videodev2.h>
- 
-+#include "libavutil/hwcontext_drm.h"
- #include "avcodec.h"
- 
- enum V4L2Buffer_status {
-     V4L2BUF_AVAILABLE,
-     V4L2BUF_IN_DRIVER,
-+    V4L2BUF_IN_USE,
-     V4L2BUF_RET_USER,
- };
- 
- /**
-  * V4L2Buffer (wrapper for v4l2_buffer management)
-  */
-+struct V4L2Context;
-+struct ff_weak_link_client;
-+
- typedef struct V4L2Buffer {
--    /* each buffer needs to have a reference to its context */
-+    /* each buffer needs to have a reference to its context
-+     * The pointer is good enough for most operation but once the buffer has
-+     * been passed to the user the buffer may become orphaned so for free ops
-+     * the weak link must be used to ensure that the context is actually
-+     * there
-+     */
-     struct V4L2Context *context;
-+    struct ff_weak_link_client *context_wl;
- 
--    /* This object is refcounted per-plane, so we need to keep track
--     * of how many context-refs we are holding. */
--    AVBufferRef *context_ref;
--    atomic_uint context_refcount;
-+    /* DRM descriptor */
-+    AVDRMFrameDescriptor drm_frame;
-+    /* For DRM_PRIME encode - need to keep a ref to the source buffer till we
-+     * are done
-+     */
-+    AVBufferRef * ref_buf;
- 
-     /* keep track of the mmap address and mmap length */
-     struct V4L2Plane_info {
-@@ -60,7 +73,6 @@ typedef struct V4L2Buffer {
-     struct v4l2_buffer buf;
-     struct v4l2_plane planes[VIDEO_MAX_PLANES];
- 
--    int flags;
-     enum V4L2Buffer_status status;
- 
- } V4L2Buffer;
-@@ -98,6 +110,10 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket
-  */
- int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);
- 
-+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
-+                                    const void *extdata, size_t extlen,
-+                                    const int64_t timestamp);
-+
- /**
-  * Extracts the data from an AVFrame to a V4L2Buffer
-  *
-@@ -106,7 +122,7 @@ int ff_v4l2_buffer_avpkt_to_buf(const AV
-  *
-  * @returns 0 in case of success, a negative AVERROR code otherwise
-  */
--int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
-+int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts);
- 
- /**
-  * Initializes a V4L2Buffer
-@@ -116,7 +132,7 @@ int ff_v4l2_buffer_avframe_to_buf(const
-  *
-  * @returns 0 in case of success, a negative AVERROR code otherwise
-  */
--int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index);
-+int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx, enum v4l2_memory mem);
- 
- /**
-  * Enqueues a V4L2Buffer
-@@ -127,5 +143,12 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
-  */
- int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf);
- 
-+static inline void
-+ff_v4l2_buffer_set_avail(V4L2Buffer* const avbuf)
-+{
-+    avbuf->status = V4L2BUF_AVAILABLE;
-+    av_buffer_unref(&avbuf->ref_buf);
-+}
-+
- 
- #endif // AVCODEC_V4L2_BUFFERS_H
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -27,11 +27,13 @@
- #include <unistd.h>
- #include <fcntl.h>
- #include <poll.h>
-+#include "libavutil/avassert.h"
- #include "libavcodec/avcodec.h"
- #include "libavcodec/internal.h"
- #include "v4l2_buffers.h"
- #include "v4l2_fmt.h"
- #include "v4l2_m2m.h"
-+#include "weak_link.h"
- 
- struct v4l2_format_update {
-     uint32_t v4l2_fmt;
-@@ -41,26 +43,168 @@ struct v4l2_format_update {
-     int update_avfmt;
- };
- 
--static inline V4L2m2mContext *ctx_to_m2mctx(V4L2Context *ctx)
-+
-+static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n)
- {
--    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
--        container_of(ctx, V4L2m2mContext, output) :
--        container_of(ctx, V4L2m2mContext, capture);
-+    return (int64_t)n;
- }
- 
--static inline AVCodecContext *logger(V4L2Context *ctx)
-+static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts)
- {
--    return ctx_to_m2mctx(ctx)->avctx;
-+    return (unsigned int)pts;
- }
- 
--static inline unsigned int v4l2_get_width(struct v4l2_format *fmt)
-+// FFmpeg requires us to propagate a number of vars from the coded pkt into
-+// the decoded frame. The only thing that tracks like that in V4L2 stateful
-+// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no
-+// guarantees about PTS being unique or specified for every frame so replace
-+// the supplied PTS with a simple incrementing number and keep a circular
-+// buffer of all the things we want preserved (including the original PTS)
-+// indexed by the tracking no.
-+static int64_t
-+xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPacket *const avpkt)
- {
--    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
-+    int64_t track_pts;
-+
-+    // Avoid 0
-+    if (++x->track_no == 0)
-+        x->track_no = 1;
-+
-+    track_pts = track_to_pts(avctx, x->track_no);
-+
-+    av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
-+    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
-+        .discard          = 0,
-+        .pending          = 1,
-+        .pkt_size         = avpkt->size,
-+        .pts              = avpkt->pts,
-+        .dts              = avpkt->dts,
-+        .reordered_opaque = avctx->reordered_opaque,
-+        .pkt_pos          = avpkt->pos,
-+        .pkt_duration     = avpkt->duration,
-+        .track_pts        = track_pts
-+    };
-+    return track_pts;
- }
- 
--static inline unsigned int v4l2_get_height(struct v4l2_format *fmt)
-+static int64_t
-+xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFrame *const frame)
- {
--    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
-+    int64_t track_pts;
-+
-+    // Avoid 0
-+    if (++x->track_no == 0)
-+        x->track_no = 1;
-+
-+    track_pts = track_to_pts(avctx, x->track_no);
-+
-+    av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no);
-+    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
-+        .discard          = 0,
-+        .pending          = 1,
-+        .pkt_size         = 0,
-+        .pts              = frame->pts,
-+        .dts              = AV_NOPTS_VALUE,
-+        .reordered_opaque = frame->reordered_opaque,
-+        .pkt_pos          = frame->pkt_pos,
-+        .pkt_duration     = frame->pkt_duration,
-+        .track_pts        = track_pts
-+    };
-+    return track_pts;
-+}
-+
-+
-+// Returns -1 if we should discard the frame
-+static int
-+xlat_pts_frame_out(AVCodecContext *const avctx,
-+             xlat_track_t * const x,
-+             AVFrame *const frame)
-+{
-+    unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
-+    V4L2m2mTrackEl *const t = x->track_els + n;
-+    if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
-+    {
-+        av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
-+               "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
-+        frame->pts              = AV_NOPTS_VALUE;
-+        frame->pkt_dts          = AV_NOPTS_VALUE;
-+        frame->reordered_opaque = x->last_opaque;
-+        frame->pkt_pos          = -1;
-+        frame->pkt_duration     = 0;
-+        frame->pkt_size         = -1;
-+    }
-+    else if (!t->discard)
-+    {
-+        frame->pts              = t->pending ? t->pts : AV_NOPTS_VALUE;
-+        frame->pkt_dts          = t->dts;
-+        frame->reordered_opaque = t->reordered_opaque;
-+        frame->pkt_pos          = t->pkt_pos;
-+        frame->pkt_duration     = t->pkt_duration;
-+        frame->pkt_size         = t->pkt_size;
-+
-+        x->last_opaque = x->track_els[n].reordered_opaque;
-+        if (frame->pts != AV_NOPTS_VALUE)
-+            x->last_pts = frame->pts;
-+        t->pending = 0;
-+    }
-+    else
-+    {
-+        av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
-+        return -1;
-+    }
-+
-+    av_log(avctx, AV_LOG_TRACE, "Out frame PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n",
-+           frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n);
-+    return 0;
-+}
-+
-+// Returns -1 if we should discard the frame
-+static int
-+xlat_pts_pkt_out(AVCodecContext *const avctx,
-+             xlat_track_t * const x,
-+             AVPacket *const pkt)
-+{
-+    unsigned int n = pts_to_track(avctx, pkt->pts) % FF_V4L2_M2M_TRACK_SIZE;
-+    V4L2m2mTrackEl *const t = x->track_els + n;
-+    if (pkt->pts == AV_NOPTS_VALUE || pkt->pts != t->track_pts)
-+    {
-+        av_log(avctx, pkt->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
-+               "Pkt tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
-+        pkt->pts                = AV_NOPTS_VALUE;
-+    }
-+    else if (!t->discard)
-+    {
-+        pkt->pts                = t->pending ? t->pts : AV_NOPTS_VALUE;
-+
-+        x->last_opaque = x->track_els[n].reordered_opaque;
-+        if (pkt->pts != AV_NOPTS_VALUE)
-+            x->last_pts = pkt->pts;
-+        t->pending = 0;
-+    }
-+    else
-+    {
-+        av_log(avctx, AV_LOG_DEBUG, "Discard packet (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
-+        return -1;
-+    }
-+
-+    // * Would like something much better than this...xlat(offset + out_count)?
-+    pkt->dts = pkt->pts;
-+    av_log(avctx, AV_LOG_TRACE, "Out pkt PTS=%" PRId64 ", track=%"PRId64", n=%d\n",
-+           pkt->pts, t->track_pts, n);
-+    return 0;
-+}
-+
-+
-+static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
-+{
-+    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
-+        container_of(ctx, V4L2m2mContext, output) :
-+        container_of(ctx, V4L2m2mContext, capture);
-+}
-+
-+static inline AVCodecContext *logger(const V4L2Context *ctx)
-+{
-+    return ctx_to_m2mctx(ctx)->avctx;
- }
- 
- static AVRational v4l2_get_sar(V4L2Context *ctx)
-@@ -81,21 +225,29 @@ static AVRational v4l2_get_sar(V4L2Conte
-     return sar;
- }
- 
--static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2_format *fmt2)
-+static inline int ctx_buffers_alloced(const V4L2Context * const ctx)
-+{
-+    return ctx->bufrefs != NULL;
-+}
-+
-+// Width/Height changed or we don't have an alloc in the first place?
-+static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2)
- {
--    struct v4l2_format *fmt1 = &ctx->format;
--    int ret =  V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
--        fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
--        fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
--        :
--        fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
--        fmt1->fmt.pix.height != fmt2->fmt.pix.height;
-+    const struct v4l2_format *fmt1 = &ctx->format;
-+    int ret = !ctx_buffers_alloced(ctx) ||
-+        (V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
-+            fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
-+            fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
-+            :
-+            fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
-+            fmt1->fmt.pix.height != fmt2->fmt.pix.height);
- 
-     if (ret)
--        av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n",
-+        av_log(logger(ctx), AV_LOG_DEBUG, "V4L2 %s changed: alloc=%d (%dx%d) -> (%dx%d)\n",
-             ctx->name,
--            v4l2_get_width(fmt1), v4l2_get_height(fmt1),
--            v4l2_get_width(fmt2), v4l2_get_height(fmt2));
-+            ctx_buffers_alloced(ctx),
-+            ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1),
-+            ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2));
- 
-     return ret;
- }
-@@ -153,90 +305,110 @@ static inline void v4l2_save_to_context(
-     }
- }
- 
--/**
-- * handle resolution change event and end of stream event
-- * returns 1 if reinit was successful, negative if it failed
-- * returns 0 if reinit was not executed
-- */
--static int v4l2_handle_event(V4L2Context *ctx)
-+static int get_default_selection(V4L2Context * const ctx, struct v4l2_rect *r)
- {
--    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
--    struct v4l2_format cap_fmt = s->capture.format;
--    struct v4l2_format out_fmt = s->output.format;
--    struct v4l2_event evt = { 0 };
--    int full_reinit, reinit, ret;
-+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-+    struct v4l2_selection selection = {
-+        .type = V4L2_BUF_TYPE_VIDEO_CAPTURE,
-+        .target = V4L2_SEL_TGT_COMPOSE
-+    };
- 
--    ret = ioctl(s->fd, VIDIOC_DQEVENT, &evt);
--    if (ret < 0) {
--        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_DQEVENT\n", ctx->name);
--        return 0;
--    }
-+    memset(r, 0, sizeof(*r));
-+    if (ioctl(s->fd, VIDIOC_G_SELECTION, &selection))
-+        return AVERROR(errno);
- 
--    if (evt.type == V4L2_EVENT_EOS) {
--        ctx->done = 1;
--        return 0;
--    }
-+    *r = selection.r;
-+    return 0;
-+}
- 
--    if (evt.type != V4L2_EVENT_SOURCE_CHANGE)
--        return 0;
-+static int do_source_change(V4L2m2mContext * const s)
-+{
-+    AVCodecContext *const avctx = s->avctx;
- 
--    ret = ioctl(s->fd, VIDIOC_G_FMT, &out_fmt);
--    if (ret) {
--        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->output.name);
--        return 0;
--    }
-+    int ret;
-+    int reinit;
-+    struct v4l2_format cap_fmt = s->capture.format;
-+
-+    s->capture.done = 0;
- 
-     ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt);
-     if (ret) {
--        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->capture.name);
-+        av_log(avctx, AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", s->capture.name);
-         return 0;
-     }
- 
--    full_reinit = v4l2_resolution_changed(&s->output, &out_fmt);
--    if (full_reinit) {
--        s->output.height = v4l2_get_height(&out_fmt);
--        s->output.width = v4l2_get_width(&out_fmt);
--        s->output.sample_aspect_ratio = v4l2_get_sar(&s->output);
--    }
-+    get_default_selection(&s->capture, &s->capture.selection);
-+
-+    reinit = ctx_resolution_changed(&s->capture, &cap_fmt);
-+    if ((s->quirks & FF_V4L2_QUIRK_REINIT_ALWAYS) != 0)
-+        reinit = 1;
- 
--    reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
-+    s->capture.format = cap_fmt;
-     if (reinit) {
--        s->capture.height = v4l2_get_height(&cap_fmt);
--        s->capture.width = v4l2_get_width(&cap_fmt);
--        s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
-+        s->capture.height = ff_v4l2_get_format_height(&cap_fmt);
-+        s->capture.width = ff_v4l2_get_format_width(&cap_fmt);
-     }
- 
--    if (full_reinit || reinit)
--        s->reinit = 1;
--
--    if (full_reinit) {
--        ret = ff_v4l2_m2m_codec_full_reinit(s);
--        if (ret) {
--            av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_full_reinit\n");
--            return AVERROR(EINVAL);
--        }
--        goto reinit_run;
-+    // If we don't support selection (or it is bust) and we obviously have HD then kludge
-+    if ((s->capture.selection.width == 0 || s->capture.selection.height == 0) &&
-+        (s->capture.height == 1088 && s->capture.width == 1920)) {
-+        s->capture.selection = (struct v4l2_rect){.width = 1920, .height = 1080};
-     }
- 
-+    s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
-+
-+    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, wxh %dx%d crop %dx%d @ %d,%d, reinit=%d\n",
-+           s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den,
-+           s->capture.width, s->capture.height,
-+           s->capture.selection.width, s->capture.selection.height,
-+           s->capture.selection.left, s->capture.selection.top, reinit);
-+
-     if (reinit) {
--        if (s->avctx)
--            ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height);
-+        if (avctx)
-+            ret = ff_set_dimensions(s->avctx,
-+                                    s->capture.selection.width != 0 ? s->capture.selection.width : s->capture.width,
-+                                    s->capture.selection.height != 0 ? s->capture.selection.height : s->capture.height);
-         if (ret < 0)
--            av_log(logger(ctx), AV_LOG_WARNING, "update avcodec height and width\n");
-+            av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n");
- 
-         ret = ff_v4l2_m2m_codec_reinit(s);
-         if (ret) {
--            av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_reinit\n");
-+            av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n");
-             return AVERROR(EINVAL);
-         }
-+
-+        if (s->capture.width > ff_v4l2_get_format_width(&s->capture.format) ||
-+            s->capture.height > ff_v4l2_get_format_height(&s->capture.format)) {
-+            av_log(avctx, AV_LOG_ERROR, "Format post reinit too small: wanted %dx%d > got %dx%d\n",
-+                   s->capture.width, s->capture.height,
-+                   ff_v4l2_get_format_width(&s->capture.format), ff_v4l2_get_format_height(&s->capture.format));
-+            return AVERROR(EINVAL);
-+        }
-+
-+        // Update pixel format - should only actually do something on initial change
-+        s->capture.av_pix_fmt =
-+            ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO);
-+        if (s->output_drm) {
-+            avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
-+            avctx->sw_pix_fmt = s->capture.av_pix_fmt;
-+        }
-+        else
-+            avctx->pix_fmt = s->capture.av_pix_fmt;
-+
-         goto reinit_run;
-     }
- 
--    /* dummy event received */
--    return 0;
-+    /* Buffers are OK so just stream off to ack */
-+    av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only - restart decode\n", __func__);
-+
-+    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
-+    if (ret)
-+        av_log(avctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF failed\n");
-+    s->draining = 0;
- 
-     /* reinit executed */
- reinit_run:
-+    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMON);
-     return 1;
- }
- 
-@@ -280,171 +452,277 @@ static int v4l2_stop_encode(V4L2Context
-     return 0;
- }
- 
--static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
--{
--    struct v4l2_plane planes[VIDEO_MAX_PLANES];
--    struct v4l2_buffer buf = { 0 };
--    V4L2Buffer *avbuf;
--    struct pollfd pfd = {
--        .events =  POLLIN | POLLRDNORM | POLLPRI | POLLOUT | POLLWRNORM, /* default blocking capture */
--        .fd = ctx_to_m2mctx(ctx)->fd,
-+// DQ a buffer
-+// Amalgamates all the various ways there are of signalling EOS/Event to
-+// generate a consistant EPIPE.
-+//
-+// Sets ctx->flag_last if next dq would produce EPIPE (i.e. stream has stopped)
-+//
-+// Returns:
-+//  0               Success
-+//  AVERROR(EPIPE)  Nothing more to read
-+//  AVERROR(ENOSPC) No buffers in Q to put result in
-+//  *               AVERROR(..)
-+
-+ static int
-+dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf)
-+{
-+    V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
-+    AVCodecContext * const avctx = m->avctx;
-+    V4L2Buffer * avbuf;
-+    const int is_mp = V4L2_TYPE_IS_MULTIPLANAR(ctx->type);
-+
-+    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
-+
-+    struct v4l2_buffer buf = {
-+        .type = ctx->type,
-+        .memory = V4L2_MEMORY_MMAP,
-     };
--    int i, ret;
- 
--    if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx->buffers) {
--        for (i = 0; i < ctx->num_buffers; i++) {
--            if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
--                break;
--        }
--        if (i == ctx->num_buffers)
--            av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers returned to "
--                                                "userspace. Increase num_capture_buffers "
--                                                "to prevent device deadlock or dropped "
--                                                "packets/frames.\n");
--    }
--
--    /* if we are draining and there are no more capture buffers queued in the driver we are done */
--    if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx_to_m2mctx(ctx)->draining) {
--        for (i = 0; i < ctx->num_buffers; i++) {
--            /* capture buffer initialization happens during decode hence
--             * detection happens at runtime
--             */
--            if (!ctx->buffers)
--                break;
--
--            if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
--                goto start;
--        }
--        ctx->done = 1;
--        return NULL;
--    }
--
--start:
--    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
--        pfd.events =  POLLOUT | POLLWRNORM;
--    else {
--        /* no need to listen to requests for more input while draining */
--        if (ctx_to_m2mctx(ctx)->draining)
--            pfd.events =  POLLIN | POLLRDNORM | POLLPRI;
-+    *ppavbuf = NULL;
-+
-+    if (ctx->flag_last)
-+        return AVERROR(EPIPE);
-+
-+    if (is_mp) {
-+        buf.length = VIDEO_MAX_PLANES;
-+        buf.m.planes = planes;
-     }
- 
--    for (;;) {
--        ret = poll(&pfd, 1, timeout);
--        if (ret > 0)
--            break;
--        if (errno == EINTR)
--            continue;
--        return NULL;
-+    while (ioctl(m->fd, VIDIOC_DQBUF, &buf) != 0) {
-+        const int err = errno;
-+        av_assert0(AVERROR(err) < 0);
-+        if (err != EINTR) {
-+            av_log(avctx, AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
-+                ctx->name, av_err2str(AVERROR(err)));
-+
-+            if (err == EPIPE)
-+                ctx->flag_last = 1;
-+
-+            return AVERROR(err);
-+        }
-     }
-+    atomic_fetch_sub(&ctx->q_count, 1);
- 
--    /* 0. handle errors */
--    if (pfd.revents & POLLERR) {
--        /* if we are trying to get free buffers but none have been queued yet
--           no need to raise a warning */
--        if (timeout == 0) {
--            for (i = 0; i < ctx->num_buffers; i++) {
--                if (ctx->buffers[i].status != V4L2BUF_AVAILABLE)
--                    av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
--            }
-+    avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
-+    ff_v4l2_buffer_set_avail(avbuf);
-+    avbuf->buf = buf;
-+    if (is_mp) {
-+        memcpy(avbuf->planes, planes, sizeof(planes));
-+        avbuf->buf.m.planes = avbuf->planes;
-+    }
-+    // Done with any attached buffer
-+    av_buffer_unref(&avbuf->ref_buf);
-+
-+    if (V4L2_TYPE_IS_CAPTURE(ctx->type)) {
-+        // Zero length cap buffer return == EOS
-+        if ((is_mp ? buf.m.planes[0].bytesused : buf.bytesused) == 0) {
-+            av_log(avctx, AV_LOG_DEBUG, "Buffer empty - reQ\n");
-+
-+            // Must reQ so we don't leak
-+            // May not matter if the next thing we do is release all the
-+            // buffers but better to be tidy.
-+            ff_v4l2_buffer_enqueue(avbuf);
-+
-+            ctx->flag_last = 1;
-+            return AVERROR(EPIPE);
-         }
--        else
--            av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
- 
--        return NULL;
-+#ifdef V4L2_BUF_FLAG_LAST
-+        // If flag_last set then this contains data but is the last frame
-+        // so remember that but return OK
-+        if ((buf.flags & V4L2_BUF_FLAG_LAST) != 0)
-+            ctx->flag_last = 1;
-+#endif
-     }
- 
--    /* 1. handle resolution changes */
--    if (pfd.revents & POLLPRI) {
--        ret = v4l2_handle_event(ctx);
--        if (ret < 0) {
--            /* if re-init failed, abort */
--            ctx->done = 1;
--            return NULL;
--        }
--        if (ret) {
--            /* if re-init was successful drop the buffer (if there was one)
--             * since we had to reconfigure capture (unmap all buffers)
--             */
--            return NULL;
-+    *ppavbuf = avbuf;
-+    return 0;
-+}
-+
-+/**
-+ * handle resolution change event and end of stream event
-+ * Expects to be called after the stream has stopped
-+ *
-+ * returns 1 if reinit was successful, negative if it failed
-+ * returns 0 if reinit was not executed
-+ */
-+static int
-+get_event(V4L2m2mContext * const m)
-+{
-+    AVCodecContext * const avctx = m->avctx;
-+    struct v4l2_event evt = { 0 };
-+
-+    while (ioctl(m->fd, VIDIOC_DQEVENT, &evt) != 0) {
-+        const int rv = AVERROR(errno);
-+        if (rv == AVERROR(EINTR))
-+            continue;
-+        if (rv == AVERROR(EAGAIN)) {
-+            av_log(avctx, AV_LOG_WARNING, "V4L2 failed to get expected event - assume EOS\n");
-+            return AVERROR_EOF;
-         }
-+        av_log(avctx, AV_LOG_ERROR, "V4L2 VIDIOC_DQEVENT: %s\n", av_err2str(rv));
-+        return rv;
-     }
- 
--    /* 2. dequeue the buffer */
--    if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) {
-+    av_log(avctx, AV_LOG_DEBUG, "Dq event %d\n", evt.type);
- 
--        if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
--            /* there is a capture buffer ready */
--            if (pfd.revents & (POLLIN | POLLRDNORM))
--                goto dequeue;
-+    if (evt.type == V4L2_EVENT_EOS) {
-+        av_log(avctx, AV_LOG_TRACE, "V4L2 VIDIOC_EVENT_EOS\n");
-+        return AVERROR_EOF;
-+    }
-+
-+    if (evt.type == V4L2_EVENT_SOURCE_CHANGE)
-+        return do_source_change(m);
-+
-+    return 0;
-+}
-+
-+
-+// Get a buffer
-+// If output then just gets the buffer in the expected way
-+// If capture then runs the capture state m/c to deal with res change etc.
-+// If return value == 0 then *ppavbuf != NULL
-+
-+static int
-+get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout)
-+{
-+    V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
-+    AVCodecContext * const avctx = m->avctx;
-+    const int is_cap = V4L2_TYPE_IS_CAPTURE(ctx->type);
-+
-+    const unsigned int poll_cap = (POLLIN | POLLRDNORM);
-+    const unsigned int poll_out = (POLLOUT | POLLWRNORM);
-+    const unsigned int poll_event = POLLPRI;
-+
-+    *ppavbuf = NULL;
- 
--            /* the driver is ready to accept more input; instead of waiting for the capture
--             * buffer to complete we return NULL so input can proceed (we are single threaded)
--             */
--            if (pfd.revents & (POLLOUT | POLLWRNORM))
--                return NULL;
-+    for (;;) {
-+        struct pollfd pfd = {
-+            .fd = m->fd,
-+            // If capture && stream not started then assume we are waiting for the initial event
-+            .events = !is_cap ? poll_out :
-+                !ff_v4l2_ctx_eos(ctx) && ctx->streamon ? poll_cap :
-+                    poll_event,
-+        };
-+        int ret;
-+
-+        if (ctx->done) {
-+            av_log(avctx, AV_LOG_TRACE, "V4L2 %s already done\n", ctx->name);
-+            return AVERROR_EOF;
-         }
- 
--dequeue:
--        memset(&buf, 0, sizeof(buf));
--        buf.memory = V4L2_MEMORY_MMAP;
--        buf.type = ctx->type;
--        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
--            memset(planes, 0, sizeof(planes));
--            buf.length = VIDEO_MAX_PLANES;
--            buf.m.planes = planes;
-+        // If capture && timeout == -1 then also wait for rx buffer free
-+        if (is_cap && timeout == -1 && m->output.streamon && !m->draining)
-+            pfd.events |= poll_out;
-+
-+        // If nothing Qed all we will get is POLLERR - avoid that
-+        if ((pfd.events == poll_out && atomic_load(&m->output.q_count) == 0) ||
-+            (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) ||
-+            (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) {
-+            av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name);
-+            return AVERROR(ENOSPC);
-         }
- 
--        ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf);
--        if (ret) {
--            if (errno != EAGAIN) {
--                ctx->done = 1;
--                if (errno != EPIPE)
--                    av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
--                        ctx->name, av_err2str(AVERROR(errno)));
-+        // Timeout kludged s.t. "forever" eventually gives up & produces logging
-+        // If waiting for an event when we have seen a last_frame then we expect
-+        //   it to be ready already so force a short timeout
-+        ret = poll(&pfd, 1,
-+                   ff_v4l2_ctx_eos(ctx) ? 10 :
-+                   timeout == -1 ? 3000 : timeout);
-+        if (ret < 0) {
-+            ret = AVERROR(errno);  // Remember errno before logging etc.
-+            av_assert0(ret < 0);
-+        }
-+
-+        av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s ret=%d, timeout=%d, events=%#x, revents=%#x\n",
-+               ctx->name, ret, timeout, pfd.events, pfd.revents);
-+
-+        if (ret < 0) {
-+            if (ret == AVERROR(EINTR))
-+                continue;
-+            av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll error %d (%s)\n", ctx->name, AVUNERROR(ret), av_err2str(ret));
-+            return ret;
-+        }
-+
-+        if (ret == 0) {
-+            if (timeout == -1)
-+                av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll unexpected timeout: events=%#x\n", ctx->name, pfd.events);
-+            if (ff_v4l2_ctx_eos(ctx)) {
-+                av_log(avctx, AV_LOG_WARNING, "V4L2 %s poll event timeout\n", ctx->name);
-+                ret = get_event(m);
-+                if (ret < 0) {
-+                    ctx->done = 1;
-+                    return ret;
-+                }
-             }
--            return NULL;
-+            return AVERROR(EAGAIN);
-         }
- 
--        if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) {
--            int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ?
--                            buf.m.planes[0].bytesused : buf.bytesused;
--            if (bytesused == 0) {
-+        if ((pfd.revents & POLLERR) != 0) {
-+            av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name);
-+            return AVERROR_UNKNOWN;
-+        }
-+
-+        if ((pfd.revents & poll_event) != 0) {
-+            ret = get_event(m);
-+            if (ret < 0) {
-                 ctx->done = 1;
--                return NULL;
-+                return ret;
-             }
--#ifdef V4L2_BUF_FLAG_LAST
--            if (buf.flags & V4L2_BUF_FLAG_LAST)
--                ctx->done = 1;
--#endif
-+            continue;
-+        }
-+
-+        if ((pfd.revents & poll_cap) != 0) {
-+            ret = dq_buf(ctx, ppavbuf);
-+            if (ret == AVERROR(EPIPE))
-+                continue;
-+            return ret;
-         }
- 
--        avbuf = &ctx->buffers[buf.index];
--        avbuf->status = V4L2BUF_AVAILABLE;
--        avbuf->buf = buf;
--        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
--            memcpy(avbuf->planes, planes, sizeof(planes));
--            avbuf->buf.m.planes = avbuf->planes;
-+        if ((pfd.revents & poll_out) != 0) {
-+            if (is_cap)
-+                return AVERROR(EAGAIN);
-+            return dq_buf(ctx, ppavbuf);
-         }
--        return avbuf;
-+
-+        av_log(avctx, AV_LOG_ERROR, "V4L2 poll unexpected events=%#x, revents=%#x\n", pfd.events, pfd.revents);
-+        return AVERROR_UNKNOWN;
-     }
-+}
- 
--    return NULL;
-+// Clear out flags and timestamps that should should be set by the user
-+// Returns the passed avbuf
-+static V4L2Buffer *
-+clean_v4l2_buffer(V4L2Buffer * const avbuf)
-+{
-+    struct v4l2_buffer *const buf = &avbuf->buf;
-+
-+    buf->flags = 0;
-+    buf->field = V4L2_FIELD_ANY;
-+    buf->timestamp = (struct timeval){0};
-+    buf->timecode = (struct v4l2_timecode){0};
-+    buf->sequence = 0;
-+
-+    return avbuf;
- }
- 
- static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
- {
--    int timeout = 0; /* return when no more buffers to dequeue */
-     int i;
- 
-     /* get back as many output buffers as possible */
-     if (V4L2_TYPE_IS_OUTPUT(ctx->type)) {
--          do {
--          } while (v4l2_dequeue_v4l2buf(ctx, timeout));
-+        V4L2Buffer * avbuf;
-+        do {
-+            get_qbuf(ctx, &avbuf, 0);
-+        } while (avbuf);
-     }
- 
-     for (i = 0; i < ctx->num_buffers; i++) {
--        if (ctx->buffers[i].status == V4L2BUF_AVAILABLE)
--            return &ctx->buffers[i];
-+        V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-+        if (avbuf->status == V4L2BUF_AVAILABLE)
-+            return clean_v4l2_buffer(avbuf);
-     }
- 
-     return NULL;
-@@ -452,25 +730,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(
- 
- static int v4l2_release_buffers(V4L2Context* ctx)
- {
--    struct v4l2_requestbuffers req = {
--        .memory = V4L2_MEMORY_MMAP,
--        .type = ctx->type,
--        .count = 0, /* 0 -> unmaps buffers from the driver */
--    };
--    int i, j;
-+    int i;
-+    int ret = 0;
-+    const int fd = ctx_to_m2mctx(ctx)->fd;
- 
--    for (i = 0; i < ctx->num_buffers; i++) {
--        V4L2Buffer *buffer = &ctx->buffers[i];
-+    // Orphan any buffers in the wild
-+    ff_weak_link_break(&ctx->wl_master);
- 
--        for (j = 0; j < buffer->num_planes; j++) {
--            struct V4L2Plane_info *p = &buffer->plane_info[j];
--            if (p->mm_addr && p->length)
--                if (munmap(p->mm_addr, p->length) < 0)
--                    av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno)));
-+    if (ctx->bufrefs) {
-+        for (i = 0; i < ctx->num_buffers; i++)
-+            av_buffer_unref(ctx->bufrefs + i);
-+    }
-+
-+    if (fd != -1) {
-+        struct v4l2_requestbuffers req = {
-+            .memory = V4L2_MEMORY_MMAP,
-+            .type = ctx->type,
-+            .count = 0, /* 0 -> unmap all buffers from the driver */
-+        };
-+
-+        while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) {
-+            if (errno == EINTR)
-+                continue;
-+
-+            ret = AVERROR(errno);
-+
-+            av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n",
-+                ctx->name, av_err2str(AVERROR(errno)));
-+
-+            if (ctx_to_m2mctx(ctx)->output_drm)
-+                av_log(logger(ctx), AV_LOG_ERROR,
-+                    "Make sure the DRM client releases all FB/GEM objects before closing the codec (ie):\n"
-+                    "for all buffers: \n"
-+                    "  1. drmModeRmFB(..)\n"
-+                    "  2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n");
-         }
-     }
-+    atomic_store(&ctx->q_count, 0);
- 
--    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req);
-+    return ret;
- }
- 
- static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt)
-@@ -499,6 +797,8 @@ static inline int v4l2_try_raw_format(V4
- 
- static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
- {
-+    V4L2m2mContext* s = ctx_to_m2mctx(ctx);
-+    V4L2m2mPriv *priv = s->avctx->priv_data;
-     enum AVPixelFormat pixfmt = ctx->av_pix_fmt;
-     struct v4l2_fmtdesc fdesc;
-     int ret;
-@@ -517,6 +817,13 @@ static int v4l2_get_raw_format(V4L2Conte
-         if (ret)
-             return AVERROR(EINVAL);
- 
-+        if (priv->pix_fmt != AV_PIX_FMT_NONE) {
-+            if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) {
-+                fdesc.index++;
-+                continue;
-+            }
-+        }
-+
-         pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO);
-         ret = v4l2_try_raw_format(ctx, pixfmt);
-         if (ret){
-@@ -569,30 +876,99 @@ static int v4l2_get_coded_format(V4L2Con
-   *
-   *****************************************************************************/
- 
-+
-+static void flush_all_buffers_status(V4L2Context* const ctx)
-+{
-+    int i;
-+
-+    if (!ctx->bufrefs)
-+        return;
-+
-+    for (i = 0; i < ctx->num_buffers; ++i) {
-+        struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
-+        if (buf->status == V4L2BUF_IN_DRIVER)
-+            ff_v4l2_buffer_set_avail(buf);
-+    }
-+    atomic_store(&ctx->q_count, 0);
-+}
-+
-+static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx)
-+{
-+    int i;
-+    int rv;
-+
-+    if (!ctx->bufrefs) {
-+        rv = ff_v4l2_context_init(ctx);
-+        if (rv) {
-+            av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
-+            return rv;
-+        }
-+    }
-+
-+    for (i = 0; i < ctx->num_buffers; ++i) {
-+        struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
-+        if (buf->status == V4L2BUF_AVAILABLE) {
-+            rv = ff_v4l2_buffer_enqueue(buf);
-+            if (rv < 0)
-+                return rv;
-+        }
-+    }
-+    return 0;
-+}
-+
- int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
- {
-     int type = ctx->type;
--    int ret;
-+    int ret = 0;
-+    AVCodecContext * const avctx = logger(ctx);
- 
--    ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type);
--    if (ret < 0)
--        return AVERROR(errno);
-+    // Avoid doing anything if there is nothing we can do
-+    if (cmd == VIDIOC_STREAMOFF && !ctx_buffers_alloced(ctx) && !ctx->streamon)
-+        return 0;
- 
--    ctx->streamon = (cmd == VIDIOC_STREAMON);
-+    ff_mutex_lock(&ctx->lock);
- 
--    return 0;
-+    if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type))
-+        stuff_all_buffers(avctx, ctx);
-+
-+    if (ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type) < 0) {
-+        const int err = errno;
-+        av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name,
-+               cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err);
-+        ret = AVERROR(err);
-+    }
-+    else
-+    {
-+        if (cmd == VIDIOC_STREAMOFF)
-+            flush_all_buffers_status(ctx);
-+        else
-+            ctx->first_buf = 1;
-+
-+        ctx->streamon = (cmd == VIDIOC_STREAMON);
-+        av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name,
-+               cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF");
-+    }
-+
-+    // Both stream off & on effectively clear flag_last
-+    ctx->flag_last = 0;
-+
-+    ff_mutex_unlock(&ctx->lock);
-+
-+    return ret;
- }
- 
- int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
- {
--    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
-+    V4L2m2mContext *const s = ctx_to_m2mctx(ctx);
-+    AVCodecContext *const avctx = s->avctx;
-+    int64_t track_ts;
-     V4L2Buffer* avbuf;
-     int ret;
- 
-     if (!frame) {
-         ret = v4l2_stop_encode(ctx);
-         if (ret)
--            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
-+            av_log(avctx, AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
-         s->draining= 1;
-         return 0;
-     }
-@@ -601,23 +977,29 @@ int ff_v4l2_context_enqueue_frame(V4L2Co
-     if (!avbuf)
-         return AVERROR(ENOMEM);
- 
--    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf);
-+    track_ts = xlat_pts_frame_in(avctx, &s->xlat, frame);
-+
-+    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf, track_ts);
-     if (ret)
-         return ret;
- 
-     return ff_v4l2_buffer_enqueue(avbuf);
- }
- 
--int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
-+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
-+                                   const void * extdata, size_t extlen)
- {
-     V4L2m2mContext *s = ctx_to_m2mctx(ctx);
-+    AVCodecContext *const avctx = s->avctx;
-     V4L2Buffer* avbuf;
-     int ret;
-+    int64_t track_ts;
- 
-     if (!pkt->size) {
-         ret = v4l2_stop_decode(ctx);
-+        // Log but otherwise ignore stop failure
-         if (ret)
--            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode\n", ctx->name);
-+            av_log(avctx, AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret);
-         s->draining = 1;
-         return 0;
-     }
-@@ -626,8 +1008,13 @@ int ff_v4l2_context_enqueue_packet(V4L2C
-     if (!avbuf)
-         return AVERROR(EAGAIN);
- 
--    ret = ff_v4l2_buffer_avpkt_to_buf(pkt, avbuf);
--    if (ret)
-+    track_ts = xlat_pts_pkt_in(avctx, &s->xlat, pkt);
-+
-+    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, track_ts);
-+    if (ret == AVERROR(ENOMEM))
-+        av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n",
-+               __func__, pkt->size, avbuf->planes[0].length);
-+    else if (ret)
-         return ret;
- 
-     return ff_v4l2_buffer_enqueue(avbuf);
-@@ -635,42 +1022,36 @@ int ff_v4l2_context_enqueue_packet(V4L2C
- 
- int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
- {
-+    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
-+    AVCodecContext *const avctx = s->avctx;
-     V4L2Buffer *avbuf;
-+    int rv;
- 
--    /*
--     * timeout=-1 blocks until:
--     *  1. decoded frame available
--     *  2. an input buffer is ready to be dequeued
--     */
--    avbuf = v4l2_dequeue_v4l2buf(ctx, timeout);
--    if (!avbuf) {
--        if (ctx->done)
--            return AVERROR_EOF;
--
--        return AVERROR(EAGAIN);
--    }
-+    do {
-+        if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
-+            return rv;
-+        if ((rv = ff_v4l2_buffer_buf_to_avframe(frame, avbuf)) != 0)
-+            return rv;
-+    } while (xlat_pts_frame_out(avctx, &s->xlat, frame) != 0);
- 
--    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
-+   return 0;
- }
- 
- int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
- {
-+    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
-+    AVCodecContext *const avctx = s->avctx;
-     V4L2Buffer *avbuf;
-+    int rv;
- 
--    /*
--     * blocks until:
--     *  1. encoded packet available
--     *  2. an input buffer ready to be dequeued
--     */
--    avbuf = v4l2_dequeue_v4l2buf(ctx, -1);
--    if (!avbuf) {
--        if (ctx->done)
--            return AVERROR_EOF;
-+    do {
-+        if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
-+            return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv;  // Caller not currently expecting ENOSPC
-+        if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0)
-+            return rv;
-+    } while (xlat_pts_pkt_out(avctx, &s->xlat, pkt) != 0);
- 
--        return AVERROR(EAGAIN);
--    }
--
--    return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
-+    return 0;
- }
- 
- int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
-@@ -702,78 +1083,160 @@ int ff_v4l2_context_get_format(V4L2Conte
- 
- int ff_v4l2_context_set_format(V4L2Context* ctx)
- {
--    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
-+    int ret;
-+
-+    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
-+    if (ret != 0)
-+        return ret;
-+
-+    // Check returned size against min size and if smaller have another go
-+    // Only worry about plane[0] as this is meant to enforce limits for
-+    // encoded streams where we might know a bit more about the shape
-+    // than the driver
-+    if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) {
-+        if (ctx->min_buf_size <= ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage)
-+            return 0;
-+        ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage = ctx->min_buf_size;
-+    }
-+    else {
-+        if (ctx->min_buf_size <= ctx->format.fmt.pix.sizeimage)
-+            return 0;
-+        ctx->format.fmt.pix.sizeimage = ctx->min_buf_size;
-+    }
-+
-+    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
-+    return ret;
- }
- 
- void ff_v4l2_context_release(V4L2Context* ctx)
- {
-     int ret;
- 
--    if (!ctx->buffers)
-+    if (!ctx->bufrefs)
-         return;
- 
-     ret = v4l2_release_buffers(ctx);
-     if (ret)
-         av_log(logger(ctx), AV_LOG_WARNING, "V4L2 failed to unmap the %s buffers\n", ctx->name);
- 
--    av_freep(&ctx->buffers);
-+    av_freep(&ctx->bufrefs);
-+    av_buffer_unref(&ctx->frames_ref);
-+
-+    ff_mutex_destroy(&ctx->lock);
-+    pthread_cond_destroy(&ctx->cond);
- }
- 
--int ff_v4l2_context_init(V4L2Context* ctx)
-+
-+static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers, const enum v4l2_memory mem)
- {
--    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
-+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-     struct v4l2_requestbuffers req;
--    int ret, i;
--
--    if (!v4l2_type_supported(ctx)) {
--        av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
--        return AVERROR_PATCHWELCOME;
--    }
-+    int ret;
-+    int i;
- 
--    ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
--    if (ret)
--        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", ctx->name);
-+    av_assert0(ctx->bufrefs == NULL);
- 
-     memset(&req, 0, sizeof(req));
--    req.count = ctx->num_buffers;
--    req.memory = V4L2_MEMORY_MMAP;
-+    req.count = req_buffers;
-+    req.memory = mem;
-     req.type = ctx->type;
--    ret = ioctl(s->fd, VIDIOC_REQBUFS, &req);
--    if (ret < 0) {
--        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, strerror(errno));
--        return AVERROR(errno);
-+    while ((ret = ioctl(s->fd, VIDIOC_REQBUFS, &req)) == -1) {
-+        if (errno != EINTR) {
-+            ret = AVERROR(errno);
-+            av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, av_err2str(ret));
-+            return ret;
-+        }
-     }
- 
-     ctx->num_buffers = req.count;
--    ctx->buffers = av_mallocz(ctx->num_buffers * sizeof(V4L2Buffer));
--    if (!ctx->buffers) {
-+    ctx->bufrefs = av_mallocz(ctx->num_buffers * sizeof(*ctx->bufrefs));
-+    if (!ctx->bufrefs) {
-         av_log(logger(ctx), AV_LOG_ERROR, "%s malloc enomem\n", ctx->name);
--        return AVERROR(ENOMEM);
-+        goto fail_release;
-     }
- 
--    for (i = 0; i < req.count; i++) {
--        ctx->buffers[i].context = ctx;
--        ret = ff_v4l2_buffer_initialize(&ctx->buffers[i], i);
--        if (ret < 0) {
-+    ctx->wl_master = ff_weak_link_new(ctx);
-+    if (!ctx->wl_master) {
-+        ret = AVERROR(ENOMEM);
-+        goto fail_release;
-+    }
-+
-+    for (i = 0; i < ctx->num_buffers; i++) {
-+        ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx, mem);
-+        if (ret) {
-             av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret));
--            goto error;
-+            goto fail_release;
-         }
-     }
- 
-     av_log(logger(ctx), AV_LOG_DEBUG, "%s: %s %02d buffers initialized: %04ux%04u, sizeimage %08u, bytesperline %08u\n", ctx->name,
-         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? av_fourcc2str(ctx->format.fmt.pix_mp.pixelformat) : av_fourcc2str(ctx->format.fmt.pix.pixelformat),
-         req.count,
--        v4l2_get_width(&ctx->format),
--        v4l2_get_height(&ctx->format),
-+        ff_v4l2_get_format_width(&ctx->format),
-+        ff_v4l2_get_format_height(&ctx->format),
-         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage : ctx->format.fmt.pix.sizeimage,
-         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline : ctx->format.fmt.pix.bytesperline);
- 
-     return 0;
- 
--error:
-+fail_release:
-     v4l2_release_buffers(ctx);
-+    av_freep(&ctx->bufrefs);
-+    return ret;
-+}
-+
-+int ff_v4l2_context_init(V4L2Context* ctx)
-+{
-+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-+    int ret;
-+
-+    // It is not valid to reinit a context without a previous release
-+    av_assert0(ctx->bufrefs == NULL);
- 
--    av_freep(&ctx->buffers);
-+    if (!v4l2_type_supported(ctx)) {
-+        av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
-+        return AVERROR_PATCHWELCOME;
-+    }
-+
-+    ff_mutex_init(&ctx->lock, NULL);
-+    pthread_cond_init(&ctx->cond, NULL);
-+    atomic_init(&ctx->q_count, 0);
-+
-+    if (s->output_drm) {
-+        AVHWFramesContext *hwframes;
-+
-+        ctx->frames_ref = av_hwframe_ctx_alloc(s->device_ref);
-+        if (!ctx->frames_ref) {
-+            ret = AVERROR(ENOMEM);
-+            goto fail_unlock;
-+        }
-+
-+        hwframes = (AVHWFramesContext*)ctx->frames_ref->data;
-+        hwframes->format = AV_PIX_FMT_DRM_PRIME;
-+        hwframes->sw_format = ctx->av_pix_fmt;
-+        hwframes->width = ctx->width != 0 ? ctx->width : s->avctx->width;
-+        hwframes->height = ctx->height != 0 ? ctx->height : s->avctx->height;
-+        ret = av_hwframe_ctx_init(ctx->frames_ref);
-+        if (ret < 0)
-+            goto fail_unref_hwframes;
-+    }
-+
-+    ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
-+    if (ret) {
-+        ret = AVERROR(errno);
-+        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed: %s\n", ctx->name, av_err2str(ret));
-+        goto fail_unref_hwframes;
-+    }
-+
-+    ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem);
-+    if (ret < 0)
-+        goto fail_unref_hwframes;
-+
-+    return 0;
- 
-+fail_unref_hwframes:
-+    av_buffer_unref(&ctx->frames_ref);
-+fail_unlock:
-+    ff_mutex_destroy(&ctx->lock);
-     return ret;
- }
---- a/libavcodec/v4l2_context.h
-+++ b/libavcodec/v4l2_context.h
-@@ -31,6 +31,7 @@
- #include "libavutil/pixfmt.h"
- #include "libavutil/frame.h"
- #include "libavutil/buffer.h"
-+#include "libavutil/thread.h"
- #include "v4l2_buffers.h"
- 
- typedef struct V4L2Context {
-@@ -70,11 +71,18 @@ typedef struct V4L2Context {
-      */
-     int width, height;
-     AVRational sample_aspect_ratio;
-+    struct v4l2_rect selection;
- 
-     /**
--     * Indexed array of V4L2Buffers
-+     * If the default size of buffer is less than this then try to
-+     * set to this.
-      */
--    V4L2Buffer *buffers;
-+    uint32_t min_buf_size;
-+
-+    /**
-+     * Indexed array of pointers to V4L2Buffers
-+     */
-+    AVBufferRef **bufrefs;
- 
-     /**
-      * Readonly after init.
-@@ -82,16 +90,38 @@ typedef struct V4L2Context {
-     int num_buffers;
- 
-     /**
-+     * Buffer memory type V4L2_MEMORY_MMAP or V4L2_MEMORY_DMABUF
-+     */
-+    enum v4l2_memory buf_mem;
-+
-+    /**
-      * Whether the stream has been started (VIDIOC_STREAMON has been sent).
-      */
-     int streamon;
- 
-+    /* 1st buffer after stream on */
-+    int first_buf;
-+
-     /**
-      *  Either no more buffers available or an unrecoverable error was notified
-      *  by the V4L2 kernel driver: once set the context has to be exited.
-      */
-     int done;
- 
-+    int flag_last;
-+
-+    /**
-+     * If NZ then when Qing frame/pkt use this rather than the
-+     * "real" PTS
-+     */
-+    uint64_t track_ts;
-+
-+    AVBufferRef *frames_ref;
-+    atomic_int q_count;
-+    struct ff_weak_link_master *wl_master;
-+
-+    AVMutex lock;
-+    pthread_cond_t cond;
- } V4L2Context;
- 
- /**
-@@ -156,7 +186,10 @@ int ff_v4l2_context_dequeue_packet(V4L2C
-  * @param[in] ctx The V4L2Context to dequeue from.
-  * @param[inout] f The AVFrame to dequeue to.
-  * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds)
-+ *
-  * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
-+ *                AVERROR(ENOSPC) if no buffer availible to put
-+ *                the frame in
-  */
- int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
- 
-@@ -170,7 +203,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Co
-  * @param[in] pkt A pointer to an AVPacket.
-  * @return 0 in case of success, a negative error otherwise.
-  */
--int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt);
-+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size);
- 
- /**
-  * Enqueues a buffer to a V4L2Context from an AVFrame
---- a/libavcodec/v4l2_m2m.c
-+++ b/libavcodec/v4l2_m2m.c
-@@ -36,6 +36,14 @@
- #include "v4l2_fmt.h"
- #include "v4l2_m2m.h"
- 
-+static void
-+xlat_init(xlat_track_t * const x)
-+{
-+    memset(x, 0, sizeof(*x));
-+    x->last_pts = AV_NOPTS_VALUE;
-+}
-+
-+
- static inline int v4l2_splane_video(struct v4l2_capability *cap)
- {
-     if (cap->capabilities & (V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT) &&
-@@ -68,7 +76,9 @@ static int v4l2_prepare_contexts(V4L2m2m
- 
-     s->capture.done = s->output.done = 0;
-     s->capture.name = "capture";
-+    s->capture.buf_mem = V4L2_MEMORY_MMAP;
-     s->output.name = "output";
-+    s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
-     atomic_init(&s->refcount, 0);
-     sem_init(&s->refsync, 0, 0);
- 
-@@ -215,13 +225,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont
-         av_log(log_ctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF\n");
- 
-     /* 2. unmap the capture buffers (v4l2 and ffmpeg):
--     *    we must wait for all references to be released before being allowed
--     *    to queue new buffers.
-      */
--    av_log(log_ctx, AV_LOG_DEBUG, "waiting for user to release AVBufferRefs\n");
--    if (atomic_load(&s->refcount))
--        while(sem_wait(&s->refsync) == -1 && errno == EINTR);
--
-     ff_v4l2_context_release(&s->capture);
- 
-     /* 3. get the new capture format */
-@@ -240,7 +244,6 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont
- 
-     /* 5. complete reinit */
-     s->draining = 0;
--    s->reinit = 0;
- 
-     return 0;
- }
-@@ -274,7 +277,6 @@ int ff_v4l2_m2m_codec_full_reinit(V4L2m2
- 
-     /* start again now that we know the stream dimensions */
-     s->draining = 0;
--    s->reinit = 0;
- 
-     ret = ff_v4l2_context_get_format(&s->output, 0);
-     if (ret) {
-@@ -328,7 +330,13 @@ static void v4l2_m2m_destroy_context(voi
-     ff_v4l2_context_release(&s->capture);
-     sem_destroy(&s->refsync);
- 
--    close(s->fd);
-+    if (s->fd != -1)
-+        close(s->fd);
-+
-+    av_packet_unref(&s->buf_pkt);
-+    av_freep(&s->extdata_data);
-+
-+    av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n");
- 
-     av_free(s);
- }
-@@ -338,17 +346,34 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *p
-     V4L2m2mContext *s = priv->context;
-     int ret;
- 
--    ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
--    if (ret)
--        av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->output.name);
-+    if (!s)
-+        return 0;
- 
--    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
--    if (ret)
--        av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->capture.name);
-+    av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Codec end\n");
-+
-+    if (av_codec_is_decoder(s->avctx->codec))
-+        av_packet_unref(&s->buf_pkt);
-+
-+    if (s->fd >= 0) {
-+        ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
-+        if (ret)
-+            av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->output.name);
-+
-+        ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
-+        if (ret)
-+            av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->capture.name);
-+    }
- 
-     ff_v4l2_context_release(&s->output);
- 
-+    close(s->fd);
-+    s->fd = -1;
-+
-     s->self_ref = NULL;
-+    // This is only called on avctx close so after this point we don't have that
-+    // Crash sooner if we find we are using it (can still log with avctx = NULL)
-+    s->avctx = NULL;
-+    priv->context = NULL;
-     av_buffer_unref(&priv->context_ref);
- 
-     return 0;
-@@ -392,28 +417,33 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *
-     return v4l2_configure_contexts(s);
- }
- 
--int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **s)
-+int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **pps)
- {
--    *s = av_mallocz(sizeof(V4L2m2mContext));
--    if (!*s)
-+    V4L2m2mContext * const s = av_mallocz(sizeof(V4L2m2mContext));
-+
-+    *pps = NULL;
-+    if (!s)
-         return AVERROR(ENOMEM);
- 
--    priv->context_ref = av_buffer_create((uint8_t *) *s, sizeof(V4L2m2mContext),
-+    priv->context_ref = av_buffer_create((uint8_t *)s, sizeof(*s),
-                                          &v4l2_m2m_destroy_context, NULL, 0);
-     if (!priv->context_ref) {
--        av_freep(s);
-+        av_free(s);
-         return AVERROR(ENOMEM);
-     }
- 
-     /* assign the context */
--    priv->context = *s;
--    (*s)->priv = priv;
-+    priv->context = s;
-+    s->priv = priv;
- 
-     /* populate it */
--    priv->context->capture.num_buffers = priv->num_capture_buffers;
--    priv->context->output.num_buffers  = priv->num_output_buffers;
--    priv->context->self_ref = priv->context_ref;
--    priv->context->fd = -1;
-+    s->capture.num_buffers = priv->num_capture_buffers;
-+    s->output.num_buffers  = priv->num_output_buffers;
-+    s->self_ref = priv->context_ref;
-+    s->fd = -1;
-+
-+    xlat_init(&s->xlat);
- 
-+    *pps = s;
-     return 0;
- }
---- a/libavcodec/v4l2_m2m.h
-+++ b/libavcodec/v4l2_m2m.h
-@@ -30,6 +30,7 @@
- #include <linux/videodev2.h>
- 
- #include "libavcodec/avcodec.h"
-+#include "libavutil/pixfmt.h"
- #include "v4l2_context.h"
- 
- #define container_of(ptr, type, member) ({ \
-@@ -38,7 +39,37 @@
- 
- #define V4L_M2M_DEFAULT_OPTS \
-     { "num_output_buffers", "Number of buffers in the output context",\
--        OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 6, INT_MAX, FLAGS }
-+        OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 2, INT_MAX, FLAGS }
-+
-+#define FF_V4L2_M2M_TRACK_SIZE 128
-+typedef struct V4L2m2mTrackEl {
-+    int     discard;   // If we see this buffer its been flushed, so discard
-+    int     pending;
-+    int     pkt_size;
-+    int64_t pts;
-+    int64_t dts;
-+    int64_t reordered_opaque;
-+    int64_t pkt_pos;
-+    int64_t pkt_duration;
-+    int64_t track_pts;
-+} V4L2m2mTrackEl;
-+
-+typedef struct pts_stats_s
-+{
-+    void * logctx;
-+    const char * name;  // For debug
-+    unsigned int last_count;
-+    unsigned int last_interval;
-+    int64_t last_pts;
-+    int64_t guess;
-+} pts_stats_t;
-+
-+typedef struct xlat_track_s {
-+    unsigned int track_no;
-+    int64_t last_pts;
-+    int64_t last_opaque;
-+    V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
-+} xlat_track_t;
- 
- typedef struct V4L2m2mContext {
-     char devname[PATH_MAX];
-@@ -52,7 +83,6 @@ typedef struct V4L2m2mContext {
-     AVCodecContext *avctx;
-     sem_t refsync;
-     atomic_uint refcount;
--    int reinit;
- 
-     /* null frame/packet received */
-     int draining;
-@@ -63,6 +93,36 @@ typedef struct V4L2m2mContext {
- 
-     /* reference back to V4L2m2mPriv */
-     void *priv;
-+
-+    AVBufferRef *device_ref;
-+
-+    /* generate DRM frames */
-+    int output_drm;
-+
-+    /* input frames are drmprime */
-+    int input_drm;
-+
-+    /* Frame tracking */
-+    xlat_track_t xlat;
-+    int pending_hw;
-+    int pending_n;
-+
-+    pts_stats_t pts_stat;
-+
-+    /* req pkt */
-+    int req_pkt;
-+
-+    /* Ext data sent */
-+    int extdata_sent;
-+    /* Ext data sent in packet - overrides ctx */
-+    uint8_t * extdata_data;
-+    size_t extdata_size;
-+
-+#define FF_V4L2_QUIRK_REINIT_ALWAYS             1
-+#define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN    2
-+    /* Quirks */
-+    unsigned int quirks;
-+
- } V4L2m2mContext;
- 
- typedef struct V4L2m2mPriv {
-@@ -73,6 +133,7 @@ typedef struct V4L2m2mPriv {
- 
-     int num_output_buffers;
-     int num_capture_buffers;
-+    enum AVPixelFormat pix_fmt;
- } V4L2m2mPriv;
- 
- /**
-@@ -126,4 +187,26 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont
-  */
- int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx);
- 
-+
-+static inline unsigned int ff_v4l2_get_format_width(const struct v4l2_format * const fmt)
-+{
-+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
-+}
-+
-+static inline unsigned int ff_v4l2_get_format_height(const struct v4l2_format * const fmt)
-+{
-+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
-+}
-+
-+static inline uint32_t ff_v4l2_get_format_pixelformat(const struct v4l2_format * const fmt)
-+{
-+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat;
-+}
-+
-+static inline int ff_v4l2_ctx_eos(const V4L2Context * const ctx)
-+{
-+    return ctx->flag_last;
-+}
-+
-+
- #endif /* AVCODEC_V4L2_M2M_H */
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -23,6 +23,10 @@
- 
- #include <linux/videodev2.h>
- #include <sys/ioctl.h>
-+
-+#include "libavutil/avassert.h"
-+#include "libavutil/hwcontext.h"
-+#include "libavutil/hwcontext_drm.h"
- #include "libavutil/pixfmt.h"
- #include "libavutil/pixdesc.h"
- #include "libavutil/opt.h"
-@@ -30,75 +34,107 @@
- #include "libavcodec/decode.h"
- #include "libavcodec/internal.h"
- 
-+#include "libavcodec/hwaccels.h"
-+#include "libavcodec/internal.h"
-+#include "libavcodec/hwconfig.h"
-+
- #include "v4l2_context.h"
- #include "v4l2_m2m.h"
- #include "v4l2_fmt.h"
- 
--static int v4l2_try_start(AVCodecContext *avctx)
-+// Pick 64 for max last count - that is >1sec at 60fps
-+#define STATS_LAST_COUNT_MAX 64
-+#define STATS_INTERVAL_MAX (1 << 30)
-+
-+static int64_t pts_stats_guess(const pts_stats_t * const stats)
- {
--    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
--    V4L2Context *const capture = &s->capture;
--    V4L2Context *const output = &s->output;
--    struct v4l2_selection selection = { 0 };
--    int ret;
-+    if (stats->last_pts == AV_NOPTS_VALUE ||
-+            stats->last_interval == 0 ||
-+            stats->last_count >= STATS_LAST_COUNT_MAX)
-+        return AV_NOPTS_VALUE;
-+    return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval;
-+}
- 
--    /* 1. start the output process */
--    if (!output->streamon) {
--        ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON);
--        if (ret < 0) {
--            av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n");
--            return ret;
-+static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
-+{
-+    if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
-+        if (stats->last_count < STATS_LAST_COUNT_MAX)
-+            ++stats->last_count;
-+        return;
-+    }
-+
-+    if (stats->last_pts != AV_NOPTS_VALUE) {
-+        const int64_t interval = pts - stats->last_pts;
-+
-+        if (interval < 0 || interval >= STATS_INTERVAL_MAX ||
-+            stats->last_count >= STATS_LAST_COUNT_MAX) {
-+            if (stats->last_interval != 0)
-+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n",
-+                       __func__, stats->name, interval, stats->last_count);
-+            stats->last_interval = 0;
-+        }
-+        else {
-+            const int64_t frame_time = interval / (int64_t)stats->last_count;
-+
-+            if (frame_time != stats->last_interval)
-+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
-+                       __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
-+            stats->last_interval = frame_time;
-         }
-     }
- 
--    if (capture->streamon)
-+    stats->last_pts = pts;
-+    stats->last_count = 1;
-+}
-+
-+static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name)
-+{
-+    *stats = (pts_stats_t){
-+        .logctx = logctx,
-+        .name = name,
-+        .last_count = 1,
-+        .last_interval = 0,
-+        .last_pts = AV_NOPTS_VALUE
-+    };
-+}
-+
-+static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s)
-+{
-+    int ret;
-+    struct v4l2_decoder_cmd cmd = {
-+        .cmd = V4L2_DEC_CMD_START,
-+        .flags = 0,
-+    };
-+
-+    if (s->output.streamon)
-         return 0;
- 
--    /* 2. get the capture format */
--    capture->format.type = capture->type;
--    ret = ioctl(s->fd, VIDIOC_G_FMT, &capture->format);
--    if (ret) {
--        av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_FMT ioctl\n");
-+    ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON);
-+    if (ret != 0) {
-+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context: %s\n", av_err2str(ret));
-         return ret;
-     }
- 
--    /* 2.1 update the AVCodecContext */
--    avctx->pix_fmt = ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO);
--    capture->av_pix_fmt = avctx->pix_fmt;
--
--    /* 3. set the crop parameters */
--    selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
--    selection.r.height = avctx->coded_height;
--    selection.r.width = avctx->coded_width;
--    ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection);
--    if (!ret) {
--        ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
--        if (ret) {
--            av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n");
--        } else {
--            av_log(avctx, AV_LOG_DEBUG, "crop output %dx%d\n", selection.r.width, selection.r.height);
--            /* update the size of the resulting frame */
--            capture->height = selection.r.height;
--            capture->width  = selection.r.width;
--        }
-+    // STREAMON should do implicit START so this just for those that don't.
-+    // It is optional so don't worry if it fails
-+    if (ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd) < 0) {
-+        ret = AVERROR(errno);
-+        av_log(avctx, AV_LOG_WARNING, "VIDIOC_DECODER_CMD start error: %s\n", av_err2str(ret));
-     }
--
--    /* 4. init the capture context now that we have the capture format */
--    if (!capture->buffers) {
--        ret = ff_v4l2_context_init(capture);
--        if (ret) {
--            av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
--            return AVERROR(ENOMEM);
--        }
-+    else {
-+        av_log(avctx, AV_LOG_TRACE, "VIDIOC_DECODER_CMD start OK\n");
-     }
-+    return 0;
-+}
- 
--    /* 5. start the capture process */
--    ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
--    if (ret) {
--        av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON, on capture context\n");
--        return ret;
--    }
-+static int v4l2_try_start(AVCodecContext *avctx)
-+{
-+    V4L2m2mContext * const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-+    int ret;
- 
-+    /* 1. start the output process */
-+    if ((ret = check_output_streamon(avctx, s)) != 0)
-+        return ret;
-     return 0;
- }
- 
-@@ -133,52 +169,522 @@ static int v4l2_prepare_decoder(V4L2m2mC
-     return 0;
- }
- 
--static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-+static void
-+set_best_effort_pts(AVCodecContext *const avctx,
-+             pts_stats_t * const ps,
-+             AVFrame *const frame)
-+{
-+    pts_stats_add(ps, frame->pts);
-+
-+#if FF_API_PKT_PTS
-+FF_DISABLE_DEPRECATION_WARNINGS
-+    frame->pkt_pts = frame->pts;
-+FF_ENABLE_DEPRECATION_WARNINGS
-+#endif
-+    frame->best_effort_timestamp = pts_stats_guess(ps);
-+    // If we can't guess from just PTS - try DTS
-+    if (frame->best_effort_timestamp == AV_NOPTS_VALUE)
-+        frame->best_effort_timestamp = frame->pkt_dts;
-+
-+    // We can't emulate what s/w does in a useful manner and using the
-+    // "correct" answer seems to just confuse things.
-+    frame->pkt_dts               = frame->pts;
-+    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n",
-+           frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
-+}
-+
-+static void
-+xlat_flush(xlat_track_t * const x)
-+{
-+    unsigned int i;
-+    for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) {
-+        x->track_els[i].pending = 0;
-+        x->track_els[i].discard = 1;
-+    }
-+    x->last_pts = AV_NOPTS_VALUE;
-+}
-+
-+static int
-+xlat_pending(const xlat_track_t * const x)
-+{
-+    unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE;
-+    unsigned int i;
-+    int r = 0;
-+    int64_t now = AV_NOPTS_VALUE;
-+
-+    for (i = 0; i < 32; ++i, n = (n - 1) % FF_V4L2_M2M_TRACK_SIZE) {
-+        const V4L2m2mTrackEl * const t = x->track_els + n;
-+
-+        if (!t->pending)
-+            continue;
-+
-+        if (now == AV_NOPTS_VALUE)
-+            now = t->dts;
-+
-+        if (t->pts == AV_NOPTS_VALUE ||
-+            ((now == AV_NOPTS_VALUE || t->pts <= now) &&
-+             (x->last_pts == AV_NOPTS_VALUE || t->pts > x->last_pts)))
-+            ++r;
-+    }
-+
-+    // If we never get any ideas about PTS vs DTS allow a lot more buffer
-+    if (now == AV_NOPTS_VALUE)
-+        r -= 16;
-+
-+    return r;
-+}
-+
-+static inline int stream_started(const V4L2m2mContext * const s) {
-+    return s->output.streamon;
-+}
-+
-+#define NQ_OK        0
-+#define NQ_Q_FULL    1
-+#define NQ_SRC_EMPTY 2
-+#define NQ_NONE      3
-+#define NQ_DRAINING  4
-+#define NQ_DEAD      5
-+
-+#define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING)
-+#define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE)
-+
-+// do_not_get      If true then no new packet will be got but status will
-+//                  be set appropriately
-+
-+// AVERROR_EOF     Flushing an already flushed stream
-+// -ve             Error (all errors except EOF are unexpected)
-+// NQ_OK (0)       OK
-+// NQ_Q_FULL       Dst full (retry if we think V4L2 Q has space now)
-+// NQ_SRC_EMPTY    Src empty (do not retry)
-+// NQ_NONE         Enqueue not attempted
-+// NQ_DRAINING     At EOS, dQ dest until EOS there too
-+// NQ_DEAD         Not running (do not retry, do not attempt capture dQ)
-+
-+static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s, const int do_not_get)
- {
--    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
--    V4L2Context *const capture = &s->capture;
--    V4L2Context *const output = &s->output;
--    AVPacket avpkt = {0};
-     int ret;
- 
--    if (s->buf_pkt.size) {
--        avpkt = s->buf_pkt;
--        memset(&s->buf_pkt, 0, sizeof(AVPacket));
--    } else {
--        ret = ff_decode_get_packet(avctx, &avpkt);
--        if (ret < 0 && ret != AVERROR_EOF)
-+    // If we don't already have a coded packet - get a new one
-+    // We will already have a coded pkt if the output Q was full last time we
-+    // tried to Q it
-+    if (!s->buf_pkt.size && !do_not_get) {
-+        unsigned int i;
-+
-+        for (i = 0; i < 256; ++i) {
-+            uint8_t * side_data;
-+            size_t side_size;
-+
-+            ret = ff_decode_get_packet(avctx, &s->buf_pkt);
-+            if (ret != 0)
-+                break;
-+
-+            // New extradata is the only side-data we undertand
-+            side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
-+            if (side_data) {
-+                av_log(avctx, AV_LOG_DEBUG, "New extradata\n");
-+                av_freep(&s->extdata_data);
-+                if ((s->extdata_data = av_malloc(side_size ? side_size : 1)) == NULL) {
-+                    av_log(avctx, AV_LOG_ERROR, "Failed to alloc %zd bytes of extra data\n", side_size);
-+                    return AVERROR(ENOMEM);
-+                }
-+                memcpy(s->extdata_data, side_data, side_size);
-+                s->extdata_size = side_size;
-+                s->extdata_sent = 0;
-+            }
-+
-+            if (s->buf_pkt.size != 0)
-+                break;
-+
-+            if (s->buf_pkt.side_data_elems == 0) {
-+                av_log(avctx, AV_LOG_WARNING, "Empty pkt from ff_decode_get_packet - treating as EOF\n");
-+                ret = AVERROR_EOF;
-+                break;
-+            }
-+
-+            // Retry a side-data only pkt
-+        }
-+        // If i >= 256 something has gone wrong
-+        if (i >= 256) {
-+            av_log(avctx, AV_LOG_ERROR, "Too many side-data only packets\n");
-+            return AVERROR(EIO);
-+        }
-+
-+        if (ret == AVERROR(EAGAIN)) {
-+            if (!stream_started(s)) {
-+                av_log(avctx, AV_LOG_TRACE, "%s: receive_frame before 1st coded packet\n", __func__);
-+                return NQ_DEAD;
-+            }
-+            return NQ_SRC_EMPTY;
-+        }
-+
-+        if (ret == AVERROR_EOF) {
-+            // EOF - enter drain mode
-+            av_log(avctx, AV_LOG_TRACE, "--- EOS req: ret=%d, size=%d, started=%d, drain=%d\n",
-+                   ret, s->buf_pkt.size, stream_started(s), s->draining);
-+            if (!stream_started(s)) {
-+                av_log(avctx, AV_LOG_DEBUG, "EOS on flushed stream\n");
-+                s->draining = 1;
-+                s->capture.done = 1;
-+                return AVERROR_EOF;
-+            }
-+
-+            if (!s->draining) {
-+                // Calling enqueue with an empty pkt starts drain
-+                av_assert0(s->buf_pkt.size == 0);
-+                ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
-+                if (ret) {
-+                    av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret);
-+                    return ret;
-+                }
-+            }
-+            return NQ_DRAINING;
-+        }
-+
-+        if (ret < 0) {
-+            av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret);
-             return ret;
-+        }
-     }
- 
--    if (s->draining)
--        goto dequeue;
-+    if (s->draining) {
-+        if (s->buf_pkt.size) {
-+            av_log(avctx, AV_LOG_WARNING, "Unexpected input whilst draining\n");
-+            av_packet_unref(&s->buf_pkt);
-+        }
-+        return NQ_DRAINING;
-+    }
- 
--    ret = ff_v4l2_context_enqueue_packet(output, &avpkt);
--    if (ret < 0) {
--        if (ret != AVERROR(EAGAIN))
--           return ret;
-+    if (!s->buf_pkt.size)
-+        return NQ_NONE;
- 
--        s->buf_pkt = avpkt;
--        /* no input buffers available, continue dequeing */
--    }
-+    if ((ret = check_output_streamon(avctx, s)) != 0)
-+        return ret;
-+
-+    if (s->extdata_sent)
-+        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
-+    else if (s->extdata_data)
-+        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size);
-+    else
-+        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, avctx->extradata, avctx->extradata_size);
-+
-+    if (ret == AVERROR(EAGAIN)) {
-+        // Out of input buffers - keep packet
-+        ret = NQ_Q_FULL;
-+    }
-+    else {
-+        // In all other cases we are done with this packet
-+        av_packet_unref(&s->buf_pkt);
-+        s->extdata_sent = 1;
- 
--    if (avpkt.size) {
--        ret = v4l2_try_start(avctx);
-         if (ret) {
--            av_packet_unref(&avpkt);
-+            av_log(avctx, AV_LOG_ERROR, "Packet enqueue failure: err=%d\n", ret);
-+            return ret;
-+        }
-+    }
- 
--            /* cant recover */
--            if (ret == AVERROR(ENOMEM))
--                return ret;
-+    // Start if we haven't
-+    {
-+        const int ret2 = v4l2_try_start(avctx);
-+        if (ret2) {
-+            av_log(avctx, AV_LOG_DEBUG, "Start failure: err=%d\n", ret2);
-+            ret = (ret2 == AVERROR(ENOMEM)) ? ret2 : NQ_DEAD;
-+        }
-+    }
-+
-+    return ret;
-+}
-+
-+static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
-+{
-+    int rv = 0;
- 
--            return 0;
-+    ff_mutex_lock(&ctx->lock);
-+
-+    while (atomic_load(&ctx->q_count) == 0 && ctx->streamon) {
-+        if (pthread_cond_wait(&ctx->cond, &ctx->lock) != 0) {
-+            rv = AVERROR(errno);
-+            av_log(avctx, AV_LOG_ERROR, "Cond wait failure: %s\n", av_err2str(rv));
-+            break;
-         }
-     }
- 
--dequeue:
--    if (!s->buf_pkt.size)
--        av_packet_unref(&avpkt);
--    return ff_v4l2_context_dequeue_frame(capture, frame, -1);
-+    ff_mutex_unlock(&ctx->lock);
-+    return rv;
-+}
-+
-+// Number of frames over what xlat_pending returns that we keep *16
-+// This is a min value - if it appears to be too small the threshold should
-+// adjust dynamically.
-+#define PENDING_HW_MIN      (3 * 16)
-+// Offset to use when setting dynamically
-+// Set to %16 == 15 to avoid the threshold changing immediately as we relax
-+#define PENDING_HW_OFFSET   (PENDING_HW_MIN - 1)
-+// Number of consecutive times we've failed to get a frame when we prefer it
-+// before we increase the prefer threshold (5ms * N = max expected decode
-+// time)
-+#define PENDING_N_THRESHOLD 6
-+
-+static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-+{
-+    V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-+    int src_rv = NQ_OK;
-+    int dst_rv = 1;  // Non-zero (done), non-negative (error) number
-+    unsigned int i = 0;
-+
-+    do {
-+        const int pending = xlat_pending(&s->xlat);
-+        const int prefer_dq = (pending > s->pending_hw / 16);
-+        const int last_src_rv = src_rv;
-+
-+        // Enqueue another pkt for decode if
-+        // (a) We don't have a lot of stuff in the buffer already OR
-+        // (b) ... we (think we) do but we've failed to get a frame already OR
-+        // (c) We've dequeued a lot of frames without asking for input
-+        src_rv = try_enqueue_src(avctx, s, !(!prefer_dq || i != 0 || s->req_pkt > 2));
-+
-+        // If we got a frame last time or we've already tried to get a frame and
-+        // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN)
-+        // indicating that we want more input.
-+        // This should mean that once decode starts we enter a stable state where
-+        // we alternately ask for input and produce output
-+        if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY)
-+            break;
-+
-+        if (src_rv == NQ_Q_FULL && last_src_rv == NQ_Q_FULL) {
-+            av_log(avctx, AV_LOG_WARNING, "Poll thinks src Q has space; none found\n");
-+            break;
-+        }
-+
-+        // Try to get a new frame if
-+        // (a) we haven't already got one AND
-+        // (b) enqueue returned a status indicating that decode should be attempted
-+        if (dst_rv != 0 && TRY_DQ(src_rv)) {
-+            // Pick a timeout depending on state
-+            const int t =
-+                src_rv == NQ_DRAINING ? 300 :
-+                prefer_dq ? 5 :
-+                src_rv == NQ_Q_FULL ? -1 : 0;
-+
-+            // Dequeue frame will unref any previous contents of frame
-+            // if it returns success so we don't need an explicit unref
-+            // when discarding
-+            // This returns AVERROR(EAGAIN) on timeout or if
-+            // there is room in the input Q and timeout == -1
-+            dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
-+
-+            // Failure due to no buffer in Q?
-+            if (dst_rv == AVERROR(ENOSPC)) {
-+                // Wait & retry
-+                if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) {
-+                    dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
-+                }
-+            }
-+
-+            // Adjust dynamic pending threshold
-+            if (dst_rv == 0) {
-+                if (--s->pending_hw < PENDING_HW_MIN)
-+                    s->pending_hw = PENDING_HW_MIN;
-+                s->pending_n = 0;
-+
-+                set_best_effort_pts(avctx, &s->pts_stat, frame);
-+            }
-+            else if (dst_rv == AVERROR(EAGAIN)) {
-+                if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) {
-+                    s->pending_hw = pending * 16 + PENDING_HW_OFFSET;
-+                    s->pending_n = 0;
-+                }
-+            }
-+
-+            if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
-+                av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
-+                dst_rv = AVERROR_EOF;
-+                s->capture.done = 1;
-+            }
-+            else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
-+                av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
-+                       s->draining, s->capture.done);
-+            else if (dst_rv && dst_rv != AVERROR(EAGAIN))
-+                av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n",
-+                       s->draining, s->capture.done, dst_rv);
-+        }
-+
-+        ++i;
-+        if (i >= 256) {
-+            av_log(avctx, AV_LOG_ERROR, "Unexpectedly large retry count: %d\n", i);
-+            src_rv = AVERROR(EIO);
-+        }
-+
-+        // Continue trying to enqueue packets if either
-+        // (a) we succeeded last time OR
-+        // (b) we didn't ret a frame and we can retry the input
-+    } while (src_rv == NQ_OK || (dst_rv == AVERROR(EAGAIN) && RETRY_NQ(src_rv)));
-+
-+    // Ensure that the frame contains nothing if we aren't returning a frame
-+    // (might happen when discarding)
-+    if (dst_rv)
-+        av_frame_unref(frame);
-+
-+    // If we got a frame this time ask for a pkt next time
-+    s->req_pkt = (dst_rv == 0) ? s->req_pkt + 1 : 0;
-+
-+#if 0
-+    if (dst_rv == 0)
-+    {
-+        static int z = 0;
-+        if (++z > 50) {
-+            av_log(avctx, AV_LOG_ERROR, "Streamoff and die?\n");
-+            ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
-+            return -1;
-+        }
-+    }
-+#endif
-+
-+    return dst_rv == 0 ? 0 :
-+        src_rv < 0 ? src_rv :
-+        dst_rv < 0 ? dst_rv :
-+            AVERROR(EAGAIN);
-+}
-+
-+#if 0
-+#include <time.h>
-+static int64_t us_time(void)
-+{
-+    struct timespec ts;
-+    clock_gettime(CLOCK_MONOTONIC, &ts);
-+    return (int64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
-+}
-+
-+static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-+{
-+    int ret;
-+    const int64_t now = us_time();
-+    int64_t done;
-+    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
-+    ret = v4l2_receive_frame2(avctx, frame);
-+    done = us_time();
-+    av_log(avctx, AV_LOG_TRACE, ">>> %s: rx time=%" PRId64 ", rv=%d\n", __func__, done - now, ret);
-+    return ret;
-+}
-+#endif
-+
-+static int
-+check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
-+{
-+    unsigned int i;
-+    const uint32_t fcc = ff_v4l2_get_format_pixelformat(&s->capture.format);
-+    const uint32_t w = avctx->coded_width;
-+    const uint32_t h = avctx->coded_height;
-+
-+    if (w == 0 || h == 0 || fcc == 0) {
-+        av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc));
-+        return 0;
-+    }
-+    if ((s->quirks & FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN) != 0) {
-+        av_log(avctx, AV_LOG_TRACE, "%s: Skipped (quirk): Size %dx%d, fcc %s\n", __func__, w, h, av_fourcc2str(fcc));
-+        return 0;
-+    }
-+
-+    for (i = 0;; ++i) {
-+        struct v4l2_frmsizeenum fs = {
-+            .index = i,
-+            .pixel_format = fcc,
-+        };
-+
-+        while (ioctl(s->fd, VIDIOC_ENUM_FRAMESIZES, &fs) != 0) {
-+            const int err = AVERROR(errno);
-+            if (err == AVERROR(EINTR))
-+                continue;
-+            if (i == 0 && err == AVERROR(ENOTTY)) {
-+                av_log(avctx, AV_LOG_DEBUG, "Framesize enum not supported\n");
-+                return 0;
-+            }
-+            if (err != AVERROR(EINVAL)) {
-+                av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err));
-+                return err;
-+            }
-+            av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in %u frame size enums\n",
-+                   w, h, av_fourcc2str(fcc), i);
-+            return err;
-+        }
-+
-+        switch (fs.type) {
-+            case V4L2_FRMSIZE_TYPE_DISCRETE:
-+                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Discrete: %dx%d\n", __func__, i,
-+                       fs.discrete.width,fs.discrete.height);
-+                if (w == fs.discrete.width && h == fs.discrete.height)
-+                    return 0;
-+                break;
-+            case V4L2_FRMSIZE_TYPE_STEPWISE:
-+                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Stepwise: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
-+                       fs.stepwise.min_width, fs.stepwise.min_height,
-+                       fs.stepwise.max_width, fs.stepwise.max_height,
-+                       fs.stepwise.step_width,fs.stepwise.step_height);
-+                if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
-+                    h >= fs.stepwise.min_height && h <= fs.stepwise.max_height &&
-+                    (w - fs.stepwise.min_width) % fs.stepwise.step_width == 0 &&
-+                    (h - fs.stepwise.min_height) % fs.stepwise.step_height == 0)
-+                    return 0;
-+                break;
-+            case V4L2_FRMSIZE_TYPE_CONTINUOUS:
-+                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Continuous: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
-+                       fs.stepwise.min_width, fs.stepwise.min_height,
-+                       fs.stepwise.max_width, fs.stepwise.max_height,
-+                       fs.stepwise.step_width,fs.stepwise.step_height);
-+                if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
-+                    h >= fs.stepwise.min_height && h <= fs.stepwise.max_height)
-+                    return 0;
-+                break;
-+            default:
-+                av_log(avctx, AV_LOG_ERROR, "Unexpected framesize enum: %d", fs.type);
-+                return AVERROR(EINVAL);
-+        }
-+    }
-+}
-+
-+static int
-+get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s)
-+{
-+    struct v4l2_capability cap;
-+
-+    memset(&cap, 0, sizeof(cap));
-+    while (ioctl(s->fd, VIDIOC_QUERYCAP, &cap) != 0) {
-+        int err = errno;
-+        if (err == EINTR)
-+            continue;
-+        av_log(avctx, AV_LOG_ERROR, "V4L2: Failed to get capabilities: %s\n", strerror(err));
-+        return AVERROR(err);
-+    }
-+
-+    // Could be made table driven if we have a few more but right now there
-+    // seems no point
-+
-+    // Meson (amlogic) always gives a resolution changed event after output
-+    // streamon and userspace must (re)allocate capture buffers and streamon
-+    // capture to clear the event even if the capture buffers were the right
-+    // size in the first place.
-+    if (strcmp(cap.driver, "meson-vdec") == 0)
-+        s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS | FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks);
-+    return 0;
-+}
-+
-+// This heuristic is for H264 but use for everything
-+static uint32_t max_coded_size(const AVCodecContext * const avctx)
-+{
-+    uint32_t wxh = avctx->coded_width * avctx->coded_height;
-+    uint32_t size;
-+
-+    size = wxh * 3 / 2;
-+    // H.264 Annex A table A-1 gives minCR which is either 2 or 4
-+    // unfortunately that doesn't yield an actually useful limit
-+    // and it should be noted that frame 0 is special cased to allow
-+    // a bigger number which really isn't helpful for us. So just pick
-+    // frame_size / 2
-+    size /= 2;
-+    // Add 64k to allow for any overheads and/or encoder hopefulness
-+    // with small WxH
-+    return size + (1 << 16);
- }
- 
- static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-@@ -186,12 +692,29 @@ static av_cold int v4l2_decode_init(AVCo
-     V4L2Context *capture, *output;
-     V4L2m2mContext *s;
-     V4L2m2mPriv *priv = avctx->priv_data;
-+    int gf_pix_fmt;
-     int ret;
- 
-+    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
-+
-+    if (avctx->codec_id == AV_CODEC_ID_H264) {
-+        if (avctx->ticks_per_frame == 1) {
-+            if(avctx->time_base.den < INT_MAX/2) {
-+                avctx->time_base.den *= 2;
-+            } else
-+                avctx->time_base.num /= 2;
-+        }
-+        avctx->ticks_per_frame = 2;
-+    }
-+
-+    av_log(avctx, AV_LOG_INFO, "level=%d\n", avctx->level);
-     ret = ff_v4l2_m2m_create_context(priv, &s);
-     if (ret < 0)
-         return ret;
- 
-+    pts_stats_init(&s->pts_stat, avctx, "decoder");
-+    s->pending_hw = PENDING_HW_MIN;
-+
-     capture = &s->capture;
-     output = &s->output;
- 
-@@ -199,34 +722,127 @@ static av_cold int v4l2_decode_init(AVCo
-      * by the v4l2 driver; this event will trigger a full pipeline reconfig and
-      * the proper values will be retrieved from the kernel driver.
-      */
--    output->height = capture->height = avctx->coded_height;
--    output->width = capture->width = avctx->coded_width;
-+//    output->height = capture->height = avctx->coded_height;
-+//    output->width = capture->width = avctx->coded_width;
-+    output->height = capture->height = 0;
-+    output->width = capture->width = 0;
- 
-     output->av_codec_id = avctx->codec_id;
-     output->av_pix_fmt  = AV_PIX_FMT_NONE;
-+    output->min_buf_size = max_coded_size(avctx);
- 
-     capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
-     capture->av_pix_fmt = avctx->pix_fmt;
-+    capture->min_buf_size = 0;
-+
-+    /* the client requests the codec to generate DRM frames:
-+     *   - data[0] will therefore point to the returned AVDRMFrameDescriptor
-+     *       check the ff_v4l2_buffer_to_avframe conversion function.
-+     *   - the DRM frame format is passed in the DRM frame descriptor layer.
-+     *       check the v4l2_get_drm_frame function.
-+     */
-+
-+    avctx->sw_pix_fmt = avctx->pix_fmt;
-+    gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
-+    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n",
-+           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt),
-+           avctx->coded_width, avctx->coded_height,
-+           gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
-+
-+    if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
-+        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
-+        s->output_drm = 1;
-+    }
-+    else {
-+        capture->av_pix_fmt = gf_pix_fmt;
-+        s->output_drm = 0;
-+    }
-+
-+    s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
-+    if (!s->device_ref) {
-+        ret = AVERROR(ENOMEM);
-+        return ret;
-+    }
-+
-+    ret = av_hwdevice_ctx_init(s->device_ref);
-+    if (ret < 0)
-+        return ret;
- 
-     s->avctx = avctx;
-     ret = ff_v4l2_m2m_codec_init(priv);
-     if (ret) {
-         av_log(avctx, AV_LOG_ERROR, "can't configure decoder\n");
--        s->self_ref = NULL;
--        av_buffer_unref(&priv->context_ref);
--
-         return ret;
-     }
- 
--    return v4l2_prepare_decoder(s);
-+    if ((ret = v4l2_prepare_decoder(s)) < 0)
-+        return ret;
-+
-+    if ((ret = get_quirks(avctx, s)) != 0)
-+        return ret;
-+
-+    if ((ret = check_size(avctx, s)) != 0)
-+        return ret;
-+
-+    return 0;
- }
- 
- static av_cold int v4l2_decode_close(AVCodecContext *avctx)
- {
--    V4L2m2mPriv *priv = avctx->priv_data;
--    V4L2m2mContext *s = priv->context;
-+    int rv;
-+    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
-+    rv = ff_v4l2_m2m_codec_end(avctx->priv_data);
-+    av_log(avctx, AV_LOG_TRACE, ">>> %s: rv=%d\n", __func__, rv);
-+    return rv;
-+}
-+
-+static void v4l2_decode_flush(AVCodecContext *avctx)
-+{
-+    // An alternatve and more drastic form of flush is to simply do this:
-+    //    v4l2_decode_close(avctx);
-+    //    v4l2_decode_init(avctx);
-+    // The downside is that this keeps a decoder open until all the frames
-+    // associated with it have been returned.  This is a bit wasteful on
-+    // possibly limited h/w resources and fails on a Pi for this reason unless
-+    // more GPU mem is allocated than is the default.
-+
-+    V4L2m2mPriv * const priv = avctx->priv_data;
-+    V4L2m2mContext * const s = priv->context;
-+    V4L2Context * const output = &s->output;
-+    V4L2Context * const capture = &s->capture;
-+
-+    av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon);
-+
-+    // Reflushing everything is benign, quick and avoids having to worry about
-+    // states like EOS processing so don't try to optimize out (having got it
-+    // wrong once)
-+
-+    ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
-+
-+    // Clear any buffered input packet
-     av_packet_unref(&s->buf_pkt);
--    return ff_v4l2_m2m_codec_end(priv);
-+
-+    // Clear a pending EOS
-+    if (ff_v4l2_ctx_eos(capture)) {
-+        // Arguably we could delay this but this is easy and doesn't require
-+        // thought or extra vars
-+        ff_v4l2_context_set_status(capture, VIDIOC_STREAMOFF);
-+        ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
-+    }
-+
-+    // V4L2 makes no guarantees about whether decoded frames are flushed or not
-+    // so mark all frames we are tracking to be discarded if they appear
-+    xlat_flush(&s->xlat);
-+
-+    // resend extradata
-+    s->extdata_sent = 0;
-+    // clear EOS status vars
-+    s->draining = 0;
-+    output->done = 0;
-+    capture->done = 0;
-+
-+    // Stream on will occur when we actually submit a new frame
-+    av_log(avctx, AV_LOG_TRACE, ">>> %s\n", __func__);
- }
- 
- #define OFFSET(x) offsetof(V4L2m2mPriv, x)
-@@ -235,10 +851,16 @@ static av_cold int v4l2_decode_close(AVC
- static const AVOption options[] = {
-     V4L_M2M_DEFAULT_OPTS,
-     { "num_capture_buffers", "Number of buffers in the capture context",
--        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 20, INT_MAX, FLAGS },
-+        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS },
-+    { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS },
-     { NULL},
- };
- 
-+static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = {
-+    HW_CONFIG_INTERNAL(DRM_PRIME),
-+    NULL
-+};
-+
- #define M2MDEC_CLASS(NAME) \
-     static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \
-         .class_name = #NAME "_v4l2m2m_decoder", \
-@@ -259,9 +881,15 @@ static const AVOption options[] = {
-         .init           = v4l2_decode_init, \
-         .receive_frame  = v4l2_receive_frame, \
-         .close          = v4l2_decode_close, \
-+        .flush          = v4l2_decode_flush, \
-         .bsfs           = bsf_name, \
-         .capabilities   = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \
--        .caps_internal  = FF_CODEC_CAP_SETS_PKT_DTS, \
-+        .caps_internal  = FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \
-+        .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \
-+                                                         AV_PIX_FMT_NV12, \
-+                                                         AV_PIX_FMT_YUV420P, \
-+                                                         AV_PIX_FMT_NONE}, \
-+        .hw_configs     = v4l2_m2m_hw_configs, \
-         .wrapper_name   = "v4l2m2m", \
-     }
- 
---- a/libavcodec/v4l2_m2m_enc.c
-+++ b/libavcodec/v4l2_m2m_enc.c
-@@ -24,6 +24,8 @@
- #include <linux/videodev2.h>
- #include <sys/ioctl.h>
- #include <search.h>
-+#include <drm_fourcc.h>
-+
- #include "libavcodec/avcodec.h"
- #include "libavcodec/internal.h"
- #include "libavutil/pixdesc.h"
-@@ -37,6 +39,34 @@
- #define MPEG_CID(x) V4L2_CID_MPEG_VIDEO_##x
- #define MPEG_VIDEO(x) V4L2_MPEG_VIDEO_##x
- 
-+// P030 should be defined in drm_fourcc.h and hopefully will be sometime
-+// in the future but until then...
-+#ifndef DRM_FORMAT_P030
-+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
-+#endif
-+
-+#ifndef DRM_FORMAT_NV15
-+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
-+#endif
-+
-+#ifndef DRM_FORMAT_NV20
-+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
-+#endif
-+
-+#ifndef V4L2_CID_CODEC_BASE
-+#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
-+#endif
-+
-+// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
-+// in videodev2.h hopefully will be sometime in the future but until then...
-+#ifndef V4L2_PIX_FMT_NV12_10_COL128
-+#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
-+#endif
-+
-+#ifndef V4L2_PIX_FMT_NV12_COL128
-+#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
-+#endif
-+
- static inline void v4l2_set_timeperframe(V4L2m2mContext *s, unsigned int num, unsigned int den)
- {
-     struct v4l2_streamparm parm = { 0 };
-@@ -147,15 +177,14 @@ static inline int v4l2_mpeg4_profile_fro
- static int v4l2_check_b_frame_support(V4L2m2mContext *s)
- {
-     if (s->avctx->max_b_frames)
--        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support b-frames yet\n");
-+        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support %d b-frames yet\n", s->avctx->max_b_frames);
- 
--    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), 0, "number of B-frames", 0);
-+    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), s->avctx->max_b_frames, "number of B-frames", 1);
-     v4l2_get_ext_ctrl(s, MPEG_CID(B_FRAMES), &s->avctx->max_b_frames, "number of B-frames", 0);
-     if (s->avctx->max_b_frames == 0)
-         return 0;
- 
-     avpriv_report_missing_feature(s->avctx, "DTS/PTS calculation for V4L2 encoding");
--
-     return AVERROR_PATCHWELCOME;
- }
- 
-@@ -270,13 +299,184 @@ static int v4l2_prepare_encoder(V4L2m2mC
-     return 0;
- }
- 
-+static int avdrm_to_v4l2(struct v4l2_format * const format, const AVFrame * const frame)
-+{
-+    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
-+
-+    const uint32_t drm_fmt = src->layers[0].format;
-+    // Treat INVALID as LINEAR
-+    const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ?
-+        DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier;
-+    uint32_t pix_fmt = 0;
-+    uint32_t w = 0;
-+    uint32_t h = 0;
-+    uint32_t bpl = src->layers[0].planes[0].pitch;
-+
-+    // We really don't expect multiple layers
-+    // All formats that we currently cope with are single object
-+
-+    if (src->nb_layers != 1 || src->nb_objects != 1)
-+        return AVERROR(EINVAL);
-+
-+    switch (drm_fmt) {
-+        case DRM_FORMAT_YUV420:
-+            if (mod == DRM_FORMAT_MOD_LINEAR) {
-+                if (src->layers[0].nb_planes != 3)
-+                    break;
-+                pix_fmt = V4L2_PIX_FMT_YUV420;
-+                h = src->layers[0].planes[1].offset / bpl;
-+                w = bpl;
-+            }
-+            break;
-+
-+        case DRM_FORMAT_NV12:
-+            if (mod == DRM_FORMAT_MOD_LINEAR) {
-+                if (src->layers[0].nb_planes != 2)
-+                    break;
-+                pix_fmt = V4L2_PIX_FMT_NV12;
-+                h = src->layers[0].planes[1].offset / bpl;
-+                w = bpl;
-+            }
-+            else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
-+                if (src->layers[0].nb_planes != 2)
-+                    break;
-+                pix_fmt = V4L2_PIX_FMT_NV12_COL128;
-+                w = bpl;
-+                h = src->layers[0].planes[1].offset / 128;
-+                bpl = fourcc_mod_broadcom_param(mod);
-+            }
-+            break;
-+
-+        case DRM_FORMAT_P030:
-+            if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
-+                if (src->layers[0].nb_planes != 2)
-+                    break;
-+                pix_fmt =  V4L2_PIX_FMT_NV12_10_COL128;
-+                w = bpl / 2;  // Matching lie to how we construct this
-+                h = src->layers[0].planes[1].offset / 128;
-+                bpl = fourcc_mod_broadcom_param(mod);
-+            }
-+            break;
-+
-+        default:
-+            break;
-+    }
-+
-+    if (!pix_fmt)
-+        return AVERROR(EINVAL);
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
-+        struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp;
-+
-+        pix->width = w;
-+        pix->height = h;
-+        pix->pixelformat = pix_fmt;
-+        pix->plane_fmt[0].bytesperline = bpl;
-+        pix->num_planes = 1;
-+    }
-+    else {
-+        struct v4l2_pix_format *const pix = &format->fmt.pix;
-+
-+        pix->width = w;
-+        pix->height = h;
-+        pix->pixelformat = pix_fmt;
-+        pix->bytesperline = bpl;
-+    }
-+
-+    return 0;
-+}
-+
-+// Do we have similar enough formats to be usable?
-+static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format * const b)
-+{
-+    if (a->type != b->type)
-+        return 0;
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(a->type)) {
-+        const struct v4l2_pix_format_mplane *const pa = &a->fmt.pix_mp;
-+        const struct v4l2_pix_format_mplane *const pb = &b->fmt.pix_mp;
-+        unsigned int i;
-+        if (pa->pixelformat != pb->pixelformat ||
-+            pa->num_planes != pb->num_planes)
-+            return 0;
-+        for (i = 0; i != pa->num_planes; ++i) {
-+            if (pa->plane_fmt[i].bytesperline != pb->plane_fmt[i].bytesperline)
-+                return 0;
-+        }
-+    }
-+    else {
-+        const struct v4l2_pix_format *const pa = &a->fmt.pix;
-+        const struct v4l2_pix_format *const pb = &b->fmt.pix;
-+        if (pa->pixelformat != pb->pixelformat ||
-+            pa->bytesperline != pb->bytesperline)
-+            return 0;
-+    }
-+    return 1;
-+}
-+
-+
- static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
- {
-     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-     V4L2Context *const output = &s->output;
- 
-+    // Signal EOF if needed
-+    if (!frame) {
-+        return ff_v4l2_context_enqueue_frame(output, frame);
-+    }
-+
-+    if (s->input_drm && !output->streamon) {
-+        int rv;
-+        struct v4l2_format req_format = {.type = output->format.type};
-+
-+        // Set format when we first get a buffer
-+        if ((rv = avdrm_to_v4l2(&req_format, frame)) != 0) {
-+            av_log(avctx, AV_LOG_ERROR, "Failed to get V4L2 format from DRM_PRIME frame\n");
-+            return rv;
-+        }
-+
-+        ff_v4l2_context_release(output);
-+
-+        output->format = req_format;
-+
-+        if ((rv = ff_v4l2_context_set_format(output)) != 0) {
-+            av_log(avctx, AV_LOG_ERROR, "Failed to set V4L2 format\n");
-+            return rv;
-+        }
-+
-+        if (!fmt_eq(&req_format, &output->format)) {
-+            av_log(avctx, AV_LOG_ERROR, "Format mismatch after setup\n");
-+            return AVERROR(EINVAL);
-+        }
-+
-+        output->selection.top = frame->crop_top;
-+        output->selection.left = frame->crop_left;
-+        output->selection.width = av_frame_cropped_width(frame);
-+        output->selection.height = av_frame_cropped_height(frame);
-+
-+        if ((rv = ff_v4l2_context_init(output)) != 0) {
-+            av_log(avctx, AV_LOG_ERROR, "Failed to (re)init context\n");
-+            return rv;
-+        }
-+
-+        {
-+            struct v4l2_selection selection = {
-+                .type = V4L2_BUF_TYPE_VIDEO_OUTPUT,
-+                .target = V4L2_SEL_TGT_CROP,
-+                .r = output->selection
-+            };
-+            if (ioctl(s->fd, VIDIOC_S_SELECTION, &selection) != 0) {
-+                av_log(avctx, AV_LOG_WARNING, "S_SELECTION (CROP) %dx%d @ %d,%d failed: %s\n",
-+                       selection.r.width, selection.r.height, selection.r.left, selection.r.top,
-+                       av_err2str(AVERROR(errno)));
-+            }
-+            av_log(avctx, AV_LOG_TRACE, "S_SELECTION (CROP) %dx%d @ %d,%d OK\n",
-+                   selection.r.width, selection.r.height, selection.r.left, selection.r.top);
-+        }
-+    }
-+
- #ifdef V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME
--    if (frame && frame->pict_type == AV_PICTURE_TYPE_I)
-+    if (frame->pict_type == AV_PICTURE_TYPE_I)
-         v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1);
- #endif
- 
-@@ -310,7 +510,70 @@ static int v4l2_receive_packet(AVCodecCo
-     }
- 
- dequeue:
--    return ff_v4l2_context_dequeue_packet(capture, avpkt);
-+    if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
-+        return ret;
-+
-+    if (capture->first_buf == 1) {
-+        uint8_t * data;
-+        const int len = avpkt->size;
-+
-+        // 1st buffer after streamon should be SPS/PPS
-+        capture->first_buf = 2;
-+
-+        // Clear both possible stores so there is no chance of confusion
-+        av_freep(&s->extdata_data);
-+        s->extdata_size = 0;
-+        av_freep(&avctx->extradata);
-+        avctx->extradata_size = 0;
-+
-+        if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) != NULL)
-+            memcpy(data, avpkt->data, len);
-+
-+        av_packet_unref(avpkt);
-+
-+        if (data == NULL)
-+            return AVERROR(ENOMEM);
-+
-+        // We need to copy the header, but keep local if not global
-+        if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) {
-+            avctx->extradata = data;
-+            avctx->extradata_size = len;
-+        }
-+        else {
-+            s->extdata_data = data;
-+            s->extdata_size = len;
-+        }
-+
-+        if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
-+            return ret;
-+    }
-+
-+    // First frame must be key so mark as such even if encoder forgot
-+    if (capture->first_buf == 2)
-+        avpkt->flags |= AV_PKT_FLAG_KEY;
-+
-+    // Add SPS/PPS to the start of every key frame if non-global headers
-+    if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) {
-+        const size_t newlen = s->extdata_size + avpkt->size;
-+        AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE);
-+
-+        if (buf == NULL) {
-+            av_packet_unref(avpkt);
-+            return AVERROR(ENOMEM);
-+        }
-+
-+        memcpy(buf->data, s->extdata_data, s->extdata_size);
-+        memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size);
-+
-+        av_buffer_unref(&avpkt->buf);
-+        avpkt->buf = buf;
-+        avpkt->data = buf->data;
-+        avpkt->size = newlen;
-+    }
-+
-+//    av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret);
-+    capture->first_buf = 0;
-+    return 0;
- }
- 
- static av_cold int v4l2_encode_init(AVCodecContext *avctx)
-@@ -322,6 +585,8 @@ static av_cold int v4l2_encode_init(AVCo
-     uint32_t v4l2_fmt_output;
-     int ret;
- 
-+    av_log(avctx, AV_LOG_INFO, " <<< %s: fmt=%d/%d\n", __func__, avctx->pix_fmt, avctx->sw_pix_fmt);
-+
-     ret = ff_v4l2_m2m_create_context(priv, &s);
-     if (ret < 0)
-         return ret;
-@@ -329,13 +594,17 @@ static av_cold int v4l2_encode_init(AVCo
-     capture = &s->capture;
-     output  = &s->output;
- 
-+    s->input_drm = (avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME);
-+
-     /* common settings output/capture */
-     output->height = capture->height = avctx->height;
-     output->width = capture->width = avctx->width;
- 
-     /* output context */
-     output->av_codec_id = AV_CODEC_ID_RAWVIDEO;
--    output->av_pix_fmt = avctx->pix_fmt;
-+    output->av_pix_fmt = !s->input_drm ? avctx->pix_fmt :
-+            avctx->sw_pix_fmt != AV_PIX_FMT_NONE ? avctx->sw_pix_fmt :
-+            AV_PIX_FMT_YUV420P;
- 
-     /* capture context */
-     capture->av_codec_id = avctx->codec_id;
-@@ -354,7 +623,7 @@ static av_cold int v4l2_encode_init(AVCo
-         v4l2_fmt_output = output->format.fmt.pix.pixelformat;
- 
-     pix_fmt_output = ff_v4l2_format_v4l2_to_avfmt(v4l2_fmt_output, AV_CODEC_ID_RAWVIDEO);
--    if (pix_fmt_output != avctx->pix_fmt) {
-+    if (!s->input_drm && pix_fmt_output != avctx->pix_fmt) {
-         const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt_output);
-         av_log(avctx, AV_LOG_ERROR, "Encoder requires %s pixel format.\n", desc->name);
-         return AVERROR(EINVAL);
---- /dev/null
-+++ b/libavcodec/v4l2_req_decode_q.c
-@@ -0,0 +1,84 @@
-+#include <memory.h>
-+#include <semaphore.h>
-+#include <pthread.h>
-+
-+#include "v4l2_req_decode_q.h"
-+
-+int decode_q_in_q(const req_decode_ent * const d)
-+{
-+    return d->in_q;
-+}
-+
-+void decode_q_add(req_decode_q * const q, req_decode_ent * const d)
-+{
-+    pthread_mutex_lock(&q->q_lock);
-+    if (!q->head) {
-+        q->head = d;
-+        q->tail = d;
-+        d->prev = NULL;
-+    }
-+    else {
-+        q->tail->next = d;
-+        d->prev = q->tail;
-+        q->tail = d;
-+    }
-+    d->next = NULL;
-+    d->in_q = 1;
-+    pthread_mutex_unlock(&q->q_lock);
-+}
-+
-+// Remove entry from Q - if head wake-up anything that was waiting
-+void decode_q_remove(req_decode_q * const q, req_decode_ent * const d)
-+{
-+    int try_signal = 0;
-+
-+    if (!d->in_q)
-+        return;
-+
-+    pthread_mutex_lock(&q->q_lock);
-+    if (d->prev)
-+        d->prev->next = d->next;
-+    else {
-+        try_signal = 1;  // Only need to signal if we were head
-+        q->head = d->next;
-+    }
-+
-+    if (d->next)
-+        d->next->prev = d->prev;
-+    else
-+        q->tail = d->prev;
-+
-+    // Not strictly needed but makes debug easier
-+    d->next = NULL;
-+    d->prev = NULL;
-+    d->in_q = 0;
-+    pthread_mutex_unlock(&q->q_lock);
-+
-+    if (try_signal)
-+        pthread_cond_broadcast(&q->q_cond);
-+}
-+
-+void decode_q_wait(req_decode_q * const q, req_decode_ent * const d)
-+{
-+    pthread_mutex_lock(&q->q_lock);
-+
-+    while (q->head != d)
-+        pthread_cond_wait(&q->q_cond, &q->q_lock);
-+
-+    pthread_mutex_unlock(&q->q_lock);
-+}
-+
-+void decode_q_uninit(req_decode_q * const q)
-+{
-+    pthread_mutex_destroy(&q->q_lock);
-+    pthread_cond_destroy(&q->q_cond);
-+}
-+
-+void decode_q_init(req_decode_q * const q)
-+{
-+    memset(q, 0, sizeof(*q));
-+    pthread_mutex_init(&q->q_lock, NULL);
-+    pthread_cond_init(&q->q_cond, NULL);
-+}
-+
-+
---- /dev/null
-+++ b/libavcodec/v4l2_req_decode_q.h
-@@ -0,0 +1,25 @@
-+#ifndef AVCODEC_V4L2_REQ_DECODE_Q_H
-+#define AVCODEC_V4L2_REQ_DECODE_Q_H
-+
-+typedef struct req_decode_ent {
-+    struct req_decode_ent * next;
-+    struct req_decode_ent * prev;
-+    int in_q;
-+} req_decode_ent;
-+
-+typedef struct req_decode_q {
-+    pthread_mutex_t q_lock;
-+    pthread_cond_t q_cond;
-+    req_decode_ent * head;
-+    req_decode_ent * tail;
-+} req_decode_q;
-+
-+int decode_q_in_q(const req_decode_ent * const d);
-+void decode_q_add(req_decode_q * const q, req_decode_ent * const d);
-+void decode_q_remove(req_decode_q * const q, req_decode_ent * const d);
-+void decode_q_wait(req_decode_q * const q, req_decode_ent * const d);
-+void decode_q_uninit(req_decode_q * const q);
-+void decode_q_init(req_decode_q * const q);
-+
-+#endif
-+
---- /dev/null
-+++ b/libavcodec/v4l2_req_devscan.c
-@@ -0,0 +1,449 @@
-+#include <errno.h>
-+#include <fcntl.h>
-+#include <libudev.h>
-+#include <stdlib.h>
-+#include <string.h>
-+#include <unistd.h>
-+
-+#include <sys/ioctl.h>
-+#include <sys/sysmacros.h>
-+
-+#include <linux/media.h>
-+#include <linux/videodev2.h>
-+
-+#include "v4l2_req_devscan.h"
-+#include "v4l2_req_utils.h"
-+
-+struct decdev {
-+    enum v4l2_buf_type src_type;
-+    uint32_t src_fmt_v4l2;
-+    const char * vname;
-+    const char * mname;
-+};
-+
-+struct devscan {
-+    struct decdev env;
-+    unsigned int dev_size;
-+    unsigned int dev_count;
-+    struct decdev *devs;
-+};
-+
-+static int video_src_pixfmt_supported(uint32_t fmt)
-+{
-+    return 1;
-+}
-+
-+static void v4l2_setup_format(struct v4l2_format *format, unsigned int type,
-+                  unsigned int width, unsigned int height,
-+                  unsigned int pixelformat)
-+{
-+    unsigned int sizeimage;
-+
-+    memset(format, 0, sizeof(*format));
-+    format->type = type;
-+
-+    sizeimage = V4L2_TYPE_IS_OUTPUT(type) ? 4 * 1024 * 1024 : 0;
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(type)) {
-+        format->fmt.pix_mp.width = width;
-+        format->fmt.pix_mp.height = height;
-+        format->fmt.pix_mp.plane_fmt[0].sizeimage = sizeimage;
-+        format->fmt.pix_mp.pixelformat = pixelformat;
-+    } else {
-+        format->fmt.pix.width = width;
-+        format->fmt.pix.height = height;
-+        format->fmt.pix.sizeimage = sizeimage;
-+        format->fmt.pix.pixelformat = pixelformat;
-+    }
-+}
-+
-+static int v4l2_set_format(int video_fd, unsigned int type, unsigned int pixelformat,
-+            unsigned int width, unsigned int height)
-+{
-+    struct v4l2_format format;
-+
-+    v4l2_setup_format(&format, type, width, height, pixelformat);
-+
-+    return ioctl(video_fd, VIDIOC_S_FMT, &format) ? -errno : 0;
-+}
-+
-+static int v4l2_query_capabilities(int video_fd, unsigned int *capabilities)
-+{
-+    struct v4l2_capability capability = { 0 };
-+    int rc;
-+
-+    rc = ioctl(video_fd, VIDIOC_QUERYCAP, &capability);
-+    if (rc < 0)
-+        return -errno;
-+
-+    if (capabilities != NULL) {
-+        if ((capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0)
-+            *capabilities = capability.device_caps;
-+        else
-+            *capabilities = capability.capabilities;
-+    }
-+
-+    return 0;
-+}
-+
-+static int devscan_add(struct devscan *const scan,
-+                       enum v4l2_buf_type src_type,
-+                       uint32_t src_fmt_v4l2,
-+                       const char * vname,
-+                       const char * mname)
-+{
-+    struct decdev *d;
-+
-+    if (scan->dev_size <= scan->dev_count) {
-+        unsigned int n = !scan->dev_size ? 4 : scan->dev_size * 2;
-+        d = realloc(scan->devs, n * sizeof(*d));
-+        if (!d)
-+            return -ENOMEM;
-+        scan->devs = d;
-+        scan->dev_size = n;
-+    }
-+
-+    d = scan->devs + scan->dev_count;
-+    d->src_type = src_type;
-+    d->src_fmt_v4l2 = src_fmt_v4l2;
-+    d->vname = strdup(vname);
-+    if (!d->vname)
-+        return -ENOMEM;
-+    d->mname = strdup(mname);
-+    if (!d->mname) {
-+        free((char *)d->vname);
-+        return -ENOMEM;
-+    }
-+    ++scan->dev_count;
-+    return 0;
-+}
-+
-+void devscan_delete(struct devscan **const pScan)
-+{
-+    unsigned int i;
-+    struct devscan * const scan = *pScan;
-+
-+    if (!scan)
-+        return;
-+    *pScan = NULL;
-+
-+    for (i = 0; i < scan->dev_count; ++i) {
-+        free((char*)scan->devs[i].mname);
-+        free((char*)scan->devs[i].vname);
-+    }
-+    free(scan->devs);
-+    free(scan);
-+}
-+
-+#define REQ_BUF_CAPS (\
-+    V4L2_BUF_CAP_SUPPORTS_DMABUF |\
-+    V4L2_BUF_CAP_SUPPORTS_REQUESTS |\
-+    V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF)
-+
-+static void probe_formats(void * const dc,
-+              struct devscan *const scan,
-+              const int fd,
-+              const unsigned int type_v4l2,
-+              const char *const mpath,
-+              const char *const vpath)
-+{
-+    unsigned int i;
-+    for (i = 0;; ++i) {
-+        struct v4l2_fmtdesc fmtdesc = {
-+            .index = i,
-+            .type = type_v4l2
-+        };
-+        struct v4l2_requestbuffers rbufs = {
-+            .count = 0,
-+            .type = type_v4l2,
-+            .memory = V4L2_MEMORY_MMAP
-+        };
-+        while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) {
-+            if (errno == EINTR)
-+                continue;
-+            if (errno != EINVAL)
-+                request_err(dc, "Enum[%d] failed for type=%d\n", i, type_v4l2);
-+            return;
-+        }
-+        if (!video_src_pixfmt_supported(fmtdesc.pixelformat))
-+            continue;
-+
-+        if (v4l2_set_format(fd, type_v4l2, fmtdesc.pixelformat, 720, 480)) {
-+            request_debug(dc, "Set failed for type=%d, pf=%.4s\n", type_v4l2, (char*)&fmtdesc.pixelformat);
-+            continue;
-+        }
-+
-+        while (ioctl(fd, VIDIOC_REQBUFS, &rbufs)) {
-+            if (errno != EINTR) {
-+                request_debug(dc, "%s: Reqbufs failed\n", vpath);
-+                continue;
-+            }
-+        }
-+
-+        if ((rbufs.capabilities & REQ_BUF_CAPS) != REQ_BUF_CAPS) {
-+            request_debug(dc, "%s: Buf caps %#x insufficient\n", vpath, rbufs.capabilities);
-+            continue;
-+        }
-+
-+        request_debug(dc, "Adding: %s,%s pix=%#x, type=%d\n",
-+                 mpath, vpath, fmtdesc.pixelformat, type_v4l2);
-+        devscan_add(scan, type_v4l2, fmtdesc.pixelformat, vpath, mpath);
-+    }
-+}
-+
-+
-+static int probe_video_device(void * const dc,
-+                   struct udev_device *const device,
-+                   struct devscan *const scan,
-+                   const char *const mpath)
-+{
-+    int ret;
-+    unsigned int capabilities = 0;
-+    int video_fd = -1;
-+
-+    const char *path = udev_device_get_devnode(device);
-+    if (!path) {
-+        request_err(dc, "%s: get video device devnode failed\n", __func__);
-+        ret = -EINVAL;
-+        goto fail;
-+    }
-+
-+    video_fd = open(path, O_RDWR, 0);
-+    if (video_fd == -1) {
-+        ret = -errno;
-+        request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno);
-+        goto fail;
-+    }
-+
-+    ret = v4l2_query_capabilities(video_fd, &capabilities);
-+    if (ret < 0) {
-+        request_err(dc, "%s: get video capability failed, %s (%d)\n", __func__, strerror(-ret), -ret);
-+        goto fail;
-+    }
-+
-+    request_debug(dc, "%s: path=%s capabilities=%#x\n", __func__, path, capabilities);
-+
-+    if (!(capabilities & V4L2_CAP_STREAMING)) {
-+        request_debug(dc, "%s: missing required streaming capability\n", __func__);
-+        ret = -EINVAL;
-+        goto fail;
-+    }
-+
-+    if (!(capabilities & (V4L2_CAP_VIDEO_M2M_MPLANE | V4L2_CAP_VIDEO_M2M))) {
-+        request_debug(dc, "%s: missing required mem2mem capability\n", __func__);
-+        ret = -EINVAL;
-+        goto fail;
-+    }
-+
-+    /* Should check capture formats too... */
-+    if ((capabilities & V4L2_CAP_VIDEO_M2M) != 0)
-+        probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT, mpath, path);
-+    if ((capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) != 0)
-+        probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE, mpath, path);
-+
-+    close(video_fd);
-+    return 0;
-+
-+fail:
-+    if (video_fd >= 0)
-+        close(video_fd);
-+    return ret;
-+}
-+
-+static int probe_media_device(void * const dc,
-+                   struct udev_device *const device,
-+                   struct devscan *const scan)
-+{
-+    int ret;
-+    int rv;
-+    struct media_device_info device_info = { 0 };
-+    struct media_v2_topology topology = { 0 };
-+    struct media_v2_interface *interfaces = NULL;
-+    struct udev *udev = udev_device_get_udev(device);
-+    struct udev_device *video_device;
-+    dev_t devnum;
-+    int media_fd = -1;
-+
-+    const char *path = udev_device_get_devnode(device);
-+    if (!path) {
-+        request_err(dc, "%s: get media device devnode failed\n", __func__);
-+        ret = -EINVAL;
-+        goto fail;
-+    }
-+
-+    media_fd = open(path, O_RDWR, 0);
-+    if (media_fd < 0) {
-+        ret = -errno;
-+        request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(-ret), -ret);
-+        goto fail;
-+    }
-+
-+    rv = ioctl(media_fd, MEDIA_IOC_DEVICE_INFO, &device_info);
-+    if (rv < 0) {
-+        ret = -errno;
-+        request_err(dc, "%s: get media device info failed, %s (%d)\n", __func__, strerror(-ret), -ret);
-+        goto fail;
-+    }
-+
-+    rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
-+    if (rv < 0) {
-+        ret = -errno;
-+        request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret);
-+        goto fail;
-+    }
-+
-+    if (topology.num_interfaces <= 0) {
-+        request_err(dc, "%s: media device has no interfaces\n", __func__);
-+        ret = -EINVAL;
-+        goto fail;
-+    }
-+
-+    interfaces = calloc(topology.num_interfaces, sizeof(*interfaces));
-+    if (!interfaces) {
-+        request_err(dc, "%s: allocating media interface struct failed\n", __func__);
-+        ret = -ENOMEM;
-+        goto fail;
-+    }
-+
-+    topology.ptr_interfaces = (__u64)(uintptr_t)interfaces;
-+    rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
-+    if (rv < 0) {
-+        ret = -errno;
-+        request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret);
-+        goto fail;
-+    }
-+
-+    for (int i = 0; i < topology.num_interfaces; i++) {
-+        if (interfaces[i].intf_type != MEDIA_INTF_T_V4L_VIDEO)
-+            continue;
-+
-+        devnum = makedev(interfaces[i].devnode.major, interfaces[i].devnode.minor);
-+        video_device = udev_device_new_from_devnum(udev, 'c', devnum);
-+        if (!video_device) {
-+            ret = -errno;
-+            request_err(dc, "%s: video_device[%d]=%p\n", __func__, i, video_device);
-+            continue;
-+        }
-+
-+        ret = probe_video_device(dc, video_device, scan, path);
-+        udev_device_unref(video_device);
-+
-+        if (ret != 0)
-+            goto fail;
-+    }
-+
-+fail:
-+    free(interfaces);
-+    if (media_fd != -1)
-+        close(media_fd);
-+    return ret;
-+}
-+
-+const char *decdev_media_path(const struct decdev *const dev)
-+{
-+    return !dev ? NULL : dev->mname;
-+}
-+
-+const char *decdev_video_path(const struct decdev *const dev)
-+{
-+    return !dev ? NULL : dev->vname;
-+}
-+
-+enum v4l2_buf_type decdev_src_type(const struct decdev *const dev)
-+{
-+    return !dev ? 0 : dev->src_type;
-+}
-+
-+uint32_t decdev_src_pixelformat(const struct decdev *const dev)
-+{
-+    return !dev ? 0 : dev->src_fmt_v4l2;
-+}
-+
-+
-+const struct decdev *devscan_find(struct devscan *const scan,
-+                  const uint32_t src_fmt_v4l2)
-+{
-+    unsigned int i;
-+
-+    if (scan->env.mname && scan->env.vname)
-+        return &scan->env;
-+
-+    if (!src_fmt_v4l2)
-+        return scan->dev_count ? scan->devs + 0 : NULL;
-+
-+    for (i = 0; i != scan->dev_count; ++i) {
-+        if (scan->devs[i].src_fmt_v4l2 == src_fmt_v4l2)
-+            return scan->devs + i;
-+    }
-+    return NULL;
-+}
-+
-+int devscan_build(void * const dc, struct devscan **pscan)
-+{
-+    int ret;
-+    struct udev *udev;
-+    struct udev_enumerate *enumerate;
-+    struct udev_list_entry *devices;
-+    struct udev_list_entry *entry;
-+    struct udev_device *device;
-+    struct devscan * scan;
-+
-+    *pscan = NULL;
-+
-+    scan = calloc(1, sizeof(*scan));
-+    if (!scan) {
-+        ret = -ENOMEM;
-+        goto fail;
-+    }
-+
-+    scan->env.mname = getenv("LIBVA_V4L2_REQUEST_MEDIA_PATH");
-+    scan->env.vname = getenv("LIBVA_V4L2_REQUEST_VIDEO_PATH");
-+    if (scan->env.mname && scan->env.vname) {
-+        request_info(dc, "Media/video device env overrides found: %s,%s\n",
-+                 scan->env.mname, scan->env.vname);
-+        *pscan = scan;
-+        return 0;
-+    }
-+
-+    udev = udev_new();
-+    if (!udev) {
-+        request_err(dc, "%s: allocating udev context failed\n", __func__);
-+        ret = -ENOMEM;
-+        goto fail;
-+    }
-+
-+    enumerate = udev_enumerate_new(udev);
-+    if (!enumerate) {
-+        request_err(dc, "%s: allocating udev enumerator failed\n", __func__);
-+        ret = -ENOMEM;
-+        goto fail;
-+    }
-+
-+    udev_enumerate_add_match_subsystem(enumerate, "media");
-+    udev_enumerate_scan_devices(enumerate);
-+
-+    devices = udev_enumerate_get_list_entry(enumerate);
-+    udev_list_entry_foreach(entry, devices) {
-+        const char *path = udev_list_entry_get_name(entry);
-+        if (!path)
-+            continue;
-+
-+        device = udev_device_new_from_syspath(udev, path);
-+        if (!device)
-+            continue;
-+
-+        probe_media_device(dc, device, scan);
-+        udev_device_unref(device);
-+    }
-+
-+    udev_enumerate_unref(enumerate);
-+
-+    *pscan = scan;
-+    return 0;
-+
-+fail:
-+    udev_unref(udev);
-+    devscan_delete(&scan);
-+    return ret;
-+}
-+
---- /dev/null
-+++ b/libavcodec/v4l2_req_devscan.h
-@@ -0,0 +1,21 @@
-+#ifndef _DEVSCAN_H_
-+#define _DEVSCAN_H_
-+
-+struct devscan;
-+struct decdev;
-+enum v4l2_buf_type;
-+
-+/* These return pointers to data in the devscan structure and so are vaild
-+ * for the lifetime of that
-+ */
-+const char *decdev_media_path(const struct decdev *const dev);
-+const char *decdev_video_path(const struct decdev *const dev);
-+enum v4l2_buf_type decdev_src_type(const struct decdev *const dev);
-+uint32_t decdev_src_pixelformat(const struct decdev *const dev);
-+
-+const struct decdev *devscan_find(struct devscan *const scan, const uint32_t src_fmt_v4l2);
-+
-+int devscan_build(void * const dc, struct devscan **pscan);
-+void devscan_delete(struct devscan **const pScan);
-+
-+#endif
---- /dev/null
-+++ b/libavcodec/v4l2_req_dmabufs.c
-@@ -0,0 +1,266 @@
-+#include <stdio.h>
-+#include <stdlib.h>
-+#include <unistd.h>
-+#include <inttypes.h>
-+#include <fcntl.h>
-+#include <errno.h>
-+#include <string.h>
-+#include <sys/ioctl.h>
-+#include <sys/mman.h>
-+#include <linux/mman.h>
-+#include <linux/dma-buf.h>
-+#include <linux/dma-heap.h>
-+
-+#include "v4l2_req_dmabufs.h"
-+#include "v4l2_req_utils.h"
-+
-+#define DMABUF_NAME1  "/dev/dma_heap/linux,cma"
-+#define DMABUF_NAME2  "/dev/dma_heap/reserved"
-+
-+#define TRACE_ALLOC 0
-+
-+struct dmabufs_ctl {
-+    int fd;
-+    size_t page_size;
-+};
-+
-+struct dmabuf_h {
-+    int fd;
-+    size_t size;
-+    size_t len;
-+    void * mapptr;
-+};
-+
-+#if TRACE_ALLOC
-+static unsigned int total_bufs = 0;
-+static size_t total_size = 0;
-+#endif
-+
-+struct dmabuf_h * dmabuf_import(int fd, size_t size)
-+{
-+    struct dmabuf_h *dh;
-+
-+    fd = dup(fd);
-+    if (fd < 0  || size == 0)
-+        return NULL;
-+
-+    dh = malloc(sizeof(*dh));
-+    if (!dh) {
-+        close(fd);
-+        return NULL;
-+    }
-+
-+    *dh = (struct dmabuf_h) {
-+        .fd = fd,
-+        .size = size,
-+        .mapptr = MAP_FAILED
-+    };
-+
-+#if TRACE_ALLOC
-+    ++total_bufs;
-+    total_size += dh->size;
-+    request_log("%s: Import: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
-+#endif
-+
-+    return dh;
-+}
-+
-+struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * old, size_t size)
-+{
-+    struct dmabuf_h * dh;
-+    struct dma_heap_allocation_data data = {
-+        .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1),
-+        .fd = 0,
-+        .fd_flags = O_RDWR,
-+        .heap_flags = 0
-+    };
-+
-+    if (old != NULL) {
-+        if (old->size == data.len) {
-+            return old;
-+        }
-+        dmabuf_free(old);
-+    }
-+
-+    if (size == 0 ||
-+        (dh = malloc(sizeof(*dh))) == NULL)
-+        return NULL;
-+
-+    while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) {
-+        int err = errno;
-+        request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n",
-+                (uint64_t)data.len,
-+                dbsc->fd,
-+                err,
-+                strerror(err));
-+        if (err == EINTR)
-+            continue;
-+        goto fail;
-+    }
-+
-+    *dh = (struct dmabuf_h){
-+        .fd = data.fd,
-+        .size = (size_t)data.len,
-+        .mapptr = MAP_FAILED
-+    };
-+
-+#if TRACE_ALLOC
-+    ++total_bufs;
-+    total_size += dh->size;
-+    request_log("%s: Alloc: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
-+#endif
-+
-+    return dh;
-+
-+fail:
-+    free(dh);
-+    return NULL;
-+}
-+
-+int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags)
-+{
-+    struct dma_buf_sync sync = {
-+        .flags = flags
-+    };
-+    while (ioctl(dh->fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) {
-+        const int err = errno;
-+        if (errno == EINTR)
-+            continue;
-+        request_log("%s: ioctl failed: flags=%#x\n", __func__, flags);
-+        return -err;
-+    }
-+    return 0;
-+}
-+
-+int dmabuf_write_start(struct dmabuf_h * const dh)
-+{
-+    return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE);
-+}
-+
-+int dmabuf_write_end(struct dmabuf_h * const dh)
-+{
-+    return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE);
-+}
-+
-+int dmabuf_read_start(struct dmabuf_h * const dh)
-+{
-+    if (!dmabuf_map(dh))
-+        return -1;
-+    return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_READ);
-+}
-+
-+int dmabuf_read_end(struct dmabuf_h * const dh)
-+{
-+    return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ);
-+}
-+
-+
-+void * dmabuf_map(struct dmabuf_h * const dh)
-+{
-+    if (!dh)
-+        return NULL;
-+    if (dh->mapptr != MAP_FAILED)
-+        return dh->mapptr;
-+    dh->mapptr = mmap(NULL, dh->size,
-+              PROT_READ | PROT_WRITE,
-+              MAP_SHARED | MAP_POPULATE,
-+              dh->fd, 0);
-+    if (dh->mapptr == MAP_FAILED) {
-+        request_log("%s: Map failed\n", __func__);
-+        return NULL;
-+    }
-+    return dh->mapptr;
-+}
-+
-+int dmabuf_fd(const struct dmabuf_h * const dh)
-+{
-+    if (!dh)
-+        return -1;
-+    return dh->fd;
-+}
-+
-+size_t dmabuf_size(const struct dmabuf_h * const dh)
-+{
-+    if (!dh)
-+        return 0;
-+    return dh->size;
-+}
-+
-+size_t dmabuf_len(const struct dmabuf_h * const dh)
-+{
-+    if (!dh)
-+        return 0;
-+    return dh->len;
-+}
-+
-+void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len)
-+{
-+    dh->len = len;
-+}
-+
-+
-+
-+void dmabuf_free(struct dmabuf_h * dh)
-+{
-+    if (!dh)
-+        return;
-+
-+#if TRACE_ALLOC
-+    --total_bufs;
-+    total_size -= dh->size;
-+    request_log("%s: Free: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
-+#endif
-+
-+    if (dh->mapptr != MAP_FAILED)
-+        munmap(dh->mapptr, dh->size);
-+    while (close(dh->fd) == -1 && errno == EINTR)
-+        /* loop */;
-+    free(dh);
-+}
-+
-+struct dmabufs_ctl * dmabufs_ctl_new(void)
-+{
-+    struct dmabufs_ctl * dbsc = malloc(sizeof(*dbsc));
-+
-+    if (!dbsc)
-+        return NULL;
-+
-+    while ((dbsc->fd = open(DMABUF_NAME1, O_RDWR)) == -1 &&
-+           errno == EINTR)
-+        /* Loop */;
-+
-+    if (dbsc->fd == -1) {
-+        while ((dbsc->fd = open(DMABUF_NAME2, O_RDWR)) == -1 &&
-+               errno == EINTR)
-+            /* Loop */;
-+        if (dbsc->fd == -1) {
-+            request_log("Unable to open either %s or %s\n",
-+                    DMABUF_NAME1, DMABUF_NAME2);
-+            goto fail;
-+        }
-+    }
-+
-+    dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE);
-+
-+    return dbsc;
-+
-+fail:
-+    free(dbsc);
-+    return NULL;
-+}
-+
-+void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc)
-+{
-+    struct dmabufs_ctl * const dbsc = *pDbsc;
-+
-+    if (!dbsc)
-+        return;
-+    *pDbsc = NULL;
-+
-+    while (close(dbsc->fd) == -1 && errno == EINTR)
-+        /* loop */;
-+
-+    free(dbsc);
-+}
-+
-+
---- /dev/null
-+++ b/libavcodec/v4l2_req_dmabufs.h
-@@ -0,0 +1,38 @@
-+#ifndef DMABUFS_H
-+#define DMABUFS_H
-+
-+struct dmabufs_ctl;
-+struct dmabuf_h;
-+
-+struct dmabufs_ctl * dmabufs_ctl_new(void);
-+void dmabufs_ctl_delete(struct dmabufs_ctl ** const pdbsc);
-+
-+// Need not preserve old contents
-+// On NULL return old buffer is freed
-+struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h *, size_t size);
-+
-+static inline struct dmabuf_h * dmabuf_alloc(struct dmabufs_ctl * dbsc, size_t size) {
-+    return dmabuf_realloc(dbsc, NULL, size);
-+}
-+/* Create from existing fd - dups(fd) */
-+struct dmabuf_h * dmabuf_import(int fd, size_t size);
-+void * dmabuf_map(struct dmabuf_h * const dh);
-+
-+/* flags from linux/dmabuf.h DMA_BUF_SYNC_xxx */
-+int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags);
-+
-+int dmabuf_write_start(struct dmabuf_h * const dh);
-+int dmabuf_write_end(struct dmabuf_h * const dh);
-+int dmabuf_read_start(struct dmabuf_h * const dh);
-+int dmabuf_read_end(struct dmabuf_h * const dh);
-+
-+int dmabuf_fd(const struct dmabuf_h * const dh);
-+/* Allocated size */
-+size_t dmabuf_size(const struct dmabuf_h * const dh);
-+/* Bytes in use */
-+size_t dmabuf_len(const struct dmabuf_h * const dh);
-+/* Set bytes in use */
-+void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len);
-+void dmabuf_free(struct dmabuf_h * dh);
-+
-+#endif
---- /dev/null
-+++ b/libavcodec/v4l2_req_hevc_v1.c
-@@ -0,0 +1,3 @@
-+#define HEVC_CTRLS_VERSION 1
-+#include "v4l2_req_hevc_vx.c"
-+
---- /dev/null
-+++ b/libavcodec/v4l2_req_hevc_v2.c
-@@ -0,0 +1,3 @@
-+#define HEVC_CTRLS_VERSION 2
-+#include "v4l2_req_hevc_vx.c"
-+
---- /dev/null
-+++ b/libavcodec/v4l2_req_hevc_v3.c
-@@ -0,0 +1,3 @@
-+#define HEVC_CTRLS_VERSION 3
-+#include "v4l2_req_hevc_vx.c"
-+
---- /dev/null
-+++ b/libavcodec/v4l2_req_hevc_vx.c
-@@ -0,0 +1,1228 @@
-+// File included by v4l2_req_hevc_v* - not compiled on its own
-+
-+#include "decode.h"
-+#include "hevcdec.h"
-+#include "hwconfig.h"
-+
-+#include "v4l2_request_hevc.h"
-+
-+#if HEVC_CTRLS_VERSION == 1
-+#include "hevc-ctrls-v1.h"
-+
-+// Fixup renamed entries
-+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT
-+
-+#elif HEVC_CTRLS_VERSION == 2
-+#include "hevc-ctrls-v2.h"
-+#elif HEVC_CTRLS_VERSION == 3
-+#include "hevc-ctrls-v3.h"
-+#else
-+#error Unknown HEVC_CTRLS_VERSION
-+#endif
-+
-+#include "libavutil/hwcontext_drm.h"
-+
-+#include <semaphore.h>
-+#include <pthread.h>
-+
-+#include "v4l2_req_devscan.h"
-+#include "v4l2_req_dmabufs.h"
-+#include "v4l2_req_pollqueue.h"
-+#include "v4l2_req_media.h"
-+#include "v4l2_req_utils.h"
-+
-+// Attached to buf[0] in frame
-+// Pooled in hwcontext so generally create once - 1/frame
-+typedef struct V4L2MediaReqDescriptor {
-+    AVDRMFrameDescriptor drm;
-+
-+    // Media
-+    uint64_t timestamp;
-+    struct qent_dst * qe_dst;
-+
-+    // Decode only - should be NULL by the time we emit the frame
-+    struct req_decode_ent decode_ent;
-+
-+    struct media_request *req;
-+    struct qent_src *qe_src;
-+
-+#if HEVC_CTRLS_VERSION >= 2
-+    struct v4l2_ctrl_hevc_decode_params dec;
-+#endif
-+
-+    size_t num_slices;
-+    size_t alloced_slices;
-+    struct v4l2_ctrl_hevc_slice_params * slice_params;
-+    struct slice_info * slices;
-+
-+} V4L2MediaReqDescriptor;
-+
-+struct slice_info {
-+    const uint8_t * ptr;
-+    size_t len; // bytes
-+};
-+
-+// Handy container for accumulating controls before setting
-+struct req_controls {
-+    int has_scaling;
-+    struct timeval tv;
-+    struct v4l2_ctrl_hevc_sps sps;
-+    struct v4l2_ctrl_hevc_pps pps;
-+    struct v4l2_ctrl_hevc_scaling_matrix scaling_matrix;
-+};
-+
-+//static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 };
-+
-+
-+// Get an FFmpeg format from the v4l2 format
-+static enum AVPixelFormat pixel_format_from_format(const struct v4l2_format *const format)
-+{
-+    switch (V4L2_TYPE_IS_MULTIPLANAR(format->type) ?
-+            format->fmt.pix_mp.pixelformat : format->fmt.pix.pixelformat) {
-+    case V4L2_PIX_FMT_YUV420:
-+        return AV_PIX_FMT_YUV420P;
-+    case V4L2_PIX_FMT_NV12:
-+        return AV_PIX_FMT_NV12;
-+#if CONFIG_SAND
-+    case V4L2_PIX_FMT_NV12_COL128:
-+        return AV_PIX_FMT_RPI4_8;
-+    case V4L2_PIX_FMT_NV12_10_COL128:
-+        return AV_PIX_FMT_RPI4_10;
-+#endif
-+    default:
-+        break;
-+    }
-+    return AV_PIX_FMT_NONE;
-+}
-+
-+static inline uint64_t frame_capture_dpb(const AVFrame * const frame)
-+{
-+    const V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0];
-+    return rd->timestamp;
-+}
-+
-+static inline void frame_set_capture_dpb(AVFrame * const frame, const uint64_t dpb_stamp)
-+{
-+    V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0];
-+    rd->timestamp = dpb_stamp;
-+}
-+
-+static void fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_table *table)
-+{
-+    int32_t luma_weight_denom, chroma_weight_denom;
-+    const SliceHeader *sh = &h->sh;
-+
-+    if (sh->slice_type == HEVC_SLICE_I ||
-+        (sh->slice_type == HEVC_SLICE_P && !h->ps.pps->weighted_pred_flag) ||
-+        (sh->slice_type == HEVC_SLICE_B && !h->ps.pps->weighted_bipred_flag))
-+        return;
-+
-+    table->luma_log2_weight_denom = sh->luma_log2_weight_denom;
-+
-+    if (h->ps.sps->chroma_format_idc)
-+        table->delta_chroma_log2_weight_denom = sh->chroma_log2_weight_denom - sh->luma_log2_weight_denom;
-+
-+    luma_weight_denom = (1 << sh->luma_log2_weight_denom);
-+    chroma_weight_denom = (1 << sh->chroma_log2_weight_denom);
-+
-+    for (int i = 0; i < 15 && i < sh->nb_refs[L0]; i++) {
-+        table->delta_luma_weight_l0[i] = sh->luma_weight_l0[i] - luma_weight_denom;
-+        table->luma_offset_l0[i] = sh->luma_offset_l0[i];
-+        table->delta_chroma_weight_l0[i][0] = sh->chroma_weight_l0[i][0] - chroma_weight_denom;
-+        table->delta_chroma_weight_l0[i][1] = sh->chroma_weight_l0[i][1] - chroma_weight_denom;
-+        table->chroma_offset_l0[i][0] = sh->chroma_offset_l0[i][0];
-+        table->chroma_offset_l0[i][1] = sh->chroma_offset_l0[i][1];
-+    }
-+
-+    if (sh->slice_type != HEVC_SLICE_B)
-+        return;
-+
-+    for (int i = 0; i < 15 && i < sh->nb_refs[L1]; i++) {
-+        table->delta_luma_weight_l1[i] = sh->luma_weight_l1[i] - luma_weight_denom;
-+        table->luma_offset_l1[i] = sh->luma_offset_l1[i];
-+        table->delta_chroma_weight_l1[i][0] = sh->chroma_weight_l1[i][0] - chroma_weight_denom;
-+        table->delta_chroma_weight_l1[i][1] = sh->chroma_weight_l1[i][1] - chroma_weight_denom;
-+        table->chroma_offset_l1[i][0] = sh->chroma_offset_l1[i][0];
-+        table->chroma_offset_l1[i][1] = sh->chroma_offset_l1[i][1];
-+    }
-+}
-+
-+#if HEVC_CTRLS_VERSION <= 2
-+static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp)
-+{
-+    const HEVCFrame *frame;
-+    int i;
-+
-+    for (i = 0; i < h->rps[ST_CURR_BEF].nb_refs; i++) {
-+        frame = h->rps[ST_CURR_BEF].ref[i];
-+        if (frame && timestamp == frame_capture_dpb(frame->frame))
-+            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE;
-+    }
-+
-+    for (i = 0; i < h->rps[ST_CURR_AFT].nb_refs; i++) {
-+        frame = h->rps[ST_CURR_AFT].ref[i];
-+        if (frame && timestamp == frame_capture_dpb(frame->frame))
-+            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER;
-+    }
-+
-+    for (i = 0; i < h->rps[LT_CURR].nb_refs; i++) {
-+        frame = h->rps[LT_CURR].ref[i];
-+        if (frame && timestamp == frame_capture_dpb(frame->frame))
-+            return V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR;
-+    }
-+
-+    return 0;
-+}
-+#endif
-+
-+static unsigned int
-+get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame,
-+                  const struct v4l2_hevc_dpb_entry * const entries,
-+                  const unsigned int num_entries)
-+{
-+    uint64_t timestamp;
-+
-+    if (!frame)
-+        return 0;
-+
-+    timestamp = frame_capture_dpb(frame->frame);
-+
-+    for (unsigned int i = 0; i < num_entries; i++) {
-+        if (entries[i].timestamp == timestamp)
-+            return i;
-+    }
-+
-+    return 0;
-+}
-+
-+static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx)
-+{
-+    unsigned int z = 0;
-+    while (idx--) {
-+        if (*b++ == 0) {
-+            ++z;
-+            if (z >= 2 && *b == 3) {
-+                ++b;
-+                z = 0;
-+            }
-+        }
-+        else {
-+            z = 0;
-+        }
-+    }
-+    return b;
-+}
-+
-+static int slice_add(V4L2MediaReqDescriptor * const rd)
-+{
-+    if (rd->num_slices >= rd->alloced_slices) {
-+        struct v4l2_ctrl_hevc_slice_params * p2;
-+        struct slice_info * s2;
-+        size_t n2 = rd->num_slices == 0 ? 8 : rd->num_slices * 2;
-+
-+        p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2));
-+        if (p2 == NULL)
-+            return AVERROR(ENOMEM);
-+        rd->slice_params = p2;
-+
-+        s2 = av_realloc_array(rd->slices, n2, sizeof(*s2));
-+        if (s2 == NULL)
-+            return AVERROR(ENOMEM);
-+        rd->slices = s2;
-+
-+        rd->alloced_slices = n2;
-+    }
-+    ++rd->num_slices;
-+    return 0;
-+}
-+
-+static unsigned int
-+fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries)
-+{
-+    unsigned int i;
-+    unsigned int n = 0;
-+    const HEVCFrame * const pic = h->ref;
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(h->DPB); i++) {
-+        const HEVCFrame * const frame = &h->DPB[i];
-+        if (frame != pic && (frame->flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF))) {
-+            struct v4l2_hevc_dpb_entry * const entry = entries + n++;
-+
-+            entry->timestamp = frame_capture_dpb(frame->frame);
-+#if HEVC_CTRLS_VERSION <= 2
-+            entry->rps = find_frame_rps_type(h, entry->timestamp);
-+#else
-+            entry->flags = (frame->flags & HEVC_FRAME_FLAG_LONG_REF) == 0 ? 0 :
-+                V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE;
-+#endif
-+            entry->field_pic = frame->frame->interlaced_frame;
-+
-+            /* TODO: Interleaved: Get the POC for each field. */
-+            entry->pic_order_cnt[0] = frame->poc;
-+            entry->pic_order_cnt[1] = frame->poc;
-+        }
-+    }
-+    return n;
-+}
-+
-+static void fill_slice_params(const HEVCContext * const h,
-+#if HEVC_CTRLS_VERSION >= 2
-+                              const struct v4l2_ctrl_hevc_decode_params * const dec,
-+#endif
-+                              struct v4l2_ctrl_hevc_slice_params *slice_params,
-+                              uint32_t bit_size, uint32_t bit_offset)
-+{
-+    const SliceHeader * const sh = &h->sh;
-+#if HEVC_CTRLS_VERSION >= 2
-+    const struct v4l2_hevc_dpb_entry *const dpb = dec->dpb;
-+    const unsigned int dpb_n = dec->num_active_dpb_entries;
-+#else
-+    struct v4l2_hevc_dpb_entry *const dpb = slice_params->dpb;
-+    unsigned int dpb_n;
-+#endif
-+    unsigned int i;
-+    RefPicList *rpl;
-+
-+    *slice_params = (struct v4l2_ctrl_hevc_slice_params) {
-+        .bit_size = bit_size,
-+        .data_bit_offset = bit_offset,
-+
-+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+        .slice_segment_addr = sh->slice_segment_addr,
-+
-+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
-+        .nal_unit_type = h->nal_unit_type,
-+        .nuh_temporal_id_plus1 = h->temporal_id + 1,
-+
-+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+        .slice_type = sh->slice_type,
-+        .colour_plane_id = sh->colour_plane_id,
-+        .slice_pic_order_cnt = h->ref->poc,
-+        .num_ref_idx_l0_active_minus1 = sh->nb_refs[L0] ? sh->nb_refs[L0] - 1 : 0,
-+        .num_ref_idx_l1_active_minus1 = sh->nb_refs[L1] ? sh->nb_refs[L1] - 1 : 0,
-+        .collocated_ref_idx = sh->slice_temporal_mvp_enabled_flag ? sh->collocated_ref_idx : 0,
-+        .five_minus_max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ? 0 : 5 - sh->max_num_merge_cand,
-+        .slice_qp_delta = sh->slice_qp_delta,
-+        .slice_cb_qp_offset = sh->slice_cb_qp_offset,
-+        .slice_cr_qp_offset = sh->slice_cr_qp_offset,
-+        .slice_act_y_qp_offset = 0,
-+        .slice_act_cb_qp_offset = 0,
-+        .slice_act_cr_qp_offset = 0,
-+        .slice_beta_offset_div2 = sh->beta_offset / 2,
-+        .slice_tc_offset_div2 = sh->tc_offset / 2,
-+
-+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
-+        .pic_struct = h->sei.picture_timing.picture_struct,
-+
-+#if HEVC_CTRLS_VERSION < 2
-+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+        .num_rps_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
-+        .num_rps_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
-+        .num_rps_poc_lt_curr = h->rps[LT_CURR].nb_refs,
-+#endif
-+    };
-+
-+    if (sh->slice_sample_adaptive_offset_flag[0])
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA;
-+
-+    if (sh->slice_sample_adaptive_offset_flag[1])
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA;
-+
-+    if (sh->slice_temporal_mvp_enabled_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED;
-+
-+    if (sh->mvd_l1_zero_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO;
-+
-+    if (sh->cabac_init_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT;
-+
-+    if (sh->collocated_list == L0)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0;
-+
-+    if (sh->disable_deblocking_filter_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED;
-+
-+    if (sh->slice_loop_filter_across_slices_enabled_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED;
-+
-+    if (sh->dependent_slice_segment_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT;
-+
-+#if HEVC_CTRLS_VERSION < 2
-+    dpb_n = fill_dpb_entries(h, dpb);
-+    slice_params->num_active_dpb_entries = dpb_n;
-+#endif
-+
-+    if (sh->slice_type != HEVC_SLICE_I) {
-+        rpl = &h->ref->refPicList[0];
-+        for (i = 0; i < rpl->nb_refs; i++)
-+            slice_params->ref_idx_l0[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n);
-+    }
-+
-+    if (sh->slice_type == HEVC_SLICE_B) {
-+        rpl = &h->ref->refPicList[1];
-+        for (i = 0; i < rpl->nb_refs; i++)
-+            slice_params->ref_idx_l1[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n);
-+    }
-+
-+    fill_pred_table(h, &slice_params->pred_weight_table);
-+
-+    slice_params->num_entry_point_offsets = sh->num_entry_point_offsets;
-+    if (slice_params->num_entry_point_offsets > 256) {
-+        slice_params->num_entry_point_offsets = 256;
-+        av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets);
-+    }
-+
-+    for (i = 0; i < slice_params->num_entry_point_offsets; i++)
-+        slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
-+}
-+
-+#if HEVC_CTRLS_VERSION >= 2
-+static void
-+fill_decode_params(const HEVCContext * const h,
-+                   struct v4l2_ctrl_hevc_decode_params * const dec)
-+{
-+    unsigned int i;
-+
-+    *dec = (struct v4l2_ctrl_hevc_decode_params){
-+        .pic_order_cnt_val = h->poc,
-+        .num_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
-+        .num_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
-+        .num_poc_lt_curr = h->rps[LT_CURR].nb_refs,
-+    };
-+
-+    dec->num_active_dpb_entries = fill_dpb_entries(h, dec->dpb);
-+
-+    // The docn does seem to ask that we fit our 32 bit signed POC into
-+    // a U8 so... (To be fair 16 bits would be enough)
-+    // Luckily we (Pi) don't use these fields
-+    for (i = 0; i != h->rps[ST_CURR_BEF].nb_refs; ++i)
-+        dec->poc_st_curr_before[i] = h->rps[ST_CURR_BEF].ref[i]->poc;
-+    for (i = 0; i != h->rps[ST_CURR_AFT].nb_refs; ++i)
-+        dec->poc_st_curr_after[i] = h->rps[ST_CURR_AFT].ref[i]->poc;
-+    for (i = 0; i != h->rps[LT_CURR].nb_refs; ++i)
-+        dec->poc_lt_curr[i] = h->rps[LT_CURR].ref[i]->poc;
-+
-+    if (IS_IRAP(h))
-+        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC;
-+    if (IS_IDR(h))
-+        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC;
-+    if (h->sh.no_output_of_prior_pics_flag)
-+        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR;
-+
-+}
-+#endif
-+
-+static void fill_sps(struct v4l2_ctrl_hevc_sps *ctrl, const HEVCSPS *sps)
-+{
-+    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
-+    *ctrl = (struct v4l2_ctrl_hevc_sps) {
-+        .chroma_format_idc = sps->chroma_format_idc,
-+        .pic_width_in_luma_samples = sps->width,
-+        .pic_height_in_luma_samples = sps->height,
-+        .bit_depth_luma_minus8 = sps->bit_depth - 8,
-+        .bit_depth_chroma_minus8 = sps->bit_depth - 8,
-+        .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4,
-+        .sps_max_dec_pic_buffering_minus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering - 1,
-+        .sps_max_num_reorder_pics = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics,
-+        .sps_max_latency_increase_plus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_latency_increase + 1,
-+        .log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3,
-+        .log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size,
-+        .log2_min_luma_transform_block_size_minus2 = sps->log2_min_tb_size - 2,
-+        .log2_diff_max_min_luma_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size,
-+        .max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter,
-+        .max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra,
-+        .pcm_sample_bit_depth_luma_minus1 = sps->pcm.bit_depth - 1,
-+        .pcm_sample_bit_depth_chroma_minus1 = sps->pcm.bit_depth_chroma - 1,
-+        .log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm.log2_min_pcm_cb_size - 3,
-+        .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size,
-+        .num_short_term_ref_pic_sets = sps->nb_st_rps,
-+        .num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps,
-+        .chroma_format_idc = sps->chroma_format_idc,
-+        .sps_max_sub_layers_minus1 = sps->max_sub_layers - 1,
-+    };
-+
-+    if (sps->separate_colour_plane_flag)
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE;
-+
-+    if (sps->scaling_list_enable_flag)
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED;
-+
-+    if (sps->amp_enabled_flag)
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED;
-+
-+    if (sps->sao_enabled)
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET;
-+
-+    if (sps->pcm_enabled_flag)
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED;
-+
-+    if (sps->pcm.loop_filter_disable_flag)
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED;
-+
-+    if (sps->long_term_ref_pics_present_flag)
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT;
-+
-+    if (sps->sps_temporal_mvp_enabled_flag)
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED;
-+
-+    if (sps->sps_strong_intra_smoothing_enable_flag)
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED;
-+}
-+
-+static void fill_scaling_matrix(const ScalingList * const sl,
-+                                struct v4l2_ctrl_hevc_scaling_matrix * const sm)
-+{
-+    unsigned int i;
-+
-+    for (i = 0; i < 6; i++) {
-+        unsigned int j;
-+
-+        for (j = 0; j < 16; j++)
-+            sm->scaling_list_4x4[i][j] = sl->sl[0][i][j];
-+        for (j = 0; j < 64; j++) {
-+            sm->scaling_list_8x8[i][j]   = sl->sl[1][i][j];
-+            sm->scaling_list_16x16[i][j] = sl->sl[2][i][j];
-+            if (i < 2)
-+                sm->scaling_list_32x32[i][j] = sl->sl[3][i * 3][j];
-+        }
-+        sm->scaling_list_dc_coef_16x16[i] = sl->sl_dc[0][i];
-+        if (i < 2)
-+            sm->scaling_list_dc_coef_32x32[i] = sl->sl_dc[1][i * 3];
-+    }
-+}
-+
-+static void fill_pps(struct v4l2_ctrl_hevc_pps * const ctrl, const HEVCPPS * const pps)
-+{
-+    uint64_t flags = 0;
-+
-+    if (pps->dependent_slice_segments_enabled_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED;
-+
-+    if (pps->output_flag_present_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT;
-+
-+    if (pps->sign_data_hiding_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED;
-+
-+    if (pps->cabac_init_present_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT;
-+
-+    if (pps->constrained_intra_pred_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED;
-+
-+    if (pps->transform_skip_enabled_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED;
-+
-+    if (pps->cu_qp_delta_enabled_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED;
-+
-+    if (pps->pic_slice_level_chroma_qp_offsets_present_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT;
-+
-+    if (pps->weighted_pred_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED;
-+
-+    if (pps->weighted_bipred_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED;
-+
-+    if (pps->transquant_bypass_enable_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED;
-+
-+    if (pps->tiles_enabled_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_TILES_ENABLED;
-+
-+    if (pps->entropy_coding_sync_enabled_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED;
-+
-+    if (pps->loop_filter_across_tiles_enabled_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED;
-+
-+    if (pps->seq_loop_filter_across_slices_enabled_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED;
-+
-+    if (pps->deblocking_filter_override_enabled_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED;
-+
-+    if (pps->disable_dbf)
-+        flags |= V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER;
-+
-+    if (pps->lists_modification_present_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT;
-+
-+    if (pps->slice_header_extension_present_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT;
-+
-+    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
-+    *ctrl = (struct v4l2_ctrl_hevc_pps) {
-+        .num_extra_slice_header_bits = pps->num_extra_slice_header_bits,
-+        .init_qp_minus26 = pps->pic_init_qp_minus26,
-+        .diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth,
-+        .pps_cb_qp_offset = pps->cb_qp_offset,
-+        .pps_cr_qp_offset = pps->cr_qp_offset,
-+        .pps_beta_offset_div2 = pps->beta_offset / 2,
-+        .pps_tc_offset_div2 = pps->tc_offset / 2,
-+        .log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2,
-+        .flags = flags
-+    };
-+
-+
-+    if (pps->tiles_enabled_flag) {
-+        ctrl->num_tile_columns_minus1 = pps->num_tile_columns - 1;
-+        ctrl->num_tile_rows_minus1 = pps->num_tile_rows - 1;
-+
-+        for (int i = 0; i < pps->num_tile_columns; i++)
-+            ctrl->column_width_minus1[i] = pps->column_width[i] - 1;
-+
-+        for (int i = 0; i < pps->num_tile_rows; i++)
-+            ctrl->row_height_minus1[i] = pps->row_height[i] - 1;
-+    }
-+}
-+
-+// Called before finally returning the frame to the user
-+// Set corrupt flag here as this is actually the frame structure that
-+// is going to the user (in MT land each thread has its own pool)
-+static int frame_post_process(void *logctx, AVFrame *frame)
-+{
-+    V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)frame->data[0];
-+
-+//    av_log(NULL, AV_LOG_INFO, "%s\n", __func__);
-+    frame->flags &= ~AV_FRAME_FLAG_CORRUPT;
-+    if (rd->qe_dst) {
-+        MediaBufsStatus stat = qent_dst_wait(rd->qe_dst);
-+        if (stat != MEDIABUFS_STATUS_SUCCESS) {
-+            av_log(logctx, AV_LOG_ERROR, "%s: Decode fail\n", __func__);
-+            frame->flags |= AV_FRAME_FLAG_CORRUPT;
-+        }
-+    }
-+
-+    return 0;
-+}
-+
-+static inline struct timeval cvt_dpb_to_tv(uint64_t t)
-+{
-+    t /= 1000;
-+    return (struct timeval){
-+        .tv_usec = t % 1000000,
-+        .tv_sec = t / 1000000
-+    };
-+}
-+
-+static inline uint64_t cvt_timestamp_to_dpb(const unsigned int t)
-+{
-+    return (uint64_t)t * 1000;
-+}
-+
-+static int v4l2_request_hevc_start_frame(AVCodecContext *avctx,
-+                                         av_unused const uint8_t *buffer,
-+                                         av_unused uint32_t size)
-+{
-+    const HEVCContext *h = avctx->priv_data;
-+    V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0];
-+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+
-+//    av_log(NULL, AV_LOG_INFO, "%s\n", __func__);
-+    decode_q_add(&ctx->decode_q, &rd->decode_ent);
-+
-+    rd->num_slices = 0;
-+    ctx->timestamp++;
-+    rd->timestamp = cvt_timestamp_to_dpb(ctx->timestamp);
-+
-+    {
-+        FrameDecodeData * const fdd = (FrameDecodeData*)h->ref->frame->private_ref->data;
-+        fdd->post_process = frame_post_process;
-+    }
-+
-+    // qe_dst needs to be bound to the data buffer and only returned when that is
-+    if (!rd->qe_dst)
-+    {
-+        if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) {
-+            av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__);
-+            return AVERROR(ENOMEM);
-+        }
-+    }
-+
-+    ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame
-+
-+    return 0;
-+}
-+
-+// Object fd & size will be zapped by this & need setting later
-+static int drm_from_format(AVDRMFrameDescriptor * const desc, const struct v4l2_format * const format)
-+{
-+    AVDRMLayerDescriptor *layer = &desc->layers[0];
-+    unsigned int width;
-+    unsigned int height;
-+    unsigned int bpl;
-+    uint32_t pixelformat;
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
-+        width       = format->fmt.pix_mp.width;
-+        height      = format->fmt.pix_mp.height;
-+        pixelformat = format->fmt.pix_mp.pixelformat;
-+        bpl         = format->fmt.pix_mp.plane_fmt[0].bytesperline;
-+    }
-+    else {
-+        width       = format->fmt.pix.width;
-+        height      = format->fmt.pix.height;
-+        pixelformat = format->fmt.pix.pixelformat;
-+        bpl         = format->fmt.pix.bytesperline;
-+    }
-+
-+    switch (pixelformat) {
-+    case V4L2_PIX_FMT_NV12:
-+        layer->format = DRM_FORMAT_NV12;
-+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+        break;
-+#if CONFIG_SAND
-+    case V4L2_PIX_FMT_NV12_COL128:
-+        layer->format = DRM_FORMAT_NV12;
-+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl);
-+        break;
-+    case V4L2_PIX_FMT_NV12_10_COL128:
-+        layer->format = DRM_FORMAT_P030;
-+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl);
-+        break;
-+#endif
-+#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED
-+    case V4L2_PIX_FMT_SUNXI_TILED_NV12:
-+        layer->format = DRM_FORMAT_NV12;
-+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_ALLWINNER_TILED;
-+        break;
-+#endif
-+#if defined(V4L2_PIX_FMT_NV15) && defined(DRM_FORMAT_NV15)
-+    case V4L2_PIX_FMT_NV15:
-+        layer->format = DRM_FORMAT_NV15;
-+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+        break;
-+#endif
-+    case V4L2_PIX_FMT_NV16:
-+        layer->format = DRM_FORMAT_NV16;
-+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+        break;
-+#if defined(V4L2_PIX_FMT_NV20) && defined(DRM_FORMAT_NV20)
-+    case V4L2_PIX_FMT_NV20:
-+        layer->format = DRM_FORMAT_NV20;
-+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+        break;
-+#endif
-+    default:
-+        return -1;
-+    }
-+
-+    desc->nb_objects = 1;
-+    desc->objects[0].fd = -1;
-+    desc->objects[0].size = 0;
-+
-+    desc->nb_layers = 1;
-+    layer->nb_planes = 2;
-+
-+    layer->planes[0].object_index = 0;
-+    layer->planes[0].offset = 0;
-+    layer->planes[0].pitch = bpl;
-+#if CONFIG_SAND
-+    if (pixelformat == V4L2_PIX_FMT_NV12_COL128) {
-+        layer->planes[1].object_index = 0;
-+        layer->planes[1].offset = height * 128;
-+        layer->planes[0].pitch = width;
-+        layer->planes[1].pitch = width;
-+    }
-+    else if (pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
-+        layer->planes[1].object_index = 0;
-+        layer->planes[1].offset = height * 128;
-+        layer->planes[0].pitch = width * 2; // Lies but it keeps DRM import happy
-+        layer->planes[1].pitch = width * 2;
-+    }
-+    else
-+#endif
-+    {
-+        layer->planes[1].object_index = 0;
-+        layer->planes[1].offset = layer->planes[0].pitch * height;
-+        layer->planes[1].pitch = layer->planes[0].pitch;
-+    }
-+
-+    return 0;
-+}
-+
-+static int
-+set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
-+    struct req_controls *const controls,
-+#if HEVC_CTRLS_VERSION >= 2
-+    struct v4l2_ctrl_hevc_decode_params * const dec,
-+#endif
-+    struct v4l2_ctrl_hevc_slice_params * const slices,
-+    const unsigned int slice_no,
-+    const unsigned int slice_count)
-+{
-+    int rv;
-+
-+    struct v4l2_ext_control control[] = {
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS,
-+            .ptr = &controls->sps,
-+            .size = sizeof(controls->sps),
-+        },
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS,
-+            .ptr = &controls->pps,
-+            .size = sizeof(controls->pps),
-+        },
-+#if HEVC_CTRLS_VERSION >= 2
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS,
-+            .ptr = dec,
-+            .size = sizeof(*dec),
-+        },
-+#endif
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS,
-+            .ptr = slices + slice_no,
-+            .size = sizeof(*slices) * slice_count,
-+        },
-+        // Optional
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX,
-+            .ptr = &controls->scaling_matrix,
-+            .size = sizeof(controls->scaling_matrix),
-+        },
-+    };
-+
-+    rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control,
-+            controls->has_scaling ?
-+                FF_ARRAY_ELEMS(control) :
-+                FF_ARRAY_ELEMS(control) - 1);
-+
-+    return rv;
-+}
-+
-+static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
-+{
-+    const HEVCContext * const h = avctx->priv_data;
-+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+    V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0];
-+    int bcount = get_bits_count(&h->HEVClc->gb);
-+    uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount;
-+
-+    int rv;
-+    struct slice_info * si;
-+
-+    if ((rv = slice_add(rd)) != 0)
-+        return rv;
-+
-+    si = rd->slices + rd->num_slices - 1;
-+    si->ptr = buffer;
-+    si->len = size;
-+
-+    if (ctx->multi_slice && rd->num_slices > 1) {
-+        struct slice_info *const si0 = rd->slices;
-+        const size_t offset = (buffer - si0->ptr);
-+        boff += offset * 8;
-+        size += offset;
-+        si0->len = si->len + offset;
-+    }
-+
-+#if HEVC_CTRLS_VERSION >= 2
-+    if (rd->num_slices == 1)
-+        fill_decode_params(h, &rd->dec);
-+    fill_slice_params(h, &rd->dec, rd->slice_params + rd->num_slices - 1, size * 8, boff);
-+#else
-+    fill_slice_params(h, rd->slice_params + rd->num_slices - 1, size * 8, boff);
-+#endif
-+
-+    return 0;
-+}
-+
-+static void v4l2_request_hevc_abort_frame(AVCodecContext * const avctx)
-+{
-+    const HEVCContext * const h = avctx->priv_data;
-+    if (h->ref != NULL) {
-+        V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0];
-+        V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+
-+        media_request_abort(&rd->req);
-+        mediabufs_src_qent_abort(ctx->mbufs, &rd->qe_src);
-+
-+        decode_q_remove(&ctx->decode_q, &rd->decode_ent);
-+    }
-+}
-+
-+static int send_slice(AVCodecContext * const avctx,
-+                      V4L2MediaReqDescriptor * const rd,
-+                      struct req_controls *const controls,
-+                      const unsigned int i, const unsigned int j)
-+{
-+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+
-+    struct slice_info *const si = rd->slices + i;
-+    struct media_request * req = NULL;
-+    struct qent_src * src = NULL;
-+    MediaBufsStatus stat;
-+
-+    if ((req = media_request_get(ctx->mpool)) == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__);
-+        return AVERROR(ENOMEM);
-+    }
-+
-+    if (set_req_ctls(ctx, req,
-+                     controls,
-+#if HEVC_CTRLS_VERSION >= 2
-+                     &rd->dec,
-+#endif
-+                     rd->slice_params,
-+                     i, j - i)) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__);
-+        goto fail1;
-+    }
-+
-+    if ((src = mediabufs_src_qent_get(ctx->mbufs)) == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to get src buffer\n", __func__);
-+        goto fail1;
-+    }
-+
-+    if (qent_src_data_copy(src, 0, si->ptr, si->len, ctx->dbufs) != 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Failed data copy\n", __func__);
-+        goto fail2;
-+    }
-+
-+    if (qent_src_params_set(src, &controls->tv)) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Failed src param set\n", __func__);
-+        goto fail2;
-+    }
-+
-+#warning ANNEX_B start code
-+//        if (ctx->start_code == V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
-+//        }
-+
-+    stat = mediabufs_start_request(ctx->mbufs, &req, &src,
-+                                   i == 0 ? rd->qe_dst : NULL,
-+                                   j == rd->num_slices);
-+
-+    if (stat != MEDIABUFS_STATUS_SUCCESS) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__);
-+        return AVERROR_UNKNOWN;
-+    }
-+    return 0;
-+
-+fail2:
-+    mediabufs_src_qent_abort(ctx->mbufs, &src);
-+fail1:
-+    media_request_abort(&req);
-+    return AVERROR_UNKNOWN;
-+}
-+
-+static int v4l2_request_hevc_end_frame(AVCodecContext *avctx)
-+{
-+    const HEVCContext * const h = avctx->priv_data;
-+    V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0];
-+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
-+    struct req_controls rc;
-+    unsigned int i;
-+    int rv;
-+
-+    // It is possible, though maybe a bug, to get an end_frame without
-+    // a previous start_frame.  If we do then give up.
-+    if (!decode_q_in_q(&rd->decode_ent)) {
-+        av_log(avctx, AV_LOG_DEBUG, "%s: Frame not in decode Q\n", __func__);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    {
-+        const ScalingList *sl = h->ps.pps->scaling_list_data_present_flag ?
-+                                    &h->ps.pps->scaling_list :
-+                                h->ps.sps->scaling_list_enable_flag ?
-+                                    &h->ps.sps->scaling_list : NULL;
-+
-+
-+        memset(&rc, 0, sizeof(rc));
-+        rc.tv = cvt_dpb_to_tv(rd->timestamp);
-+        fill_sps(&rc.sps, h->ps.sps);
-+        fill_pps(&rc.pps, h->ps.pps);
-+        if (sl) {
-+            rc.has_scaling = 1;
-+            fill_scaling_matrix(sl, &rc.scaling_matrix);
-+        }
-+    }
-+
-+    decode_q_wait(&ctx->decode_q, &rd->decode_ent);
-+
-+    // qe_dst needs to be bound to the data buffer and only returned when that is
-+    // Alloc almost certainly wants to be serialised if there is any chance of blocking
-+    // so we get the next frame to be free in the thread that needs it for decode first.
-+    //
-+    // In our current world this probably isn't a concern but put it here anyway
-+    if (!rd->qe_dst)
-+    {
-+        if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) {
-+            av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__);
-+            rv = AVERROR(ENOMEM);
-+            goto fail;
-+        }
-+    }
-+
-+    // Send as slices
-+    if (ctx->multi_slice)
-+    {
-+        if ((rv = send_slice(avctx, rd, &rc, 0, rd->num_slices)) != 0)
-+            goto fail;
-+    }
-+    else
-+    {
-+        for (i = 0; i != rd->num_slices; ++i) {
-+            if ((rv = send_slice(avctx, rd, &rc, i, i + 1)) != 0)
-+                goto fail;
-+        }
-+    }
-+
-+    // Set the drm_prime desriptor
-+    drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs));
-+    rd->drm.objects[0].fd = dmabuf_fd(qent_dst_dmabuf(rd->qe_dst, 0));
-+    rd->drm.objects[0].size = dmabuf_size(qent_dst_dmabuf(rd->qe_dst, 0));
-+
-+    decode_q_remove(&ctx->decode_q, &rd->decode_ent);
-+    return 0;
-+
-+fail:
-+    decode_q_remove(&ctx->decode_q, &rd->decode_ent);
-+    return rv;
-+}
-+
-+// Initial check & init
-+static int
-+probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
-+{
-+    const HEVCContext *h = avctx->priv_data;
-+    const HEVCSPS * const sps = h->ps.sps;
-+    struct v4l2_ctrl_hevc_sps ctrl_sps;
-+    unsigned int i;
-+
-+    // Check for var slice array
-+    struct v4l2_query_ext_ctrl qc[] = {
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX },
-+#if HEVC_CTRLS_VERSION >= 2
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS },
-+#endif
-+    };
-+    // Order & size must match!
-+    static const size_t ctrl_sizes[] = {
-+        sizeof(struct v4l2_ctrl_hevc_slice_params),
-+        sizeof(struct v4l2_ctrl_hevc_sps),
-+        sizeof(struct v4l2_ctrl_hevc_pps),
-+        sizeof(struct v4l2_ctrl_hevc_scaling_matrix),
-+#if HEVC_CTRLS_VERSION >= 2
-+        sizeof(struct v4l2_ctrl_hevc_decode_params),
-+#endif
-+    };
-+    const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc);
-+
-+#if HEVC_CTRLS_VERSION == 2
-+    if (mediabufs_ctl_driver_version(ctx->mbufs) >= MEDIABUFS_DRIVER_VERSION(5, 18, 0))
-+        return AVERROR(EINVAL);
-+#elif HEVC_CTRLS_VERSION == 3
-+    if (mediabufs_ctl_driver_version(ctx->mbufs) < MEDIABUFS_DRIVER_VERSION(5, 18, 0))
-+        return AVERROR(EINVAL);
-+#endif
-+
-+    if (mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls)) {
-+        av_log(avctx, AV_LOG_DEBUG, "Probed V%d control missing\n", HEVC_CTRLS_VERSION);
-+        return AVERROR(EINVAL);
-+    }
-+    for (i = 0; i != noof_ctrls; ++i) {
-+        if (ctrl_sizes[i] != (size_t)qc[i].elem_size) {
-+            av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n",
-+                   HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size);
-+            return AVERROR(EINVAL);
-+        }
-+    }
-+
-+    fill_sps(&ctrl_sps, sps);
-+
-+    if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_MPEG_VIDEO_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n");
-+        return AVERROR(EINVAL);
-+    }
-+
-+    ctx->multi_slice = (qc[0].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) != 0;
-+    return 0;
-+}
-+
-+// Final init
-+static int
-+set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
-+{
-+    int ret;
-+
-+    struct v4l2_query_ext_ctrl querys[] = {
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, },
-+    };
-+
-+    struct v4l2_ext_control ctrls[] = {
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, },
-+    };
-+
-+    mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys));
-+
-+    ctx->decode_mode = querys[0].default_value;
-+
-+    if (ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED &&
-+        ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    ctx->start_code = querys[1].default_value;
-+    if (ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE &&
-+        ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    ctx->max_slices = querys[2].elems;
-+    if (ctx->max_slices > MAX_SLICES) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported max slices, %d\n", __func__, ctx->max_slices);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    ctrls[0].value = ctx->decode_mode;
-+    ctrls[1].value = ctx->start_code;
-+
-+    ret = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, NULL, ctrls, FF_ARRAY_ELEMS(ctrls));
-+    return !ret ? 0 : AVERROR(-ret);
-+}
-+
-+static void v4l2_req_frame_free(void *opaque, uint8_t *data)
-+{
-+    AVCodecContext *avctx = opaque;
-+    V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)data;
-+
-+    av_log(NULL, AV_LOG_DEBUG, "%s: avctx=%p data=%p\n", __func__, avctx, data);
-+
-+    qent_dst_unref(&rd->qe_dst);
-+
-+    // We don't expect req or qe_src to be set
-+    if (rd->req || rd->qe_src)
-+        av_log(NULL, AV_LOG_ERROR, "%s: qe_src %p or req %p not NULL\n", __func__, rd->req, rd->qe_src);
-+
-+    av_freep(&rd->slices);
-+    av_freep(&rd->slice_params);
-+
-+    av_free(rd);
-+}
-+
-+static AVBufferRef *v4l2_req_frame_alloc(void *opaque, int size)
-+{
-+    AVCodecContext *avctx = opaque;
-+//    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
-+//    V4L2MediaReqDescriptor *req;
-+    AVBufferRef *ref;
-+    uint8_t *data;
-+//    int ret;
-+
-+    data = av_mallocz(size);
-+    if (!data)
-+        return NULL;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p\n", __func__, avctx, size, data);
-+    ref = av_buffer_create(data, size, v4l2_req_frame_free, avctx, 0);
-+    if (!ref) {
-+        av_freep(&data);
-+        return NULL;
-+    }
-+    return ref;
-+}
-+
-+#if 0
-+static void v4l2_req_pool_free(void *opaque)
-+{
-+    av_log(NULL, AV_LOG_DEBUG, "%s: opaque=%p\n", __func__, opaque);
-+}
-+
-+static void v4l2_req_hwframe_ctx_free(AVHWFramesContext *hwfc)
-+{
-+    av_log(NULL, AV_LOG_DEBUG, "%s: hwfc=%p pool=%p\n", __func__, hwfc, hwfc->pool);
-+
-+    av_buffer_pool_uninit(&hwfc->pool);
-+}
-+#endif
-+
-+static int frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
-+{
-+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
-+    AVHWFramesContext *hwfc = (AVHWFramesContext*)hw_frames_ctx->data;
-+    const struct v4l2_format *vfmt = mediabufs_dst_fmt(ctx->mbufs);
-+
-+    hwfc->format = AV_PIX_FMT_DRM_PRIME;
-+    hwfc->sw_format = pixel_format_from_format(vfmt);
-+    if (V4L2_TYPE_IS_MULTIPLANAR(vfmt->type)) {
-+        hwfc->width = vfmt->fmt.pix_mp.width;
-+        hwfc->height = vfmt->fmt.pix_mp.height;
-+    } else {
-+        hwfc->width = vfmt->fmt.pix.width;
-+        hwfc->height = vfmt->fmt.pix.height;
-+    }
-+#if 0
-+    hwfc->pool = av_buffer_pool_init2(sizeof(V4L2MediaReqDescriptor), avctx, v4l2_req_frame_alloc, v4l2_req_pool_free);
-+    if (!hwfc->pool)
-+        return AVERROR(ENOMEM);
-+
-+    hwfc->free = v4l2_req_hwframe_ctx_free;
-+
-+    hwfc->initial_pool_size = 1;
-+
-+    switch (avctx->codec_id) {
-+    case AV_CODEC_ID_VP9:
-+        hwfc->initial_pool_size += 8;
-+        break;
-+    case AV_CODEC_ID_VP8:
-+        hwfc->initial_pool_size += 3;
-+        break;
-+    default:
-+        hwfc->initial_pool_size += 2;
-+    }
-+#endif
-+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p hw_frames_ctx=%p hwfc=%p pool=%p width=%d height=%d initial_pool_size=%d\n", __func__, avctx, ctx, hw_frames_ctx, hwfc, hwfc->pool, hwfc->width, hwfc->height, hwfc->initial_pool_size);
-+
-+    return 0;
-+}
-+
-+static int alloc_frame(AVCodecContext * avctx, AVFrame *frame)
-+{
-+    int rv;
-+
-+    frame->buf[0] = v4l2_req_frame_alloc(avctx, sizeof(V4L2MediaReqDescriptor));
-+    if (!frame->buf[0])
-+        return AVERROR(ENOMEM);
-+
-+    frame->data[0] = frame->buf[0]->data;
-+
-+    frame->hw_frames_ctx = av_buffer_ref(avctx->hw_frames_ctx);
-+
-+    if ((rv = ff_attach_decode_data(frame)) != 0) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to attach decode data to frame\n");
-+        av_frame_unref(frame);
-+        return rv;
-+    }
-+
-+    return 0;
-+}
-+
-+const v4l2_req_decode_fns V(ff_v4l2_req_hevc) = {
-+    .src_pix_fmt_v4l2 = V4L2_PIX_FMT_HEVC_SLICE,
-+    .name = "V4L2 HEVC stateless V" STR(HEVC_CTRLS_VERSION),
-+    .probe = probe,
-+    .set_controls = set_controls,
-+
-+    .start_frame    = v4l2_request_hevc_start_frame,
-+    .decode_slice   = v4l2_request_hevc_decode_slice,
-+    .end_frame      = v4l2_request_hevc_end_frame,
-+    .abort_frame    = v4l2_request_hevc_abort_frame,
-+    .frame_params   = frame_params,
-+    .alloc_frame    = alloc_frame,
-+};
-+
---- /dev/null
-+++ b/libavcodec/v4l2_req_media.c
-@@ -0,0 +1,1601 @@
-+/*
-+ * Copyright (C) 2018 Paul Kocialkowski <paul.kocialkowski@bootlin.com>
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the
-+ * "Software"), to deal in the Software without restriction, including
-+ * without limitation the rights to use, copy, modify, merge, publish,
-+ * distribute, sub license, and/or sell copies of the Software, and to
-+ * permit persons to whom the Software is furnished to do so, subject to
-+ * the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the
-+ * next paragraph) shall be included in all copies or substantial portions
-+ * of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
-+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
-+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-+ */
-+
-+#include <errno.h>
-+#include <fcntl.h>
-+#include <poll.h>
-+#include <pthread.h>
-+#include <semaphore.h>
-+#include <stdatomic.h>
-+#include <stdbool.h>
-+#include <stdlib.h>
-+#include <string.h>
-+#include <unistd.h>
-+#include <linux/media.h>
-+#include <sys/ioctl.h>
-+#include <sys/select.h>
-+#include <sys/ioctl.h>
-+
-+#include <linux/videodev2.h>
-+
-+#include "v4l2_req_dmabufs.h"
-+#include "v4l2_req_media.h"
-+#include "v4l2_req_pollqueue.h"
-+#include "v4l2_req_utils.h"
-+#include "weak_link.h"
-+
-+
-+/* floor(log2(x)) */
-+static unsigned int log2_size(size_t x)
-+{
-+    unsigned int n = 0;
-+
-+    if (x & ~0xffff) {
-+        n += 16;
-+        x >>= 16;
-+    }
-+    if (x & ~0xff) {
-+        n += 8;
-+        x >>= 8;
-+    }
-+    if (x & ~0xf) {
-+        n += 4;
-+        x >>= 4;
-+    }
-+    if (x & ~3) {
-+        n += 2;
-+        x >>= 2;
-+    }
-+    return (x & ~1) ? n + 1 : n;
-+}
-+
-+static size_t round_up_size(const size_t x)
-+{
-+    /* Admit no size < 256 */
-+    const unsigned int n = x < 256 ? 8 : log2_size(x) - 1;
-+
-+    return x >= (3 << n) ? 4 << n : (3 << n);
-+}
-+
-+struct media_request;
-+
-+struct media_pool {
-+    int fd;
-+    sem_t sem;
-+    pthread_mutex_t lock;
-+    struct media_request * free_reqs;
-+    struct pollqueue * pq;
-+};
-+
-+struct media_request {
-+    struct media_request * next;
-+    struct media_pool * mp;
-+    int fd;
-+    struct polltask * pt;
-+};
-+
-+
-+static inline int do_trywait(sem_t *const sem)
-+{
-+    while (sem_trywait(sem)) {
-+        if (errno != EINTR)
-+            return -errno;
-+    }
-+    return 0;
-+}
-+
-+static inline int do_wait(sem_t *const sem)
-+{
-+    while (sem_wait(sem)) {
-+        if (errno != EINTR)
-+            return -errno;
-+    }
-+    return 0;
-+}
-+
-+static int request_buffers(int video_fd, unsigned int type,
-+                           enum v4l2_memory memory, unsigned int buffers_count)
-+{
-+    struct v4l2_requestbuffers buffers;
-+    int rc;
-+
-+    memset(&buffers, 0, sizeof(buffers));
-+    buffers.type = type;
-+    buffers.memory = memory;
-+    buffers.count = buffers_count;
-+
-+    rc = ioctl(video_fd, VIDIOC_REQBUFS, &buffers);
-+    if (rc < 0) {
-+        rc = -errno;
-+        request_log("Unable to request %d type %d buffers: %s\n", buffers_count, type, strerror(-rc));
-+        return rc;
-+    }
-+
-+    return 0;
-+}
-+
-+
-+static int set_stream(int video_fd, unsigned int type, bool enable)
-+{
-+    enum v4l2_buf_type buf_type = type;
-+    int rc;
-+
-+    rc = ioctl(video_fd, enable ? VIDIOC_STREAMON : VIDIOC_STREAMOFF,
-+           &buf_type);
-+    if (rc < 0) {
-+        rc = -errno;
-+        request_log("Unable to %sable stream: %s\n",
-+                enable ? "en" : "dis", strerror(-rc));
-+        return rc;
-+    }
-+
-+    return 0;
-+}
-+
-+
-+
-+struct media_request * media_request_get(struct media_pool * const mp)
-+{
-+    struct media_request *req = NULL;
-+
-+    /* Timeout handled by poll code */
-+    if (do_wait(&mp->sem))
-+        return NULL;
-+
-+    pthread_mutex_lock(&mp->lock);
-+    req = mp->free_reqs;
-+    if (req) {
-+        mp->free_reqs = req->next;
-+        req->next = NULL;
-+    }
-+    pthread_mutex_unlock(&mp->lock);
-+    return req;
-+}
-+
-+int media_request_fd(const struct media_request * const req)
-+{
-+    return req->fd;
-+}
-+
-+int media_request_start(struct media_request * const req)
-+{
-+    while (ioctl(req->fd, MEDIA_REQUEST_IOC_QUEUE, NULL) == -1)
-+    {
-+        const int err = errno;
-+        if (err == EINTR)
-+            continue;
-+        request_log("%s: Failed to Q media: (%d) %s\n", __func__, err, strerror(err));
-+        return -err;
-+    }
-+
-+    pollqueue_add_task(req->pt, 2000);
-+    return 0;
-+}
-+
-+static void media_request_done(void *v, short revents)
-+{
-+    struct media_request *const req = v;
-+    struct media_pool *const mp = req->mp;
-+
-+    /* ** Not sure what to do about timeout */
-+
-+    if (ioctl(req->fd, MEDIA_REQUEST_IOC_REINIT, NULL) < 0)
-+        request_log("Unable to reinit media request: %s\n",
-+                strerror(errno));
-+
-+    pthread_mutex_lock(&mp->lock);
-+    req->next = mp->free_reqs;
-+    mp->free_reqs = req;
-+    pthread_mutex_unlock(&mp->lock);
-+    sem_post(&mp->sem);
-+}
-+
-+int media_request_abort(struct media_request ** const preq)
-+{
-+    struct media_request * const req = *preq;
-+
-+    if (req == NULL)
-+        return 0;
-+    *preq = NULL;
-+
-+    media_request_done(req, 0);
-+    return 0;
-+}
-+
-+static void delete_req_chain(struct media_request * const chain)
-+{
-+    struct media_request * next = chain;
-+    while (next) {
-+        struct media_request * const req = next;
-+        next = req->next;
-+        if (req->pt)
-+            polltask_delete(&req->pt);
-+        if (req->fd != -1)
-+            close(req->fd);
-+        free(req);
-+    }
-+}
-+
-+struct media_pool * media_pool_new(const char * const media_path,
-+                   struct pollqueue * const pq,
-+                   const unsigned int n)
-+{
-+    struct media_pool * const mp = calloc(1, sizeof(*mp));
-+    unsigned int i;
-+
-+    if (!mp)
-+        goto fail0;
-+
-+    mp->pq = pq;
-+    pthread_mutex_init(&mp->lock, NULL);
-+    mp->fd = open(media_path, O_RDWR | O_NONBLOCK);
-+    if (mp->fd == -1) {
-+        request_log("Failed to open '%s': %s\n", media_path, strerror(errno));
-+        goto fail1;
-+    }
-+
-+    for (i = 0; i != n; ++i) {
-+        struct media_request * req = malloc(sizeof(*req));
-+        if (!req)
-+            goto fail4;
-+
-+        *req = (struct media_request){
-+            .next = mp->free_reqs,
-+            .mp = mp,
-+            .fd = -1
-+        };
-+        mp->free_reqs = req;
-+
-+        if (ioctl(mp->fd, MEDIA_IOC_REQUEST_ALLOC, &req->fd) == -1) {
-+            request_log("Failed to alloc request %d: %s\n", i, strerror(errno));
-+            goto fail4;
-+        }
-+
-+        req->pt = polltask_new(pq, req->fd, POLLPRI, media_request_done, req);
-+        if (!req->pt)
-+            goto fail4;
-+    }
-+
-+    sem_init(&mp->sem, 0, n);
-+
-+    return mp;
-+
-+fail4:
-+    delete_req_chain(mp->free_reqs);
-+    close(mp->fd);
-+    pthread_mutex_destroy(&mp->lock);
-+fail1:
-+    free(mp);
-+fail0:
-+    return NULL;
-+}
-+
-+void media_pool_delete(struct media_pool ** pMp)
-+{
-+    struct media_pool * const mp = *pMp;
-+
-+    if (!mp)
-+        return;
-+    *pMp = NULL;
-+
-+    delete_req_chain(mp->free_reqs);
-+    close(mp->fd);
-+    sem_destroy(&mp->sem);
-+    pthread_mutex_destroy(&mp->lock);
-+    free(mp);
-+}
-+
-+
-+#define INDEX_UNSET (~(uint32_t)0)
-+
-+enum qent_status {
-+    QENT_NEW = 0,       // Initial state - shouldn't last
-+    QENT_FREE,          // On free chain
-+    QENT_PENDING,       // User has ent
-+    QENT_WAITING,       // On inuse
-+    QENT_DONE,          // Frame rx
-+    QENT_ERROR,         // Error
-+    QENT_IMPORT
-+};
-+
-+struct qent_base {
-+    atomic_int ref_count;
-+    struct qent_base *next;
-+    struct qent_base *prev;
-+    enum qent_status status;
-+    uint32_t index;
-+    struct dmabuf_h *dh[VIDEO_MAX_PLANES];
-+    struct timeval timestamp;
-+};
-+
-+struct qent_src {
-+    struct qent_base base;
-+    int fixed_size;
-+};
-+
-+struct qent_dst {
-+    struct qent_base base;
-+    bool waiting;
-+    pthread_mutex_t lock;
-+    pthread_cond_t cond;
-+    struct ff_weak_link_client * mbc_wl;
-+};
-+
-+struct qe_list_head {
-+    struct qent_base *head;
-+    struct qent_base *tail;
-+};
-+
-+struct buf_pool {
-+    pthread_mutex_t lock;
-+    sem_t free_sem;
-+    enum v4l2_buf_type buf_type;
-+    struct qe_list_head free;
-+    struct qe_list_head inuse;
-+};
-+
-+
-+static inline struct qent_dst *base_to_dst(struct qent_base *be)
-+{
-+    return (struct qent_dst *)be;
-+}
-+
-+static inline struct qent_src *base_to_src(struct qent_base *be)
-+{
-+    return (struct qent_src *)be;
-+}
-+
-+
-+#define QENT_BASE_INITIALIZER {\
-+    .ref_count = ATOMIC_VAR_INIT(0),\
-+    .status = QENT_NEW,\
-+    .index  = INDEX_UNSET\
-+}
-+
-+static void qe_base_uninit(struct qent_base *const be)
-+{
-+    unsigned int i;
-+    for (i = 0; i != VIDEO_MAX_PLANES; ++i) {
-+        dmabuf_free(be->dh[i]);
-+        be->dh[i] = NULL;
-+    }
-+}
-+
-+static void qe_src_free(struct qent_src *const be_src)
-+{
-+    if (!be_src)
-+        return;
-+    qe_base_uninit(&be_src->base);
-+    free(be_src);
-+}
-+
-+static struct qent_src * qe_src_new(void)
-+{
-+    struct qent_src *const be_src = malloc(sizeof(*be_src));
-+    if (!be_src)
-+        return NULL;
-+    *be_src = (struct qent_src){
-+        .base = QENT_BASE_INITIALIZER
-+    };
-+    return be_src;
-+}
-+
-+static void qe_dst_free(struct qent_dst *const be_dst)
-+{
-+    if (!be_dst)
-+        return;
-+
-+    ff_weak_link_unref(&be_dst->mbc_wl);
-+    pthread_cond_destroy(&be_dst->cond);
-+    pthread_mutex_destroy(&be_dst->lock);
-+    qe_base_uninit(&be_dst->base);
-+    free(be_dst);
-+}
-+
-+static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl)
-+{
-+    struct qent_dst *const be_dst = malloc(sizeof(*be_dst));
-+    if (!be_dst)
-+        return NULL;
-+    *be_dst = (struct qent_dst){
-+        .base = QENT_BASE_INITIALIZER,
-+        .lock = PTHREAD_MUTEX_INITIALIZER,
-+        .cond = PTHREAD_COND_INITIALIZER,
-+        .mbc_wl = ff_weak_link_ref(wl)
-+    };
-+    return be_dst;
-+}
-+
-+static void ql_add_tail(struct qe_list_head * const ql, struct qent_base * be)
-+{
-+    if (ql->tail)
-+        ql->tail->next = be;
-+    else
-+        ql->head = be;
-+    be->prev = ql->tail;
-+    be->next = NULL;
-+    ql->tail = be;
-+}
-+
-+static struct qent_base * ql_extract(struct qe_list_head * const ql, struct qent_base * be)
-+{
-+    if (!be)
-+        return NULL;
-+
-+    if (be->next)
-+        be->next->prev = be->prev;
-+    else
-+        ql->tail = be->prev;
-+    if (be->prev)
-+        be->prev->next = be->next;
-+    else
-+        ql->head = be->next;
-+    be->next = NULL;
-+    be->prev = NULL;
-+    return be;
-+}
-+
-+
-+static void bq_put_free(struct buf_pool *const bp, struct qent_base * be)
-+{
-+    ql_add_tail(&bp->free, be);
-+}
-+
-+static struct qent_base * bq_get_free(struct buf_pool *const bp)
-+{
-+    return ql_extract(&bp->free, bp->free.head);
-+}
-+
-+static struct qent_base * bq_extract_inuse(struct buf_pool *const bp, struct qent_base *const be)
-+{
-+    return ql_extract(&bp->inuse, be);
-+}
-+
-+static struct qent_base * bq_get_inuse(struct buf_pool *const bp)
-+{
-+    return ql_extract(&bp->inuse, bp->inuse.head);
-+}
-+
-+static void bq_free_all_free_src(struct buf_pool *const bp)
-+{
-+    struct qent_base *be;
-+    while ((be = bq_get_free(bp)) != NULL)
-+        qe_src_free(base_to_src(be));
-+}
-+
-+static void bq_free_all_inuse_src(struct buf_pool *const bp)
-+{
-+    struct qent_base *be;
-+    while ((be = bq_get_inuse(bp)) != NULL)
-+        qe_src_free(base_to_src(be));
-+}
-+
-+static void bq_free_all_free_dst(struct buf_pool *const bp)
-+{
-+    struct qent_base *be;
-+    while ((be = bq_get_free(bp)) != NULL)
-+        qe_dst_free(base_to_dst(be));
-+}
-+
-+static void queue_put_free(struct buf_pool *const bp, struct qent_base *be)
-+{
-+    unsigned int i;
-+
-+    pthread_mutex_lock(&bp->lock);
-+    /* Clear out state vars */
-+    be->timestamp.tv_sec = 0;
-+    be->timestamp.tv_usec = 0;
-+    be->status = QENT_FREE;
-+    for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i)
-+        dmabuf_len_set(be->dh[i], 0);
-+    bq_put_free(bp, be);
-+    pthread_mutex_unlock(&bp->lock);
-+    sem_post(&bp->free_sem);
-+}
-+
-+static bool queue_is_inuse(const struct buf_pool *const bp)
-+{
-+    return bp->inuse.tail != NULL;
-+}
-+
-+static void queue_put_inuse(struct buf_pool *const bp, struct qent_base *be)
-+{
-+    if (!be)
-+        return;
-+    pthread_mutex_lock(&bp->lock);
-+    ql_add_tail(&bp->inuse, be);
-+    be->status = QENT_WAITING;
-+    pthread_mutex_unlock(&bp->lock);
-+}
-+
-+static struct qent_base *queue_get_free(struct buf_pool *const bp)
-+{
-+    struct qent_base *buf;
-+
-+    if (do_wait(&bp->free_sem))
-+        return NULL;
-+    pthread_mutex_lock(&bp->lock);
-+    buf = bq_get_free(bp);
-+    pthread_mutex_unlock(&bp->lock);
-+    return buf;
-+}
-+
-+static struct qent_base *queue_tryget_free(struct buf_pool *const bp)
-+{
-+    struct qent_base *buf;
-+
-+    if (do_trywait(&bp->free_sem))
-+        return NULL;
-+    pthread_mutex_lock(&bp->lock);
-+    buf = bq_get_free(bp);
-+    pthread_mutex_unlock(&bp->lock);
-+    return buf;
-+}
-+
-+static struct qent_base * queue_find_extract_fd(struct buf_pool *const bp, const int fd)
-+{
-+    struct qent_base *be;
-+
-+    pthread_mutex_lock(&bp->lock);
-+    /* Expect 1st in Q, but allow anywhere */
-+    for (be = bp->inuse.head; be; be = be->next) {
-+        if (dmabuf_fd(be->dh[0]) == fd) {
-+            bq_extract_inuse(bp, be);
-+            break;
-+        }
-+    }
-+    pthread_mutex_unlock(&bp->lock);
-+
-+    return be;
-+}
-+
-+static void queue_delete(struct buf_pool *const bp)
-+{
-+    sem_destroy(&bp->free_sem);
-+    pthread_mutex_destroy(&bp->lock);
-+    free(bp);
-+}
-+
-+static struct buf_pool* queue_new(const int vfd)
-+{
-+    struct buf_pool *bp = calloc(1, sizeof(*bp));
-+    if (!bp)
-+        return NULL;
-+    pthread_mutex_init(&bp->lock, NULL);
-+    sem_init(&bp->free_sem, 0, 0);
-+    return bp;
-+}
-+
-+
-+struct mediabufs_ctl {
-+    atomic_int ref_count;  /* 0 is single ref for easier atomics */
-+    void * dc;
-+    int vfd;
-+    bool stream_on;
-+    bool polling;
-+    bool dst_fixed;             // Dst Q is fixed size
-+    pthread_mutex_t lock;
-+    struct buf_pool * src;
-+    struct buf_pool * dst;
-+    struct polltask * pt;
-+    struct pollqueue * pq;
-+    struct ff_weak_link_master * this_wlm;
-+
-+    struct v4l2_format src_fmt;
-+    struct v4l2_format dst_fmt;
-+    struct v4l2_capability capability;
-+};
-+
-+static int qe_v4l2_queue(struct qent_base *const be,
-+               const int vfd, struct media_request *const mreq,
-+               const struct v4l2_format *const fmt,
-+               const bool is_dst, const bool hold_flag)
-+{
-+    struct v4l2_buffer buffer = {
-+        .type = fmt->type,
-+        .memory = V4L2_MEMORY_DMABUF,
-+        .index = be->index
-+    };
-+    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-+        unsigned int i;
-+        for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i) {
-+            if (is_dst)
-+                dmabuf_len_set(be->dh[i], 0);
-+
-+            /* *** Really need a pixdesc rather than a format so we can fill in data_offset */
-+            planes[i].length = dmabuf_size(be->dh[i]);
-+            planes[i].bytesused = dmabuf_len(be->dh[i]);
-+            planes[i].m.fd = dmabuf_fd(be->dh[i]);
-+        }
-+        buffer.m.planes = planes;
-+        buffer.length = i;
-+    }
-+    else {
-+        if (is_dst)
-+            dmabuf_len_set(be->dh[0], 0);
-+
-+        buffer.bytesused = dmabuf_len(be->dh[0]);
-+        buffer.length = dmabuf_size(be->dh[0]);
-+        buffer.m.fd = dmabuf_fd(be->dh[0]);
-+    }
-+
-+    if (!is_dst && mreq) {
-+        buffer.flags |= V4L2_BUF_FLAG_REQUEST_FD;
-+        buffer.request_fd = media_request_fd(mreq);
-+        if (hold_flag)
-+            buffer.flags |= V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF;
-+    }
-+
-+    if (is_dst)
-+        be->timestamp = (struct timeval){0,0};
-+
-+    buffer.timestamp = be->timestamp;
-+
-+    while (ioctl(vfd, VIDIOC_QBUF, &buffer)) {
-+        const int err = errno;
-+        if (err != EINTR) {
-+            request_log("%s: Failed to Q buffer: err=%d (%s)\n", __func__, err, strerror(err));
-+            return -err;
-+        }
-+    }
-+    return 0;
-+}
-+
-+static struct qent_base * qe_dequeue(struct buf_pool *const bp,
-+                     const int vfd,
-+                     const struct v4l2_format * const f)
-+{
-+    int fd;
-+    struct qent_base *be;
-+    int rc;
-+    const bool mp = V4L2_TYPE_IS_MULTIPLANAR(f->type);
-+    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
-+    struct v4l2_buffer buffer = {
-+        .type =  f->type,
-+        .memory = V4L2_MEMORY_DMABUF
-+    };
-+    if (mp) {
-+        buffer.length = f->fmt.pix_mp.num_planes;
-+        buffer.m.planes = planes;
-+    }
-+
-+    while ((rc = ioctl(vfd, VIDIOC_DQBUF, &buffer)) != 0 &&
-+           errno == EINTR)
-+        /* Loop */;
-+    if (rc) {
-+        request_log("Error DQing buffer type %d: %s\n", f->type, strerror(errno));
-+        return NULL;
-+    }
-+
-+    fd = mp ? planes[0].m.fd : buffer.m.fd;
-+    be = queue_find_extract_fd(bp, fd);
-+    if (!be) {
-+        request_log("Failed to find fd %d in Q\n", fd);
-+        return NULL;
-+    }
-+
-+    be->timestamp = buffer.timestamp;
-+    be->status = (buffer.flags & V4L2_BUF_FLAG_ERROR) ? QENT_ERROR : QENT_DONE;
-+    return be;
-+}
-+
-+static void qe_dst_done(struct qent_dst * dst_be)
-+{
-+    pthread_mutex_lock(&dst_be->lock);
-+    dst_be->waiting = false;
-+    pthread_cond_broadcast(&dst_be->cond);
-+    pthread_mutex_unlock(&dst_be->lock);
-+
-+    qent_dst_unref(&dst_be);
-+}
-+
-+static bool qe_dst_waiting(struct qent_dst *const dst_be)
-+{
-+    bool waiting;
-+    pthread_mutex_lock(&dst_be->lock);
-+    waiting = dst_be->waiting;
-+    dst_be->waiting = true;
-+    pthread_mutex_unlock(&dst_be->lock);
-+    return waiting;
-+}
-+
-+
-+static bool mediabufs_wants_poll(const struct mediabufs_ctl *const mbc)
-+{
-+    return queue_is_inuse(mbc->src) || queue_is_inuse(mbc->dst);
-+}
-+
-+static void mediabufs_poll_cb(void * v, short revents)
-+{
-+    struct mediabufs_ctl *mbc = v;
-+    struct qent_src *src_be = NULL;
-+    struct qent_dst *dst_be = NULL;
-+
-+    if (!revents)
-+        request_err(mbc->dc, "%s: Timeout\n", __func__);
-+
-+    pthread_mutex_lock(&mbc->lock);
-+    mbc->polling = false;
-+
-+    if ((revents & POLLOUT) != 0)
-+        src_be = base_to_src(qe_dequeue(mbc->src, mbc->vfd, &mbc->src_fmt));
-+    if ((revents & POLLIN) != 0)
-+        dst_be = base_to_dst(qe_dequeue(mbc->dst, mbc->vfd, &mbc->dst_fmt));
-+
-+    /* Reschedule */
-+    if (mediabufs_wants_poll(mbc)) {
-+        mbc->polling = true;
-+        pollqueue_add_task(mbc->pt, 2000);
-+    }
-+    pthread_mutex_unlock(&mbc->lock);
-+
-+    if (src_be)
-+        queue_put_free(mbc->src, &src_be->base);
-+    if (dst_be)
-+        qe_dst_done(dst_be);
-+}
-+
-+int qent_src_params_set(struct qent_src *const be_src, const struct timeval * timestamp)
-+{
-+    struct qent_base *const be = &be_src->base;
-+
-+    be->timestamp = *timestamp;
-+    return 0;
-+}
-+
-+struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst)
-+{
-+    return be_dst->base.timestamp;
-+}
-+
-+static int qent_base_realloc(struct qent_base *const be, const size_t len, struct dmabufs_ctl * dbsc)
-+{
-+    if (!be->dh[0] || len > dmabuf_size(be->dh[0])) {
-+        size_t newsize = round_up_size(len);
-+        request_log("%s: Overrun %zd > %zd; trying %zd\n", __func__, len, dmabuf_size(be->dh[0]), newsize);
-+        if (!dbsc) {
-+            request_log("%s: No dmbabuf_ctrl for realloc\n", __func__);
-+            return -ENOMEM;
-+        }
-+        if ((be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], newsize)) == NULL) {
-+            request_log("%s: Realloc %zd failed\n", __func__, newsize);
-+            return -ENOMEM;
-+        }
-+    }
-+    return 0;
-+}
-+
-+int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc)
-+{
-+    struct qent_base *const be = &be_src->base;
-+    return qent_base_realloc(be, len, dbsc);
-+}
-+
-+
-+int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc)
-+{
-+    void * dst;
-+    struct qent_base *const be = &be_src->base;
-+    int rv;
-+
-+    // Realloc doesn't copy so don't alloc if offset != 0
-+    if ((rv = qent_base_realloc(be, offset + len,
-+                                be_src->fixed_size || offset ? NULL : dbsc)) != 0)
-+        return rv;
-+
-+    dmabuf_write_start(be->dh[0]);
-+    dst = dmabuf_map(be->dh[0]);
-+    if (!dst)
-+        return -1;
-+    memcpy((char*)dst + offset, src, len);
-+    dmabuf_len_set(be->dh[0], len);
-+    dmabuf_write_end(be->dh[0]);
-+    return 0;
-+}
-+
-+const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be_dst, unsigned int plane)
-+{
-+    const struct qent_base *const be = &be_dst->base;
-+
-+    return (plane >= sizeof(be->dh)/sizeof(be->dh[0])) ? NULL : be->dh[plane];
-+}
-+
-+int qent_dst_dup_fd(const struct qent_dst *const be_dst, unsigned int plane)
-+{
-+    return dup(dmabuf_fd(qent_dst_dmabuf(be_dst, plane)));
-+}
-+
-+MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
-+                struct media_request **const pmreq,
-+                struct qent_src **const psrc_be,
-+                struct qent_dst *const dst_be,
-+                const bool is_final)
-+{
-+    struct media_request * mreq = *pmreq;
-+    struct qent_src *const src_be = *psrc_be;
-+
-+    // Req & src are always both "consumed"
-+    *pmreq = NULL;
-+    *psrc_be = NULL;
-+
-+    pthread_mutex_lock(&mbc->lock);
-+
-+    if (!src_be)
-+        goto fail1;
-+
-+    if (dst_be) {
-+        if (qe_dst_waiting(dst_be)) {
-+            request_info(mbc->dc, "Request buffer already waiting on start\n");
-+            goto fail1;
-+        }
-+        dst_be->base.timestamp = (struct timeval){0,0};
-+        if (qe_v4l2_queue(&dst_be->base, mbc->vfd, NULL, &mbc->dst_fmt, true, false))
-+            goto fail1;
-+
-+        qent_dst_ref(dst_be);
-+        queue_put_inuse(mbc->dst, &dst_be->base);
-+    }
-+
-+    if (qe_v4l2_queue(&src_be->base, mbc->vfd, mreq, &mbc->src_fmt, false, !is_final))
-+        goto fail1;
-+    queue_put_inuse(mbc->src, &src_be->base);
-+
-+    if (!mbc->polling && mediabufs_wants_poll(mbc)) {
-+        mbc->polling = true;
-+        pollqueue_add_task(mbc->pt, 2000);
-+    }
-+    pthread_mutex_unlock(&mbc->lock);
-+
-+    if (media_request_start(mreq))
-+        return MEDIABUFS_ERROR_OPERATION_FAILED;
-+
-+    return MEDIABUFS_STATUS_SUCCESS;
-+
-+fail1:
-+    media_request_abort(&mreq);
-+    if (src_be)
-+        queue_put_free(mbc->src, &src_be->base);
-+
-+// *** TODO: If src Q fails this doesnt unwind properly - separate dst Q from src Q
-+    if (dst_be) {
-+        dst_be->base.status = QENT_ERROR;
-+        qe_dst_done(dst_be);
-+    }
-+    pthread_mutex_unlock(&mbc->lock);
-+    return MEDIABUFS_ERROR_OPERATION_FAILED;
-+}
-+
-+
-+static int qe_alloc_from_fmt(struct qent_base *const be,
-+                   struct dmabufs_ctl *const dbsc,
-+                   const struct v4l2_format *const fmt)
-+{
-+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-+        unsigned int i;
-+        for (i = 0; i != fmt->fmt.pix_mp.num_planes; ++i) {
-+            be->dh[i] = dmabuf_realloc(dbsc, be->dh[i],
-+                fmt->fmt.pix_mp.plane_fmt[i].sizeimage);
-+            /* On failure tidy up and die */
-+            if (!be->dh[i]) {
-+                while (i--) {
-+                    dmabuf_free(be->dh[i]);
-+                    be->dh[i] = NULL;
-+                }
-+                return -1;
-+            }
-+        }
-+    }
-+    else {
-+//      be->dh[0] = dmabuf_alloc(dbsc, fmt->fmt.pix.sizeimage);
-+        size_t size = fmt->fmt.pix.sizeimage;
-+        be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], size);
-+        if (!be->dh[0])
-+            return -1;
-+    }
-+    return 0;
-+}
-+
-+static MediaBufsStatus fmt_set(struct v4l2_format *const fmt, const int fd,
-+            const enum v4l2_buf_type buftype,
-+            uint32_t pixfmt,
-+            const unsigned int width, const unsigned int height,
-+                               const size_t bufsize)
-+{
-+    *fmt = (struct v4l2_format){.type = buftype};
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) {
-+        fmt->fmt.pix_mp.width = width;
-+        fmt->fmt.pix_mp.height = height;
-+        fmt->fmt.pix_mp.pixelformat = pixfmt;
-+        if (bufsize) {
-+            fmt->fmt.pix_mp.num_planes = 1;
-+            fmt->fmt.pix_mp.plane_fmt[0].sizeimage = bufsize;
-+        }
-+    }
-+    else {
-+        fmt->fmt.pix.width = width;
-+        fmt->fmt.pix.height = height;
-+        fmt->fmt.pix.pixelformat = pixfmt;
-+        fmt->fmt.pix.sizeimage = bufsize;
-+    }
-+
-+    while (ioctl(fd, VIDIOC_S_FMT, fmt))
-+        if (errno != EINTR)
-+            return MEDIABUFS_ERROR_OPERATION_FAILED;
-+
-+    // Treat anything where we don't get at least what we asked for as a fail
-+    if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) {
-+        if (fmt->fmt.pix_mp.width < width ||
-+            fmt->fmt.pix_mp.height < height ||
-+            fmt->fmt.pix_mp.pixelformat != pixfmt) {
-+            return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
-+        }
-+    }
-+    else {
-+        if (fmt->fmt.pix.width < width ||
-+            fmt->fmt.pix.height < height ||
-+            fmt->fmt.pix.pixelformat != pixfmt) {
-+            return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
-+        }
-+    }
-+
-+    return MEDIABUFS_STATUS_SUCCESS;
-+}
-+
-+static MediaBufsStatus find_fmt_flags(struct v4l2_format *const fmt,
-+                   const int fd,
-+                   const unsigned int type_v4l2,
-+                   const uint32_t flags_must,
-+                   const uint32_t flags_not,
-+                   const unsigned int width,
-+                   const unsigned int height,
-+                   mediabufs_dst_fmt_accept_fn *const accept_fn,
-+                   void *const accept_v)
-+{
-+    unsigned int i;
-+
-+    for (i = 0;; ++i) {
-+        struct v4l2_fmtdesc fmtdesc = {
-+            .index = i,
-+            .type = type_v4l2
-+        };
-+        while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) {
-+            if (errno != EINTR)
-+                return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
-+        }
-+        if ((fmtdesc.flags & flags_must) != flags_must ||
-+            (fmtdesc.flags & flags_not))
-+            continue;
-+        if (!accept_fn(accept_v, &fmtdesc))
-+            continue;
-+
-+        if (fmt_set(fmt, fd, fmtdesc.type, fmtdesc.pixelformat,
-+                width, height, 0) == MEDIABUFS_STATUS_SUCCESS)
-+            return MEDIABUFS_STATUS_SUCCESS;
-+    }
-+    return 0;
-+}
-+
-+
-+/* Wait for qent done */
-+
-+MediaBufsStatus qent_dst_wait(struct qent_dst *const be_dst)
-+{
-+    struct qent_base *const be = &be_dst->base;
-+    enum qent_status estat;
-+
-+    pthread_mutex_lock(&be_dst->lock);
-+    while (be_dst->waiting &&
-+           !pthread_cond_wait(&be_dst->cond, &be_dst->lock))
-+        /* Loop */;
-+    estat = be->status;
-+    pthread_mutex_unlock(&be_dst->lock);
-+
-+    return estat == QENT_DONE ? MEDIABUFS_STATUS_SUCCESS :
-+        estat == QENT_ERROR ? MEDIABUFS_ERROR_DECODING_ERROR :
-+            MEDIABUFS_ERROR_OPERATION_FAILED;
-+}
-+
-+const uint8_t * qent_dst_data(struct qent_dst *const be_dst, unsigned int buf_no)
-+{
-+    struct qent_base *const be = &be_dst->base;
-+    return dmabuf_map(be->dh[buf_no]);
-+}
-+
-+MediaBufsStatus qent_dst_read_start(struct qent_dst *const be_dst)
-+{
-+    struct qent_base *const be = &be_dst->base;
-+    unsigned int i;
-+    for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) {
-+        if (dmabuf_read_start(be->dh[i])) {
-+            while (i--)
-+                dmabuf_read_end(be->dh[i]);
-+            return MEDIABUFS_ERROR_ALLOCATION_FAILED;
-+        }
-+    }
-+    return MEDIABUFS_STATUS_SUCCESS;
-+}
-+
-+MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be_dst)
-+{
-+    struct qent_base *const be = &be_dst->base;
-+    unsigned int i;
-+    MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS;
-+
-+    for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) {
-+        if (dmabuf_read_end(be->dh[i]))
-+            status = MEDIABUFS_ERROR_OPERATION_FAILED;
-+    }
-+    return status;
-+}
-+
-+struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst)
-+{
-+    if (be_dst)
-+        atomic_fetch_add(&be_dst->base.ref_count, 1);
-+    return be_dst;
-+}
-+
-+void qent_dst_unref(struct qent_dst ** const pbe_dst)
-+{
-+    struct qent_dst * const be_dst = *pbe_dst;
-+    struct mediabufs_ctl * mbc;
-+    if (!be_dst)
-+        return;
-+    *pbe_dst = NULL;
-+
-+    if (atomic_fetch_sub(&be_dst->base.ref_count, 1) != 0)
-+        return;
-+
-+    if ((mbc = ff_weak_link_lock(&be_dst->mbc_wl)) != NULL) {
-+        queue_put_free(mbc->dst, &be_dst->base);
-+        ff_weak_link_unlock(be_dst->mbc_wl);
-+    }
-+    else {
-+        qe_dst_free(be_dst);
-+    }
-+}
-+
-+MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
-+                unsigned int plane,
-+                int fd, size_t size)
-+{
-+    struct qent_base *const be = &be_dst->base;
-+    struct dmabuf_h * dh;
-+
-+    if (be->status != QENT_IMPORT || be->dh[plane])
-+        return MEDIABUFS_ERROR_OPERATION_FAILED;
-+
-+    dh = dmabuf_import(fd, size);
-+    if (!dh)
-+        return MEDIABUFS_ERROR_ALLOCATION_FAILED;
-+
-+    be->dh[plane] = dh;
-+    return MEDIABUFS_STATUS_SUCCESS;
-+}
-+
-+// Returns noof buffers created, -ve for error
-+static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, struct qent_dst * const qes[])
-+{
-+    unsigned int i;
-+
-+    struct v4l2_create_buffers cbuf = {
-+        .count = n,
-+        .memory = V4L2_MEMORY_DMABUF,
-+        .format = mbc->dst_fmt,
-+    };
-+
-+    while (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf)) {
-+        const int err = -errno;
-+        if (err != EINTR) {
-+            request_err(mbc->dc, "%s: Failed to create V4L2 buffer\n", __func__);
-+            return -err;
-+        }
-+    }
-+
-+    if (cbuf.count != n)
-+        request_warn(mbc->dc, "%s: Created %d of %d V4L2 buffers requested\n", __func__, cbuf.count, n);
-+
-+    for (i = 0; i != cbuf.count; ++i)
-+        qes[i]->base.index = cbuf.index + i;
-+
-+    return cbuf.count;
-+}
-+
-+struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struct dmabufs_ctl *const dbsc)
-+{
-+    struct qent_dst * be_dst;
-+
-+    if (mbc == NULL) {
-+        be_dst = qe_dst_new(NULL);
-+        if (be_dst)
-+            be_dst->base.status = QENT_IMPORT;
-+        return be_dst;
-+    }
-+
-+    if (mbc->dst_fixed) {
-+        be_dst = base_to_dst(queue_get_free(mbc->dst));
-+        if (!be_dst)
-+            return NULL;
-+    }
-+    else {
-+        be_dst = base_to_dst(queue_tryget_free(mbc->dst));
-+        if (!be_dst) {
-+            be_dst = qe_dst_new(mbc->this_wlm);
-+            if (!be_dst)
-+                return NULL;
-+
-+            if (create_dst_bufs(mbc, 1, &be_dst) != 1) {
-+                qe_dst_free(be_dst);
-+                return NULL;
-+            }
-+        }
-+    }
-+
-+    if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) {
-+        /* Given  how create buf works we can't uncreate it on alloc failure
-+         * all we can do is put it on the free Q
-+        */
-+        queue_put_free(mbc->dst, &be_dst->base);
-+        return NULL;
-+    }
-+
-+    be_dst->base.status = QENT_PENDING;
-+    atomic_store(&be_dst->base.ref_count, 0);
-+    return be_dst;
-+}
-+
-+const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc)
-+{
-+    return &mbc->dst_fmt;
-+}
-+
-+MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,
-+               const unsigned int width,
-+               const unsigned int height,
-+               mediabufs_dst_fmt_accept_fn *const accept_fn,
-+               void *const accept_v)
-+{
-+    MediaBufsStatus status;
-+    unsigned int i;
-+    const enum v4l2_buf_type buf_type = mbc->dst_fmt.type;
-+    static const struct {
-+        unsigned int flags_must;
-+        unsigned int flags_not;
-+    } trys[] = {
-+        {0, V4L2_FMT_FLAG_EMULATED},
-+        {V4L2_FMT_FLAG_EMULATED, 0},
-+    };
-+    for (i = 0; i != sizeof(trys)/sizeof(trys[0]); ++i) {
-+        status = find_fmt_flags(&mbc->dst_fmt, mbc->vfd,
-+                                buf_type,
-+                                trys[i].flags_must,
-+                                trys[i].flags_not,
-+                                width, height, accept_fn, accept_v);
-+        if (status != MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE)
-+            return status;
-+    }
-+
-+    if (status != MEDIABUFS_STATUS_SUCCESS)
-+        return status;
-+
-+    /* Try to create a buffer - don't alloc */
-+    return status;
-+}
-+
-+// ** This is a mess if we get partial alloc but without any way to remove
-+//    individual V4L2 Q members we are somewhat stuffed
-+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed)
-+{
-+    unsigned int i;
-+    int a = 0;
-+    unsigned int qc;
-+    struct qent_dst * qes[32];
-+
-+    if (n > 32)
-+        return MEDIABUFS_ERROR_ALLOCATION_FAILED;
-+
-+    // Create qents first as it is hard to get rid of the V4L2 buffers on error
-+    for (qc = 0; qc != n; ++qc)
-+    {
-+        if ((qes[qc] = qe_dst_new(mbc->this_wlm)) == NULL)
-+            goto fail;
-+    }
-+
-+    if ((a = create_dst_bufs(mbc, n, qes)) < 0)
-+        goto fail;
-+
-+    for (i = 0; i != a; ++i)
-+        queue_put_free(mbc->dst, &qes[i]->base);
-+
-+    if (a != n)
-+        goto fail;
-+
-+    mbc->dst_fixed = fixed;
-+    return MEDIABUFS_STATUS_SUCCESS;
-+
-+fail:
-+    for (i = (a < 0 ? 0 : a); i != qc; ++i)
-+        qe_dst_free(qes[i]);
-+
-+    return MEDIABUFS_ERROR_ALLOCATION_FAILED;
-+}
-+
-+struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc)
-+{
-+    struct qent_base * buf = queue_get_free(mbc->src);
-+    buf->status = QENT_PENDING;
-+    return base_to_src(buf);
-+}
-+
-+void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src)
-+{
-+    struct qent_src *const qe_src = *pqe_src;
-+    if (!qe_src)
-+        return;
-+    *pqe_src = NULL;
-+    queue_put_free(mbc->src, &qe_src->base);
-+}
-+
-+/* src format must have been set up before this */
-+MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc,
-+                  struct dmabufs_ctl * const dbsc,
-+                  unsigned int n)
-+{
-+    unsigned int i;
-+    struct v4l2_requestbuffers req = {
-+        .count = n,
-+        .type = mbc->src_fmt.type,
-+        .memory = V4L2_MEMORY_DMABUF
-+    };
-+
-+    bq_free_all_free_src(mbc->src);
-+    while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1) {
-+        if (errno != EINTR) {
-+            request_err(mbc->dc, "%s: Failed to request src bufs\n", __func__);
-+            return MEDIABUFS_ERROR_OPERATION_FAILED;
-+        }
-+    }
-+
-+    if (n > req.count) {
-+        request_info(mbc->dc, "Only allocated %d of %d src buffers requested\n", req.count, n);
-+        n = req.count;
-+    }
-+
-+    for (i = 0; i != n; ++i) {
-+        struct qent_src *const be_src = qe_src_new();
-+        if (!be_src) {
-+            request_err(mbc->dc, "Failed to create src be %d\n", i);
-+            goto fail;
-+        }
-+        if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) {
-+            qe_src_free(be_src);
-+            goto fail;
-+        }
-+        be_src->base.index = i;
-+        be_src->fixed_size = !mediabufs_src_resizable(mbc);
-+
-+        queue_put_free(mbc->src, &be_src->base);
-+    }
-+
-+    return MEDIABUFS_STATUS_SUCCESS;
-+
-+fail:
-+    bq_free_all_free_src(mbc->src);
-+    req.count = 0;
-+    while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1 &&
-+           errno == EINTR)
-+        /* Loop */;
-+
-+    return MEDIABUFS_ERROR_OPERATION_FAILED;
-+}
-+
-+
-+
-+/*
-+ * Set stuff order:
-+ *  Set src fmt
-+ *  Set parameters (sps) on vfd
-+ *  Negotiate dst format (dst_fmt_set)
-+ *  Create src buffers
-+ *  Alloc a dst buffer or Create dst slots
-+*/
-+MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc)
-+{
-+    if (mbc->stream_on)
-+        return MEDIABUFS_STATUS_SUCCESS;
-+
-+    if (set_stream(mbc->vfd, mbc->src_fmt.type, true) < 0) {
-+        request_log("Failed to set stream on src type %d\n", mbc->src_fmt.type);
-+        return MEDIABUFS_ERROR_OPERATION_FAILED;
-+    }
-+
-+    if (set_stream(mbc->vfd, mbc->dst_fmt.type, true) < 0) {
-+        request_log("Failed to set stream on dst type %d\n", mbc->dst_fmt.type);
-+        set_stream(mbc->vfd, mbc->src_fmt.type, false);
-+        return MEDIABUFS_ERROR_OPERATION_FAILED;
-+    }
-+
-+    mbc->stream_on = true;
-+    return MEDIABUFS_STATUS_SUCCESS;
-+}
-+
-+MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc)
-+{
-+    MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS;
-+
-+    if (!mbc->stream_on)
-+        return MEDIABUFS_STATUS_SUCCESS;
-+
-+    if (set_stream(mbc->vfd, mbc->dst_fmt.type, false) < 0) {
-+        request_log("Failed to set stream off dst type %d\n", mbc->dst_fmt.type);
-+        status = MEDIABUFS_ERROR_OPERATION_FAILED;
-+    }
-+
-+    if (set_stream(mbc->vfd, mbc->src_fmt.type, false) < 0) {
-+        request_log("Failed to set stream off src type %d\n", mbc->src_fmt.type);
-+        status = MEDIABUFS_ERROR_OPERATION_FAILED;
-+    }
-+
-+    mbc->stream_on = false;
-+    return status;
-+}
-+
-+int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq, struct v4l2_ext_control control_array[], unsigned int n)
-+{
-+    struct v4l2_ext_controls controls = {
-+        .controls = control_array,
-+        .count = n
-+    };
-+
-+    if (mreq) {
-+        controls.which = V4L2_CTRL_WHICH_REQUEST_VAL;
-+        controls.request_fd = media_request_fd(mreq);
-+    }
-+
-+    while (ioctl(mbc->vfd, VIDIOC_S_EXT_CTRLS, &controls))
-+    {
-+        const int err = errno;
-+        if (err != EINTR) {
-+            request_err(mbc->dc, "Unable to set controls: %s\n", strerror(err));
-+            return -err;
-+        }
-+    }
-+
-+    return 0;
-+}
-+
-+MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc,
-+                struct media_request * const mreq,
-+                unsigned int id, void *data,
-+                unsigned int size)
-+{
-+    struct v4l2_ext_control control = {
-+        .id = id,
-+        .ptr = data,
-+        .size = size
-+    };
-+
-+    int rv = mediabufs_ctl_set_ext_ctrls(mbc, mreq, &control, 1);
-+    return !rv ? MEDIABUFS_STATUS_SUCCESS : MEDIABUFS_ERROR_OPERATION_FAILED;
-+}
-+
-+MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,
-+                                      enum v4l2_buf_type buf_type,
-+                   const uint32_t pixfmt,
-+                   const uint32_t width, const uint32_t height,
-+                                      const size_t bufsize)
-+{
-+    MediaBufsStatus rv = fmt_set(&mbc->src_fmt, mbc->vfd, buf_type, pixfmt, width, height, bufsize);
-+    if (rv != MEDIABUFS_STATUS_SUCCESS)
-+        request_err(mbc->dc, "Failed to set src buftype %d, format %#x %dx%d\n", buf_type, pixfmt, width, height);
-+
-+    return rv;
-+}
-+
-+int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n)
-+{
-+    int rv = 0;
-+    while (n--) {
-+        while (ioctl(mbc->vfd, VIDIOC_QUERY_EXT_CTRL, ctrls)) {
-+            const int err = errno;
-+            if (err != EINTR) {
-+                // Often used for probing - errors are to be expected
-+                request_debug(mbc->dc, "Failed to query ext id=%#x, err=%d\n", ctrls->id, err);
-+                ctrls->type = 0; // 0 is invalid
-+                rv = -err;
-+                break;
-+            }
-+        }
-+        ++ctrls;
-+    }
-+    return rv;
-+}
-+
-+int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc)
-+{
-+    // Single planar OUTPUT can only take exact size buffers
-+    // Multiplanar will take larger than negotiated
-+    return V4L2_TYPE_IS_MULTIPLANAR(mbc->src_fmt.type);
-+}
-+
-+static void mediabufs_ctl_delete(struct mediabufs_ctl *const mbc)
-+{
-+    if (!mbc)
-+        return;
-+
-+    // Break the weak link first
-+    ff_weak_link_break(&mbc->this_wlm);
-+
-+    polltask_delete(&mbc->pt);
-+
-+    mediabufs_stream_off(mbc);
-+
-+    // Empty v4l2 buffer stash
-+    request_buffers(mbc->vfd, mbc->src_fmt.type, V4L2_MEMORY_MMAP, 0);
-+    request_buffers(mbc->vfd, mbc->dst_fmt.type, V4L2_MEMORY_MMAP, 0);
-+
-+    bq_free_all_free_src(mbc->src);
-+    bq_free_all_inuse_src(mbc->src);
-+    bq_free_all_free_dst(mbc->dst);
-+
-+    {
-+        struct qent_dst *dst_be;
-+        while ((dst_be = base_to_dst(bq_get_inuse(mbc->dst))) != NULL) {
-+            dst_be->base.timestamp = (struct timeval){0};
-+            dst_be->base.status = QENT_ERROR;
-+            qe_dst_done(dst_be);
-+        }
-+    }
-+
-+    queue_delete(mbc->dst);
-+    queue_delete(mbc->src);
-+    close(mbc->vfd);
-+    pthread_mutex_destroy(&mbc->lock);
-+
-+    free(mbc);
-+}
-+
-+struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc)
-+{
-+    atomic_fetch_add(&mbc->ref_count, 1);
-+    return mbc;
-+}
-+
-+void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc)
-+{
-+    struct mediabufs_ctl *const mbc = *pmbc;
-+    int n;
-+
-+    if (!mbc)
-+        return;
-+    *pmbc = NULL;
-+    n = atomic_fetch_sub(&mbc->ref_count, 1);
-+    if (n)
-+        return;
-+    mediabufs_ctl_delete(mbc);
-+}
-+
-+unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc)
-+{
-+    return mbc->capability.version;
-+}
-+
-+static int set_capabilities(struct mediabufs_ctl *const mbc)
-+{
-+    uint32_t caps;
-+
-+    if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &mbc->capability)) {
-+        int err = errno;
-+        request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err));
-+        return -err;
-+    }
-+
-+    caps = (mbc->capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ?
-+            mbc->capability.device_caps :
-+            mbc->capability.capabilities;
-+
-+    if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) {
-+        mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
-+        mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
-+    }
-+    else if ((caps & V4L2_CAP_VIDEO_M2M) != 0) {
-+        mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
-+        mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-+    }
-+    else {
-+        request_err(mbc->dc, "No M2M capabilities (%#x)\n", caps);
-+        return -EINVAL;
-+    }
-+
-+    return 0;
-+}
-+
-+/* One of these per context */
-+struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, const char * vpath, struct pollqueue *const pq)
-+{
-+    struct mediabufs_ctl *const mbc = calloc(1, sizeof(*mbc));
-+
-+    if (!mbc)
-+        return NULL;
-+
-+    mbc->dc = dc;
-+    // Default mono planar
-+    mbc->pq = pq;
-+    pthread_mutex_init(&mbc->lock, NULL);
-+
-+    /* Pick a default  - could we scan for this? */
-+    if (vpath == NULL)
-+        vpath = "/dev/media0";
-+
-+    while ((mbc->vfd = open(vpath, O_RDWR)) == -1)
-+    {
-+        const int err = errno;
-+        if (err != EINTR) {
-+            request_err(dc, "Failed to open video dev '%s': %s\n", vpath, strerror(err));
-+            goto fail0;
-+        }
-+    }
-+
-+    if (set_capabilities(mbc)) {
-+        request_err(dc, "Bad capabilities for video dev '%s'\n", vpath);
-+        goto fail1;
-+    }
-+
-+    mbc->src = queue_new(mbc->vfd);
-+    if (!mbc->src)
-+        goto fail1;
-+    mbc->dst = queue_new(mbc->vfd);
-+    if (!mbc->dst)
-+        goto fail2;
-+    mbc->pt = polltask_new(pq, mbc->vfd, POLLIN | POLLOUT, mediabufs_poll_cb, mbc);
-+    if (!mbc->pt)
-+        goto fail3;
-+    mbc->this_wlm = ff_weak_link_new(mbc);
-+    if (!mbc->this_wlm)
-+        goto fail4;
-+
-+    /* Cannot add polltask now - polling with nothing pending
-+     * generates infinite error polls
-+    */
-+    return mbc;
-+
-+fail4:
-+    polltask_delete(&mbc->pt);
-+fail3:
-+    queue_delete(mbc->dst);
-+fail2:
-+    queue_delete(mbc->src);
-+fail1:
-+    close(mbc->vfd);
-+fail0:
-+    free(mbc);
-+    request_info(dc, "%s: FAILED\n", __func__);
-+    return NULL;
-+}
-+
-+
-+
---- /dev/null
-+++ b/libavcodec/v4l2_req_media.h
-@@ -0,0 +1,154 @@
-+/*
-+e.h
-+*
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the
-+ * "Software"), to deal in the Software without restriction, including
-+ * without limitation the rights to use, copy, modify, merge, publish,
-+ * distribute, sub license, and/or sell copies of the Software, and to
-+ * permit persons to whom the Software is furnished to do so, subject to
-+ * the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the
-+ * next paragraph) shall be included in all copies or substantial portions
-+ * of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
-+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
-+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-+ */
-+
-+#ifndef _MEDIA_H_
-+#define _MEDIA_H_
-+
-+#include <stdbool.h>
-+#include <stdint.h>
-+
-+struct v4l2_format;
-+struct v4l2_fmtdesc;
-+struct v4l2_query_ext_ctrl;
-+
-+struct pollqueue;
-+struct media_request;
-+struct media_pool;
-+
-+typedef enum media_buf_status {
-+    MEDIABUFS_STATUS_SUCCESS = 0,
-+    MEDIABUFS_ERROR_OPERATION_FAILED,
-+    MEDIABUFS_ERROR_DECODING_ERROR,
-+    MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE,
-+    MEDIABUFS_ERROR_UNSUPPORTED_RT_FORMAT,
-+    MEDIABUFS_ERROR_ALLOCATION_FAILED,
-+} MediaBufsStatus;
-+
-+struct media_pool * media_pool_new(const char * const media_path,
-+                   struct pollqueue * const pq,
-+                   const unsigned int n);
-+void media_pool_delete(struct media_pool ** pmp);
-+
-+// Obtain a media request
-+// Will block if none availible - has a 2sec timeout
-+struct media_request * media_request_get(struct media_pool * const mp);
-+int media_request_fd(const struct media_request * const req);
-+
-+// Start this request
-+// Request structure is returned to pool once done
-+int media_request_start(struct media_request * const req);
-+
-+// Return an *unstarted* media_request to the pool
-+// May later be upgraded to allow for aborting a started req
-+int media_request_abort(struct media_request ** const preq);
-+
-+
-+struct mediabufs_ctl;
-+struct qent_src;
-+struct qent_dst;
-+struct dmabuf_h;
-+struct dmabufs_ctl;
-+
-+int qent_src_params_set(struct qent_src *const be, const struct timeval * timestamp);
-+struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst);
-+
-+// prealloc
-+int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc);
-+// dbsc may be NULL if realloc not required
-+int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc);
-+const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be, unsigned int plane);
-+int qent_dst_dup_fd(const struct qent_dst *const be, unsigned int plane);
-+MediaBufsStatus qent_dst_wait(struct qent_dst *const be);
-+void qent_dst_delete(struct qent_dst *const be);
-+// Returns a qent_dst to its mbc free Q or deletes it if the mbc is dead
-+void qent_dst_unref(struct qent_dst ** const pbe_dst);
-+struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst);
-+
-+const uint8_t * qent_dst_data(struct qent_dst *const be, unsigned int buf_no);
-+MediaBufsStatus qent_dst_read_start(struct qent_dst *const be);
-+MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be);
-+/* Import an fd unattached to any mediabuf */
-+MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
-+                unsigned int plane,
-+                int fd, size_t size);
-+
-+MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
-+                struct media_request **const pmreq,
-+                struct qent_src **const psrc_be,
-+                struct qent_dst *const dst_be,
-+                const bool is_final);
-+// Get / alloc a dst buffer & associate with a slot
-+// If the dst pool is empty then behaviour depends on the fixed flag passed to
-+// dst_slots_create.  Default is !fixed = unlimited alloc
-+struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc,
-+                           struct dmabufs_ctl *const dbsc);
-+// Create dst slots without alloc
-+// If fixed true then qent_alloc will only get slots from this pool and will
-+// block until a qent has been unrefed
-+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed);
-+
-+MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc);
-+MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc);
-+const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc);
-+
-+typedef int mediabufs_dst_fmt_accept_fn(void * v, const struct v4l2_fmtdesc *fmtdesc);
-+
-+MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,
-+               const unsigned int width,
-+               const unsigned int height,
-+               mediabufs_dst_fmt_accept_fn *const accept_fn,
-+               void *const accept_v);
-+struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc);
-+void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src);
-+
-+int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq,
-+                                struct v4l2_ext_control control_array[], unsigned int n);
-+MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc,
-+                struct media_request * const mreq,
-+                unsigned int id, void *data,
-+                unsigned int size);
-+int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n);
-+
-+int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc);
-+
-+MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,
-+                                      enum v4l2_buf_type buf_type,
-+                                      const uint32_t pixfmt,
-+                                      const uint32_t width, const uint32_t height,
-+                                      const size_t bufsize);
-+
-+MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw,
-+                  struct dmabufs_ctl * const dbsc,
-+                  unsigned int n);
-+
-+#define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c))
-+unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc);
-+
-+struct mediabufs_ctl * mediabufs_ctl_new(void * const dc,
-+                     const char *vpath, struct pollqueue *const pq);
-+void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc);
-+struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc);
-+
-+
-+#endif
---- /dev/null
-+++ b/libavcodec/v4l2_req_pollqueue.c
-@@ -0,0 +1,361 @@
-+#include <errno.h>
-+#include <limits.h>
-+#include <poll.h>
-+#include <pthread.h>
-+#include <semaphore.h>
-+#include <stdatomic.h>
-+#include <stdbool.h>
-+#include <stdlib.h>
-+#include <stdint.h>
-+#include <stdio.h>
-+#include <string.h>
-+#include <unistd.h>
-+#include <sys/eventfd.h>
-+
-+#include "v4l2_req_pollqueue.h"
-+#include "v4l2_req_utils.h"
-+
-+
-+struct pollqueue;
-+
-+enum polltask_state {
-+    POLLTASK_UNQUEUED = 0,
-+    POLLTASK_QUEUED,
-+    POLLTASK_RUNNING,
-+    POLLTASK_Q_KILL,
-+    POLLTASK_RUN_KILL,
-+};
-+
-+struct polltask {
-+    struct polltask *next;
-+    struct polltask *prev;
-+    struct pollqueue *q;
-+    enum polltask_state state;
-+
-+    int fd;
-+    short events;
-+
-+    void (*fn)(void *v, short revents);
-+    void * v;
-+
-+    uint64_t timeout; /* CLOCK_MONOTONIC time, 0 => never */
-+    sem_t kill_sem;
-+};
-+
-+struct pollqueue {
-+    atomic_int ref_count;
-+    pthread_mutex_t lock;
-+
-+    struct polltask *head;
-+    struct polltask *tail;
-+
-+    bool kill;
-+    bool no_prod;
-+    int prod_fd;
-+    struct polltask *prod_pt;
-+    pthread_t worker;
-+};
-+
-+struct polltask *polltask_new(struct pollqueue *const pq,
-+                              const int fd, const short events,
-+                  void (*const fn)(void *v, short revents),
-+                  void *const v)
-+{
-+    struct polltask *pt;
-+
-+    if (!events)
-+        return NULL;
-+
-+    pt = malloc(sizeof(*pt));
-+    if (!pt)
-+        return NULL;
-+
-+    *pt = (struct polltask){
-+        .next = NULL,
-+        .prev = NULL,
-+        .q = pollqueue_ref(pq),
-+        .fd = fd,
-+        .events = events,
-+        .fn = fn,
-+        .v = v
-+    };
-+
-+    sem_init(&pt->kill_sem, 0, 0);
-+
-+    return pt;
-+}
-+
-+static void pollqueue_rem_task(struct pollqueue *const pq, struct polltask *const pt)
-+{
-+    if (pt->prev)
-+        pt->prev->next = pt->next;
-+    else
-+        pq->head = pt->next;
-+    if (pt->next)
-+        pt->next->prev = pt->prev;
-+    else
-+        pq->tail = pt->prev;
-+    pt->next = NULL;
-+    pt->prev = NULL;
-+}
-+
-+static void polltask_free(struct polltask * const pt)
-+{
-+    sem_destroy(&pt->kill_sem);
-+    free(pt);
-+}
-+
-+static int pollqueue_prod(const struct pollqueue *const pq)
-+{
-+    static const uint64_t one = 1;
-+    return write(pq->prod_fd, &one, sizeof(one));
-+}
-+
-+void polltask_delete(struct polltask **const ppt)
-+{
-+    struct polltask *const pt = *ppt;
-+    struct pollqueue * pq;
-+    enum polltask_state state;
-+    bool prodme;
-+
-+    if (!pt)
-+        return;
-+
-+    pq = pt->q;
-+    pthread_mutex_lock(&pq->lock);
-+    state = pt->state;
-+    pt->state = (state == POLLTASK_RUNNING) ? POLLTASK_RUN_KILL : POLLTASK_Q_KILL;
-+    prodme = !pq->no_prod;
-+    pthread_mutex_unlock(&pq->lock);
-+
-+    if (state != POLLTASK_UNQUEUED) {
-+        if (prodme)
-+            pollqueue_prod(pq);
-+        while (sem_wait(&pt->kill_sem) && errno == EINTR)
-+            /* loop */;
-+    }
-+
-+    // Leave zapping the ref until we have DQed the PT as might well be
-+    // legitimately used in it
-+    *ppt = NULL;
-+    polltask_free(pt);
-+    pollqueue_unref(&pq);
-+}
-+
-+static uint64_t pollqueue_now(int timeout)
-+{
-+    struct timespec now;
-+    uint64_t now_ms;
-+
-+    if (clock_gettime(CLOCK_MONOTONIC, &now))
-+        return 0;
-+    now_ms = (now.tv_nsec / 1000000) + (uint64_t)now.tv_sec * 1000 + timeout;
-+    return now_ms ? now_ms : (uint64_t)1;
-+}
-+
-+void pollqueue_add_task(struct polltask *const pt, const int timeout)
-+{
-+    bool prodme = false;
-+    struct pollqueue * const pq = pt->q;
-+
-+    pthread_mutex_lock(&pq->lock);
-+    if (pt->state != POLLTASK_Q_KILL && pt->state != POLLTASK_RUN_KILL) {
-+        if (pq->tail)
-+            pq->tail->next = pt;
-+        else
-+            pq->head = pt;
-+        pt->prev = pq->tail;
-+        pt->next = NULL;
-+        pt->state = POLLTASK_QUEUED;
-+        pt->timeout = timeout < 0 ? 0 : pollqueue_now(timeout);
-+        pq->tail = pt;
-+        prodme = !pq->no_prod;
-+    }
-+    pthread_mutex_unlock(&pq->lock);
-+    if (prodme)
-+        pollqueue_prod(pq);
-+}
-+
-+static void *poll_thread(void *v)
-+{
-+    struct pollqueue *const pq = v;
-+    struct pollfd *a = NULL;
-+    size_t asize = 0;
-+
-+    pthread_mutex_lock(&pq->lock);
-+    do {
-+        unsigned int i;
-+        unsigned int n = 0;
-+        struct polltask *pt;
-+        struct polltask *pt_next;
-+        uint64_t now = pollqueue_now(0);
-+        int timeout = -1;
-+        int rv;
-+
-+        for (pt = pq->head; pt; pt = pt_next) {
-+            int64_t t;
-+
-+            pt_next = pt->next;
-+
-+            if (pt->state == POLLTASK_Q_KILL) {
-+                pollqueue_rem_task(pq, pt);
-+                sem_post(&pt->kill_sem);
-+                continue;
-+            }
-+
-+            if (n >= asize) {
-+                asize = asize ? asize * 2 : 4;
-+                a = realloc(a, asize * sizeof(*a));
-+                if (!a) {
-+                    request_log("Failed to realloc poll array to %zd\n", asize);
-+                    goto fail_locked;
-+                }
-+            }
-+
-+            a[n++] = (struct pollfd){
-+                .fd = pt->fd,
-+                .events = pt->events
-+            };
-+
-+            t = (int64_t)(pt->timeout - now);
-+            if (pt->timeout && t < INT_MAX &&
-+                (timeout < 0 || (int)t < timeout))
-+                timeout = (t < 0) ? 0 : (int)t;
-+        }
-+        pthread_mutex_unlock(&pq->lock);
-+
-+        if ((rv = poll(a, n, timeout)) == -1) {
-+            if (errno != EINTR) {
-+                request_log("Poll error: %s\n", strerror(errno));
-+                goto fail_unlocked;
-+            }
-+        }
-+
-+        pthread_mutex_lock(&pq->lock);
-+        now = pollqueue_now(0);
-+
-+        /* Prodding in this loop is pointless and might lead to
-+         * infinite looping
-+        */
-+        pq->no_prod = true;
-+        for (i = 0, pt = pq->head; i < n; ++i, pt = pt_next) {
-+            pt_next = pt->next;
-+
-+            /* Pending? */
-+            if (a[i].revents ||
-+                (pt->timeout && (int64_t)(now - pt->timeout) >= 0)) {
-+                pollqueue_rem_task(pq, pt);
-+                if (pt->state == POLLTASK_QUEUED)
-+                    pt->state = POLLTASK_RUNNING;
-+                if (pt->state == POLLTASK_Q_KILL)
-+                    pt->state = POLLTASK_RUN_KILL;
-+                pthread_mutex_unlock(&pq->lock);
-+
-+                /* This can add new entries to the Q but as
-+                 * those are added to the tail our existing
-+                 * chain remains intact
-+                */
-+                pt->fn(pt->v, a[i].revents);
-+
-+                pthread_mutex_lock(&pq->lock);
-+                if (pt->state == POLLTASK_RUNNING)
-+                    pt->state = POLLTASK_UNQUEUED;
-+                if (pt->state == POLLTASK_RUN_KILL)
-+                    sem_post(&pt->kill_sem);
-+            }
-+        }
-+        pq->no_prod = false;
-+
-+    } while (!pq->kill);
-+
-+fail_locked:
-+    pthread_mutex_unlock(&pq->lock);
-+fail_unlocked:
-+    free(a);
-+    return NULL;
-+}
-+
-+static void prod_fn(void *v, short revents)
-+{
-+    struct pollqueue *const pq = v;
-+    char buf[8];
-+    if (revents)
-+        read(pq->prod_fd, buf, 8);
-+    if (!pq->kill)
-+        pollqueue_add_task(pq->prod_pt, -1);
-+}
-+
-+struct pollqueue * pollqueue_new(void)
-+{
-+    struct pollqueue *pq = malloc(sizeof(*pq));
-+    if (!pq)
-+        return NULL;
-+    *pq = (struct pollqueue){
-+        .ref_count = ATOMIC_VAR_INIT(0),
-+        .lock = PTHREAD_MUTEX_INITIALIZER,
-+        .head = NULL,
-+        .tail = NULL,
-+        .kill = false,
-+        .prod_fd = -1
-+    };
-+
-+    pq->prod_fd = eventfd(0, EFD_NONBLOCK);
-+    if (pq->prod_fd == 1)
-+        goto fail1;
-+    pq->prod_pt = polltask_new(pq, pq->prod_fd, POLLIN, prod_fn, pq);
-+    if (!pq->prod_pt)
-+        goto fail2;
-+    pollqueue_add_task(pq->prod_pt, -1);
-+    if (pthread_create(&pq->worker, NULL, poll_thread, pq))
-+        goto fail3;
-+    // Reset ref count which will have been inced by the add_task
-+    atomic_store(&pq->ref_count, 0);
-+    return pq;
-+
-+fail3:
-+    polltask_free(pq->prod_pt);
-+fail2:
-+    close(pq->prod_fd);
-+fail1:
-+    free(pq);
-+    return NULL;
-+}
-+
-+static void pollqueue_free(struct pollqueue *const pq)
-+{
-+    void *rv;
-+
-+    pthread_mutex_lock(&pq->lock);
-+    pq->kill = true;
-+    pollqueue_prod(pq);
-+    pthread_mutex_unlock(&pq->lock);
-+
-+    pthread_join(pq->worker, &rv);
-+    polltask_free(pq->prod_pt);
-+    pthread_mutex_destroy(&pq->lock);
-+    close(pq->prod_fd);
-+    free(pq);
-+}
-+
-+struct pollqueue * pollqueue_ref(struct pollqueue *const pq)
-+{
-+    atomic_fetch_add(&pq->ref_count, 1);
-+    return pq;
-+}
-+
-+void pollqueue_unref(struct pollqueue **const ppq)
-+{
-+    struct pollqueue * const pq = *ppq;
-+
-+    if (!pq)
-+        return;
-+    *ppq = NULL;
-+
-+    if (atomic_fetch_sub(&pq->ref_count, 1) != 0)
-+        return;
-+
-+    pollqueue_free(pq);
-+}
-+
-+
-+
---- /dev/null
-+++ b/libavcodec/v4l2_req_pollqueue.h
-@@ -0,0 +1,18 @@
-+#ifndef POLLQUEUE_H_
-+#define POLLQUEUE_H_
-+
-+struct polltask;
-+struct pollqueue;
-+
-+struct polltask *polltask_new(struct pollqueue *const pq,
-+			      const int fd, const short events,
-+			      void (*const fn)(void *v, short revents),
-+			      void *const v);
-+void polltask_delete(struct polltask **const ppt);
-+
-+void pollqueue_add_task(struct polltask *const pt, const int timeout);
-+struct pollqueue * pollqueue_new(void);
-+void pollqueue_unref(struct pollqueue **const ppq);
-+struct pollqueue * pollqueue_ref(struct pollqueue *const pq);
-+
-+#endif /* POLLQUEUE_H_ */
---- /dev/null
-+++ b/libavcodec/v4l2_req_utils.h
-@@ -0,0 +1,22 @@
-+#include "libavutil/log.h"
-+
-+#define request_log(...) av_log(NULL, AV_LOG_INFO, __VA_ARGS__)
-+
-+#define request_err(_ctx, ...) av_log(_ctx, AV_LOG_ERROR, __VA_ARGS__)
-+#define request_warn(_ctx, ...) av_log(_ctx, AV_LOG_WARNING, __VA_ARGS__)
-+#define request_info(_ctx, ...) av_log(_ctx, AV_LOG_INFO, __VA_ARGS__)
-+#define request_debug(_ctx, ...) av_log(_ctx, AV_LOG_DEBUG, __VA_ARGS__)
-+
-+static inline char safechar(char c) {
-+    return c > 0x20 && c < 0x7f ? c : '.';
-+}
-+
-+static inline const char * strfourcc(char tbuf[5], uint32_t fcc) {
-+    tbuf[0] = safechar((fcc >>  0) & 0xff);
-+    tbuf[1] = safechar((fcc >>  8) & 0xff);
-+    tbuf[2] = safechar((fcc >> 16) & 0xff);
-+    tbuf[3] = safechar((fcc >> 24) & 0xff);
-+    tbuf[4] = '\0';
-+    return tbuf;
-+}
-+
---- /dev/null
-+++ b/libavcodec/v4l2_request_hevc.c
-@@ -0,0 +1,311 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+
-+
-+#include "decode.h"
-+#include "hevcdec.h"
-+#include "hwconfig.h"
-+
-+#include "v4l2_request_hevc.h"
-+
-+#include "libavutil/hwcontext_drm.h"
-+
-+#include "v4l2_req_devscan.h"
-+#include "v4l2_req_dmabufs.h"
-+#include "v4l2_req_pollqueue.h"
-+#include "v4l2_req_media.h"
-+#include "v4l2_req_utils.h"
-+
-+static size_t bit_buf_size(unsigned int w, unsigned int h, unsigned int bits_minus8)
-+{
-+    const size_t wxh = w * h;
-+    size_t bits_alloc;
-+
-+    /* Annex A gives a min compression of 2 @ lvl 3.1
-+     * (wxh <= 983040) and min 4 thereafter but avoid
-+     * the odity of 983041 having a lower limit than
-+     * 983040.
-+     * Multiply by 3/2 for 4:2:0
-+     */
-+    bits_alloc = wxh < 983040 ? wxh * 3 / 4 :
-+        wxh < 983040 * 2 ? 983040 * 3 / 4 :
-+        wxh * 3 / 8;
-+    /* Allow for bit depth */
-+    bits_alloc += (bits_alloc * bits_minus8) / 8;
-+    /* Add a few bytes (16k) for overhead */
-+    bits_alloc += 0x4000;
-+    return bits_alloc;
-+}
-+
-+static int v4l2_req_hevc_start_frame(AVCodecContext *avctx,
-+                                     av_unused const uint8_t *buffer,
-+                                     av_unused uint32_t size)
-+{
-+    const V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+    return ctx->fns->start_frame(avctx, buffer, size);
-+}
-+
-+static int v4l2_req_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
-+{
-+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+    return ctx->fns->decode_slice(avctx, buffer, size);
-+}
-+
-+static int v4l2_req_hevc_end_frame(AVCodecContext *avctx)
-+{
-+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
-+    return ctx->fns->end_frame(avctx);
-+}
-+
-+static void v4l2_req_hevc_abort_frame(AVCodecContext * const avctx)
-+{
-+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+    ctx->fns->abort_frame(avctx);
-+}
-+
-+static int v4l2_req_hevc_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
-+{
-+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+    return ctx->fns->frame_params(avctx, hw_frames_ctx);
-+}
-+
-+static int v4l2_req_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame)
-+{
-+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+    return ctx->fns->alloc_frame(avctx, frame);
-+}
-+
-+
-+static int v4l2_request_hevc_uninit(AVCodecContext *avctx)
-+{
-+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
-+
-+    decode_q_wait(&ctx->decode_q, NULL);  // Wait for all other threads to be out of decode
-+
-+    mediabufs_ctl_unref(&ctx->mbufs);
-+    media_pool_delete(&ctx->mpool);
-+    pollqueue_unref(&ctx->pq);
-+    dmabufs_ctl_delete(&ctx->dbufs);
-+    devscan_delete(&ctx->devscan);
-+
-+    decode_q_uninit(&ctx->decode_q);
-+
-+//    if (avctx->hw_frames_ctx) {
-+//        AVHWFramesContext *hwfc = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
-+//        av_buffer_pool_flush(hwfc->pool);
-+//    }
-+    return 0;
-+}
-+
-+static int dst_fmt_accept_cb(void * v, const struct v4l2_fmtdesc *fmtdesc)
-+{
-+    AVCodecContext *const avctx = v;
-+    const HEVCContext *const h = avctx->priv_data;
-+
-+    if (h->ps.sps->bit_depth == 8) {
-+        if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_COL128 ||
-+            fmtdesc->pixelformat == V4L2_PIX_FMT_NV12) {
-+            return 1;
-+        }
-+    }
-+    else if (h->ps.sps->bit_depth == 10) {
-+        if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
-+            return 1;
-+        }
-+    }
-+    return 0;
-+}
-+
-+static int v4l2_request_hevc_init(AVCodecContext *avctx)
-+{
-+    const HEVCContext *h = avctx->priv_data;
-+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+    const HEVCSPS * const sps = h->ps.sps;
-+    int ret;
-+    const struct decdev * decdev;
-+    const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 1).src_pix_fmt_v4l2;  // Assuming constant for all APIs but avoiding V4L2 includes
-+    size_t src_size;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
-+
-+    // Give up immediately if this is something that we have no code to deal with
-+    if (h->ps.sps->chroma_format_idc != 1) {
-+        av_log(avctx, AV_LOG_WARNING, "chroma_format_idc(%d) != 1: Not implemented\n", h->ps.sps->chroma_format_idc);
-+        return AVERROR_PATCHWELCOME;
-+    }
-+    if (!(h->ps.sps->bit_depth == 10 || h->ps.sps->bit_depth == 8) ||
-+        h->ps.sps->bit_depth != h->ps.sps->bit_depth_chroma) {
-+        av_log(avctx, AV_LOG_WARNING, "Bit depth Y:%d C:%d: Not implemented\n", h->ps.sps->bit_depth, h->ps.sps->bit_depth_chroma);
-+        return AVERROR_PATCHWELCOME;
-+    }
-+
-+    if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) {
-+        av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n");
-+        return (AVERROR(-ret));
-+    }
-+    ret = AVERROR(ENOMEM);  // Assume mem fail by default for these
-+
-+    if ((decdev = devscan_find(ctx->devscan, src_pix_fmt)) == NULL)
-+    {
-+        av_log(avctx, AV_LOG_WARNING, "Failed to find a V4L2 device for H265\n");
-+        ret = AVERROR(ENODEV);
-+        goto fail0;
-+    }
-+    av_log(avctx, AV_LOG_DEBUG, "Trying V4L2 devices: %s,%s\n",
-+           decdev_media_path(decdev), decdev_video_path(decdev));
-+
-+    if ((ctx->dbufs = dmabufs_ctl_new()) == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "Unable to open dmabufs\n");
-+        goto fail0;
-+    }
-+
-+    if ((ctx->pq = pollqueue_new()) == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "Unable to create pollqueue\n");
-+        goto fail1;
-+    }
-+
-+    if ((ctx->mpool = media_pool_new(decdev_media_path(decdev), ctx->pq, 4)) == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "Unable to create media pool\n");
-+        goto fail2;
-+    }
-+
-+    if ((ctx->mbufs = mediabufs_ctl_new(avctx, decdev_video_path(decdev), ctx->pq)) == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "Unable to create media controls\n");
-+        goto fail3;
-+    }
-+
-+    // Ask for an initial bitbuf size of max size / 4
-+    // We will realloc if we need more
-+    // Must use sps->h/w as avctx contains cropped size
-+    src_size = bit_buf_size(sps->width, sps->height, sps->bit_depth - 8);
-+    if (mediabufs_src_resizable(ctx->mbufs))
-+        src_size /= 4;
-+    // Kludge for conformance tests which break Annex A limits
-+    else if (src_size < 0x40000)
-+        src_size = 0x40000;
-+
-+    if (mediabufs_src_fmt_set(ctx->mbufs, decdev_src_type(decdev), src_pix_fmt,
-+                              sps->width, sps->height, src_size)) {
-+        char tbuf1[5];
-+        av_log(avctx, AV_LOG_ERROR, "Failed to set source format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height);
-+        goto fail4;
-+    }
-+
-+    if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
-+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n");
-+        ctx->fns = &V2(ff_v4l2_req_hevc, 3);
-+    }
-+    else if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) {
-+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n");
-+        ctx->fns = &V2(ff_v4l2_req_hevc, 2);
-+    }
-+    else if (V2(ff_v4l2_req_hevc, 1).probe(avctx, ctx) == 0) {
-+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 1 probed successfully\n");
-+        ctx->fns = &V2(ff_v4l2_req_hevc, 1);
-+    }
-+    else {
-+        av_log(avctx, AV_LOG_ERROR, "No HEVC version probed successfully\n");
-+        ret = AVERROR(EINVAL);
-+        goto fail4;
-+    }
-+
-+    if (mediabufs_dst_fmt_set(ctx->mbufs, sps->width, sps->height, dst_fmt_accept_cb, avctx)) {
-+        char tbuf1[5];
-+        av_log(avctx, AV_LOG_ERROR, "Failed to set destination format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height);
-+        goto fail4;
-+    }
-+
-+    if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6)) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to create source pool\n");
-+        goto fail4;
-+    }
-+
-+    {
-+        unsigned int dst_slots = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering +
-+            avctx->thread_count + (avctx->extra_hw_frames > 0 ? avctx->extra_hw_frames : 6);
-+        av_log(avctx, AV_LOG_DEBUG, "Slots=%d: Reordering=%d, threads=%d, hw+=%d\n", dst_slots,
-+               sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering,
-+               avctx->thread_count, avctx->extra_hw_frames);
-+
-+        // extra_hw_frames is -1 if unset
-+        if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0))) {
-+            av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n");
-+            goto fail4;
-+        }
-+    }
-+
-+    if (mediabufs_stream_on(ctx->mbufs)) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed stream on\n");
-+        goto fail4;
-+    }
-+
-+    if ((ret = ff_decode_get_hw_frames_ctx(avctx, AV_HWDEVICE_TYPE_DRM)) != 0) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to create frame ctx\n");
-+        goto fail4;
-+    }
-+
-+    if ((ret = ctx->fns->set_controls(avctx, ctx)) != 0) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed set controls\n");
-+        goto fail5;
-+    }
-+
-+    decode_q_init(&ctx->decode_q);
-+
-+    // Set our s/w format
-+    avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format;
-+
-+    av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s\n",
-+           ctx->fns->name,
-+           decdev_media_path(decdev), decdev_video_path(decdev));
-+
-+    return 0;
-+
-+fail5:
-+    av_buffer_unref(&avctx->hw_frames_ctx);
-+fail4:
-+    mediabufs_ctl_unref(&ctx->mbufs);
-+fail3:
-+    media_pool_delete(&ctx->mpool);
-+fail2:
-+    pollqueue_unref(&ctx->pq);
-+fail1:
-+    dmabufs_ctl_delete(&ctx->dbufs);
-+fail0:
-+    devscan_delete(&ctx->devscan);
-+    return ret;
-+}
-+
-+const AVHWAccel ff_hevc_v4l2request_hwaccel = {
-+    .name           = "hevc_v4l2request",
-+    .type           = AVMEDIA_TYPE_VIDEO,
-+    .id             = AV_CODEC_ID_HEVC,
-+    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
-+    .alloc_frame    = v4l2_req_hevc_alloc_frame,
-+    .start_frame    = v4l2_req_hevc_start_frame,
-+    .decode_slice   = v4l2_req_hevc_decode_slice,
-+    .end_frame      = v4l2_req_hevc_end_frame,
-+    .abort_frame    = v4l2_req_hevc_abort_frame,
-+    .init           = v4l2_request_hevc_init,
-+    .uninit         = v4l2_request_hevc_uninit,
-+    .priv_data_size = sizeof(V4L2RequestContextHEVC),
-+    .frame_params   = v4l2_req_hevc_frame_params,
-+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
-+};
---- /dev/null
-+++ b/libavcodec/v4l2_request_hevc.h
-@@ -0,0 +1,102 @@
-+#ifndef AVCODEC_V4L2_REQUEST_HEVC_H
-+#define AVCODEC_V4L2_REQUEST_HEVC_H
-+
-+#include <drm_fourcc.h>
-+#include "v4l2_req_decode_q.h"
-+
-+#ifndef DRM_FORMAT_NV15
-+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
-+#endif
-+
-+#ifndef DRM_FORMAT_NV20
-+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
-+#endif
-+
-+// P030 should be defined in drm_fourcc.h and hopefully will be sometime
-+// in the future but until then...
-+#ifndef DRM_FORMAT_P030
-+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
-+#endif
-+
-+#ifndef DRM_FORMAT_NV15
-+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
-+#endif
-+
-+#ifndef DRM_FORMAT_NV20
-+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
-+#endif
-+
-+#include <linux/videodev2.h>
-+#ifndef V4L2_CID_CODEC_BASE
-+#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
-+#endif
-+
-+// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
-+// in drm_fourcc.h hopefully will be sometime in the future but until then...
-+#ifndef V4L2_PIX_FMT_NV12_10_COL128
-+#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
-+#endif
-+
-+#ifndef V4L2_PIX_FMT_NV12_COL128
-+#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
-+#endif
-+
-+#ifndef V4L2_CTRL_FLAG_DYNAMIC_ARRAY
-+#define V4L2_CTRL_FLAG_DYNAMIC_ARRAY	0x0800
-+#endif
-+
-+#define MAX_SLICES 128
-+
-+#define VCAT(name, version) name##_v##version
-+#define V2(n,v) VCAT(n, v)
-+#define V(n) V2(n, HEVC_CTRLS_VERSION)
-+
-+#define S2(x) #x
-+#define STR(x) S2(x)
-+
-+// 1 per decoder
-+struct v4l2_req_decode_fns;
-+
-+typedef struct V4L2RequestContextHEVC {
-+//    V4L2RequestContext base;
-+    const struct v4l2_req_decode_fns * fns;
-+
-+    unsigned int timestamp;  // ?? maybe uint64_t
-+
-+    int multi_slice;
-+    int decode_mode;
-+    int start_code;
-+    int max_slices;
-+
-+    req_decode_q decode_q;
-+
-+    struct devscan *devscan;
-+    struct dmabufs_ctl *dbufs;
-+    struct pollqueue *pq;
-+    struct media_pool * mpool;
-+    struct mediabufs_ctl *mbufs;
-+} V4L2RequestContextHEVC;
-+
-+typedef struct v4l2_req_decode_fns {
-+    int src_pix_fmt_v4l2;
-+    const char * name;
-+
-+    // Init setup
-+    int (*probe)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx);
-+    int (*set_controls)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx);
-+
-+    // Passthrough of hwaccel fns
-+    int (*start_frame)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
-+    int (*decode_slice)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
-+    int (*end_frame)(AVCodecContext *avctx);
-+    void (*abort_frame)(AVCodecContext *avctx);
-+    int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
-+    int (*alloc_frame)(AVCodecContext * avctx, AVFrame *frame);
-+} v4l2_req_decode_fns;
-+
-+
-+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1);
-+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2);
-+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3);
-+
-+#endif
---- a/libavcodec/vc1dec.c
-+++ b/libavcodec/vc1dec.c
-@@ -486,7 +486,7 @@ static av_cold int vc1_decode_init(AVCod
-             size = next - start - 4;
-             if (size <= 0)
-                 continue;
--            buf2_size = vc1_unescape_buffer(start + 4, size, buf2);
-+            buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
-             init_get_bits(&gb, buf2, buf2_size * 8);
-             switch (AV_RB32(start)) {
-             case VC1_CODE_SEQHDR:
-@@ -689,7 +689,7 @@ static int vc1_decode_frame(AVCodecConte
-                 case VC1_CODE_FRAME:
-                     if (avctx->hwaccel)
-                         buf_start = start;
--                    buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
-+                    buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
-                     break;
-                 case VC1_CODE_FIELD: {
-                     int buf_size3;
-@@ -706,8 +706,8 @@ static int vc1_decode_frame(AVCodecConte
-                         ret = AVERROR(ENOMEM);
-                         goto err;
-                     }
--                    buf_size3 = vc1_unescape_buffer(start + 4, size,
--                                                    slices[n_slices].buf);
-+                    buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
-+                                                              slices[n_slices].buf);
-                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
-                                   buf_size3 << 3);
-                     slices[n_slices].mby_start = avctx->coded_height + 31 >> 5;
-@@ -718,7 +718,7 @@ static int vc1_decode_frame(AVCodecConte
-                     break;
-                 }
-                 case VC1_CODE_ENTRYPOINT: /* it should be before frame data */
--                    buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
-+                    buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
-                     init_get_bits(&s->gb, buf2, buf_size2 * 8);
-                     ff_vc1_decode_entry_point(avctx, v, &s->gb);
-                     break;
-@@ -735,8 +735,8 @@ static int vc1_decode_frame(AVCodecConte
-                         ret = AVERROR(ENOMEM);
-                         goto err;
-                     }
--                    buf_size3 = vc1_unescape_buffer(start + 4, size,
--                                                    slices[n_slices].buf);
-+                    buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
-+                                                              slices[n_slices].buf);
-                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
-                                   buf_size3 << 3);
-                     slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9);
-@@ -770,7 +770,7 @@ static int vc1_decode_frame(AVCodecConte
-                     ret = AVERROR(ENOMEM);
-                     goto err;
-                 }
--                buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
-+                buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
-                 init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
-                               buf_size3 << 3);
-                 slices[n_slices].mby_start = s->mb_height + 1 >> 1;
-@@ -779,9 +779,9 @@ static int vc1_decode_frame(AVCodecConte
-                 n_slices1 = n_slices - 1;
-                 n_slices++;
-             }
--            buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2);
-+            buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, buf2);
-         } else {
--            buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2);
-+            buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2);
-         }
-         init_get_bits(&s->gb, buf2, buf_size2*8);
-     } else
---- a/libavcodec/vc1dsp.c
-+++ b/libavcodec/vc1dsp.c
-@@ -32,6 +32,7 @@
- #include "rnd_avg.h"
- #include "vc1dsp.h"
- #include "startcode.h"
-+#include "vc1_common.h"
- 
- /* Apply overlap transform to horizontal edge */
- static void vc1_v_overlap_c(uint8_t *src, int stride)
-@@ -1028,6 +1029,7 @@ av_cold void ff_vc1dsp_init(VC1DSPContex
- #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */
- 
-     dsp->startcode_find_candidate = ff_startcode_find_candidate_c;
-+    dsp->vc1_unescape_buffer      = vc1_unescape_buffer;
- 
-     if (ARCH_AARCH64)
-         ff_vc1dsp_init_aarch64(dsp);
---- a/libavcodec/vc1dsp.h
-+++ b/libavcodec/vc1dsp.h
-@@ -80,6 +80,9 @@ typedef struct VC1DSPContext {
-      * one or more further zero bytes and a one byte.
-      */
-     int (*startcode_find_candidate)(const uint8_t *buf, int size);
-+
-+    /* Copy a buffer, removing startcode emulation escape bytes as we go */
-+    int (*vc1_unescape_buffer)(const uint8_t *src, int size, uint8_t *dst);
- } VC1DSPContext;
- 
- void ff_vc1dsp_init(VC1DSPContext* c);
---- /dev/null
-+++ b/libavcodec/weak_link.c
-@@ -0,0 +1,102 @@
-+#include <stdlib.h>
-+#include <pthread.h>
-+#include <stdatomic.h>
-+#include "weak_link.h"
-+
-+struct ff_weak_link_master {
-+    atomic_int ref_count;    /* 0 is single ref for easier atomics */
-+    pthread_rwlock_t lock;
-+    void * ptr;
-+};
-+
-+static inline struct ff_weak_link_master * weak_link_x(struct ff_weak_link_client * c)
-+{
-+    return (struct ff_weak_link_master *)c;
-+}
-+
-+struct ff_weak_link_master * ff_weak_link_new(void * p)
-+{
-+    struct ff_weak_link_master * w = malloc(sizeof(*w));
-+    if (!w)
-+        return NULL;
-+    w->ptr = p;
-+    if (pthread_rwlock_init(&w->lock, NULL)) {
-+        free(w);
-+        return NULL;
-+    }
-+    return w;
-+}
-+
-+static void weak_link_do_unref(struct ff_weak_link_master * const w)
-+{
-+    int n = atomic_fetch_sub(&w->ref_count, 1);
-+    if (n)
-+        return;
-+
-+    pthread_rwlock_destroy(&w->lock);
-+    free(w);
-+}
-+
-+// Unref & break link
-+void ff_weak_link_break(struct ff_weak_link_master ** ppLink)
-+{
-+    struct ff_weak_link_master * const w = *ppLink;
-+    if (!w)
-+        return;
-+
-+    *ppLink = NULL;
-+    pthread_rwlock_wrlock(&w->lock);
-+    w->ptr = NULL;
-+    pthread_rwlock_unlock(&w->lock);
-+
-+    weak_link_do_unref(w);
-+}
-+
-+struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w)
-+{
-+    if (!w)
-+        return NULL;
-+    atomic_fetch_add(&w->ref_count, 1);
-+    return (struct ff_weak_link_client*)w;
-+}
-+
-+void ff_weak_link_unref(struct ff_weak_link_client ** ppLink)
-+{
-+    struct ff_weak_link_master * const w = weak_link_x(*ppLink);
-+    if (!w)
-+        return;
-+
-+    *ppLink = NULL;
-+    weak_link_do_unref(w);
-+}
-+
-+void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink)
-+{
-+    struct ff_weak_link_master * const w = weak_link_x(*ppLink);
-+
-+    if (!w)
-+        return NULL;
-+
-+    if (pthread_rwlock_rdlock(&w->lock))
-+        goto broken;
-+
-+    if (w->ptr)
-+        return w->ptr;
-+
-+    pthread_rwlock_unlock(&w->lock);
-+
-+broken:
-+    *ppLink = NULL;
-+    weak_link_do_unref(w);
-+    return NULL;
-+}
-+
-+// Ignores a NULL c (so can be on the return path of both broken & live links)
-+void ff_weak_link_unlock(struct ff_weak_link_client * c)
-+{
-+    struct ff_weak_link_master * const w = weak_link_x(c);
-+    if (w)
-+        pthread_rwlock_unlock(&w->lock);
-+}
-+
-+
---- /dev/null
-+++ b/libavcodec/weak_link.h
-@@ -0,0 +1,23 @@
-+struct ff_weak_link_master;
-+struct ff_weak_link_client;
-+
-+struct ff_weak_link_master * ff_weak_link_new(void * p);
-+void ff_weak_link_break(struct ff_weak_link_master ** ppLink);
-+
-+struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w);
-+void ff_weak_link_unref(struct ff_weak_link_client ** ppLink);
-+
-+// Returns NULL if link broken - in this case it will also zap
-+//   *ppLink and unref the weak_link.
-+// Returns NULL if *ppLink is NULL (so a link once broken stays broken)
-+//
-+// The above does mean that there is a race if this is called simultainiously
-+// by two threads using the same weak_link_client (so don't do that)
-+void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink);
-+void ff_weak_link_unlock(struct ff_weak_link_client * c);
-+
-+
-+
-+
-+
-+
---- a/libavdevice/Makefile
-+++ b/libavdevice/Makefile
-@@ -46,6 +46,9 @@ OBJS-$(CONFIG_SNDIO_OUTDEV)
- OBJS-$(CONFIG_V4L2_INDEV)                += v4l2.o v4l2-common.o timefilter.o
- OBJS-$(CONFIG_V4L2_OUTDEV)               += v4l2enc.o v4l2-common.o
- OBJS-$(CONFIG_VFWCAP_INDEV)              += vfwcap.o
-+OBJS-$(CONFIG_VOUT_DRM_OUTDEV)           += drm_vout.o
-+OBJS-$(CONFIG_VOUT_EGL_OUTDEV)           += egl_vout.o
-+OBJS-$(CONFIG_VOUT_RPI_OUTDEV)           += rpi_vout.o
- OBJS-$(CONFIG_XCBGRAB_INDEV)             += xcbgrab.o
- OBJS-$(CONFIG_XV_OUTDEV)                 += xv.o
- 
---- a/libavdevice/alldevices.c
-+++ b/libavdevice/alldevices.c
-@@ -52,6 +52,9 @@ extern AVOutputFormat ff_sndio_muxer;
- extern AVInputFormat  ff_v4l2_demuxer;
- extern AVOutputFormat ff_v4l2_muxer;
- extern AVInputFormat  ff_vfwcap_demuxer;
-+extern AVOutputFormat ff_vout_drm_muxer;
-+extern AVOutputFormat ff_vout_egl_muxer;
-+extern AVOutputFormat ff_vout_rpi_muxer;
- extern AVInputFormat  ff_xcbgrab_demuxer;
- extern AVOutputFormat ff_xv_muxer;
- 
---- /dev/null
-+++ b/libavdevice/drm_vout.c
-@@ -0,0 +1,643 @@
-+/*
-+ * Copyright (c) 2020 John Cox for Raspberry Pi Trading
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+
-+// *** This module is a work in progress and its utility is strictly
-+//     limited to testing.
-+
-+#include "libavutil/opt.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/hwcontext_drm.h"
-+#include "libavformat/internal.h"
-+#include "avdevice.h"
-+
-+#include "pthread.h"
-+#include <semaphore.h>
-+#include <unistd.h>
-+
-+#include <xf86drm.h>
-+#include <xf86drmMode.h>
-+
-+#define TRACE_ALL 0
-+
-+#define DRM_MODULE "vc4"
-+
-+#define ERRSTR strerror(errno)
-+
-+struct drm_setup {
-+   int conId;
-+   uint32_t crtcId;
-+   int crtcIdx;
-+   uint32_t planeId;
-+   unsigned int out_fourcc;
-+   struct {
-+       int x, y, width, height;
-+   } compose;
-+};
-+
-+typedef struct drm_aux_s {
-+    unsigned int fb_handle;
-+    uint32_t bo_handles[AV_DRM_MAX_PLANES];
-+    AVFrame * frame;
-+} drm_aux_t;
-+
-+// Aux size should only need to be 2, but on a few streams (Hobbit) under FKMS
-+// we get initial flicker probably due to dodgy drm timing
-+#define AUX_SIZE 3
-+typedef struct drm_display_env_s
-+{
-+    AVClass *class;
-+
-+    int drm_fd;
-+    uint32_t con_id;
-+    struct drm_setup setup;
-+    enum AVPixelFormat avfmt;
-+    int show_all;
-+
-+    unsigned int ano;
-+    drm_aux_t aux[AUX_SIZE];
-+
-+    pthread_t q_thread;
-+    sem_t q_sem_in;
-+    sem_t q_sem_out;
-+    int q_terminate;
-+    AVFrame * q_next;
-+
-+} drm_display_env_t;
-+
-+
-+static int drm_vout_write_trailer(AVFormatContext *s)
-+{
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
-+#endif
-+
-+    return 0;
-+}
-+
-+static int drm_vout_write_header(AVFormatContext *s)
-+{
-+    const AVCodecParameters * const par = s->streams[0]->codecpar;
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
-+#endif
-+    if (   s->nb_streams > 1
-+        || par->codec_type != AVMEDIA_TYPE_VIDEO
-+        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
-+        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
-+        return AVERROR(EINVAL);
-+    }
-+
-+    return 0;
-+}
-+
-+static int find_plane(struct AVFormatContext * const avctx,
-+                      const int drmfd, const int crtcidx, const uint32_t format,
-+                      uint32_t * const pplane_id)
-+{
-+   drmModePlaneResPtr planes;
-+   drmModePlanePtr plane;
-+   unsigned int i;
-+   unsigned int j;
-+   int ret = 0;
-+
-+   planes = drmModeGetPlaneResources(drmfd);
-+   if (!planes)
-+   {
-+       av_log(avctx, AV_LOG_WARNING, "drmModeGetPlaneResources failed: %s\n", ERRSTR);
-+       return -1;
-+   }
-+
-+   for (i = 0; i < planes->count_planes; ++i) {
-+      plane = drmModeGetPlane(drmfd, planes->planes[i]);
-+      if (!planes)
-+      {
-+          av_log(avctx, AV_LOG_WARNING, "drmModeGetPlane failed: %s\n", ERRSTR);
-+          break;
-+      }
-+
-+      if (!(plane->possible_crtcs & (1 << crtcidx))) {
-+         drmModeFreePlane(plane);
-+         continue;
-+      }
-+
-+      for (j = 0; j < plane->count_formats; ++j) {
-+         if (plane->formats[j] == format)
-+            break;
-+      }
-+
-+      if (j == plane->count_formats) {
-+         drmModeFreePlane(plane);
-+         continue;
-+      }
-+
-+      *pplane_id = plane->plane_id;
-+      drmModeFreePlane(plane);
-+      break;
-+   }
-+
-+   if (i == planes->count_planes)
-+      ret = -1;
-+
-+   drmModeFreePlaneResources(planes);
-+   return ret;
-+}
-+
-+static void da_uninit(drm_display_env_t * const de, drm_aux_t * da)
-+{
-+    if (da->fb_handle != 0) {
-+        drmModeRmFB(de->drm_fd, da->fb_handle);
-+        da->fb_handle = 0;
-+    }
-+
-+    for (unsigned int i = 0; i != AV_DRM_MAX_PLANES; ++i) {
-+        if (da->bo_handles[i]) {
-+            struct drm_gem_close gem_close = {.handle = da->bo_handles[i]};
-+            drmIoctl(de->drm_fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
-+            da->bo_handles[i] = 0;
-+        }
-+    }
-+    av_frame_free(&da->frame);
-+}
-+
-+static int do_display(AVFormatContext * const s, drm_display_env_t * const de, AVFrame * frame)
-+{
-+    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0];
-+    drm_aux_t * da = de->aux + de->ano;
-+    const uint32_t format = desc->layers[0].format;
-+    int ret = 0;
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_DEBUG, "<<< %s: fd=%d\n", __func__, desc->objects[0].fd);
-+#endif
-+
-+    if (de->setup.out_fourcc != format) {
-+        if (find_plane(s, de->drm_fd, de->setup.crtcIdx, format, &de->setup.planeId)) {
-+            av_frame_free(&frame);
-+            av_log(s, AV_LOG_WARNING, "No plane for format: %#x\n", format);
-+            return -1;
-+        }
-+        de->setup.out_fourcc = format;
-+    }
-+
-+    {
-+        drmVBlank vbl = {
-+            .request = {
-+                .type = DRM_VBLANK_RELATIVE,
-+                .sequence = 0
-+            }
-+        };
-+
-+        while (drmWaitVBlank(de->drm_fd, &vbl)) {
-+            if (errno != EINTR) {
-+//                av_log(s, AV_LOG_WARNING, "drmWaitVBlank failed: %s\n", ERRSTR);
-+                break;
-+            }
-+        }
-+    }
-+
-+    da_uninit(de, da);
-+
-+    {
-+        uint32_t pitches[4] = {0};
-+        uint32_t offsets[4] = {0};
-+        uint64_t modifiers[4] = {0};
-+        uint32_t bo_handles[4] = {0};
-+        int i, j, n;
-+
-+        da->frame = frame;
-+
-+        for (i = 0; i < desc->nb_objects; ++i) {
-+            if (drmPrimeFDToHandle(de->drm_fd, desc->objects[i].fd, da->bo_handles + i) != 0) {
-+                av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle[%d](%d) failed: %s\n", i, desc->objects[i].fd, ERRSTR);
-+                return -1;
-+            }
-+        }
-+
-+        n = 0;
-+        for (i = 0; i < desc->nb_layers; ++i) {
-+            for (j = 0; j < desc->layers[i].nb_planes; ++j) {
-+                const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j;
-+                const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index;
-+                pitches[n] = p->pitch;
-+                offsets[n] = p->offset;
-+                modifiers[n] = obj->format_modifier;
-+                bo_handles[n] = da->bo_handles[p->object_index];
-+                ++n;
-+            }
-+        }
-+
-+#if 1 && TRACE_ALL
-+        av_log(s, AV_LOG_DEBUG, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d,"
-+               " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n",
-+               av_frame_cropped_width(frame),
-+               av_frame_cropped_height(frame),
-+               desc->layers[0].format,
-+               bo_handles[0],
-+               bo_handles[1],
-+               bo_handles[2],
-+               bo_handles[3],
-+               pitches[0],
-+               pitches[1],
-+               pitches[2],
-+               pitches[3],
-+               offsets[0],
-+               offsets[1],
-+               offsets[2],
-+               offsets[3],
-+               (long long)modifiers[0],
-+               (long long)modifiers[1],
-+               (long long)modifiers[2],
-+               (long long)modifiers[3]
-+               );
-+#endif
-+
-+        if (drmModeAddFB2WithModifiers(de->drm_fd,
-+                                         av_frame_cropped_width(frame),
-+                                         av_frame_cropped_height(frame),
-+                                         desc->layers[0].format, bo_handles,
-+                                         pitches, offsets, modifiers,
-+                                         &da->fb_handle, DRM_MODE_FB_MODIFIERS /** 0 if no mods */) != 0) {
-+            av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR);
-+            return -1;
-+        }
-+    }
-+
-+    ret = drmModeSetPlane(de->drm_fd, de->setup.planeId, de->setup.crtcId,
-+                              da->fb_handle, 0,
-+                de->setup.compose.x, de->setup.compose.y,
-+                de->setup.compose.width,
-+                de->setup.compose.height,
-+                0, 0,
-+                av_frame_cropped_width(frame) << 16,
-+                av_frame_cropped_height(frame) << 16);
-+
-+    if (ret != 0) {
-+        av_log(s, AV_LOG_WARNING, "drmModeSetPlane failed: %s\n", ERRSTR);
-+    }
-+
-+    de->ano = de->ano + 1 >= AUX_SIZE ? 0 : de->ano + 1;
-+
-+    return ret;
-+}
-+
-+static int do_sem_wait(sem_t * const sem, const int nowait)
-+{
-+    while (nowait ? sem_trywait(sem) : sem_wait(sem)) {
-+        if (errno != EINTR)
-+            return -errno;
-+    }
-+    return 0;
-+}
-+
-+static void * display_thread(void * v)
-+{
-+    AVFormatContext * const s = v;
-+    drm_display_env_t * const de = s->priv_data;
-+    int i;
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
-+#endif
-+
-+    sem_post(&de->q_sem_out);
-+
-+    for (;;) {
-+        AVFrame * frame;
-+
-+        do_sem_wait(&de->q_sem_in, 0);
-+
-+        if (de->q_terminate)
-+            break;
-+
-+        frame = de->q_next;
-+        de->q_next = NULL;
-+        sem_post(&de->q_sem_out);
-+
-+        do_display(s, de, frame);
-+    }
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
-+#endif
-+
-+    for (i = 0; i != AUX_SIZE; ++i)
-+        da_uninit(de, de->aux + i);
-+
-+    av_frame_free(&de->q_next);
-+
-+    return NULL;
-+}
-+
-+static int drm_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
-+{
-+    const AVFrame * const src_frame = (AVFrame *)pkt->data;
-+    AVFrame * frame;
-+    drm_display_env_t * const de = s->priv_data;
-+    int ret;
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
-+#endif
-+
-+    if ((src_frame->flags & AV_FRAME_FLAG_CORRUPT) != 0) {
-+        av_log(s, AV_LOG_WARNING, "Discard corrupt frame: fmt=%d, ts=%" PRId64 "\n", src_frame->format, src_frame->pts);
-+        return 0;
-+    }
-+
-+    if (src_frame->format == AV_PIX_FMT_DRM_PRIME) {
-+        frame = av_frame_alloc();
-+        av_frame_ref(frame, src_frame);
-+    }
-+    else if (src_frame->format == AV_PIX_FMT_VAAPI) {
-+        frame = av_frame_alloc();
-+        frame->format = AV_PIX_FMT_DRM_PRIME;
-+        if (av_hwframe_map(frame, src_frame, 0) != 0)
-+        {
-+            av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
-+            av_frame_free(&frame);
-+            return AVERROR(EINVAL);
-+        }
-+    }
-+    else {
-+        av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    ret = do_sem_wait(&de->q_sem_out, !de->show_all);
-+    if (ret) {
-+        av_frame_free(&frame);
-+    }
-+    else {
-+        de->q_next = frame;
-+        sem_post(&de->q_sem_in);
-+    }
-+
-+    return 0;
-+}
-+
-+static int drm_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
-+                          unsigned flags)
-+{
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_DEBUG, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags);
-+#endif
-+
-+    /* drm_vout_write_header() should have accepted only supported formats */
-+    if ((flags & AV_WRITE_UNCODED_FRAME_QUERY))
-+        return 0;
-+
-+    return 0;
-+}
-+
-+static int drm_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
-+{
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_DEBUG, "%s: %d\n", __func__, type);
-+#endif
-+    switch(type) {
-+    case AV_APP_TO_DEV_WINDOW_REPAINT:
-+        return 0;
-+    default:
-+        break;
-+    }
-+    return AVERROR(ENOSYS);
-+}
-+
-+static int find_crtc(struct AVFormatContext * const avctx, int drmfd, struct drm_setup *s, uint32_t * const pConId)
-+{
-+   int ret = -1;
-+   int i;
-+   drmModeRes *res = drmModeGetResources(drmfd);
-+   drmModeConnector *c;
-+
-+   if(!res)
-+   {
-+      printf( "drmModeGetResources failed: %s\n", ERRSTR);
-+      return -1;
-+   }
-+
-+   if (res->count_crtcs <= 0)
-+   {
-+      printf( "drm: no crts\n");
-+      goto fail_res;
-+   }
-+
-+   if (!s->conId) {
-+      fprintf(stderr,
-+         "No connector ID specified.  Choosing default from list:\n");
-+
-+      for (i = 0; i < res->count_connectors; i++) {
-+         drmModeConnector *con =
-+            drmModeGetConnector(drmfd, res->connectors[i]);
-+         drmModeEncoder *enc = NULL;
-+         drmModeCrtc *crtc = NULL;
-+
-+         if (con->encoder_id) {
-+            enc = drmModeGetEncoder(drmfd, con->encoder_id);
-+            if (enc->crtc_id) {
-+               crtc = drmModeGetCrtc(drmfd, enc->crtc_id);
-+            }
-+         }
-+
-+         if (!s->conId && crtc) {
-+            s->conId = con->connector_id;
-+            s->crtcId = crtc->crtc_id;
-+         }
-+
-+         av_log(avctx, AV_LOG_DEBUG, "Connector %d (crtc %d): type %d, %dx%d%s\n",
-+                con->connector_id,
-+                crtc ? crtc->crtc_id : 0,
-+                con->connector_type,
-+                crtc ? crtc->width : 0,
-+                crtc ? crtc->height : 0,
-+                (s->conId == (int)con->connector_id ?
-+            " (chosen)" : ""));
-+      }
-+
-+      if (!s->conId) {
-+         av_log(avctx, AV_LOG_ERROR,
-+            "No suitable enabled connector found.\n");
-+         return -1;;
-+      }
-+   }
-+
-+   s->crtcIdx = -1;
-+
-+   for (i = 0; i < res->count_crtcs; ++i) {
-+      if (s->crtcId == res->crtcs[i]) {
-+         s->crtcIdx = i;
-+         break;
-+      }
-+   }
-+
-+   if (s->crtcIdx == -1)
-+   {
-+       av_log(avctx, AV_LOG_WARNING, "drm: CRTC %u not found\n", s->crtcId);
-+       goto fail_res;
-+   }
-+
-+   if (res->count_connectors <= 0)
-+   {
-+       av_log(avctx, AV_LOG_WARNING, "drm: no connectors\n");
-+       goto fail_res;
-+   }
-+
-+   c = drmModeGetConnector(drmfd, s->conId);
-+   if (!c)
-+   {
-+       av_log(avctx, AV_LOG_WARNING, "drmModeGetConnector failed: %s\n", ERRSTR);
-+       goto fail_res;
-+   }
-+
-+   if (!c->count_modes)
-+   {
-+       av_log(avctx, AV_LOG_WARNING, "connector supports no mode\n");
-+       goto fail_conn;
-+   }
-+
-+   {
-+      drmModeCrtc *crtc = drmModeGetCrtc(drmfd, s->crtcId);
-+      s->compose.x = crtc->x;
-+      s->compose.y = crtc->y;
-+      s->compose.width = crtc->width;
-+      s->compose.height = crtc->height;
-+      drmModeFreeCrtc(crtc);
-+   }
-+
-+   if (pConId)
-+      *pConId = c->connector_id;
-+   ret = 0;
-+
-+fail_conn:
-+   drmModeFreeConnector(c);
-+
-+fail_res:
-+   drmModeFreeResources(res);
-+
-+   return ret;
-+}
-+
-+// deinit is called if init fails so no need to clean up explicity here
-+static int drm_vout_init(struct AVFormatContext * s)
-+{
-+    drm_display_env_t * const de = s->priv_data;
-+    int rv;
-+    const char * drm_module = DRM_MODULE;
-+
-+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
-+
-+    de->drm_fd = -1;
-+    de->con_id = 0;
-+    de->setup = (struct drm_setup){0};
-+    de->q_terminate = 0;
-+
-+    if ((de->drm_fd = drmOpen(drm_module, NULL)) < 0)
-+    {
-+        rv = AVERROR(errno);
-+        av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", drm_module, av_err2str(rv));
-+        return rv;
-+    }
-+
-+    if (find_crtc(s, de->drm_fd, &de->setup, &de->con_id) != 0)
-+    {
-+        av_log(s, AV_LOG_ERROR, "failed to find valid mode\n");
-+        rv = AVERROR(EINVAL);
-+        goto fail_close;
-+    }
-+
-+    sem_init(&de->q_sem_in, 0, 0);
-+    sem_init(&de->q_sem_out, 0, 0);
-+    if (pthread_create(&de->q_thread, NULL, display_thread, s)) {
-+        rv = AVERROR(errno);
-+        av_log(s, AV_LOG_ERROR, "Failed to creatye display thread: %s\n", av_err2str(rv));
-+        goto fail_close;
-+    }
-+
-+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
-+
-+    return 0;
-+
-+fail_close:
-+    close(de->drm_fd);
-+    de->drm_fd = -1;
-+    av_log(s, AV_LOG_DEBUG, ">>> %s: FAIL\n", __func__);
-+
-+    return rv;
-+}
-+
-+static void drm_vout_deinit(struct AVFormatContext * s)
-+{
-+    drm_display_env_t * const de = s->priv_data;
-+
-+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
-+
-+    de->q_terminate = 1;
-+    sem_post(&de->q_sem_in);
-+    pthread_join(de->q_thread, NULL);
-+    sem_destroy(&de->q_sem_in);
-+    sem_destroy(&de->q_sem_out);
-+
-+    for (unsigned int i = 0; i != AUX_SIZE; ++i)
-+        da_uninit(de, de->aux + i);
-+
-+    av_frame_free(&de->q_next);
-+
-+    if (de->drm_fd >= 0) {
-+        close(de->drm_fd);
-+        de->drm_fd = -1;
-+    }
-+
-+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
-+}
-+
-+
-+#define OFFSET(x) offsetof(drm_display_env_t, x)
-+static const AVOption options[] = {
-+    { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
-+    { NULL }
-+};
-+
-+static const AVClass drm_vout_class = {
-+    .class_name = "drm vid outdev",
-+    .item_name  = av_default_item_name,
-+    .option     = options,
-+    .version    = LIBAVUTIL_VERSION_INT,
-+    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
-+};
-+
-+AVOutputFormat ff_vout_drm_muxer = {
-+    .name           = "vout_drm",
-+    .long_name      = NULL_IF_CONFIG_SMALL("Drm video output device"),
-+    .priv_data_size = sizeof(drm_display_env_t),
-+    .audio_codec    = AV_CODEC_ID_NONE,
-+    .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
-+    .write_header   = drm_vout_write_header,
-+    .write_packet   = drm_vout_write_packet,
-+    .write_uncoded_frame = drm_vout_write_frame,
-+    .write_trailer  = drm_vout_write_trailer,
-+    .control_message = drm_vout_control_message,
-+    .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
-+    .priv_class     = &drm_vout_class,
-+    .init           = drm_vout_init,
-+    .deinit         = drm_vout_deinit,
-+};
-+
---- /dev/null
-+++ b/libavdevice/egl_vout.c
-@@ -0,0 +1,816 @@
-+/*
-+ * Copyright (c) 2020 John Cox for Raspberry Pi Trading
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+
-+// *** This module is a work in progress and its utility is strictly
-+//     limited to testing.
-+//     Amongst other issues it doesn't wait for the pic to be displayed before
-+//     returning the buffer so flikering does occur.
-+
-+#include <epoxy/gl.h>
-+#include <epoxy/egl.h>
-+
-+#include "libavutil/opt.h"
-+#include "libavutil/avassert.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/imgutils.h"
-+#include "libavutil/hwcontext_drm.h"
-+#include "libavformat/internal.h"
-+#include "avdevice.h"
-+
-+#include "pthread.h"
-+#include <semaphore.h>
-+#include <stdatomic.h>
-+#include <unistd.h>
-+
-+#include <X11/Xlib.h>
-+#include <X11/Xutil.h>
-+
-+#include "libavutil/rpi_sand_fns.h"
-+
-+#define TRACE_ALL 0
-+
-+struct egl_setup {
-+   int conId;
-+
-+   Display *dpy;
-+   EGLDisplay egl_dpy;
-+   EGLContext ctx;
-+   EGLSurface surf;
-+   Window win;
-+
-+   uint32_t crtcId;
-+   int crtcIdx;
-+   uint32_t planeId;
-+   struct {
-+       int x, y, width, height;
-+   } compose;
-+};
-+
-+typedef struct egl_aux_s {
-+    int fd;
-+    GLuint texture;
-+
-+} egl_aux_t;
-+
-+typedef struct egl_display_env_s
-+{
-+    AVClass *class;
-+
-+    struct egl_setup setup;
-+    enum AVPixelFormat avfmt;
-+
-+    int show_all;
-+    int window_width, window_height;
-+    int window_x, window_y;
-+    int fullscreen;
-+
-+    egl_aux_t aux[32];
-+
-+    pthread_t q_thread;
-+    pthread_mutex_t q_lock;
-+    sem_t display_start_sem;
-+    sem_t q_sem;
-+    int q_terminate;
-+    AVFrame * q_this;
-+    AVFrame * q_next;
-+
-+} egl_display_env_t;
-+
-+
-+/**
-+ * Remove window border/decorations.
-+ */
-+static void
-+no_border( Display *dpy, Window w)
-+{
-+   static const unsigned MWM_HINTS_DECORATIONS = (1 << 1);
-+   static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5;
-+
-+   typedef struct
-+   {
-+      unsigned long       flags;
-+      unsigned long       functions;
-+      unsigned long       decorations;
-+      long                inputMode;
-+      unsigned long       status;
-+   } PropMotifWmHints;
-+
-+   PropMotifWmHints motif_hints;
-+   Atom prop, proptype;
-+   unsigned long flags = 0;
-+
-+   /* setup the property */
-+   motif_hints.flags = MWM_HINTS_DECORATIONS;
-+   motif_hints.decorations = flags;
-+
-+   /* get the atom for the property */
-+   prop = XInternAtom( dpy, "_MOTIF_WM_HINTS", True );
-+   if (!prop) {
-+      /* something went wrong! */
-+      return;
-+   }
-+
-+   /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */
-+   proptype = prop;
-+
-+   XChangeProperty( dpy, w,                         /* display, window */
-+                    prop, proptype,                 /* property, type */
-+                    32,                             /* format: 32-bit datums */
-+                    PropModeReplace,                /* mode */
-+                    (unsigned char *) &motif_hints, /* data */
-+                    PROP_MOTIF_WM_HINTS_ELEMENTS    /* nelements */
-+                  );
-+}
-+
-+
-+/*
-+ * Create an RGB, double-buffered window.
-+ * Return the window and context handles.
-+ */
-+static int
-+make_window(struct AVFormatContext * const s,
-+            egl_display_env_t * const de,
-+            Display *dpy, EGLDisplay egl_dpy, const char *name,
-+            Window *winRet, EGLContext *ctxRet, EGLSurface *surfRet)
-+{
-+   int scrnum = DefaultScreen( dpy );
-+   XSetWindowAttributes attr;
-+   unsigned long mask;
-+   Window root = RootWindow( dpy, scrnum );
-+   Window win;
-+   EGLContext ctx;
-+   const int fullscreen = de->fullscreen;
-+   EGLConfig config;
-+   int x = de->window_x;
-+   int y = de->window_y;
-+   int width = de->window_width ? de->window_width : 1280;
-+   int height = de->window_height ? de->window_height : 720;
-+
-+
-+   if (fullscreen) {
-+      int scrnum = DefaultScreen(dpy);
-+
-+      x = 0; y = 0;
-+      width = DisplayWidth(dpy, scrnum);
-+      height = DisplayHeight(dpy, scrnum);
-+   }
-+
-+   {
-+      EGLint num_configs;
-+      static const EGLint attribs[] = {
-+         EGL_RED_SIZE, 1,
-+         EGL_GREEN_SIZE, 1,
-+         EGL_BLUE_SIZE, 1,
-+         EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,
-+         EGL_NONE
-+      };
-+
-+      if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) {
-+         av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n");
-+         return -1;
-+      }
-+   }
-+
-+   {
-+      EGLint vid;
-+      if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) {
-+         av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n");
-+         return -1;
-+      }
-+
-+      {
-+         XVisualInfo visTemplate = {
-+            .visualid = vid,
-+         };
-+         int num_visuals;
-+         XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask,
-+                                               &visTemplate, &num_visuals);
-+
-+         /* window attributes */
-+         attr.background_pixel = 0;
-+         attr.border_pixel = 0;
-+         attr.colormap = XCreateColormap( dpy, root, visinfo->visual, AllocNone);
-+         attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask;
-+         /* XXX this is a bad way to get a borderless window! */
-+         mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask;
-+
-+         win = XCreateWindow( dpy, root, x, y, width, height,
-+                              0, visinfo->depth, InputOutput,
-+                              visinfo->visual, mask, &attr );
-+         XFree(visinfo);
-+      }
-+   }
-+
-+   if (fullscreen)
-+      no_border(dpy, win);
-+
-+   /* set hints and properties */
-+   {
-+      XSizeHints sizehints;
-+      sizehints.x = x;
-+      sizehints.y = y;
-+      sizehints.width  = width;
-+      sizehints.height = height;
-+      sizehints.flags = USSize | USPosition;
-+      XSetNormalHints(dpy, win, &sizehints);
-+      XSetStandardProperties(dpy, win, name, name,
-+                              None, (char **)NULL, 0, &sizehints);
-+   }
-+
-+   eglBindAPI(EGL_OPENGL_ES_API);
-+
-+   {
-+      static const EGLint ctx_attribs[] = {
-+         EGL_CONTEXT_CLIENT_VERSION, 2,
-+         EGL_NONE
-+      };
-+      ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs );
-+      if (!ctx) {
-+         av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
-+         return -1;
-+      }
-+   }
-+
-+
-+   XMapWindow(dpy, win);
-+
-+   {
-+      EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL);
-+      if (!surf) {
-+         av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n");
-+         return -1;
-+      }
-+
-+      if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) {
-+         av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
-+         return -1;
-+      }
-+
-+      *winRet = win;
-+      *ctxRet = ctx;
-+      *surfRet = surf;
-+   }
-+
-+   return 0;
-+}
-+
-+static GLint
-+compile_shader(struct AVFormatContext * const avctx, GLenum target, const char *source)
-+{
-+   GLuint s = glCreateShader(target);
-+
-+   if (s == 0) {
-+      av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n");
-+      return 0;
-+   }
-+
-+   glShaderSource(s, 1, (const GLchar **) &source, NULL);
-+   glCompileShader(s);
-+
-+   {
-+      GLint ok;
-+      glGetShaderiv(s, GL_COMPILE_STATUS, &ok);
-+
-+      if (!ok) {
-+         GLchar *info;
-+         GLint size;
-+
-+         glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size);
-+         info = malloc(size);
-+
-+         glGetShaderInfoLog(s, size, NULL, info);
-+         av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source);
-+
-+         return 0;
-+      }
-+   }
-+
-+   return s;
-+}
-+
-+static GLuint link_program(struct AVFormatContext * const s, GLint vs, GLint fs)
-+{
-+   GLuint prog = glCreateProgram();
-+
-+   if (prog == 0) {
-+      av_log(s, AV_LOG_ERROR, "Failed to create program\n");
-+      return 0;
-+   }
-+
-+   glAttachShader(prog, vs);
-+   glAttachShader(prog, fs);
-+   glLinkProgram(prog);
-+
-+   {
-+      GLint ok;
-+      glGetProgramiv(prog, GL_LINK_STATUS, &ok);
-+      if (!ok) {
-+         /* Some drivers return a size of 1 for an empty log.  This is the size
-+          * of a log that contains only a terminating NUL character.
-+          */
-+         GLint size;
-+         GLchar *info = NULL;
-+         glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size);
-+         if (size > 1) {
-+            info = malloc(size);
-+            glGetProgramInfoLog(prog, size, NULL, info);
-+         }
-+
-+         av_log(s, AV_LOG_ERROR, "Failed to link: %s\n",
-+                 (info != NULL) ? info : "<empty log>");
-+         return 0;
-+      }
-+   }
-+
-+   return prog;
-+}
-+
-+static int
-+gl_setup(struct AVFormatContext * const s)
-+{
-+   const char *vs =
-+      "attribute vec4 pos;\n"
-+      "varying vec2 texcoord;\n"
-+      "\n"
-+      "void main() {\n"
-+      "  gl_Position = pos;\n"
-+      "  texcoord.x = (pos.x + 1.0) / 2.0;\n"
-+      "  texcoord.y = (-pos.y + 1.0) / 2.0;\n"
-+      "}\n";
-+   const char *fs =
-+      "#extension GL_OES_EGL_image_external : enable\n"
-+      "precision mediump float;\n"
-+      "uniform samplerExternalOES s;\n"
-+      "varying vec2 texcoord;\n"
-+      "void main() {\n"
-+      "  gl_FragColor = texture2D(s, texcoord);\n"
-+      "}\n";
-+
-+   GLuint vs_s;
-+   GLuint fs_s;
-+   GLuint prog;
-+
-+   if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) ||
-+       !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) ||
-+       !(prog = link_program(s, vs_s, fs_s)))
-+      return -1;
-+
-+   glUseProgram(prog);
-+
-+   {
-+      static const float verts[] = {
-+         -1, -1,
-+         1, -1,
-+         1, 1,
-+         -1, 1,
-+      };
-+      glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts);
-+   }
-+
-+   glEnableVertexAttribArray(0);
-+   return 0;
-+}
-+
-+static int egl_vout_write_trailer(AVFormatContext *s)
-+{
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
-+#endif
-+
-+    return 0;
-+}
-+
-+static int egl_vout_write_header(AVFormatContext *s)
-+{
-+    const AVCodecParameters * const par = s->streams[0]->codecpar;
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
-+#endif
-+    if (   s->nb_streams > 1
-+        || par->codec_type != AVMEDIA_TYPE_VIDEO
-+        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
-+        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
-+        return AVERROR(EINVAL);
-+    }
-+
-+    return 0;
-+}
-+
-+
-+static int do_display(AVFormatContext * const s, egl_display_env_t * const de, AVFrame * const frame)
-+{
-+    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0];
-+    egl_aux_t * da = NULL;
-+    unsigned int i;
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
-+#endif
-+
-+    for (i = 0; i != 32; ++i) {
-+        if (de->aux[i].fd == -1 || de->aux[i].fd == desc->objects[0].fd) {
-+            da = de->aux + i;
-+            break;
-+        }
-+    }
-+
-+    if (da == NULL) {
-+        av_log(s, AV_LOG_INFO, "%s: Out of handles\n", __func__);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    if (da->texture == 0) {
-+        EGLint attribs[50];
-+        EGLint * a = attribs;
-+        int i, j;
-+        static const EGLint anames[] = {
-+           EGL_DMA_BUF_PLANE0_FD_EXT,
-+           EGL_DMA_BUF_PLANE0_OFFSET_EXT,
-+           EGL_DMA_BUF_PLANE0_PITCH_EXT,
-+           EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
-+           EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
-+           EGL_DMA_BUF_PLANE1_FD_EXT,
-+           EGL_DMA_BUF_PLANE1_OFFSET_EXT,
-+           EGL_DMA_BUF_PLANE1_PITCH_EXT,
-+           EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT,
-+           EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT,
-+           EGL_DMA_BUF_PLANE2_FD_EXT,
-+           EGL_DMA_BUF_PLANE2_OFFSET_EXT,
-+           EGL_DMA_BUF_PLANE2_PITCH_EXT,
-+           EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT,
-+           EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT,
-+        };
-+        const EGLint * b = anames;
-+
-+        *a++ = EGL_WIDTH;
-+        *a++ = av_frame_cropped_width(frame);
-+        *a++ = EGL_HEIGHT;
-+        *a++ = av_frame_cropped_height(frame);
-+        *a++ = EGL_LINUX_DRM_FOURCC_EXT;
-+        *a++ = desc->layers[0].format;
-+
-+        for (i = 0; i < desc->nb_layers; ++i) {
-+            for (j = 0; j < desc->layers[i].nb_planes; ++j) {
-+                const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j;
-+                const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index;
-+                *a++ = *b++;
-+                *a++ = obj->fd;
-+                *a++ = *b++;
-+                *a++ = p->offset;
-+                *a++ = *b++;
-+                *a++ = p->pitch;
-+                if (obj->format_modifier == 0) {
-+                   b += 2;
-+                }
-+                else {
-+                   *a++ = *b++;
-+                   *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF);
-+                   *a++ = *b++;
-+                   *a++ = (EGLint)(obj->format_modifier >> 32);
-+                }
-+            }
-+        }
-+
-+        *a = EGL_NONE;
-+
-+#if TRACE_ALL
-+        for (a = attribs, i = 0; *a != EGL_NONE; a += 2, ++i) {
-+           av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]);
-+        }
-+#endif
-+        {
-+           const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy,
-+                                              EGL_NO_CONTEXT,
-+                                              EGL_LINUX_DMA_BUF_EXT,
-+                                              NULL, attribs);
-+           if (!image) {
-+              av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd);
-+              return -1;
-+           }
-+
-+           glGenTextures(1, &da->texture);
-+           glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
-+           glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
-+           glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
-+           glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image);
-+
-+           eglDestroyImageKHR(de->setup.egl_dpy, image);
-+        }
-+
-+        da->fd = desc->objects[0].fd;
-+
-+#if 0
-+        av_log(s, AV_LOG_INFO, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d,"
-+               " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n",
-+               av_frame_cropped_width(frame),
-+               av_frame_cropped_height(frame),
-+               desc->layers[0].format,
-+               bo_plane_handles[0],
-+               bo_plane_handles[1],
-+               bo_plane_handles[2],
-+               bo_plane_handles[3],
-+               pitches[0],
-+               pitches[1],
-+               pitches[2],
-+               pitches[3],
-+               offsets[0],
-+               offsets[1],
-+               offsets[2],
-+               offsets[3],
-+               (long long)modifiers[0],
-+               (long long)modifiers[1],
-+               (long long)modifiers[2],
-+               (long long)modifiers[3]
-+               );
-+#endif
-+    }
-+
-+    glClearColor(0.5, 0.5, 0.5, 0.5);
-+    glClear(GL_COLOR_BUFFER_BIT);
-+
-+    glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
-+    glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-+    eglSwapBuffers(de->setup.egl_dpy, de->setup.surf);
-+
-+    glDeleteTextures(1, &da->texture);
-+    da->texture = 0;
-+    da->fd = -1;
-+
-+    return 0;
-+}
-+
-+static void * display_thread(void * v)
-+{
-+    AVFormatContext * const s = v;
-+    egl_display_env_t * const de = s->priv_data;
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
-+#endif
-+    {
-+       EGLint egl_major, egl_minor;
-+
-+       de->setup.dpy = XOpenDisplay(NULL);
-+       if (!de->setup.dpy) {
-+          av_log(s, AV_LOG_ERROR, "Couldn't open X display\n");
-+          goto fail;
-+       }
-+
-+       de->setup.egl_dpy = eglGetDisplay(de->setup.dpy);
-+       if (!de->setup.egl_dpy) {
-+          av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n");
-+          goto fail;
-+       }
-+
-+       if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) {
-+           av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n");
-+           goto fail;
-+       }
-+
-+       av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor);
-+
-+       if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) {
-+          av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n");
-+          goto fail;
-+       }
-+    }
-+
-+    if (!de->window_width || !de->window_height) {
-+       de->window_width = 1280;
-+       de->window_height = 720;
-+    }
-+    if (make_window(s, de, de->setup.dpy, de->setup.egl_dpy, "ffmpeg-vout",
-+                    &de->setup.win, &de->setup.ctx, &de->setup.surf)) {
-+       av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__);
-+       goto fail;
-+    }
-+
-+    if (gl_setup(s)) {
-+       av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__);
-+       goto fail;
-+    }
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "--- %s: Start done\n", __func__);
-+#endif
-+    sem_post(&de->display_start_sem);
-+
-+    for (;;) {
-+        AVFrame * frame;
-+
-+        while (sem_wait(&de->q_sem) != 0) {
-+            av_assert0(errno == EINTR);
-+        }
-+
-+        if (de->q_terminate)
-+            break;
-+
-+        pthread_mutex_lock(&de->q_lock);
-+        frame = de->q_next;
-+        de->q_next = NULL;
-+        pthread_mutex_unlock(&de->q_lock);
-+
-+        do_display(s, de, frame);
-+
-+        av_frame_free(&de->q_this);
-+        de->q_this = frame;
-+    }
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, ">>> %s\n", __func__);
-+#endif
-+
-+    return NULL;
-+
-+fail:
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, ">>> %s: FAIL\n", __func__);
-+#endif
-+    de->q_terminate = 1;
-+    sem_post(&de->display_start_sem);
-+
-+    return NULL;
-+}
-+
-+static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
-+{
-+    const AVFrame * const src_frame = (AVFrame *)pkt->data;
-+    AVFrame * frame;
-+    egl_display_env_t * const de = s->priv_data;
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
-+#endif
-+
-+    if (src_frame->format == AV_PIX_FMT_DRM_PRIME) {
-+        frame = av_frame_alloc();
-+        av_frame_ref(frame, src_frame);
-+    }
-+    else if (src_frame->format == AV_PIX_FMT_VAAPI) {
-+        frame = av_frame_alloc();
-+        frame->format = AV_PIX_FMT_DRM_PRIME;
-+        if (av_hwframe_map(frame, src_frame, 0) != 0)
-+        {
-+            av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
-+            av_frame_free(&frame);
-+            return AVERROR(EINVAL);
-+        }
-+    }
-+    else {
-+        av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    // Really hacky sync
-+    while (de->show_all && de->q_next) {
-+       usleep(3000);
-+    }
-+
-+    pthread_mutex_lock(&de->q_lock);
-+    {
-+        AVFrame * const t = de->q_next;
-+        de->q_next = frame;
-+        frame = t;
-+    }
-+    pthread_mutex_unlock(&de->q_lock);
-+
-+    if (frame == NULL)
-+        sem_post(&de->q_sem);
-+    else
-+        av_frame_free(&frame);
-+
-+    return 0;
-+}
-+
-+static int egl_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
-+                          unsigned flags)
-+{
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags);
-+#endif
-+
-+    /* egl_vout_write_header() should have accepted only supported formats */
-+    if ((flags & AV_WRITE_UNCODED_FRAME_QUERY))
-+        return 0;
-+
-+    return 0;
-+}
-+
-+static int egl_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
-+{
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
-+#endif
-+    switch(type) {
-+    case AV_APP_TO_DEV_WINDOW_REPAINT:
-+        return 0;
-+    default:
-+        break;
-+    }
-+    return AVERROR(ENOSYS);
-+}
-+
-+// deinit is called if init fails so no need to clean up explicity here
-+static int egl_vout_init(struct AVFormatContext * s)
-+{
-+    egl_display_env_t * const de = s->priv_data;
-+    unsigned int i;
-+
-+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
-+
-+    de->setup = (struct egl_setup){0};
-+
-+    for (i = 0; i != 32; ++i) {
-+        de->aux[i].fd = -1;
-+    }
-+
-+    de->q_terminate = 0;
-+    pthread_mutex_init(&de->q_lock, NULL);
-+    sem_init(&de->q_sem, 0, 0);
-+    sem_init(&de->display_start_sem, 0, 0);
-+    av_assert0(pthread_create(&de->q_thread, NULL, display_thread, s) == 0);
-+
-+    sem_wait(&de->display_start_sem);
-+    if (de->q_terminate) {
-+       av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__);
-+       return -1;
-+    }
-+
-+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
-+
-+    return 0;
-+}
-+
-+static void egl_vout_deinit(struct AVFormatContext * s)
-+{
-+    egl_display_env_t * const de = s->priv_data;
-+
-+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
-+
-+    de->q_terminate = 1;
-+    sem_post(&de->q_sem);
-+    pthread_join(de->q_thread, NULL);
-+    sem_destroy(&de->q_sem);
-+    pthread_mutex_destroy(&de->q_lock);
-+
-+    av_frame_free(&de->q_next);
-+    av_frame_free(&de->q_this);
-+
-+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
-+}
-+
-+#define OFFSET(x) offsetof(egl_display_env_t, x)
-+static const AVOption options[] = {
-+   { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
-+   { "window_size",  "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
-+   { "window_x",     "set window x offset",    OFFSET(window_x),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
-+   { "window_y",     "set window y offset",    OFFSET(window_y),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
-+   { "fullscreen",   "set fullscreen display", OFFSET(fullscreen),   AV_OPT_TYPE_BOOL,   {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
-+    { NULL }
-+
-+};
-+
-+static const AVClass egl_vout_class = {
-+    .class_name = "egl vid outdev",
-+    .item_name  = av_default_item_name,
-+    .option     = options,
-+    .version    = LIBAVUTIL_VERSION_INT,
-+    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
-+};
-+
-+AVOutputFormat ff_vout_egl_muxer = {
-+    .name           = "vout_egl",
-+    .long_name      = NULL_IF_CONFIG_SMALL("Egl video output device"),
-+    .priv_data_size = sizeof(egl_display_env_t),
-+    .audio_codec    = AV_CODEC_ID_NONE,
-+    .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
-+    .write_header   = egl_vout_write_header,
-+    .write_packet   = egl_vout_write_packet,
-+    .write_uncoded_frame = egl_vout_write_frame,
-+    .write_trailer  = egl_vout_write_trailer,
-+    .control_message = egl_vout_control_message,
-+    .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
-+    .priv_class     = &egl_vout_class,
-+    .init           = egl_vout_init,
-+    .deinit         = egl_vout_deinit,
-+};
-+
---- /dev/null
-+++ b/libavdevice/rpi_vout.c
-@@ -0,0 +1,534 @@
-+/*
-+ * Copyright (c) 2013 Jeff Moguillansky
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+/**
-+ * @file
-+ * XVideo output device
-+ *
-+ * TODO:
-+ * - add support to more formats
-+ */
-+
-+#include "libavutil/opt.h"
-+#include "libavutil/avassert.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/imgutils.h"
-+#include "libavformat/internal.h"
-+#include "avdevice.h"
-+
-+#include <stdatomic.h>
-+#include <unistd.h>
-+
-+#pragma GCC diagnostic push
-+// Many many redundant decls in the header files
-+#pragma GCC diagnostic ignored "-Wredundant-decls"
-+#include <bcm_host.h>
-+#include <interface/mmal/mmal.h>
-+#include <interface/mmal/mmal_parameters_camera.h>
-+#include <interface/mmal/mmal_buffer.h>
-+#include <interface/mmal/mmal_port.h>
-+#include <interface/mmal/util/mmal_util.h>
-+#include <interface/mmal/util/mmal_default_components.h>
-+#include <interface/mmal/util/mmal_connection.h>
-+#include <interface/mmal/util/mmal_util_params.h>
-+#pragma GCC diagnostic pop
-+#include "libavutil/rpi_sand_fns.h"
-+#include "libavcodec/rpi_zc.h"
-+
-+#define TRACE_ALL 0
-+
-+#define DISPLAY_PORT_DEPTH 4
-+
-+typedef struct rpi_display_env_s
-+{
-+    AVClass *class;
-+
-+    MMAL_COMPONENT_T* display;
-+    MMAL_COMPONENT_T* isp;
-+    MMAL_PORT_T * port_in;  // Input port of either isp or display depending on pipe setup
-+    MMAL_CONNECTION_T * conn;
-+
-+    MMAL_POOL_T *rpi_pool;
-+    volatile int rpi_display_count;
-+
-+    MMAL_FOURCC_T req_fmt;
-+    MMAL_VIDEO_FORMAT_T req_vfmt;
-+
-+    AVZcEnvPtr zc;
-+
-+    int window_width, window_height;
-+    int window_x, window_y;
-+    int layer, fullscreen;
-+    int show_all;
-+} rpi_display_env_t;
-+
-+
-+static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
-+    mmal_buffer_header_release(buffer);
-+}
-+
-+static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
-+    mmal_buffer_header_release(buffer);
-+}
-+
-+
-+static MMAL_FOURCC_T mmfmt_from_avfmt(const enum AVPixelFormat fmt)
-+{
-+    switch (fmt) {
-+    case AV_PIX_FMT_SAND128:
-+    case AV_PIX_FMT_RPI4_8:
-+        return MMAL_ENCODING_YUVUV128;
-+    case AV_PIX_FMT_RPI4_10:
-+        return MMAL_ENCODING_YUV10_COL;
-+    case AV_PIX_FMT_SAND64_10:
-+        return MMAL_ENCODING_YUVUV64_10;
-+    case AV_PIX_FMT_SAND64_16:
-+        return MMAL_ENCODING_YUVUV64_16;
-+    case AV_PIX_FMT_YUV420P:
-+        return MMAL_ENCODING_I420;
-+
-+    default:
-+        break;
-+    }
-+    return 0;
-+}
-+
-+
-+static void video_format_from_zc_frame(MMAL_ES_FORMAT_T* const es_fmt,
-+                                       const AVFrame * const frame, const AVRpiZcRefPtr fr_ref)
-+{
-+    MMAL_VIDEO_FORMAT_T *const vfmt = &es_fmt->es->video;
-+    const AVRpiZcFrameGeometry * geo = av_rpi_zc_geometry(fr_ref);
-+    if (av_rpi_is_sand_format(geo->format)) {
-+        // Sand formats are a bit "special"
-+        // stride1 implicit in format
-+        // width = stride2
-+        vfmt->width = geo->stripe_is_yc ?
-+            geo->height_y + geo->height_c : geo->height_y;
-+//        es->height = geo->video_height;  //*** When we get the FLAG this will change
-+        vfmt->height = geo->height_y;
-+        es_fmt->flags = MMAL_ES_FORMAT_FLAG_COL_FMTS_WIDTH_IS_COL_STRIDE;
-+    }
-+    else {
-+        vfmt->width = geo->stride_y / geo->bytes_per_pel;
-+        vfmt->height = geo->height_y;
-+        es_fmt->flags = 0;
-+    }
-+
-+    es_fmt->type = MMAL_ES_TYPE_VIDEO;
-+    es_fmt->encoding = mmfmt_from_avfmt(geo->format);
-+    es_fmt->encoding_variant = 0;
-+    es_fmt->bitrate = 0;
-+
-+    vfmt->crop.x = frame->crop_left;
-+    vfmt->crop.y = frame->crop_top;
-+    vfmt->crop.width = av_frame_cropped_width(frame);
-+    vfmt->crop.height = av_frame_cropped_height(frame);
-+
-+    vfmt->frame_rate.den = 0;  // Don't think I know it here
-+    vfmt->frame_rate.num = 0;
-+
-+    vfmt->par.den = frame->sample_aspect_ratio.den;
-+    vfmt->par.num = frame->sample_aspect_ratio.num;
-+
-+    vfmt->color_space = 0;  // Unknown currently
-+}
-+
-+static MMAL_BOOL_T buf_release_cb(MMAL_BUFFER_HEADER_T * buf, void *userdata)
-+{
-+    rpi_display_env_t * const de = userdata;
-+    if (buf->user_data != NULL) {
-+        av_rpi_zc_unref((AVRpiZcRefPtr)buf->user_data);
-+        buf->user_data = NULL;
-+    }
-+    atomic_fetch_add(&de->rpi_display_count, -1);
-+    return MMAL_FALSE;
-+}
-+
-+static inline int avfmt_needs_isp(const enum AVPixelFormat avfmt)
-+{
-+    return avfmt == AV_PIX_FMT_SAND64_10;
-+}
-+
-+static void isp_remove(AVFormatContext * const s, rpi_display_env_t * const de)
-+{
-+    if (de->isp != NULL)
-+    {
-+        if (de->isp->input[0]->is_enabled)
-+            mmal_port_disable(de->isp->input[0]);
-+        if (de->isp->control->is_enabled)
-+            mmal_port_disable(de->isp->control);
-+    }
-+    if (de->conn != NULL) {
-+        mmal_connection_destroy(de->conn);
-+        de->conn = NULL;
-+    }
-+    if (de->isp != NULL) {
-+        mmal_component_destroy(de->isp);
-+        de->isp = NULL;
-+    }
-+}
-+
-+static void display_frame(AVFormatContext * const s, rpi_display_env_t * const de, const AVFrame* const fr)
-+{
-+    MMAL_BUFFER_HEADER_T* buf = NULL;
-+    AVRpiZcRefPtr fr_buf = NULL;
-+
-+    if (de == NULL)
-+        return;
-+
-+    if (atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
-+        av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
-+        return;
-+    }
-+
-+    if ((fr_buf = av_rpi_zc_ref(s, de->zc, fr, fr->format, 1)) == NULL) {
-+        return;
-+    }
-+
-+    buf = mmal_queue_get(de->rpi_pool->queue);
-+    if (!buf) {
-+        // Running too fast so drop the frame (unexpected)
-+        goto fail;
-+    }
-+
-+    buf->cmd = 0;
-+    buf->offset = 0;
-+    buf->flags = 0;
-+    mmal_buffer_header_reset(buf);
-+
-+    atomic_fetch_add(&de->rpi_display_count, 1);  // Deced on release
-+    mmal_buffer_header_pre_release_cb_set(buf, buf_release_cb, de);
-+
-+    buf->user_data = fr_buf;
-+    buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf);  // Cast our handle to a pointer for mmal
-+    buf->offset = av_rpi_zc_offset(fr_buf);
-+    buf->length = av_rpi_zc_length(fr_buf);
-+    buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
-+
-+    while (de->show_all && atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
-+        usleep(5000);
-+    }
-+
-+    {
-+        MMAL_ES_SPECIFIC_FORMAT_T new_ess = {.video = {0}};
-+        MMAL_ES_FORMAT_T new_es = {.es = &new_ess};
-+		MMAL_VIDEO_FORMAT_T * const new_vfmt = &new_ess.video;
-+
-+        video_format_from_zc_frame(&new_es, fr, fr_buf);
-+        if (de->req_fmt != new_es.encoding ||
-+            de->req_vfmt.width       != new_vfmt->width ||
-+            de->req_vfmt.height      != new_vfmt->height ||
-+            de->req_vfmt.crop.x      != new_vfmt->crop.x ||
-+            de->req_vfmt.crop.y      != new_vfmt->crop.y ||
-+            de->req_vfmt.crop.width  != new_vfmt->crop.width ||
-+            de->req_vfmt.crop.height != new_vfmt->crop.height) {
-+            // Something has changed
-+
-+            // If we have an ISP tear it down
-+            isp_remove(s, de);
-+            de->port_in = de->display->input[0];
-+
-+            // If we still need an ISP create it now
-+            if (avfmt_needs_isp(fr->format))
-+            {
-+                if (mmal_component_create("vc.ril.isp", &de->isp) != MMAL_SUCCESS)
-+                {
-+                    av_log(s, AV_LOG_ERROR, "ISP creation failed\n");
-+                    goto fail;
-+                }
-+                de->port_in = de->isp->input[0];
-+            }
-+
-+            mmal_format_copy(de->port_in->format, &new_es);
-+
-+            if (mmal_port_format_commit(de->port_in)) {
-+                av_log(s, AV_LOG_ERROR, "Failed to commit input format\n");
-+                goto fail;
-+            }
-+
-+            // If we have an ISP then we must want to use it
-+            if (de->isp != NULL) {
-+                MMAL_PORT_T * const port_out = de->isp->output[0];
-+                MMAL_VIDEO_FORMAT_T* vfmt_in = &de->port_in->format->es->video;
-+                MMAL_VIDEO_FORMAT_T* vfmt_out = &port_out->format->es->video;
-+
-+                port_out->format->type = MMAL_ES_TYPE_VIDEO;
-+                port_out->format->encoding  = MMAL_ENCODING_YUVUV128;
-+                port_out->format->encoding_variant = 0;
-+                port_out->format->bitrate = 0;
-+                port_out->format->flags = 0;
-+                port_out->format->extradata = NULL;
-+                port_out->format->extradata_size = 0;
-+
-+                vfmt_out->width       = (vfmt_in->crop.width + 31) & ~31;
-+                vfmt_out->height      = (vfmt_in->crop.height + 15) & ~15;
-+                vfmt_out->crop.x      = 0;
-+                vfmt_out->crop.y      = 0;
-+                vfmt_out->crop.width  = vfmt_in->crop.width;
-+                vfmt_out->crop.height = vfmt_in->crop.height;
-+                vfmt_out->frame_rate  = vfmt_in->frame_rate;
-+                vfmt_out->par         = vfmt_in->par;
-+                vfmt_out->color_space = vfmt_in->color_space;
-+
-+                if (mmal_port_format_commit(port_out)) {
-+                    av_log(s, AV_LOG_ERROR, "Failed to commit output format\n");
-+                    goto fail;
-+                }
-+
-+                if (mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING) != MMAL_SUCCESS) {
-+                    av_log(s, AV_LOG_ERROR, "Failed to create connection\n");
-+                    goto fail;
-+                }
-+                if (mmal_connection_enable(de->conn) != MMAL_SUCCESS) {
-+                    av_log(s, AV_LOG_ERROR, "Failed to enable connection\n");
-+                    goto fail;
-+                }
-+                mmal_port_enable(de->isp->control,display_cb_control);
-+                mmal_component_enable(de->isp);
-+            }
-+
-+            // Number of slots in my port Q
-+            de->port_in->buffer_num = DISPLAY_PORT_DEPTH;
-+            // Size to keep it happy - isn't used for anything other than error checking
-+            de->port_in->buffer_size = buf->alloc_size;
-+            if (!de->port_in->is_enabled)
-+            {
-+                mmal_port_parameter_set_boolean(de->port_in, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle?  Would have expected a vc_image?
-+                if (mmal_port_enable(de->port_in, display_cb_input) != MMAL_SUCCESS) {
-+                    av_log(s, AV_LOG_ERROR, "Failed to enable input port\n");
-+                    goto fail;
-+                }
-+            }
-+
-+            de->req_fmt  = new_es.encoding;
-+            de->req_vfmt = *new_vfmt;
-+        }
-+    }
-+
-+    if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS)
-+    {
-+        av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count);
-+        goto fail;
-+    }
-+    return;
-+
-+fail:
-+    // If we have a buf then fr_buf is held by that
-+    if (buf != NULL)
-+        mmal_buffer_header_release(buf);
-+    else if (fr_buf != NULL)
-+        av_rpi_zc_unref(fr_buf);
-+}
-+
-+
-+static int xv_write_trailer(AVFormatContext *s)
-+{
-+    rpi_display_env_t * const de = s->priv_data;
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
-+#endif
-+    if (de->port_in != NULL && de->port_in->is_enabled) {
-+        mmal_port_disable(de->port_in);
-+    }
-+
-+    // The above disable should kick out all buffers - check that
-+    if (atomic_load(&de->rpi_display_count) != 0) {
-+        av_log(s, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", atomic_load(&de->rpi_display_count));
-+    }
-+
-+    isp_remove(s, de);
-+    if (de->rpi_pool != NULL) {
-+        mmal_pool_destroy(de->rpi_pool);
-+        de->rpi_pool = NULL;
-+    }
-+    if (de->display != NULL) {
-+        mmal_component_destroy(de->display);
-+        de->display = NULL;
-+    }
-+
-+    return 0;
-+}
-+
-+static int xv_write_header(AVFormatContext *s)
-+{
-+    rpi_display_env_t * const de = s->priv_data;
-+    const AVCodecParameters * const par = s->streams[0]->codecpar;
-+    const unsigned int w = de->window_width ? de->window_width : par->width;
-+    const unsigned int h = de->window_height ? de->window_height : par->height;
-+    const unsigned int x = de->window_x;
-+    const unsigned int y = de->window_y;
-+    const int layer = de->layer ? de->layer : 2;
-+    const MMAL_BOOL_T fullscreen = de->fullscreen;
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s: %dx%d\n", __func__, w, h);
-+#endif
-+    if (   s->nb_streams > 1
-+        || par->codec_type != AVMEDIA_TYPE_VIDEO
-+        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
-+        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
-+        return AVERROR(EINVAL);
-+    }
-+
-+    {
-+        MMAL_DISPLAYREGION_T region =
-+        {
-+            .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
-+            .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN |
-+                MMAL_DISPLAY_SET_DEST_RECT | MMAL_DISPLAY_SET_ALPHA,
-+            .layer = layer,
-+            .fullscreen = fullscreen,
-+            .dest_rect = {x, y, w, h},
-+            .alpha = !fullscreen ? 0xff : 0xff | MMAL_DISPLAY_ALPHA_FLAGS_DISCARD_LOWER_LAYERS,
-+        };
-+
-+        bcm_host_init();  // Needs to be done by someone...
-+
-+        if (mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display) != MMAL_SUCCESS)
-+        {
-+            av_log(s, AV_LOG_ERROR, "Failed to create display component\n");
-+            goto fail;
-+        }
-+        de->port_in = de->display->input[0];
-+
-+        mmal_port_parameter_set(de->display->input[0], &region.hdr);
-+
-+        if (mmal_component_enable(de->display) != MMAL_SUCCESS)
-+        {
-+            av_log(s, AV_LOG_ERROR, "Failed to enable display component\n");
-+            goto fail;
-+        }
-+        if (mmal_port_enable(de->display->control,display_cb_control) != MMAL_SUCCESS)
-+        {
-+            av_log(s, AV_LOG_ERROR, "Failed to enable display control port\n");
-+            goto fail;
-+        }
-+
-+        if ((de->rpi_pool = mmal_pool_create(DISPLAY_PORT_DEPTH, 0)) == NULL)
-+        {
-+            av_log(s, AV_LOG_ERROR, "Failed to create pool\n");
-+            goto fail;
-+        }
-+    }
-+
-+    return 0;
-+
-+fail:
-+    xv_write_trailer(s);
-+    return AVERROR_UNKNOWN;
-+}
-+
-+static int xv_write_packet(AVFormatContext *s, AVPacket *pkt)
-+{
-+    AVFrame * const frame = (AVFrame *)pkt->data;
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
-+#endif
-+    display_frame(s, s->priv_data, frame);
-+    return 0;
-+}
-+
-+static int xv_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
-+                          unsigned flags)
-+{
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags);
-+#endif
-+
-+    /* xv_write_header() should have accepted only supported formats */
-+    if ((flags & AV_WRITE_UNCODED_FRAME_QUERY))
-+        return 0;
-+//    return write_picture(s, (*frame)->data, (*frame)->linesize);
-+
-+    display_frame(s, s->priv_data, *ppframe);
-+    return 0;
-+}
-+
-+static int xv_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
-+{
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
-+#endif
-+    switch(type) {
-+    case AV_APP_TO_DEV_WINDOW_REPAINT:
-+        return 0;
-+    default:
-+        break;
-+    }
-+    return AVERROR(ENOSYS);
-+}
-+
-+// deinit is called if init fails so no need to clean up explicity here
-+static int rpi_vout_init(struct AVFormatContext * s)
-+{
-+    rpi_display_env_t * const de = s->priv_data;
-+
-+    // Get a ZC context in case we need one - has little overhead if unused
-+    if ((de->zc = av_rpi_zc_int_env_alloc(s)) == NULL)
-+        return 1;
-+
-+    return 0;
-+}
-+
-+static void rpi_vout_deinit(struct AVFormatContext * s)
-+{
-+    rpi_display_env_t * const de = s->priv_data;
-+
-+    av_rpi_zc_int_env_freep(&de->zc);
-+}
-+
-+
-+#define OFFSET(x) offsetof(rpi_display_env_t, x)
-+static const AVOption options[] = {
-+    { "show_all",     "show all frames",        OFFSET(show_all),     AV_OPT_TYPE_BOOL,   {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
-+    { "window_size",  "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
-+    { "window_x",     "set window x offset",    OFFSET(window_x),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
-+    { "window_y",     "set window y offset",    OFFSET(window_y),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
-+    { "display_layer","set display layer",      OFFSET(layer),        AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
-+    { "fullscreen",   "set fullscreen display", OFFSET(fullscreen),   AV_OPT_TYPE_BOOL,   {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
-+    { NULL }
-+
-+};
-+
-+static const AVClass xv_class = {
-+    .class_name = "rpi vid outdev",
-+    .item_name  = av_default_item_name,
-+    .option     = options,
-+    .version    = LIBAVUTIL_VERSION_INT,
-+    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
-+};
-+
-+AVOutputFormat ff_vout_rpi_muxer = {
-+    .name           = "vout_rpi",
-+    .long_name      = NULL_IF_CONFIG_SMALL("Rpi (mmal) video output device"),
-+    .priv_data_size = sizeof(rpi_display_env_t),
-+    .audio_codec    = AV_CODEC_ID_NONE,
-+    .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
-+    .write_header   = xv_write_header,
-+    .write_packet   = xv_write_packet,
-+    .write_uncoded_frame = xv_write_frame,
-+    .write_trailer  = xv_write_trailer,
-+    .control_message = xv_control_message,
-+    .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
-+    .priv_class     = &xv_class,
-+    .init           = rpi_vout_init,
-+    .deinit         = rpi_vout_deinit,
-+};
---- a/libavfilter/Makefile
-+++ b/libavfilter/Makefile
-@@ -218,6 +218,7 @@ OBJS-$(CONFIG_DEFLATE_FILTER)
- OBJS-$(CONFIG_DEFLICKER_FILTER)              += vf_deflicker.o
- OBJS-$(CONFIG_DEINTERLACE_QSV_FILTER)        += vf_deinterlace_qsv.o
- OBJS-$(CONFIG_DEINTERLACE_VAAPI_FILTER)      += vf_deinterlace_vaapi.o vaapi_vpp.o
-+OBJS-$(CONFIG_DEINTERLACE_V4L2M2M_FILTER)    += vf_deinterlace_v4l2m2m.o
- OBJS-$(CONFIG_DEJUDDER_FILTER)               += vf_dejudder.o
- OBJS-$(CONFIG_DELOGO_FILTER)                 += vf_delogo.o
- OBJS-$(CONFIG_DENOISE_VAAPI_FILTER)          += vf_misc_vaapi.o vaapi_vpp.o
-@@ -434,6 +435,7 @@ OBJS-$(CONFIG_TRANSPOSE_OPENCL_FILTER)
- OBJS-$(CONFIG_TRANSPOSE_VAAPI_FILTER)        += vf_transpose_vaapi.o vaapi_vpp.o
- OBJS-$(CONFIG_TRIM_FILTER)                   += trim.o
- OBJS-$(CONFIG_UNPREMULTIPLY_FILTER)          += vf_premultiply.o framesync.o
-+OBJS-$(CONFIG_UNSAND_FILTER)                 += vf_unsand.o
- OBJS-$(CONFIG_UNSHARP_FILTER)                += vf_unsharp.o
- OBJS-$(CONFIG_UNSHARP_OPENCL_FILTER)         += vf_unsharp_opencl.o opencl.o \
-                                                 opencl/unsharp.o
---- a/libavfilter/allfilters.c
-+++ b/libavfilter/allfilters.c
-@@ -204,6 +204,7 @@ extern AVFilter ff_vf_dedot;
- extern AVFilter ff_vf_deflate;
- extern AVFilter ff_vf_deflicker;
- extern AVFilter ff_vf_deinterlace_qsv;
-+extern AVFilter ff_vf_deinterlace_v4l2m2m;
- extern AVFilter ff_vf_deinterlace_vaapi;
- extern AVFilter ff_vf_dejudder;
- extern AVFilter ff_vf_delogo;
-@@ -414,6 +415,7 @@ extern AVFilter ff_vf_transpose_opencl;
- extern AVFilter ff_vf_transpose_vaapi;
- extern AVFilter ff_vf_trim;
- extern AVFilter ff_vf_unpremultiply;
-+extern AVFilter ff_vf_unsand;
- extern AVFilter ff_vf_unsharp;
- extern AVFilter ff_vf_unsharp_opencl;
- extern AVFilter ff_vf_untile;
---- a/libavfilter/avfiltergraph.c
-+++ b/libavfilter/avfiltergraph.c
-@@ -32,6 +32,9 @@
- #include "libavutil/internal.h"
- #include "libavutil/opt.h"
- #include "libavutil/pixdesc.h"
-+#if CONFIG_UNSAND_FILTER
-+#include "libavutil/rpi_sand_fns.h"
-+#endif
- 
- #define FF_INTERNAL_FIELDS 1
- #include "framequeue.h"
-@@ -427,6 +430,19 @@ static int can_merge_formats(AVFilterFor
-     }
- }
- 
-+#if CONFIG_UNSAND_FILTER
-+static int has_sand_format(const AVFilterFormats * const ff)
-+{
-+    int i;
-+    for (i = 0; i != ff->nb_formats; ++i) {
-+        if (av_rpi_is_sand_format(ff->formats[i])) {
-+            return 1;
-+        }
-+    }
-+    return 0;
-+}
-+#endif
-+
- /**
-  * Perform one round of query_formats() and merging formats lists on the
-  * filter graph.
-@@ -467,6 +483,7 @@ static int query_formats(AVFilterGraph *
-         for (j = 0; j < filter->nb_inputs; j++) {
-             AVFilterLink *link = filter->inputs[j];
-             int convert_needed = 0;
-+            unsigned int extra_convert_tried = 0;
- 
-             if (!link)
-                 continue;
-@@ -514,11 +531,14 @@ static int query_formats(AVFilterGraph *
-             )
- #undef MERGE_DISPATCH
- 
--            if (convert_needed) {
-+            while (convert_needed) {
-                 AVFilterContext *convert;
-                 const AVFilter *filter;
-                 AVFilterLink *inlink, *outlink;
-                 char inst_name[30];
-+                int can_retry = 0;
-+
-+                convert_needed = 0;
- 
-                 if (graph->disable_auto_convert) {
-                     av_log(log_ctx, AV_LOG_ERROR,
-@@ -531,19 +551,45 @@ static int query_formats(AVFilterGraph *
-                 /* couldn't merge format lists. auto-insert conversion filter */
-                 switch (link->type) {
-                 case AVMEDIA_TYPE_VIDEO:
--                    if (!(filter = avfilter_get_by_name("scale"))) {
--                        av_log(log_ctx, AV_LOG_ERROR, "'scale' filter "
--                               "not present, cannot convert pixel formats.\n");
--                        return AVERROR(EINVAL);
--                    }
--
--                    snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d",
--                             scaler_count++);
-+#if CONFIG_UNSAND_FILTER
-+                    // Only try each extra conversion once
-+                    // The unsand output pad should never trigger has_sand_format
-+                    // but it is better to be safe
-+                    if ((extra_convert_tried & 1) == 0 && has_sand_format(link->in_formats)) {
-+                        if (!(filter = avfilter_get_by_name("unsand"))) {
-+                            av_log(log_ctx, AV_LOG_ERROR, "'unsand' filter "
-+                                   "not present, cannot convert pixel formats.\n");
-+                            return AVERROR(EINVAL);
-+                        }
-+
-+                        snprintf(inst_name, sizeof(inst_name), "auto_unsand_%d",
-+                                 scaler_count++);
-+
-+                        if ((ret = avfilter_graph_create_filter(&convert, filter,
-+                                                                inst_name, "", NULL,
-+                                                                graph)) < 0)
-+                            return ret;
- 
--                    if ((ret = avfilter_graph_create_filter(&convert, filter,
--                                                            inst_name, graph->scale_sws_opts, NULL,
--                                                            graph)) < 0)
--                        return ret;
-+                        extra_convert_tried |= 1;
-+                        can_retry = 1;
-+                    }
-+                    else
-+#endif
-+                    {
-+                        if (!(filter = avfilter_get_by_name("scale"))) {
-+                            av_log(log_ctx, AV_LOG_ERROR, "'scale' filter "
-+                                   "not present, cannot convert pixel formats.\n");
-+                            return AVERROR(EINVAL);
-+                        }
-+
-+                        snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d",
-+                                 scaler_count++);
-+
-+                        if ((ret = avfilter_graph_create_filter(&convert, filter,
-+                                                                inst_name, graph->scale_sws_opts, NULL,
-+                                                                graph)) < 0)
-+                            return ret;
-+                    }
-                     break;
-                 case AVMEDIA_TYPE_AUDIO:
-                     if (!(filter = avfilter_get_by_name("aresample"))) {
-@@ -585,9 +631,19 @@ static int query_formats(AVFilterGraph *
-                     av_assert0(outlink-> in_channel_layouts->refcount > 0);
-                     av_assert0(outlink->out_channel_layouts->refcount > 0);
-                 }
--                if (!ff_merge_formats( inlink->in_formats,  inlink->out_formats,  inlink->type) ||
--                    !ff_merge_formats(outlink->in_formats, outlink->out_formats, outlink->type))
-+                // If we have added an extra filter we must merge the input
-+                // side but we can have another go at the output
-+                if (!ff_merge_formats( inlink->in_formats,  inlink->out_formats,  inlink->type))
-                     ret = AVERROR(ENOSYS);
-+                else if (!ff_merge_formats(outlink->in_formats, outlink->out_formats, outlink->type))
-+                {
-+                    if (can_retry) {
-+                        link = outlink;
-+                        convert_needed = 1;
-+                        continue;
-+                    }
-+                    ret = AVERROR(ENOSYS);
-+                }
-                 if (inlink->type == AVMEDIA_TYPE_AUDIO &&
-                     (!ff_merge_samplerates(inlink->in_samplerates,
-                                            inlink->out_samplerates) ||
---- a/libavfilter/buffersrc.c
-+++ b/libavfilter/buffersrc.c
-@@ -210,7 +210,7 @@ static int av_buffersrc_add_frame_intern
- 
-         switch (ctx->outputs[0]->type) {
-         case AVMEDIA_TYPE_VIDEO:
--            CHECK_VIDEO_PARAM_CHANGE(ctx, s, frame->width, frame->height,
-+            CHECK_VIDEO_PARAM_CHANGE(ctx, s, av_frame_cropped_width(frame), av_frame_cropped_height(frame),
-                                      frame->format, frame->pts);
-             break;
-         case AVMEDIA_TYPE_AUDIO:
---- /dev/null
-+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
-@@ -0,0 +1,1282 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+/**
-+ * @file
-+ * deinterlace video filter - V4L2 M2M
-+ */
-+
-+#include <drm_fourcc.h>
-+
-+#include <linux/videodev2.h>
-+
-+#include <dirent.h>
-+#include <fcntl.h>
-+#include <poll.h>
-+#include <stdatomic.h>
-+#include <stdio.h>
-+#include <string.h>
-+#include <sys/ioctl.h>
-+#include <sys/mman.h>
-+#include <unistd.h>
-+
-+#include "libavutil/avassert.h"
-+#include "libavutil/avstring.h"
-+#include "libavutil/common.h"
-+#include "libavutil/hwcontext.h"
-+#include "libavutil/hwcontext_drm.h"
-+#include "libavutil/internal.h"
-+#include "libavutil/mathematics.h"
-+#include "libavutil/opt.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/time.h"
-+
-+#define FF_INTERNAL_FIELDS 1
-+#include "framequeue.h"
-+#include "filters.h"
-+#include "avfilter.h"
-+#include "formats.h"
-+#include "internal.h"
-+#include "video.h"
-+
-+typedef struct V4L2Queue V4L2Queue;
-+typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared;
-+
-+typedef struct V4L2PlaneInfo {
-+    int bytesperline;
-+    size_t length;
-+} V4L2PlaneInfo;
-+
-+typedef struct V4L2Buffer {
-+    int enqueued;
-+    int reenqueue;
-+    int fd;
-+    struct v4l2_buffer buffer;
-+    AVFrame frame;
-+    struct v4l2_plane planes[VIDEO_MAX_PLANES];
-+    int num_planes;
-+    V4L2PlaneInfo plane_info[VIDEO_MAX_PLANES];
-+    AVDRMFrameDescriptor drm_frame;
-+    V4L2Queue *q;
-+} V4L2Buffer;
-+
-+typedef struct V4L2Queue {
-+    struct v4l2_format format;
-+    int num_buffers;
-+    V4L2Buffer *buffers;
-+    DeintV4L2M2MContextShared *ctx;
-+} V4L2Queue;
-+
-+typedef struct pts_stats_s
-+{
-+    void * logctx;
-+    const char * name;  // For debug
-+    unsigned int last_count;
-+    unsigned int last_interval;
-+    int64_t last_pts;
-+} pts_stats_t;
-+
-+#define PTS_TRACK_SIZE 32
-+typedef struct pts_track_el_s
-+{
-+    uint32_t n;
-+    unsigned int interval;
-+    AVFrame * props;
-+} pts_track_el_t;
-+
-+typedef struct pts_track_s
-+{
-+    uint32_t n;
-+    uint32_t last_n;
-+    int got_2;
-+    void * logctx;
-+    pts_stats_t stats;
-+    pts_track_el_t a[PTS_TRACK_SIZE];
-+} pts_track_t;
-+
-+typedef struct DeintV4L2M2MContextShared {
-+    void * logctx;  // For logging - will be NULL when done
-+
-+    int fd;
-+    int done;
-+    int width;
-+    int height;
-+    int orig_width;
-+    int orig_height;
-+    atomic_uint refcount;
-+
-+    AVBufferRef *hw_frames_ctx;
-+
-+    unsigned int field_order;
-+
-+    pts_track_t track;
-+
-+    V4L2Queue output;
-+    V4L2Queue capture;
-+} DeintV4L2M2MContextShared;
-+
-+typedef struct DeintV4L2M2MContext {
-+    const AVClass *class;
-+
-+    DeintV4L2M2MContextShared *shared;
-+} DeintV4L2M2MContext;
-+
-+static unsigned int pts_stats_interval(const pts_stats_t * const stats)
-+{
-+    return stats->last_interval;
-+}
-+
-+// Pick 64 for max last count - that is >1sec at 60fps
-+#define STATS_LAST_COUNT_MAX 64
-+#define STATS_INTERVAL_MAX (1 << 30)
-+static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
-+{
-+    if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
-+        if (stats->last_count < STATS_LAST_COUNT_MAX)
-+            ++stats->last_count;
-+        return;
-+    }
-+
-+    if (stats->last_pts != AV_NOPTS_VALUE) {
-+        const int64_t interval = pts - stats->last_pts;
-+
-+        if (interval < 0 || interval >= STATS_INTERVAL_MAX ||
-+            stats->last_count >= STATS_LAST_COUNT_MAX) {
-+            if (stats->last_interval != 0)
-+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n",
-+                       __func__, stats->name, interval, stats->last_count);
-+            stats->last_interval = 0;
-+        }
-+        else {
-+            const int64_t frame_time = interval / (int64_t)stats->last_count;
-+
-+            if (frame_time != stats->last_interval)
-+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
-+                       __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
-+            stats->last_interval = frame_time;
-+        }
-+    }
-+
-+    stats->last_pts = pts;
-+    stats->last_count = 1;
-+}
-+
-+static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name)
-+{
-+    *stats = (pts_stats_t){
-+        .logctx = logctx,
-+        .name = name,
-+        .last_count = 1,
-+        .last_interval = 0,
-+        .last_pts = AV_NOPTS_VALUE
-+    };
-+}
-+
-+static inline uint32_t pts_track_next_n(pts_track_t * const trk)
-+{
-+    if (++trk->n == 0)
-+        trk->n = 1;
-+    return trk->n;
-+}
-+
-+static int pts_track_get_frame(pts_track_t * const trk, const struct timeval tv, AVFrame * const dst)
-+{
-+    uint32_t n = (uint32_t)(tv.tv_usec / 2 + tv.tv_sec * 500000);
-+    pts_track_el_t * t;
-+
-+    // As a first guess assume that n==0 means last frame
-+    if (n == 0) {
-+        n = trk->last_n;
-+        if (n == 0)
-+            goto fail;
-+    }
-+
-+    t = trk->a + (n & (PTS_TRACK_SIZE - 1));
-+
-+    if (t->n != n) {
-+        av_log(trk->logctx, AV_LOG_ERROR, "%s: track failure: got %u, expected %u\n", __func__, n, trk->n);
-+        goto fail;
-+    }
-+
-+    // 1st frame is simple - just believe it
-+    if (n != trk->last_n) {
-+        trk->last_n = n;
-+        trk->got_2 = 0;
-+        return av_frame_copy_props(dst, t->props);
-+    }
-+
-+    // Only believe in a single interpolated frame
-+    if (trk->got_2)
-+        goto fail;
-+    trk->got_2 = 1;
-+
-+    av_frame_copy_props(dst, t->props);
-+
-+
-+    // If we can't guess - don't
-+    if (t->interval == 0) {
-+        dst->best_effort_timestamp = AV_NOPTS_VALUE;
-+        dst->pts = AV_NOPTS_VALUE;
-+        dst->pkt_dts = AV_NOPTS_VALUE;
-+    }
-+    else {
-+        if (dst->best_effort_timestamp != AV_NOPTS_VALUE)
-+            dst->best_effort_timestamp += t->interval / 2;
-+        if (dst->pts != AV_NOPTS_VALUE)
-+            dst->pts += t->interval / 2;
-+        if (dst->pkt_dts != AV_NOPTS_VALUE)
-+            dst->pkt_dts += t->interval / 2;
-+    }
-+
-+    return 0;
-+
-+fail:
-+    trk->last_n = 0;
-+    trk->got_2 = 0;
-+    dst->pts = AV_NOPTS_VALUE;
-+    dst->pkt_dts = AV_NOPTS_VALUE;
-+    return 0;
-+}
-+
-+static struct timeval pts_track_add_frame(pts_track_t * const trk, const AVFrame * const src)
-+{
-+    const uint32_t n = pts_track_next_n(trk);
-+    pts_track_el_t * const t = trk->a + (n & (PTS_TRACK_SIZE - 1));
-+
-+    pts_stats_add(&trk->stats, src->pts);
-+
-+    t->n = n;
-+    t->interval = pts_stats_interval(&trk->stats); // guess that next interval is the same as the last
-+    av_frame_unref(t->props);
-+    av_frame_copy_props(t->props, src);
-+
-+    // We now know what the previous interval was, rather than having to guess,
-+    // so set it.  There is a better than decent chance that this is before
-+    // we use it.
-+    if (t->interval != 0) {
-+        pts_track_el_t * const prev_t = trk->a + ((n - 1) & (PTS_TRACK_SIZE - 1));
-+        prev_t->interval = t->interval;
-+    }
-+
-+    // In case deinterlace interpolates frames use every other usec
-+    return (struct timeval){.tv_sec = n / 500000, .tv_usec = (n % 500000) * 2};
-+}
-+
-+static void pts_track_uninit(pts_track_t * const trk)
-+{
-+    unsigned int i;
-+    for (i = 0; i != PTS_TRACK_SIZE; ++i) {
-+        trk->a[i].n = 0;
-+        av_frame_free(&trk->a[i].props);
-+    }
-+}
-+
-+static int pts_track_init(pts_track_t * const trk, void *logctx)
-+{
-+    unsigned int i;
-+    trk->n = 1;
-+    pts_stats_init(&trk->stats, logctx, "track");
-+    for (i = 0; i != PTS_TRACK_SIZE; ++i) {
-+        trk->a[i].n = 0;
-+        if ((trk->a[i].props = av_frame_alloc()) == NULL) {
-+            pts_track_uninit(trk);
-+            return AVERROR(ENOMEM);
-+        }
-+    }
-+    return 0;
-+}
-+
-+static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx)
-+{
-+    struct v4l2_capability cap;
-+    int ret;
-+
-+    memset(&cap, 0, sizeof(cap));
-+    ret = ioctl(ctx->fd, VIDIOC_QUERYCAP, &cap);
-+    if (ret < 0)
-+        return ret;
-+
-+    if (!(cap.capabilities & V4L2_CAP_STREAMING))
-+        return AVERROR(EINVAL);
-+
-+    if (cap.capabilities & V4L2_CAP_VIDEO_M2M) {
-+        ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-+        ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
-+
-+        return 0;
-+    }
-+
-+    if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) {
-+        ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
-+        ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
-+
-+        return 0;
-+    }
-+
-+    return AVERROR(EINVAL);
-+}
-+
-+static int deint_v4l2m2m_try_format(V4L2Queue *queue)
-+{
-+    struct v4l2_format *fmt        = &queue->format;
-+    DeintV4L2M2MContextShared *ctx = queue->ctx;
-+    int ret, field;
-+
-+    ret = ioctl(ctx->fd, VIDIOC_G_FMT, fmt);
-+    if (ret)
-+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret);
-+
-+    if (V4L2_TYPE_IS_OUTPUT(fmt->type))
-+        field = V4L2_FIELD_INTERLACED_TB;
-+    else
-+        field = V4L2_FIELD_NONE;
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-+        fmt->fmt.pix_mp.pixelformat = V4L2_PIX_FMT_YUV420;
-+        fmt->fmt.pix_mp.field = field;
-+        fmt->fmt.pix_mp.width = ctx->width;
-+        fmt->fmt.pix_mp.height = ctx->height;
-+    } else {
-+        fmt->fmt.pix.pixelformat = V4L2_PIX_FMT_YUV420;
-+        fmt->fmt.pix.field = field;
-+        fmt->fmt.pix.width = ctx->width;
-+        fmt->fmt.pix.height = ctx->height;
-+    }
-+
-+    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__,
-+		 fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height,
-+		 fmt->fmt.pix_mp.pixelformat,
-+		 fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
-+
-+    ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, fmt);
-+    if (ret)
-+        return AVERROR(EINVAL);
-+
-+    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__,
-+		 fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height,
-+		 fmt->fmt.pix_mp.pixelformat,
-+		 fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-+        if (fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 ||
-+            fmt->fmt.pix_mp.field != field) {
-+            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
-+
-+            return AVERROR(EINVAL);
-+        }
-+    } else {
-+        if (fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 ||
-+            fmt->fmt.pix.field != field) {
-+            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
-+
-+            return AVERROR(EINVAL);
-+        }
-+    }
-+
-+    return 0;
-+}
-+
-+static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, int height, int pitch, int ysize)
-+{
-+    struct v4l2_format *fmt        = &queue->format;
-+    DeintV4L2M2MContextShared *ctx = queue->ctx;
-+    int ret;
-+
-+    struct v4l2_selection sel = {
-+        .type = fmt->type,
-+        .target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP_BOUNDS : V4L2_SEL_TGT_COMPOSE_BOUNDS,
-+    };
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-+        fmt->fmt.pix_mp.field = field;
-+        fmt->fmt.pix_mp.width = width;
-+        fmt->fmt.pix_mp.height = ysize / pitch;
-+        fmt->fmt.pix_mp.plane_fmt[0].bytesperline = pitch;
-+        fmt->fmt.pix_mp.plane_fmt[0].sizeimage = ysize + (ysize >> 1);
-+    } else {
-+        fmt->fmt.pix.field = field;
-+        fmt->fmt.pix.width = width;
-+        fmt->fmt.pix.height = height;
-+        fmt->fmt.pix.sizeimage = 0;
-+        fmt->fmt.pix.bytesperline = 0;
-+    }
-+
-+    ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt);
-+    if (ret)
-+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret);
-+
-+    ret = ioctl(ctx->fd, VIDIOC_G_SELECTION, &sel);
-+    if (ret)
-+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_SELECTION failed: %d\n", ret);
-+
-+    sel.r.width = width;
-+    sel.r.height = height;
-+    sel.r.left = 0;
-+    sel.r.top = 0;
-+    sel.target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE,
-+    sel.flags = V4L2_SEL_FLAG_LE;
-+
-+    ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &sel);
-+    if (ret)
-+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_SELECTION failed: %d\n", ret);
-+
-+    return ret;
-+}
-+
-+static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node)
-+{
-+    int ret;
-+
-+    ctx->fd = open(node, O_RDWR | O_NONBLOCK, 0);
-+    if (ctx->fd < 0)
-+        return AVERROR(errno);
-+
-+    ret = deint_v4l2m2m_prepare_context(ctx);
-+    if (ret)
-+        goto fail;
-+
-+    ret = deint_v4l2m2m_try_format(&ctx->capture);
-+    if (ret)
-+        goto fail;
-+
-+    ret = deint_v4l2m2m_try_format(&ctx->output);
-+    if (ret)
-+        goto fail;
-+
-+    return 0;
-+
-+fail:
-+    close(ctx->fd);
-+    ctx->fd = -1;
-+
-+    return ret;
-+}
-+
-+static int deint_v4l2m2m_find_device(DeintV4L2M2MContextShared *ctx)
-+{
-+    int ret = AVERROR(EINVAL);
-+    struct dirent *entry;
-+    char node[PATH_MAX];
-+    DIR *dirp;
-+
-+    dirp = opendir("/dev");
-+    if (!dirp)
-+        return AVERROR(errno);
-+
-+    for (entry = readdir(dirp); entry; entry = readdir(dirp)) {
-+
-+        if (strncmp(entry->d_name, "video", 5))
-+            continue;
-+
-+        snprintf(node, sizeof(node), "/dev/%s", entry->d_name);
-+        av_log(ctx->logctx, AV_LOG_DEBUG, "probing device %s\n", node);
-+        ret = deint_v4l2m2m_probe_device(ctx, node);
-+        if (!ret)
-+            break;
-+    }
-+
-+    closedir(dirp);
-+
-+    if (ret) {
-+        av_log(ctx->logctx, AV_LOG_ERROR, "Could not find a valid device\n");
-+        ctx->fd = -1;
-+
-+        return ret;
-+    }
-+
-+    av_log(ctx->logctx, AV_LOG_INFO, "Using device %s\n", node);
-+
-+    return 0;
-+}
-+
-+static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf)
-+{
-+    int ret;
-+
-+    ret = ioctl(buf->q->ctx->fd, VIDIOC_QBUF, &buf->buffer);
-+    if (ret < 0)
-+        return AVERROR(errno);
-+
-+    buf->enqueued = 1;
-+
-+    return 0;
-+}
-+
-+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
-+{
-+    struct v4l2_exportbuffer expbuf;
-+    int i, ret;
-+
-+    for (i = 0; i < avbuf->num_planes; i++) {
-+        memset(&expbuf, 0, sizeof(expbuf));
-+
-+        expbuf.index = avbuf->buffer.index;
-+        expbuf.type = avbuf->buffer.type;
-+        expbuf.plane = i;
-+
-+        ret = ioctl(avbuf->q->ctx->fd, VIDIOC_EXPBUF, &expbuf);
-+        if (ret < 0)
-+            return AVERROR(errno);
-+
-+        avbuf->fd = expbuf.fd;
-+
-+        if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type)) {
-+            /* drm frame */
-+            avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length;
-+            avbuf->drm_frame.objects[i].fd = expbuf.fd;
-+            avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+        } else {
-+            /* drm frame */
-+            avbuf->drm_frame.objects[0].size = avbuf->buffer.length;
-+            avbuf->drm_frame.objects[0].fd = expbuf.fd;
-+            avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+        }
-+    }
-+
-+    return 0;
-+}
-+
-+static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
-+{
-+    struct v4l2_format *fmt = &queue->format;
-+    DeintV4L2M2MContextShared *ctx = queue->ctx;
-+    struct v4l2_requestbuffers req;
-+    int ret, i, j, multiplanar;
-+    uint32_t memory;
-+
-+    memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ?
-+        V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
-+
-+    multiplanar = V4L2_TYPE_IS_MULTIPLANAR(fmt->type);
-+
-+    memset(&req, 0, sizeof(req));
-+    req.count = queue->num_buffers;
-+    req.memory = memory;
-+    req.type = fmt->type;
-+
-+    ret = ioctl(ctx->fd, VIDIOC_REQBUFS, &req);
-+    if (ret < 0) {
-+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_REQBUFS failed: %s\n", strerror(errno));
-+
-+        return AVERROR(errno);
-+    }
-+
-+    queue->num_buffers = req.count;
-+    queue->buffers = av_mallocz(queue->num_buffers * sizeof(V4L2Buffer));
-+    if (!queue->buffers) {
-+        av_log(ctx->logctx, AV_LOG_ERROR, "malloc enomem\n");
-+
-+        return AVERROR(ENOMEM);
-+    }
-+
-+    for (i = 0; i < queue->num_buffers; i++) {
-+        V4L2Buffer *buf = &queue->buffers[i];
-+
-+        buf->enqueued = 0;
-+        buf->fd = -1;
-+        buf->q = queue;
-+
-+        buf->buffer.type = fmt->type;
-+        buf->buffer.memory = memory;
-+        buf->buffer.index = i;
-+
-+        if (multiplanar) {
-+            buf->buffer.length = VIDEO_MAX_PLANES;
-+            buf->buffer.m.planes = buf->planes;
-+        }
-+
-+        ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer);
-+        if (ret < 0) {
-+            ret = AVERROR(errno);
-+
-+            goto fail;
-+        }
-+
-+        if (multiplanar)
-+            buf->num_planes = buf->buffer.length;
-+        else
-+            buf->num_planes = 1;
-+
-+        for (j = 0; j < buf->num_planes; j++) {
-+            V4L2PlaneInfo *info = &buf->plane_info[j];
-+
-+            if (multiplanar) {
-+                info->bytesperline = fmt->fmt.pix_mp.plane_fmt[j].bytesperline;
-+                info->length = buf->buffer.m.planes[j].length;
-+            } else {
-+                info->bytesperline = fmt->fmt.pix.bytesperline;
-+                info->length = buf->buffer.length;
-+            }
-+        }
-+
-+        if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) {
-+            ret = deint_v4l2m2m_enqueue_buffer(buf);
-+            if (ret)
-+                goto fail;
-+
-+            ret = v4l2_buffer_export_drm(buf);
-+            if (ret)
-+                goto fail;
-+        }
-+    }
-+
-+    return 0;
-+
-+fail:
-+    for (i = 0; i < queue->num_buffers; i++)
-+        if (queue->buffers[i].fd >= 0)
-+            close(queue->buffers[i].fd);
-+    av_free(queue->buffers);
-+    queue->buffers = NULL;
-+
-+    return ret;
-+}
-+
-+static int deint_v4l2m2m_streamon(V4L2Queue *queue)
-+{
-+    DeintV4L2M2MContextShared * const ctx = queue->ctx;
-+    int type = queue->format.type;
-+    int ret;
-+
-+    ret = ioctl(ctx->fd, VIDIOC_STREAMON, &type);
-+    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno));
-+    if (ret < 0)
-+        return AVERROR(errno);
-+
-+    return 0;
-+}
-+
-+static int deint_v4l2m2m_streamoff(V4L2Queue *queue)
-+{
-+    DeintV4L2M2MContextShared * const ctx = queue->ctx;
-+    int type = queue->format.type;
-+    int ret;
-+
-+    ret = ioctl(ctx->fd, VIDIOC_STREAMOFF, &type);
-+    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno));
-+    if (ret < 0)
-+        return AVERROR(errno);
-+
-+    return 0;
-+}
-+
-+// timeout in ms
-+static V4L2Buffer* deint_v4l2m2m_dequeue_buffer(V4L2Queue *queue, int timeout)
-+{
-+    struct v4l2_plane planes[VIDEO_MAX_PLANES];
-+    DeintV4L2M2MContextShared *ctx = queue->ctx;
-+    struct v4l2_buffer buf = { 0 };
-+    V4L2Buffer* avbuf = NULL;
-+    struct pollfd pfd;
-+    short events;
-+    int ret;
-+
-+    if (V4L2_TYPE_IS_OUTPUT(queue->format.type))
-+        events =  POLLOUT | POLLWRNORM;
-+    else
-+        events = POLLIN | POLLRDNORM;
-+
-+    pfd.events = events;
-+    pfd.fd = ctx->fd;
-+
-+    for (;;) {
-+        ret = poll(&pfd, 1, timeout);
-+        if (ret > 0)
-+            break;
-+        if (errno == EINTR)
-+            continue;
-+        return NULL;
-+    }
-+
-+    if (pfd.revents & POLLERR)
-+        return NULL;
-+
-+    if (pfd.revents & events) {
-+        memset(&buf, 0, sizeof(buf));
-+        buf.memory = V4L2_MEMORY_MMAP;
-+        buf.type = queue->format.type;
-+        if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) {
-+            memset(planes, 0, sizeof(planes));
-+            buf.length = VIDEO_MAX_PLANES;
-+            buf.m.planes = planes;
-+        }
-+
-+        ret = ioctl(ctx->fd, VIDIOC_DQBUF, &buf);
-+        if (ret) {
-+            if (errno != EAGAIN)
-+                av_log(ctx->logctx, AV_LOG_DEBUG, "VIDIOC_DQBUF, errno (%s)\n",
-+                       av_err2str(AVERROR(errno)));
-+            return NULL;
-+        }
-+
-+        avbuf = &queue->buffers[buf.index];
-+        avbuf->enqueued = 0;
-+        avbuf->buffer = buf;
-+        if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) {
-+            memcpy(avbuf->planes, planes, sizeof(planes));
-+            avbuf->buffer.m.planes = avbuf->planes;
-+        }
-+        return avbuf;
-+    }
-+
-+    return NULL;
-+}
-+
-+static V4L2Buffer *deint_v4l2m2m_find_free_buf(V4L2Queue *queue)
-+{
-+    int i;
-+    V4L2Buffer *buf = NULL;
-+
-+    for (i = 0; i < queue->num_buffers; i++)
-+        if (!queue->buffers[i].enqueued) {
-+            buf = &queue->buffers[i];
-+            break;
-+        }
-+    return buf;
-+}
-+
-+static void deint_v4l2m2m_unref_queued(V4L2Queue *queue)
-+{
-+    int i;
-+    V4L2Buffer *buf = NULL;
-+
-+    if (!queue || !queue->buffers)
-+        return;
-+    for (i = 0; i < queue->num_buffers; i++) {
-+        buf = &queue->buffers[i];
-+        if (queue->buffers[i].enqueued)
-+            av_frame_unref(&buf->frame);
-+    }
-+}
-+
-+static void recycle_q(V4L2Queue * const queue)
-+{
-+    V4L2Buffer* avbuf;
-+    while (avbuf = deint_v4l2m2m_dequeue_buffer(queue, 0), avbuf) {
-+        av_frame_unref(&avbuf->frame);
-+    }
-+}
-+
-+static int count_enqueued(V4L2Queue *queue)
-+{
-+    int i;
-+    int n = 0;
-+
-+    if (queue->buffers == NULL)
-+        return 0;
-+
-+    for (i = 0; i < queue->num_buffers; i++)
-+        if (queue->buffers[i].enqueued)
-+            ++n;
-+    return n;
-+}
-+
-+static int deint_v4l2m2m_enqueue_frame(V4L2Queue * const queue, AVFrame * const frame)
-+{
-+    DeintV4L2M2MContextShared *const ctx = queue->ctx;
-+    AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)frame->data[0];
-+    V4L2Buffer *buf;
-+    int i;
-+
-+    if (V4L2_TYPE_IS_OUTPUT(queue->format.type))
-+        recycle_q(queue);
-+
-+    buf = deint_v4l2m2m_find_free_buf(queue);
-+    if (!buf) {
-+        av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d finding free buf\n", __func__, 0);
-+        return AVERROR(EAGAIN);
-+    }
-+    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type))
-+        for (i = 0; i < drm_desc->nb_objects; i++)
-+            buf->buffer.m.planes[i].m.fd = drm_desc->objects[i].fd;
-+    else
-+        buf->buffer.m.fd = drm_desc->objects[0].fd;
-+
-+    buf->buffer.field = !frame->interlaced_frame ? V4L2_FIELD_NONE :
-+        frame->top_field_first ? V4L2_FIELD_INTERLACED_TB :
-+            V4L2_FIELD_INTERLACED_BT;
-+
-+    if (ctx->field_order != buf->buffer.field) {
-+        av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Field changed: %d->%d\n", __func__, ctx->field_order, buf->buffer.field);
-+        ctx->field_order = buf->buffer.field;
-+    }
-+
-+    buf->buffer.timestamp = pts_track_add_frame(&ctx->track, frame);
-+
-+    buf->drm_frame.objects[0].fd = drm_desc->objects[0].fd;
-+
-+    av_frame_move_ref(&buf->frame, frame);
-+
-+    return deint_v4l2m2m_enqueue_buffer(buf);
-+}
-+
-+static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx)
-+{
-+    if (atomic_fetch_sub(&ctx->refcount, 1) == 1) {
-+        V4L2Queue *capture = &ctx->capture;
-+        V4L2Queue *output  = &ctx->output;
-+        int i;
-+
-+        av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__);
-+
-+        if (ctx->fd >= 0) {
-+            deint_v4l2m2m_streamoff(capture);
-+            deint_v4l2m2m_streamoff(output);
-+        }
-+
-+        if (capture->buffers)
-+            for (i = 0; i < capture->num_buffers; i++) {
-+                capture->buffers[i].q = NULL;
-+                if (capture->buffers[i].fd >= 0)
-+                    close(capture->buffers[i].fd);
-+            }
-+
-+        deint_v4l2m2m_unref_queued(output);
-+
-+        av_buffer_unref(&ctx->hw_frames_ctx);
-+
-+        if (capture->buffers)
-+            av_free(capture->buffers);
-+
-+        if (output->buffers)
-+            av_free(output->buffers);
-+
-+        if (ctx->fd >= 0) {
-+            close(ctx->fd);
-+            ctx->fd = -1;
-+        }
-+
-+        av_free(ctx);
-+    }
-+}
-+
-+static void v4l2_free_buffer(void *opaque, uint8_t *unused)
-+{
-+    V4L2Buffer *buf                = opaque;
-+    DeintV4L2M2MContextShared *ctx = buf->q->ctx;
-+
-+    if (!ctx->done)
-+        deint_v4l2m2m_enqueue_buffer(buf);
-+
-+    deint_v4l2m2m_destroy_context(ctx);
-+}
-+
-+static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height)
-+{
-+    int av_pix_fmt = AV_PIX_FMT_YUV420P;
-+    AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
-+    AVDRMLayerDescriptor *layer;
-+
-+    /* fill the DRM frame descriptor */
-+    drm_desc->nb_objects = avbuf->num_planes;
-+    drm_desc->nb_layers = 1;
-+
-+    layer = &drm_desc->layers[0];
-+    layer->nb_planes = avbuf->num_planes;
-+
-+    for (int i = 0; i < avbuf->num_planes; i++) {
-+        layer->planes[i].object_index = i;
-+        layer->planes[i].offset = 0;
-+        layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
-+    }
-+
-+    switch (av_pix_fmt) {
-+    case AV_PIX_FMT_YUYV422:
-+
-+        layer->format = DRM_FORMAT_YUYV;
-+        layer->nb_planes = 1;
-+
-+        break;
-+
-+    case AV_PIX_FMT_NV12:
-+    case AV_PIX_FMT_NV21:
-+
-+        layer->format = av_pix_fmt == AV_PIX_FMT_NV12 ?
-+            DRM_FORMAT_NV12 : DRM_FORMAT_NV21;
-+
-+        if (avbuf->num_planes > 1)
-+            break;
-+
-+        layer->nb_planes = 2;
-+
-+        layer->planes[1].object_index = 0;
-+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
-+            height;
-+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
-+        break;
-+
-+    case AV_PIX_FMT_YUV420P:
-+
-+        layer->format = DRM_FORMAT_YUV420;
-+
-+        if (avbuf->num_planes > 1)
-+            break;
-+
-+        layer->nb_planes = 3;
-+
-+        layer->planes[1].object_index = 0;
-+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
-+            height;
-+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
-+
-+        layer->planes[2].object_index = 0;
-+        layer->planes[2].offset = layer->planes[1].offset +
-+            ((avbuf->plane_info[0].bytesperline *
-+              height) >> 2);
-+        layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
-+        break;
-+
-+    default:
-+        drm_desc->nb_layers = 0;
-+        break;
-+    }
-+
-+    return (uint8_t *) drm_desc;
-+}
-+
-+// timeout in ms
-+static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout)
-+{
-+    DeintV4L2M2MContextShared *ctx = queue->ctx;
-+    V4L2Buffer* avbuf;
-+
-+    av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__);
-+
-+    avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout);
-+    if (!avbuf) {
-+        av_log(ctx->logctx, AV_LOG_DEBUG, "%s: No buffer to dequeue (timeout=%d)\n", __func__, timeout);
-+        return AVERROR(EAGAIN);
-+    }
-+
-+    // Fill in PTS and anciliary info from src frame
-+    // we will want to overwrite some fields as only the pts/dts
-+    // fields are updated with new timing in this fn
-+    pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame);
-+
-+    frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame,
-+                            sizeof(avbuf->drm_frame), v4l2_free_buffer,
-+                            avbuf, AV_BUFFER_FLAG_READONLY);
-+    if (!frame->buf[0]) {
-+        av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d creating buffer\n", __func__, 0);
-+        return AVERROR(ENOMEM);
-+    }
-+
-+    atomic_fetch_add(&ctx->refcount, 1);
-+
-+    frame->data[0] = (uint8_t *)v4l2_get_drm_frame(avbuf, ctx->orig_height);
-+    frame->format = AV_PIX_FMT_DRM_PRIME;
-+    if (ctx->hw_frames_ctx)
-+        frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx);
-+    frame->height = ctx->height;
-+    frame->width = ctx->width;
-+
-+    // Not interlaced now
-+    frame->interlaced_frame = 0;
-+    frame->top_field_first = 0;
-+    // Pkt duration halved
-+    frame->pkt_duration /= 2;
-+
-+    if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) {
-+        av_log(ctx->logctx, AV_LOG_ERROR, "driver decode error\n");
-+        frame->decode_error_flags |= FF_DECODE_ERROR_INVALID_BITSTREAM;
-+    }
-+
-+    av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: PTS=%"PRId64"\n", __func__, frame->pts);
-+    return 0;
-+}
-+
-+static int deint_v4l2m2m_config_props(AVFilterLink *outlink)
-+{
-+    AVFilterLink *inlink           = outlink->src->inputs[0];
-+    AVFilterContext *avctx         = outlink->src;
-+    DeintV4L2M2MContext *priv      = avctx->priv;
-+    DeintV4L2M2MContextShared *ctx = priv->shared;
-+    int ret;
-+
-+    ctx->height = avctx->inputs[0]->h;
-+    ctx->width = avctx->inputs[0]->w;
-+
-+    av_log(priv, AV_LOG_DEBUG, "%s: %dx%d\n", __func__, ctx->width, ctx->height);
-+
-+    outlink->time_base           = inlink->time_base;
-+    outlink->w                   = inlink->w;
-+    outlink->h                   = inlink->h;
-+    outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
-+    outlink->format              = inlink->format;
-+    outlink->frame_rate = (AVRational) {1, 0};  // Deny knowledge of frame rate
-+
-+    ret = deint_v4l2m2m_find_device(ctx);
-+    if (ret)
-+        return ret;
-+
-+    if (inlink->hw_frames_ctx) {
-+        ctx->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx);
-+        if (!ctx->hw_frames_ctx)
-+            return AVERROR(ENOMEM);
-+    }
-+    return 0;
-+}
-+
-+static int deint_v4l2m2m_query_formats(AVFilterContext *avctx)
-+{
-+    static const enum AVPixelFormat pixel_formats[] = {
-+        AV_PIX_FMT_DRM_PRIME,
-+        AV_PIX_FMT_YUV420P,
-+        AV_PIX_FMT_NONE,
-+    };
-+
-+    return ff_set_common_formats(avctx, ff_make_format_list(pixel_formats));
-+}
-+
-+static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
-+{
-+    AVFilterContext *avctx         = link->dst;
-+    DeintV4L2M2MContext *priv      = avctx->priv;
-+    DeintV4L2M2MContextShared *ctx = priv->shared;
-+    V4L2Queue *capture             = &ctx->capture;
-+    V4L2Queue *output              = &ctx->output;
-+    int ret;
-+
-+    av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" (%"PRId64") field :%d interlaced: %d aspect:%d/%d\n",
-+          __func__, in->pts, AV_NOPTS_VALUE, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den);
-+    av_log(priv, AV_LOG_DEBUG, "--- %s: in status in %d/ot %d; out status in %d/out %d\n", __func__,
-+           avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out);
-+
-+    if (ctx->field_order == V4L2_FIELD_ANY) {
-+        AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)in->data[0];
-+        ctx->orig_width = drm_desc->layers[0].planes[0].pitch;
-+        ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width;
-+
-+        av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%d,%d)\n", __func__, ctx->width, ctx->height,
-+           drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset);
-+
-+        if (in->top_field_first)
-+            ctx->field_order = V4L2_FIELD_INTERLACED_TB;
-+        else
-+            ctx->field_order = V4L2_FIELD_INTERLACED_BT;
-+
-+        ret = deint_v4l2m2m_set_format(output, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
-+        if (ret)
-+            return ret;
-+
-+        ret = deint_v4l2m2m_set_format(capture, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
-+        if (ret)
-+            return ret;
-+
-+        ret = deint_v4l2m2m_allocate_buffers(capture);
-+        if (ret)
-+            return ret;
-+
-+        ret = deint_v4l2m2m_streamon(capture);
-+        if (ret)
-+            return ret;
-+
-+        ret = deint_v4l2m2m_allocate_buffers(output);
-+        if (ret)
-+            return ret;
-+
-+        ret = deint_v4l2m2m_streamon(output);
-+        if (ret)
-+            return ret;
-+    }
-+
-+    ret = deint_v4l2m2m_enqueue_frame(output, in);
-+
-+    av_log(priv, AV_LOG_TRACE, ">>> %s: %s\n", __func__, av_err2str(ret));
-+    return ret;
-+}
-+
-+static int deint_v4l2m2m_activate(AVFilterContext *avctx)
-+{
-+    DeintV4L2M2MContext * const priv = avctx->priv;
-+    DeintV4L2M2MContextShared *const s = priv->shared;
-+    AVFilterLink * const outlink = avctx->outputs[0];
-+    AVFilterLink * const inlink = avctx->inputs[0];
-+    int n = 0;
-+    int cn = 99;
-+    int instatus = 0;
-+    int64_t inpts = 0;
-+    int did_something = 0;
-+
-+    av_log(priv, AV_LOG_TRACE, "<<< %s\n", __func__);
-+
-+    FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx);
-+
-+    ff_inlink_acknowledge_status(inlink, &instatus, &inpts);
-+
-+    if (!ff_outlink_frame_wanted(outlink)) {
-+        av_log(priv, AV_LOG_TRACE, "%s: Not wanted out\n", __func__);
-+    }
-+    else if (s->field_order != V4L2_FIELD_ANY)  // Can't DQ if no setup!
-+    {
-+        AVFrame * frame = av_frame_alloc();
-+        int rv;
-+
-+again:
-+        recycle_q(&s->output);
-+        n = count_enqueued(&s->output);
-+
-+        if (frame == NULL) {
-+            av_log(priv, AV_LOG_ERROR, "%s: error allocating frame\n", __func__);
-+            return AVERROR(ENOMEM);
-+        }
-+
-+        rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame, n > 4 ? 300 : 0);
-+        if (rv != 0) {
-+            av_frame_free(&frame);
-+            if (rv != AVERROR(EAGAIN)) {
-+                av_log(priv, AV_LOG_ERROR, ">>> %s: DQ fail: %s\n", __func__, av_err2str(rv));
-+                return rv;
-+            }
-+        }
-+        else {
-+            frame->interlaced_frame = 0;
-+            // frame is always consumed by filter_frame - even on error despite
-+            // a somewhat confusing comment in the header
-+            rv = ff_filter_frame(outlink, frame);
-+
-+            if (instatus != 0) {
-+                av_log(priv, AV_LOG_TRACE, "%s: eof loop\n", __func__);
-+                goto again;
-+            }
-+
-+            av_log(priv, AV_LOG_TRACE, "%s: Filtered: %s\n", __func__, av_err2str(rv));
-+            did_something = 1;
-+        }
-+
-+        cn = count_enqueued(&s->capture);
-+    }
-+
-+    if (instatus != 0) {
-+        ff_outlink_set_status(outlink, instatus, inpts);
-+        av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(instatus));
-+        return 0;
-+    }
-+
-+    {
-+        AVFrame * frame;
-+        int rv;
-+
-+        recycle_q(&s->output);
-+        n = count_enqueued(&s->output);
-+
-+        while (n < 6) {
-+            if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) {
-+                av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv));
-+                return rv;
-+            }
-+
-+            if (frame == NULL) {
-+                av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__);
-+                break;
-+            }
-+
-+            deint_v4l2m2m_filter_frame(inlink, frame);
-+            av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__);
-+            ++n;
-+        }
-+    }
-+
-+    if (n < 6) {
-+        ff_inlink_request_frame(inlink);
-+        did_something = 1;
-+        av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__);
-+    }
-+
-+    if (n > 4 && ff_outlink_frame_wanted(outlink)) {
-+        ff_filter_set_ready(avctx, 1);
-+        did_something = 1;
-+        av_log(priv, AV_LOG_TRACE, "%s: ready\n", __func__);
-+    }
-+
-+    av_log(priv, AV_LOG_TRACE, ">>> %s: OK (n=%d, cn=%d)\n", __func__, n, cn);
-+    return did_something ? 0 : FFERROR_NOT_READY;
-+}
-+
-+static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
-+{
-+    DeintV4L2M2MContext * const priv = avctx->priv;
-+    DeintV4L2M2MContextShared * const ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared));
-+
-+    if (!ctx) {
-+        av_log(priv, AV_LOG_ERROR, "%s: error %d allocating context\n", __func__, 0);
-+        return AVERROR(ENOMEM);
-+    }
-+    priv->shared = ctx;
-+    ctx->logctx = priv;
-+    ctx->fd = -1;
-+    ctx->output.ctx = ctx;
-+    ctx->output.num_buffers = 8;
-+    ctx->capture.ctx = ctx;
-+    ctx->capture.num_buffers = 12;
-+    ctx->done = 0;
-+    ctx->field_order = V4L2_FIELD_ANY;
-+
-+    pts_track_init(&ctx->track, priv);
-+
-+    atomic_init(&ctx->refcount, 1);
-+
-+    return 0;
-+}
-+
-+static void deint_v4l2m2m_uninit(AVFilterContext *avctx)
-+{
-+    DeintV4L2M2MContext *priv = avctx->priv;
-+    DeintV4L2M2MContextShared *ctx = priv->shared;
-+
-+    ctx->done = 1;
-+    ctx->logctx = NULL;  // Log to NULL works, log to missing crashes
-+    pts_track_uninit(&ctx->track);
-+    deint_v4l2m2m_destroy_context(ctx);
-+}
-+
-+static const AVOption deinterlace_v4l2m2m_options[] = {
-+    { NULL },
-+};
-+
-+AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m);
-+
-+static const AVFilterPad deint_v4l2m2m_inputs[] = {
-+    {
-+        .name         = "default",
-+        .type         = AVMEDIA_TYPE_VIDEO,
-+    },
-+    { NULL }
-+};
-+
-+static const AVFilterPad deint_v4l2m2m_outputs[] = {
-+    {
-+        .name          = "default",
-+        .type          = AVMEDIA_TYPE_VIDEO,
-+        .config_props  = deint_v4l2m2m_config_props,
-+    },
-+    { NULL }
-+};
-+
-+AVFilter ff_vf_deinterlace_v4l2m2m = {
-+    .name           = "deinterlace_v4l2m2m",
-+    .description    = NULL_IF_CONFIG_SMALL("V4L2 M2M deinterlacer"),
-+    .priv_size      = sizeof(DeintV4L2M2MContext),
-+    .init           = &deint_v4l2m2m_init,
-+    .uninit         = &deint_v4l2m2m_uninit,
-+    .query_formats  = &deint_v4l2m2m_query_formats,
-+    .inputs         = deint_v4l2m2m_inputs,
-+    .outputs        = deint_v4l2m2m_outputs,
-+    .priv_class     = &deinterlace_v4l2m2m_class,
-+    .activate       = deint_v4l2m2m_activate,
-+};
---- /dev/null
-+++ b/libavfilter/vf_unsand.c
-@@ -0,0 +1,234 @@
-+/*
-+ * Copyright (c) 2007 Bobby Bingham
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+/**
-+ * @file
-+ * format and noformat video filters
-+ */
-+
-+#include <string.h>
-+
-+#include "libavutil/internal.h"
-+#include "libavutil/mem.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/opt.h"
-+#include "libavutil/rpi_sand_fns.h"
-+
-+#include "avfilter.h"
-+#include "formats.h"
-+#include "internal.h"
-+#include "video.h"
-+
-+typedef struct UnsandContext {
-+    const AVClass *class;
-+} UnsandContext;
-+
-+static av_cold void uninit(AVFilterContext *ctx)
-+{
-+//    UnsandContext *s = ctx->priv;
-+}
-+
-+static av_cold int init(AVFilterContext *ctx)
-+{
-+//    UnsandContext *s = ctx->priv;
-+
-+    return 0;
-+}
-+
-+
-+static int filter_frame(AVFilterLink *link, AVFrame *in)
-+{
-+    AVFilterLink * const outlink = link->dst->outputs[0];
-+    AVFrame *out = NULL;
-+    int rv = 0;
-+
-+    if (outlink->format == in->format) {
-+        // If nothing to do then do nothing
-+        out = in;
-+    }
-+    else
-+    {
-+        if ((out = ff_get_video_buffer(outlink, av_frame_cropped_width(in), av_frame_cropped_height(in))) == NULL)
-+        {
-+            rv = AVERROR(ENOMEM);
-+            goto fail;
-+        }
-+        if (av_rpi_sand_to_planar_frame(out, in) != 0)
-+        {
-+            rv = -1;
-+            goto fail;
-+        }
-+
-+        av_frame_free(&in);
-+    }
-+
-+    return ff_filter_frame(outlink, out);
-+
-+fail:
-+    av_frame_free(&out);
-+    av_frame_free(&in);
-+    return rv;
-+}
-+
-+#if 0
-+static void dump_fmts(const AVFilterFormats * fmts)
-+{
-+    int i;
-+    if (fmts== NULL) {
-+        printf("NULL\n");
-+        return;
-+    }
-+    for (i = 0; i < fmts->nb_formats; ++i) {
-+        printf(" %d", fmts->formats[i]);
-+    }
-+    printf("\n");
-+}
-+#endif
-+
-+static int query_formats(AVFilterContext *ctx)
-+{
-+//    UnsandContext *s = ctx->priv;
-+    int ret;
-+
-+    // If we aren't connected at both ends then just do nothing
-+    if (ctx->inputs[0] == NULL || ctx->outputs[0] == NULL)
-+        return 0;
-+
-+//    printf("Unsand: %s in: ", __func__);
-+//    dump_fmts(ctx->inputs[0]->in_formats);
-+//    printf("Unsand: %s out: ", __func__);
-+//    dump_fmts(ctx->outputs[0]->out_formats);
-+
-+    // Our output formats depend on our input formats and we can't/don't
-+    // want to convert between bit depths so we need to wait for the source
-+    // to have an opinion before we do
-+    if (ctx->inputs[0]->in_formats == NULL)
-+        return AVERROR(EAGAIN);
-+
-+    // Accept anything
-+    if (ctx->inputs[0]->out_formats == NULL &&
-+        (ret = ff_formats_ref(ctx->inputs[0]->in_formats, &ctx->inputs[0]->out_formats)) < 0)
-+        return ret;
-+
-+    // Filter out sand formats
-+
-+    // Generate a container if we don't already have one
-+    if (ctx->outputs[0]->in_formats == NULL)
-+    {
-+        // Somewhat rubbish way of ensuring we have a good structure
-+        const static enum AVPixelFormat out_fmts[] = {AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE};
-+        AVFilterFormats *formats = ff_make_format_list(out_fmts);
-+
-+        if (formats == NULL)
-+            return AVERROR(ENOMEM);
-+        if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->in_formats)) < 0)
-+            return ret;
-+    }
-+
-+    // Replace old format list with new filtered list derived from what our
-+    // input says it can do
-+    {
-+        const AVFilterFormats * const src_ff = ctx->inputs[0]->out_formats;
-+        AVFilterFormats * const dst_ff = ctx->outputs[0]->in_formats;
-+        enum AVPixelFormat *dst_fmts = av_malloc(sizeof(enum AVPixelFormat) * src_ff->nb_formats);
-+        int i;
-+        int n = 0;
-+        int seen_420p = 0;
-+        int seen_420p10 = 0;
-+
-+        for (i = 0; i < src_ff->nb_formats; ++i) {
-+            const enum AVPixelFormat f = src_ff->formats[i];
-+
-+            switch (f){
-+                case AV_PIX_FMT_YUV420P:
-+                case AV_PIX_FMT_SAND128:
-+                case AV_PIX_FMT_RPI4_8:
-+                    if (!seen_420p) {
-+                        seen_420p = 1;
-+                        dst_fmts[n++] = AV_PIX_FMT_YUV420P;
-+                    }
-+                    break;
-+                case AV_PIX_FMT_SAND64_10:
-+                case AV_PIX_FMT_YUV420P10:
-+                case AV_PIX_FMT_RPI4_10:
-+                    if (!seen_420p10) {
-+                        seen_420p10 = 1;
-+                        dst_fmts[n++] = AV_PIX_FMT_YUV420P10;
-+                    }
-+                    break;
-+                default:
-+                    dst_fmts[n++] = f;
-+                    break;
-+            }
-+        }
-+
-+        av_freep(&dst_ff->formats);
-+        dst_ff->formats = dst_fmts;
-+        dst_ff->nb_formats = n;
-+    }
-+
-+//    printf("Unsand: %s calc: ", __func__);
-+//    dump_fmts(ctx->outputs[0]->in_formats);
-+
-+    return 0;
-+}
-+
-+
-+#define OFFSET(x) offsetof(UnsandContext, x)
-+static const AVOption unsand_options[] = {
-+    { NULL }
-+};
-+
-+
-+AVFILTER_DEFINE_CLASS(unsand);
-+
-+static const AVFilterPad avfilter_vf_unsand_inputs[] = {
-+    {
-+        .name             = "default",
-+        .type             = AVMEDIA_TYPE_VIDEO,
-+        .filter_frame = filter_frame,
-+    },
-+    { NULL }
-+};
-+
-+static const AVFilterPad avfilter_vf_unsand_outputs[] = {
-+    {
-+        .name = "default",
-+        .type = AVMEDIA_TYPE_VIDEO
-+    },
-+    { NULL }
-+};
-+
-+AVFilter ff_vf_unsand = {
-+    .name          = "unsand",
-+    .description   = NULL_IF_CONFIG_SMALL("Convert sand pix fmt to yuv"),
-+
-+    .init          = init,
-+    .uninit        = uninit,
-+
-+    .query_formats = query_formats,
-+
-+    .priv_size     = sizeof(UnsandContext),
-+    .priv_class    = &unsand_class,
-+
-+    .inputs        = avfilter_vf_unsand_inputs,
-+    .outputs       = avfilter_vf_unsand_outputs,
-+};
-+
---- a/libavformat/utils.c
-+++ b/libavformat/utils.c
-@@ -3051,6 +3051,40 @@ static int has_codec_parameters(AVStream
-     return 1;
- }
- 
-+#if CONFIG_HEVC_RPI_DECODER && CONFIG_HEVC_DECODER
-+// This should be quite general purpose but avoid possible conflicts
-+// by limiting usage to cases wehere we know it works.
-+static int try_fallback_decoder(AVCodecContext * const avctx, const AVCodec *const old_codec, AVDictionary ** const opts)
-+{
-+    // Only try fallback if we know it is supported (HEVC only)
-+    const AVCodec *const new_codec = old_codec->id != AV_CODEC_ID_HEVC ? NULL :
-+        avcodec_find_decoder_by_id_and_fmt(old_codec->id, AV_PIX_FMT_NONE);
-+    int err;
-+
-+    // Failed to find fallback or we are already at the fallback
-+    if (new_codec == NULL || new_codec == old_codec)
-+    {
-+        return AVERROR_DECODER_NOT_FOUND;
-+    }
-+
-+    // * This may be dodgy - header says to not use this fn,
-+    //   especially if we are going to reopen the context...
-+    //   (but it does seem to work for our cases)
-+    if (avcodec_is_open(avctx)) {
-+        avcodec_close(avctx);
-+    }
-+
-+    if ((err = avcodec_open2(avctx, new_codec, opts)) < 0)
-+    {
-+        return err;
-+    }
-+
-+    return 0;
-+}
-+#else
-+#define try_fallback_decoder(avctx, old_codec, opts) (AVERROR_DECODER_NOT_FOUND)
-+#endif
-+
- /* returns 1 or 0 if or if not decoded data was returned, or a negative error */
- static int try_decode_frame(AVFormatContext *s, AVStream *st,
-                             const AVPacket *avpkt, AVDictionary **options)
-@@ -3085,7 +3119,11 @@ static int try_decode_frame(AVFormatCont
-         av_dict_set(options ? options : &thread_opt, "threads", "1", 0);
-         if (s->codec_whitelist)
-             av_dict_set(options ? options : &thread_opt, "codec_whitelist", s->codec_whitelist, 0);
--        ret = avcodec_open2(avctx, codec, options ? options : &thread_opt);
-+        if ((ret = avcodec_open2(avctx, codec, options ? options : &thread_opt)) == AVERROR_DECODER_NOT_FOUND)
-+        {
-+            // Try fallback if if looks worth a try
-+            ret = try_fallback_decoder(avctx, codec, options ? options : &thread_opt);
-+        }
-         if (!options)
-             av_dict_free(&thread_opt);
-         if (ret < 0) {
-@@ -3116,6 +3154,14 @@ static int try_decode_frame(AVFormatCont
-         if (avctx->codec_type == AVMEDIA_TYPE_VIDEO ||
-             avctx->codec_type == AVMEDIA_TYPE_AUDIO) {
-             ret = avcodec_send_packet(avctx, &pkt);
-+
-+            // If we are going to want to fall back we should know here
-+            if (ret == AVERROR_DECODER_NOT_FOUND) {
-+                if ((ret = try_fallback_decoder(avctx, avctx->codec, options)) < 0)
-+                    break;
-+                continue;
-+            }
-+
-             if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF)
-                 break;
-             if (ret >= 0)
-@@ -3726,9 +3772,20 @@ FF_ENABLE_DEPRECATION_WARNINGS
-         // Try to just open decoders, in case this is enough to get parameters.
-         if (!has_codec_parameters(st, NULL) && st->request_probe <= 0) {
-             if (codec && !avctx->codec)
--                if (avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt) < 0)
--                    av_log(ic, AV_LOG_WARNING,
--                           "Failed to open codec in %s\n",__FUNCTION__);
-+            {
-+                int err;
-+
-+                if ((err = avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt)) < 0)
-+                {
-+                    if (err == AVERROR_DECODER_NOT_FOUND) {
-+                        err = try_fallback_decoder(avctx, codec, options ? &options[i] : &thread_opt);
-+                    }
-+                    if (err < 0) {
-+                        av_log(ic, AV_LOG_WARNING,
-+                               "Failed to open codec in %s\n",__FUNCTION__);
-+                    }
-+                }
-+            }
-         }
-         if (!options)
-             av_dict_free(&thread_opt);
---- a/libavutil/Makefile
-+++ b/libavutil/Makefile
-@@ -68,6 +68,7 @@ HEADERS = adler32.h
-           rational.h                                                    \
-           replaygain.h                                                  \
-           ripemd.h                                                      \
-+	  rpi_sand_fns.h                                                \
-           samplefmt.h                                                   \
-           sha.h                                                         \
-           sha512.h                                                      \
-@@ -86,6 +87,7 @@ HEADERS = adler32.h
-           tx.h                                                          \
- 
- HEADERS-$(CONFIG_LZO)                   += lzo.h
-+HEADERS-$(CONFIG-RPI)                   += rpi_sand_fn_pw.h
- 
- ARCH_HEADERS = bswap.h                                                  \
-                intmath.h                                                \
-@@ -180,6 +182,7 @@ OBJS-$(CONFIG_LZO)
- OBJS-$(CONFIG_MEDIACODEC)               += hwcontext_mediacodec.o
- OBJS-$(CONFIG_OPENCL)                   += hwcontext_opencl.o
- OBJS-$(CONFIG_QSV)                      += hwcontext_qsv.o
-+OBJS-$(CONFIG_SAND)                     += rpi_sand_fns.o
- OBJS-$(CONFIG_VAAPI)                    += hwcontext_vaapi.o
- OBJS-$(CONFIG_VIDEOTOOLBOX)             += hwcontext_videotoolbox.o
- OBJS-$(CONFIG_VDPAU)                    += hwcontext_vdpau.o
---- a/libavutil/aarch64/Makefile
-+++ b/libavutil/aarch64/Makefile
-@@ -1,4 +1,6 @@
- OBJS += aarch64/cpu.o                                                 \
-         aarch64/float_dsp_init.o                                      \
- 
--NEON-OBJS += aarch64/float_dsp_neon.o
-+NEON-OBJS += aarch64/float_dsp_neon.o                                 \
-+             aarch64/rpi_sand_neon.o                                  \
-+
---- /dev/null
-+++ b/libavutil/aarch64/rpi_sand_neon.S
-@@ -0,0 +1,781 @@
-+/*
-+Copyright (c) 2021 Michael Eiler
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: Michael Eiler <eiler.mike@gmail.com>
-+*/
-+
-+#include "asm.S"
-+
-+// void ff_rpi_sand8_lines_to_planar_y8(
-+//   uint8_t * dest,            : x0
-+//   unsigned int dst_stride,   : w1
-+//   const uint8_t * src,       : x2
-+//   unsigned int src_stride1,  : w3, always 128
-+//   unsigned int src_stride2,  : w4
-+//   unsigned int _x,           : w5
-+//   unsigned int y,            : w6
-+//   unsigned int _w,           : w7
-+//   unsigned int h);           : [sp, #0]
-+
-+function ff_rpi_sand8_lines_to_planar_y8, export=1
-+    // w15 contains the number of rows we need to process
-+    ldr w15, [sp, #0]
-+
-+    // w8 will contain the number of blocks per row
-+    // w8 = floor(_w/stride1)
-+    // stride1 is assumed to always be 128
-+    mov w8, w1
-+    lsr w8, w8, #7
-+
-+    // in case the width of the image is not a multiple of 128, there will
-+    // be an incomplete block at the end of every row
-+    // w9 contains the number of pixels stored within this block
-+    // w9 = _w - w8 * 128
-+    lsl w9, w8, #7
-+    sub w9, w7, w9
-+
-+    // this is the value we have to add to the src pointer after reading a complete block
-+    // it will move the address to the start of the next block
-+    // w10 = stride2 * stride1 - stride1 
-+    mov w10, w4
-+    lsl w10, w10, #7
-+    sub w10, w10, #128
-+
-+    // w11 is the row offset, meaning the start offset of the first block of every collumn
-+    // this will be increased with stride1 within every iteration of the row_loop
-+    eor w11, w11, w11
-+
-+    // w12 = 0, processed row count
-+    eor w12, w12, w12
-+row_loop:
-+    // start of the first block within the current row
-+    // x13 = row offset + src
-+    mov x13, x2
-+    add x13, x13, x11
-+
-+    // w14 = 0, processed block count
-+    eor w14, w14, w14
-+
-+    cmp w8, #0
-+    beq no_main_y8
-+
-+block_loop:
-+    // copy 128 bytes (a full block) into the vector registers v0-v7 and increase the src address by 128
-+    // fortunately these aren't callee saved ones, meaning we don't need to backup them
-+    ld1 { v0.16b,  v1.16b,  v2.16b,  v3.16b}, [x13], #64
-+    ld1 { v4.16b,  v5.16b,  v6.16b,  v7.16b}, [x13], #64 
-+
-+    // write these registers back to the destination vector and increase the dst address by 128
-+    st1 { v0.16b,  v1.16b,  v2.16b,  v3.16b }, [x0], #64
-+    st1 { v4.16b,  v5.16b,  v6.16b,  v7.16b }, [x0], #64
-+
-+    // move the source register to the beginning of the next block (x13 = src + block offset)
-+    add x13, x13, x10
-+    // increase the block counter
-+    add w14, w14, #1
-+
-+    // continue with the block_loop if we haven't copied all full blocks yet
-+    cmp w8, w14
-+    bgt block_loop
-+
-+    // handle the last block at the end of each row
-+    // at most 127 byte values copied from src to dst
-+no_main_y8:
-+    eor w5, w5, w5 // i = 0
-+incomplete_block_loop_y8:
-+    cmp w5, w9
-+    bge incomplete_block_loop_end_y8
-+
-+    ldrb w6, [x13]
-+    strb w6, [x0]
-+    add x13, x13, #1
-+    add x0, x0, #1
-+
-+    add w5, w5, #1
-+    b incomplete_block_loop_y8
-+incomplete_block_loop_end_y8:
-+    
-+   
-+    // increase the row offset by 128 (stride1) 
-+    add w11, w11, #128
-+    // increment the row counter
-+    add w12, w12, #1
-+    
-+    // process the next row if we haven't finished yet
-+    cmp w15, w12
-+    bgt row_loop
-+
-+    ret
-+endfunc
-+
-+
-+
-+// void ff_rpi_sand8_lines_to_planar_c8(
-+//   uint8_t * dst_u,           : x0
-+//   unsigned int dst_stride_u, : w1 == width
-+//   uint8_t * dst_v,           : x2
-+//   unsigned int dst_stride_v, : w3 == width
-+//   const uint8_t * src,       : x4
-+//   unsigned int stride1,      : w5 == 128
-+//   unsigned int stride2,      : w6
-+//   unsigned int _x,           : w7
-+//   unsigned int y,            : [sp, #0]
-+//   unsigned int _w,           : [sp, #8]
-+//   unsigned int h);           : [sp, #16]
-+
-+function ff_rpi_sand8_lines_to_planar_c8, export=1
-+    // w7 = width
-+    ldr w7, [sp, #8]
-+
-+    // w15 contains the number of rows we need to process
-+    // counts down
-+    ldr w15, [sp, #16]
-+
-+    // number of full blocks, w8 = _w / (stride1 >> 1) == _w / 64 == _w >> 6
-+    mov w8, w7
-+    lsr w8, w8, #6
-+
-+    // number of pixels in block at the end of every row
-+    // w9 = _w - (w8 * 64)
-+    lsl w9, w8, #6
-+    sub w9, w7, w9
-+
-+    // Skip at the end of the line to account for stride
-+    sub w12, w1, w7
-+
-+    // address delta to the beginning of the next block
-+    // w10 = (stride2 * stride1 - stride1) = stride2 * 128 - 128
-+    lsl w10, w6, #7
-+    sub w10, w10, #128
-+
-+    // w11 = row address start offset = 0
-+    eor w11, w11, w11
-+
-+row_loop_c8:
-+    // start of the first block within the current row
-+    // x13 = row offset + src
-+    mov x13, x4
-+    add x13, x13, x11
-+
-+    // w14 = 0, processed block count
-+    eor w14, w14, w14
-+
-+    cmp w8, #0
-+    beq no_main_c8
-+
-+block_loop_c8:
-+    // load the full block -> 128 bytes, the block contains 64 interleaved U and V values 
-+    ld2 { v0.16b,  v1.16b }, [x13], #32
-+    ld2 { v2.16b,  v3.16b }, [x13], #32
-+    ld2 { v4.16b,  v5.16b }, [x13], #32
-+    ld2 { v6.16b,  v7.16b }, [x13], #32
-+
-+    // swap register so that we can write them out with a single instruction
-+    mov v16.16b, v1.16b
-+    mov v17.16b, v3.16b
-+    mov v18.16b, v5.16b
-+    mov v1.16b, v2.16b
-+    mov v2.16b, v4.16b
-+    mov v3.16b, v6.16b
-+    mov v4.16b, v16.16b
-+    mov v5.16b, v17.16b
-+    mov v6.16b, v18.16b
-+
-+    st1 { v0.16b,  v1.16b,  v2.16b,  v3.16b }, [x0], #64
-+    st1 { v4.16b,  v5.16b,  v6.16b,  v7.16b }, [x2], #64
-+
-+    // increment row counter and move src to the beginning of the next block
-+    add w14, w14, #1
-+    add x13, x13, x10
-+    
-+    // jump to block_loop_c8 iff the block count is smaller than the number of full blocks
-+    cmp w8, w14
-+    bgt block_loop_c8
-+
-+no_main_c8:
-+    // handle incomplete block at the end of every row
-+    eor w5, w5, w5 // point counter, this might be 
-+incomplete_block_loop_c8:
-+    cmp w5, w9
-+    bge incomplete_block_loop_end_c8
-+
-+    ldrb w1, [x13]
-+    strb w1, [x0]
-+    add x13, x13, #1
-+
-+    ldrb w1, [x13]
-+    strb w1, [x2]
-+    add x13, x13, #1
-+
-+    add x0, x0, #1
-+    add x2, x2, #1
-+
-+    add w5, w5, #1
-+    b incomplete_block_loop_c8
-+incomplete_block_loop_end_c8:
-+
-+    // increase row_offset by stride1
-+    add w11, w11, #128
-+    add x0, x0, w12, sxtw
-+    add x2, x2, w12, sxtw
-+
-+    // jump to row_Loop_c8 iff the row count is small than the height
-+    subs w15, w15, #1
-+    bgt row_loop_c8
-+
-+    ret
-+endfunc
-+
-+//void ff_rpi_sand30_lines_to_planar_c16(
-+//  uint8_t * dst_u,            // [x0]
-+//  unsigned int dst_stride_u,  // [w1] == _w*2
-+//  uint8_t * dst_v,            // [x2]
-+//  unsigned int dst_stride_v,  // [w3] == _w*2
-+//  const uint8_t * src,        // [x4]
-+//  unsigned int stride1,       // [w5] == 128
-+//  unsigned int stride2,       // [w6] 
-+//  unsigned int _x,            // [w7] == 0
-+//  unsigned int y,             // [sp, #0] == 0
-+//  unsigned int _w,            // [sp, #8] -> w3
-+//  unsigned int h);            // [sp, #16] -> w7
-+
-+.macro rpi_sand30_lines_to_planar_c16_block_half
-+    ld1 { v0.4s,  v1.4s, v2.4s, v3.4s }, [x13], #64
-+
-+    xtn v4.4h, v0.4s
-+    ushr v0.4s, v0.4s, #10
-+    xtn v5.4h, v0.4s
-+    ushr v0.4s, v0.4s, #10
-+    xtn v6.4h, v0.4s
-+    xtn2 v4.8h, v1.4s
-+    ushr v1.4s, v1.4s, #10
-+    xtn2 v5.8h, v1.4s
-+    ushr v1.4s, v1.4s, #10
-+    xtn2 v6.8h, v1.4s
-+    and v4.16b, v4.16b, v16.16b
-+    and v5.16b, v5.16b, v16.16b
-+    and v6.16b, v6.16b, v16.16b
-+    st3 { v4.8h, v5.8h, v6.8h }, [sp], #48
-+    
-+    xtn v4.4h, v2.4s
-+    ushr v2.4s, v2.4s, #10
-+    xtn v5.4h, v2.4s
-+    ushr v2.4s, v2.4s, #10
-+    xtn v6.4h, v2.4s
-+    xtn2 v4.8h, v3.4s
-+    ushr v3.4s, v3.4s, #10
-+    xtn2 v5.8h, v3.4s
-+    ushr v3.4s, v3.4s, #10
-+    xtn2 v6.8h, v3.4s
-+    and v4.16b, v4.16b, v16.16b
-+    and v5.16b, v5.16b, v16.16b
-+    and v6.16b, v6.16b, v16.16b
-+    st3 { v4.8h, v5.8h, v6.8h }, [sp]
-+    sub sp, sp, #48
-+.endm
-+
-+function ff_rpi_sand30_lines_to_planar_c16, export=1
-+    stp x19, x20, [sp, #-48]!
-+    stp x21, x22, [sp, #16]
-+    stp x23, x24, [sp, #32]
-+
-+    ldr w3, [sp, #48+8]    // w3 = width
-+    ldr w7, [sp, #48+16]   // w7 = height
-+
-+    // reserve space on the stack for intermediate results
-+    sub sp, sp, #256
-+
-+    // number of 128byte blocks per row, w8 = width / 48
-+    mov w9, #48
-+    udiv w8, w3, w9
-+
-+    // remaining pixels (rem_pix) per row, w9 = width - w8 * 48
-+    mul w9, w8, w9
-+    sub w9, w3, w9
-+
-+    // row offset, the beginning of the next row to process
-+    eor w10, w10, w10
-+
-+    // offset to the beginning of the next block, w11 = stride2 * 128 - 128
-+    lsl w11, w6, #7
-+    sub w11, w11, #128
-+
-+    // decrease the height by one and in case of remaining pixels increase the block count by one
-+    sub w7, w7, #1
-+    cmp w9, #0
-+    cset w19, ne    // w19 == 1 iff reamining pixels != 0
-+    add w8, w8, w19
-+
-+    // bytes we have to move dst back by at the end of every row
-+    mov w21, #48*2
-+    mul w21, w21, w8
-+    sub w21, w1, w21
-+
-+    mov w20, #0     // w20 = flag, last row processed
-+
-+    mov x12, #0x03ff03ff03ff03ff
-+    dup v16.2d, x12
-+
-+    // iterate through rows, row counter = w12 = 0
-+    eor w12, w12, w12
-+row_loop_c16:
-+    cmp w12, w7
-+    bge row_loop_c16_fin
-+
-+    // address of row data = src + row_offset
-+    mov x13, x4
-+    add x13, x13, x10
-+
-+    eor w14, w14, w14
-+block_loop_c16:
-+    cmp w14, w8
-+    bge block_loop_c16_fin
-+
-+    rpi_sand30_lines_to_planar_c16_block_half
-+
-+    ld2 { v0.8h, v1.8h }, [sp], #32
-+    ld2 { v2.8h, v3.8h }, [sp], #32
-+    ld2 { v4.8h, v5.8h }, [sp]
-+    sub sp, sp, #64
-+
-+    st1 { v0.8h }, [x0], #16
-+    st1 { v2.8h }, [x0], #16
-+    st1 { v4.8h }, [x0], #16
-+    st1 { v1.8h }, [x2], #16
-+    st1 { v3.8h }, [x2], #16
-+    st1 { v5.8h }, [x2], #16
-+
-+    rpi_sand30_lines_to_planar_c16_block_half
-+
-+    ld2 { v0.8h, v1.8h }, [sp], #32
-+    ld2 { v2.8h, v3.8h }, [sp], #32
-+    ld2 { v4.8h, v5.8h }, [sp]
-+    sub sp, sp, #64
-+
-+    st1 { v0.8h }, [x0], #16
-+    st1 { v2.8h }, [x0], #16
-+    st1 { v4.8h }, [x0], #16
-+    st1 { v1.8h }, [x2], #16
-+    st1 { v3.8h }, [x2], #16
-+    st1 { v5.8h }, [x2], #16
-+
-+    add x13, x13, x11 // offset to next block
-+    add w14, w14, #1
-+    b block_loop_c16
-+block_loop_c16_fin:
-+
-+    add w10, w10, #128
-+    add w12, w12, #1
-+    add x0, x0, w21, sxtw  // move dst pointers back by x21
-+    add x2, x2, w21, sxtw
-+    b row_loop_c16
-+row_loop_c16_fin:
-+
-+    cmp w20, #1
-+    beq row_loop_c16_fin2
-+    mov w20, #1
-+    sub w8, w8, w19 // decrease block count by w19
-+    add w7, w7, #1 // increase height
-+    b row_loop_c16
-+
-+row_loop_c16_fin2:
-+    sub x0, x0, w21, sxtw // readd x21 in case of the last row
-+    sub x2, x2, w21, sxtw // so that we can write out the few remaining pixels
-+
-+    // last incomplete block to be finished
-+    // read operations are fine, stride2 is more than large enough even if rem_pix is 0
-+    rpi_sand30_lines_to_planar_c16_block_half
-+    ld2 { v0.8h, v1.8h }, [sp], #32
-+    ld2 { v2.8h, v3.8h }, [sp], #32
-+    ld2 { v4.8h, v5.8h }, [sp], #32
-+    rpi_sand30_lines_to_planar_c16_block_half
-+    ld2 { v0.8h, v1.8h }, [sp], #32
-+    ld2 { v2.8h, v3.8h }, [sp], #32
-+    ld2 { v4.8h, v5.8h }, [sp]
-+    sub sp, sp, #160
-+
-+    mov x4, sp
-+    eor w20, w20, w20
-+rem_pix_c16_loop:
-+    cmp w20, w9
-+    bge rem_pix_c16_fin
-+
-+    ldr w22, [x4], #4
-+    str w22, [x0], #2
-+    lsr w22, w22, #16
-+    str w22, [x2], #2 
-+
-+    add w20, w20, #1
-+    b rem_pix_c16_loop
-+rem_pix_c16_fin:
-+
-+    add sp, sp, #256
-+
-+    ldp x23, x24, [sp, #32]
-+    ldp x21, x22, [sp, #16]
-+    ldp x19, x20, [sp], #48
-+    ret
-+endfunc
-+
-+
-+
-+//void ff_rpi_sand30_lines_to_planar_p010(
-+//  uint8_t * dest,
-+//  unsigned int dst_stride,
-+//  const uint8_t * src,
-+//  unsigned int src_stride1,
-+//  unsigned int src_stride2,
-+//  unsigned int _x,
-+//  unsigned int y,
-+//  unsigned int _w,
-+//  unsigned int h);
-+
-+// void ff_rpi_sand30_lines_to_planar_y8(
-+//   uint8_t * dest,            : x0
-+//   unsigned int dst_stride,   : w1
-+//   const uint8_t * src,       : x2
-+//   unsigned int src_stride1,  : w3, always 128
-+//   unsigned int src_stride2,  : w4
-+//   unsigned int _x,           : w5
-+//   unsigned int y,            : w6
-+//   unsigned int _w,           : w7
-+//   unsigned int h);           : [sp, #0]
-+//
-+// Assumes that we are starting on a stripe boundary and that overreading
-+// within the stripe is OK. However it does respect the dest size for wri
-+
-+function ff_rpi_sand30_lines_to_planar_y16, export=1
-+                lsl             w4,  w4,  #7
-+                sub             w4,  w4,  #64
-+                sub             w1,  w1,  w7, lsl #1
-+                uxtw            x6,  w6
-+                add             x8,  x2,  x6, lsl #7
-+                ldr             w6,  [sp, #0]
-+
-+10:
-+                mov             x2,  x8
-+                mov             w5,  w7
-+1:
-+                ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
-+                ld1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
-+
-+                subs            w5,  w5,  #96
-+
-+                // v0, v1
-+
-+                shrn            v18.4h,  v0.4s,   #14
-+                xtn             v16.4h,  v0.4s
-+                shrn            v17.4h,  v0.4s,   #10
-+
-+                shrn2           v18.8h,  v1.4s,   #14
-+                xtn2            v16.8h,  v1.4s
-+                shrn2           v17.8h,  v1.4s,   #10
-+
-+                ushr            v18.8h,  v18.8h,  #6
-+                bic             v16.8h,  #0xfc,   lsl #8
-+                bic             v17.8h,  #0xfc,   lsl #8
-+
-+                // v2, v3
-+
-+                shrn            v21.4h,  v2.4s,   #14
-+                xtn             v19.4h,  v2.4s
-+                shrn            v20.4h,  v2.4s,   #10
-+
-+                shrn2           v21.8h,  v3.4s,   #14
-+                xtn2            v19.8h,  v3.4s
-+                shrn2           v20.8h,  v3.4s,   #10
-+
-+                ushr            v21.8h,  v21.8h,  #6
-+                bic             v19.8h,  #0xfc,   lsl #8
-+                bic             v20.8h,  #0xfc,   lsl #8
-+
-+                // v4, v5
-+
-+                shrn            v24.4h,  v4.4s,   #14
-+                xtn             v22.4h,  v4.4s
-+                shrn            v23.4h,  v4.4s,   #10
-+
-+                shrn2           v24.8h,  v5.4s,   #14
-+                xtn2            v22.8h,  v5.4s
-+                shrn2           v23.8h,  v5.4s,   #10
-+
-+                ushr            v24.8h,  v24.8h,  #6
-+                bic             v22.8h,  #0xfc,   lsl #8
-+                bic             v23.8h,  #0xfc,   lsl #8
-+
-+                // v6, v7
-+
-+                shrn            v27.4h,  v6.4s,   #14
-+                xtn             v25.4h,  v6.4s
-+                shrn            v26.4h,  v6.4s,   #10
-+
-+                shrn2           v27.8h,  v7.4s,   #14
-+                xtn2            v25.8h,  v7.4s
-+                shrn2           v26.8h,  v7.4s,   #10
-+
-+                ushr            v27.8h,  v27.8h,  #6
-+                bic             v25.8h,  #0xfc,   lsl #8
-+                bic             v26.8h,  #0xfc,   lsl #8
-+
-+                blt             2f
-+
-+                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
-+                st3             {v19.8h, v20.8h, v21.8h}, [x0], #48
-+                st3             {v22.8h, v23.8h, v24.8h}, [x0], #48
-+                st3             {v25.8h, v26.8h, v27.8h}, [x0], #48
-+
-+                bne             1b
-+
-+11:
-+                subs            w6,  w6,  #1
-+                add             x0,  x0,  w1,  uxtw
-+                add             x8,  x8,  #128
-+                bne             10b
-+
-+                ret
-+
-+// Partial final write
-+2:
-+                cmp             w5,  #48-96
-+                blt             1f
-+                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
-+                st3             {v19.8h, v20.8h, v21.8h}, [x0], #48
-+                beq             11b
-+                mov             v16.16b, v22.16b
-+                mov             v17.16b, v23.16b
-+                sub             w5,  w5,  #48
-+                mov             v18.16b, v24.16b
-+                mov             v19.16b, v25.16b
-+                mov             v20.16b, v26.16b
-+                mov             v21.16b, v27.16b
-+1:
-+                cmp             w5,  #24-96
-+                blt             1f
-+                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
-+                beq             11b
-+                mov             v16.16b, v19.16b
-+                mov             v17.16b, v20.16b
-+                sub             w5,  w5,  #24
-+                mov             v18.16b, v21.16b
-+1:
-+                cmp             w5,  #12-96
-+                blt             1f
-+                st3             {v16.4h, v17.4h, v18.4h}, [x0], #24
-+                beq             11b
-+                mov             v16.2d[0], v16.2d[1]
-+                sub             w5,  w5,  #12
-+                mov             v17.2d[0], v17.2d[1]
-+                mov             v18.2d[0], v18.2d[1]
-+1:
-+                cmp             w5,  #6-96
-+                blt             1f
-+                st3             {v16.h, v17.h, v18.h}[0], [x0], #6
-+                st3             {v16.h, v17.h, v18.h}[1], [x0], #6
-+                beq             11b
-+                mov             v16.2s[0], v16.2s[1]
-+                sub             w5,  w5,  #6
-+                mov             v17.2s[0], v17.2s[1]
-+                mov             v18.2s[0], v18.2s[1]
-+1:
-+                cmp             w5,  #3-96
-+                blt             1f
-+                st3             {v16.h, v17.h, v18.h}[0], [x0], #6
-+                beq             11b
-+                mov             v16.4h[0], v16.4h[1]
-+                sub             w5,  w5,  #3
-+                mov             v17.4h[0], v17.4h[1]
-+1:
-+                cmp             w5,  #2-96
-+                blt             1f
-+                st2             {v16.h, v17.h}[0], [x0], #4
-+                b               11b
-+1:
-+                st1             {v16.h}[0], [x0], #2
-+                b               11b
-+
-+endfunc
-+
-+// void ff_rpi_sand30_lines_to_planar_y8(
-+//   uint8_t * dest,            : x0
-+//   unsigned int dst_stride,   : w1
-+//   const uint8_t * src,       : x2
-+//   unsigned int src_stride1,  : w3, always 128
-+//   unsigned int src_stride2,  : w4
-+//   unsigned int _x,           : w5
-+//   unsigned int y,            : w6
-+//   unsigned int _w,           : w7
-+//   unsigned int h);           : [sp, #0]
-+//
-+// Assumes that we are starting on a stripe boundary and that overreading
-+// within the stripe is OK. However it does respect the dest size for wri
-+
-+function ff_rpi_sand30_lines_to_planar_y8, export=1
-+                lsl             w4,  w4,  #7
-+                sub             w4,  w4,  #64
-+                sub             w1,  w1,  w7
-+                uxtw            x6,  w6
-+                add             x8,  x2,  x6, lsl #7
-+                ldr             w6,  [sp, #0]
-+
-+10:
-+                mov             x2,  x8
-+                mov             w5,  w7
-+1:
-+                ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
-+                ld1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
-+
-+                subs            w5,  w5,  #96
-+
-+                // v0, v1
-+
-+                shrn            v18.4h,  v0.4s,   #16
-+                xtn             v16.4h,  v0.4s
-+                shrn            v17.4h,  v0.4s,   #12
-+
-+                shrn2           v18.8h,  v1.4s,   #16
-+                xtn2            v16.8h,  v1.4s
-+                shrn2           v17.8h,  v1.4s,   #12
-+
-+                shrn            v18.8b,  v18.8h,  #6
-+                shrn            v16.8b,  v16.8h,  #2
-+                xtn             v17.8b,  v17.8h
-+
-+                // v2, v3
-+
-+                shrn            v21.4h,  v2.4s,   #16
-+                xtn             v19.4h,  v2.4s
-+                shrn            v20.4h,  v2.4s,   #12
-+
-+                shrn2           v21.8h,  v3.4s,   #16
-+                xtn2            v19.8h,  v3.4s
-+                shrn2           v20.8h,  v3.4s,   #12
-+
-+                shrn2           v18.16b, v21.8h,  #6
-+                shrn2           v16.16b, v19.8h,  #2
-+                xtn2            v17.16b, v20.8h
-+
-+                // v4, v5
-+
-+                shrn            v24.4h,  v4.4s,   #16
-+                xtn             v22.4h,  v4.4s
-+                shrn            v23.4h,  v4.4s,   #12
-+
-+                shrn2           v24.8h,  v5.4s,   #16
-+                xtn2            v22.8h,  v5.4s
-+                shrn2           v23.8h,  v5.4s,   #12
-+
-+                shrn            v21.8b,  v24.8h,  #6
-+                shrn            v19.8b,  v22.8h,  #2
-+                xtn             v20.8b,  v23.8h
-+
-+                // v6, v7
-+
-+                shrn            v27.4h,  v6.4s,   #16
-+                xtn             v25.4h,  v6.4s
-+                shrn            v26.4h,  v6.4s,   #12
-+
-+                shrn2           v27.8h,  v7.4s,   #16
-+                xtn2            v25.8h,  v7.4s
-+                shrn2           v26.8h,  v7.4s,   #12
-+
-+                shrn2           v21.16b, v27.8h,  #6
-+                shrn2           v19.16b, v25.8h,  #2
-+                xtn2            v20.16b, v26.8h
-+
-+                blt             2f
-+
-+                st3             {v16.16b, v17.16b, v18.16b}, [x0], #48
-+                st3             {v19.16b, v20.16b, v21.16b}, [x0], #48
-+
-+                bne             1b
-+
-+11:
-+                subs            w6,  w6,  #1
-+                add             x0,  x0,  w1,  uxtw
-+                add             x8,  x8,  #128
-+                bne             10b
-+
-+                ret
-+
-+// Partial final write
-+2:
-+                cmp             w5,  #48-96
-+                blt             1f
-+                st3             {v16.16b, v17.16b, v18.16b}, [x0], #48
-+                beq             11b
-+                mov             v16.16b, v22.16b
-+                mov             v17.16b, v23.16b
-+                sub             w5,  w5,  #48
-+                mov             v18.16b, v24.16b
-+1:
-+                cmp             w5,  #24-96
-+                blt             1f
-+                st3             {v16.8b, v17.8b, v18.8b}, [x0], #24
-+                beq             11b
-+                mov             v16.2d[0], v16.2d[1]
-+                sub             w5,  w5,  #24
-+                mov             v17.2d[0], v17.2d[1]
-+                mov             v18.2d[0], v18.2d[1]
-+1:
-+                cmp             w5,  #12-96
-+                blt             1f
-+                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
-+                st3             {v16.b, v17.b, v18.b}[1], [x0], #3
-+                st3             {v16.b, v17.b, v18.b}[2], [x0], #3
-+                st3             {v16.b, v17.b, v18.b}[3], [x0], #3
-+                beq             11b
-+                mov             v16.2s[0], v16.2s[1]
-+                sub             w5,  w5,  #12
-+                mov             v17.2s[0], v17.2s[1]
-+                mov             v18.2s[0], v18.2s[1]
-+1:
-+                cmp             w5,  #6-96
-+                blt             1f
-+                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
-+                st3             {v16.b, v17.b, v18.b}[1], [x0], #3
-+                beq             11b
-+                mov             v16.4h[0], v16.4h[1]
-+                sub             w5,  w5,  #6
-+                mov             v17.4h[0], v17.4h[1]
-+                mov             v18.4h[0], v18.4h[1]
-+1:
-+                cmp             w5,  #3-96
-+                blt             1f
-+                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
-+                beq             11b
-+                mov             v16.8b[0], v16.8b[1]
-+                sub             w5,  w5,  #3
-+                mov             v17.8b[0], v17.8b[1]
-+1:
-+                cmp             w5,  #2-96
-+                blt             1f
-+                st2             {v16.b, v17.b}[0], [x0], #2
-+                b               11b
-+1:
-+                st1             {v16.b}[0], [x0], #1
-+                b               11b
-+
-+endfunc
-+
---- /dev/null
-+++ b/libavutil/aarch64/rpi_sand_neon.h
-@@ -0,0 +1,59 @@
-+/*
-+Copyright (c) 2021 Michael Eiler
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: Michael Eiler <eiler.mike@gmail.com>
-+*/
-+
-+#pragma once
-+
-+#ifdef __cplusplus
-+extern "C" {
-+#endif
-+
-+void ff_rpi_sand8_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
-+  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
-+  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
-+
-+void ff_rpi_sand8_lines_to_planar_c8(uint8_t * dst_u, unsigned int dst_stride_u,
-+  uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src,
-+  unsigned int stride1, unsigned int stride2, unsigned int _x, unsigned int y,
-+  unsigned int _w, unsigned int h);
-+
-+void ff_rpi_sand30_lines_to_planar_y16(uint8_t * dest, unsigned int dst_stride,
-+  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
-+  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
-+
-+void ff_rpi_sand30_lines_to_planar_c16(uint8_t * dst_u, unsigned int dst_stride_u,
-+  uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1,
-+  unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
-+
-+void ff_rpi_sand30_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
-+  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
-+  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
-+
-+#ifdef __cplusplus
-+}
-+#endif
-+
---- a/libavutil/arm/Makefile
-+++ b/libavutil/arm/Makefile
-@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o
- 
- NEON-OBJS += arm/float_dsp_init_neon.o                                  \
-              arm/float_dsp_neon.o                                       \
-+             arm/rpi_sand_neon.o                                        \
---- /dev/null
-+++ b/libavutil/arm/rpi_sand_neon.S
-@@ -0,0 +1,925 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox
-+*/
-+
-+#include "libavutil/arm/asm.S"
-+
-+
-+@ General notes:
-+@ Having done some timing on this in sand8->y8 (Pi4)
-+@  vst1 (680fps) is a bit faster than vstm (660fps)
-+@  vldm (680fps) is noticably faster than vld1 (480fps)
-+@  (or it might be that a mix is what is required)
-+@
-+@ At least on a Pi4 it is no more expensive to have a single auto-inc register
-+@ for dest address than it is to have 2 used alternately (On Pi3 Ben asserted
-+@ the latter was better)
-+@
-+@ vstm will bus error on unaligned access (so will vldm), vst1 is safe unless
-+@ the memory is uncached.
-+@ As these are Sand -> planar we can assume that src is going to be aligned but
-+@ it is possible that dest isn't (converting to .yuv or other packed format).
-+@ Luckily vst1 is faster than vstm :-) so all is well
-+@ vst1 has alignment requirements of el size so maybe splitting vst1.32 into 4
-+@ .8 stores would let us do non-word aligned stores into uncached but it
-+@ probably isn't worth it.
-+
-+
-+
-+
-+@ void ff_rpi_sand128b_stripe_to_8_10(
-+@   uint8_t * dest,             // [r0]
-+@   const uint8_t * src1,       // [r1]
-+@   const uint8_t * src2,       // [r2]
-+@   unsigned int lines);        // [r3]
-+
-+.macro  stripe2_to_8, bit_depth
-+        vpush    {q4-q7}
-+1:
-+        vldm     r1!, {q0-q7}
-+        subs     r3, #1
-+        vldm     r2!, {q8-q15}
-+        vqrshrn.u16 d0,  q0,  #\bit_depth - 8
-+        vqrshrn.u16 d1,  q1,  #\bit_depth - 8
-+        vqrshrn.u16 d2,  q2,  #\bit_depth - 8
-+        vqrshrn.u16 d3,  q3,  #\bit_depth - 8
-+        vqrshrn.u16 d4,  q4,  #\bit_depth - 8
-+        vqrshrn.u16 d5,  q5,  #\bit_depth - 8
-+        vqrshrn.u16 d6,  q6,  #\bit_depth - 8
-+        vqrshrn.u16 d7,  q7,  #\bit_depth - 8
-+        vqrshrn.u16 d8,  q8,  #\bit_depth - 8
-+        vqrshrn.u16 d9,  q9,  #\bit_depth - 8
-+        vqrshrn.u16 d10, q10, #\bit_depth - 8
-+        vqrshrn.u16 d11, q11, #\bit_depth - 8
-+        vqrshrn.u16 d12, q12, #\bit_depth - 8
-+        vqrshrn.u16 d13, q13, #\bit_depth - 8
-+        vqrshrn.u16 d14, q14, #\bit_depth - 8
-+        vqrshrn.u16 d15, q15, #\bit_depth - 8
-+        vstm     r0!, {q0-q7}
-+        bne      1b
-+        vpop     {q4-q7}
-+        bx       lr
-+.endm
-+
-+function ff_rpi_sand128b_stripe_to_8_10, export=1
-+        stripe2_to_8     10
-+endfunc
-+
-+@ void ff_rpi_sand8_lines_to_planar_y8(
-+@   uint8_t * dest,             // [r0]
-+@   unsigned int dst_stride,    // [r1]
-+@   const uint8_t * src,        // [r2]
-+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
-+@   unsigned int src_stride2,   // [sp, #0]  -> r3
-+@   unsigned int _x,            // [sp, #4]  Ignored - 0
-+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
-+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
-+@   unsigned int h);            // [sp, #16] -> r7
-+@
-+@ Assumes that we are starting on a stripe boundary and that overreading
-+@ within the stripe is OK. However it does respect the dest size for writing
-+
-+function ff_rpi_sand8_lines_to_planar_y8, export=1
-+                push            {r4-r8, lr}     @ +24            L
-+                ldr             r3,  [sp, #24]
-+                ldr             r6,  [sp, #36]
-+                ldr             r7,  [sp, #32]  @ y
-+                lsl             r3,  #7
-+                sub             r1,  r6
-+                add             r8,  r2,  r7,  lsl #7
-+                ldr             r7,  [sp, #40]
-+
-+10:
-+                mov             r2,  r8
-+                add             r4,  r0,  #24
-+                mov             r5,  r6
-+                mov             lr,  #0
-+1:
-+                vldm            r2,  {q8-q15}
-+                add             r2,  r3
-+                subs            r5,  #128
-+                blt             2f
-+                vst1.8          {d16, d17, d18, d19}, [r0]!
-+                vst1.8          {d20, d21, d22, d23}, [r0]!
-+                vst1.8          {d24, d25, d26, d27}, [r0]!
-+                vst1.8          {d28, d29, d30, d31}, [r0]!
-+                bne             1b
-+11:
-+                subs            r7,  #1
-+                add             r0,  r1
-+                add             r8,  #128
-+                bne             10b
-+
-+                pop             {r4-r8, pc}
-+
-+@ Partial final write
-+2:
-+                cmp             r5,  #64-128
-+                blt             1f
-+                vst1.8          {d16, d17, d18, d19}, [r0]!
-+                vst1.8          {d20, d21, d22, d23}, [r0]!
-+                beq             11b
-+                vmov            q8,  q12
-+                vmov            q9,  q13
-+                sub             r5,  #64
-+                vmov            q10, q14
-+                vmov            q11, q15
-+1:
-+                cmp             r5,  #32-128
-+                blt             1f
-+                vst1.8          {d16, d17, d18, d19}, [r0]!
-+                beq             11b
-+                vmov            q8,  q10
-+                sub             r5,  #32
-+                vmov            q9,  q11
-+1:
-+                cmp             r5,  #16-128
-+                blt             1f
-+                vst1.8          {d16, d17}, [r0]!
-+                beq             11b
-+                sub             r5,  #16
-+                vmov            q8,  q9
-+1:
-+                cmp             r5,  #8-128
-+                blt             1f
-+                vst1.8          {d16}, [r0]!
-+                beq             11b
-+                sub             r5,  #8
-+                vmov            d16, d17
-+1:
-+                cmp             r5,  #4-128
-+                blt             1f
-+                vst1.32         {d16[0]}, [r0]!
-+                beq             11b
-+                sub             r5,  #4
-+                vshr.u64        d16, #32
-+1:
-+                cmp             r5,  #2-128
-+                blt             1f
-+                vst1.16         {d16[0]}, [r0]!
-+                beq             11b
-+                vst1.8          {d16[2]}, [r0]!
-+                b               11b
-+1:
-+                vst1.8          {d16[0]}, [r0]!
-+                b               11b
-+endfunc
-+
-+@ void ff_rpi_sand8_lines_to_planar_c8(
-+@   uint8_t * dst_u,            // [r0]
-+@   unsigned int dst_stride_u,  // [r1]
-+@   uint8_t * dst_v,            // [r2]
-+@   unsigned int dst_stride_v,  // [r3]
-+@   const uint8_t * src,        // [sp, #0]  -> r4, r5
-+@   unsigned int stride1,       // [sp, #4]  128
-+@   unsigned int stride2,       // [sp, #8]  -> r8
-+@   unsigned int _x,            // [sp, #12] 0
-+@   unsigned int y,             // [sp, #16] (r7 in prefix)
-+@   unsigned int _w,            // [sp, #20] -> r12, r6
-+@   unsigned int h);            // [sp, #24] -> r7
-+@
-+@ Assumes that we are starting on a stripe boundary and that overreading
-+@ within the stripe is OK. However it does respect the dest size for writing
-+
-+function ff_rpi_sand8_lines_to_planar_c8, export=1
-+                push            {r4-r8, lr}     @ +24
-+
-+                ldr             r5,  [sp, #24]
-+                ldr             r8,  [sp, #32]
-+                ldr             r7,  [sp, #40]
-+                ldr             r6,  [sp, #44]
-+                lsl             r8,  #7
-+                add             r5,  r5,  r7,  lsl #7
-+                sub             r1,  r1,  r6
-+                sub             r3,  r3,  r6
-+                ldr             r7,  [sp, #48]
-+                vpush           {q4-q7}
-+
-+10:
-+                mov             r4,  r5
-+                mov             r12, r6
-+1:
-+                subs            r12, #64
-+                vldm            r4,  {q0-q7}
-+                add             r4,  r8
-+                it              gt
-+                vldmgt          r4,  {q8-q15}
-+                add             r4,  r8
-+
-+                vuzp.8          q0,  q1
-+                vuzp.8          q2,  q3
-+                vuzp.8          q4,  q5
-+                vuzp.8          q6,  q7
-+
-+                vuzp.8          q8,  q9
-+                vuzp.8          q10, q11
-+                vuzp.8          q12, q13
-+                vuzp.8          q14, q15
-+                subs            r12, #64
-+
-+                @ Rearrange regs so we can use vst1 with 4 regs
-+                vswp            q1,  q2
-+                vswp            q5,  q6
-+                vswp            q9,  q10
-+                vswp            q13, q14
-+                blt             2f
-+
-+                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
-+                vst1.8          {d8,  d9,  d10, d11}, [r0]!
-+                vst1.8          {d16, d17, d18, d19}, [r0]!
-+                vst1.8          {d24, d25, d26, d27}, [r0]!
-+
-+                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
-+                vst1.8          {d12, d13, d14, d15}, [r2]!
-+                vst1.8          {d20, d21, d22, d23}, [r2]!
-+                vst1.8          {d28, d29, d30, d31}, [r2]!
-+                bne             1b
-+11:
-+                subs            r7,  #1
-+                add             r5,  #128
-+                add             r0,  r1
-+                add             r2,  r3
-+                bne             10b
-+                vpop            {q4-q7}
-+                pop             {r4-r8,pc}
-+
-+2:
-+                cmp             r12, #64-128
-+                blt             1f
-+                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
-+                vst1.8          {d8,  d9,  d10, d11}, [r0]!
-+                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
-+                vst1.8          {d12, d13, d14, d15}, [r2]!
-+                beq             11b
-+                sub             r12, #64
-+                vmov            q0,  q8
-+                vmov            q1,  q9
-+                vmov            q2,  q10
-+                vmov            q3,  q11
-+                vmov            q4,  q12
-+                vmov            q5,  q13
-+                vmov            q6,  q14
-+                vmov            q7,  q15
-+1:
-+                cmp             r12, #32-128
-+                blt             1f
-+                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
-+                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
-+                beq             11b
-+                sub             r12, #32
-+                vmov            q0,  q4
-+                vmov            q1,  q5
-+                vmov            q2,  q6
-+                vmov            q3,  q7
-+1:
-+                cmp             r12, #16-128
-+                blt             1f
-+                vst1.8          {d0,  d1 }, [r0]!
-+                vst1.8          {d4,  d5 }, [r2]!
-+                beq             11b
-+                sub             r12, #16
-+                vmov            q0,  q1
-+                vmov            q2,  q3
-+1:
-+                cmp             r12, #8-128
-+                blt             1f
-+                vst1.8          {d0}, [r0]!
-+                vst1.8          {d4}, [r2]!
-+                beq             11b
-+                sub             r12, #8
-+                vmov            d0,  d1
-+                vmov            d4,  d5
-+1:
-+                cmp             r12, #4-128
-+                blt             1f
-+                vst1.32         {d0[0]}, [r0]!
-+                vst1.32         {d4[0]}, [r2]!
-+                beq             11b
-+                sub             r12, #4
-+                vmov            s0,  s1
-+                vmov            s8,  s9
-+1:
-+                cmp             r12, #2-128
-+                blt             1f
-+                vst1.16         {d0[0]}, [r0]!
-+                vst1.16         {d4[0]}, [r2]!
-+                beq             11b
-+                vst1.8          {d0[2]}, [r0]!
-+                vst1.8          {d4[2]}, [r2]!
-+                b               11b
-+1:
-+                vst1.8          {d0[0]}, [r0]!
-+                vst1.8          {d4[0]}, [r2]!
-+                b               11b
-+endfunc
-+
-+
-+
-+@ void ff_rpi_sand30_lines_to_planar_y16(
-+@   uint8_t * dest,             // [r0]
-+@   unsigned int dst_stride,    // [r1]
-+@   const uint8_t * src,        // [r2]
-+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
-+@   unsigned int src_stride2,   // [sp, #0]  -> r3
-+@   unsigned int _x,            // [sp, #4]  Ignored - 0
-+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
-+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
-+@   unsigned int h);            // [sp, #16] -> r7
-+@
-+@ Assumes that we are starting on a stripe boundary and that overreading
-+@ within the stripe is OK. However it does respect the dest size for writing
-+
-+function ff_rpi_sand30_lines_to_planar_y16, export=1
-+                push            {r4-r8, lr}     @ +24
-+                ldr             r3,  [sp, #24]
-+                ldr             r6,  [sp, #36]
-+                ldr             r7,  [sp, #32]  @ y
-+                mov             r12, #48
-+                sub             r3,  #1
-+                lsl             r3,  #7
-+                sub             r1,  r1,  r6,  lsl #1
-+                add             r8,  r2,  r7,  lsl #7
-+                ldr             r7,  [sp, #40]
-+
-+10:
-+                mov             r2,  r8
-+                add             r4,  r0,  #24
-+                mov             r5,  r6
-+                mov             lr,  #0
-+1:
-+                vldm            r2!, {q10-q13}
-+                add             lr,  #64
-+
-+                vshrn.u32       d4 , q10, #14    @ Cannot vshrn.u32 #20!
-+                ands            lr,  #127
-+                vshrn.u32       d2,  q10, #10
-+                vmovn.u32       d0,  q10
-+
-+                vshrn.u32       d5,  q11, #14
-+                it              eq
-+                addeq           r2,  r3
-+                vshrn.u32       d3,  q11, #10
-+                vmovn.u32       d1,  q11
-+
-+                subs            r5,  #48
-+                vshr.u16        q2,  #6
-+                vbic.u16        q0,  #0xfc00
-+                vbic.u16        q1,  #0xfc00
-+
-+                vshrn.u32       d20, q12, #14
-+                vshrn.u32       d18, q12, #10
-+                vmovn.u32       d16, q12
-+
-+                vshrn.u32       d21, q13, #14
-+                vshrn.u32       d19, q13, #10
-+                vmovn.u32       d17, q13
-+
-+                vshr.u16        q10, #6
-+                vbic.u16        q8,  #0xfc00
-+                vbic.u16        q9 , #0xfc00
-+                blt             2f
-+
-+                vst3.16         {d0,  d2,  d4},  [r0], r12
-+                vst3.16         {d1,  d3,  d5},  [r4], r12
-+                vst3.16         {d16, d18, d20}, [r0], r12
-+                vst3.16         {d17, d19, d21}, [r4], r12
-+
-+                bne             1b
-+
-+11:
-+                subs            r7,  #1
-+                add             r0,  r1
-+                add             r8,  #128
-+                bne             10b
-+
-+                pop             {r4-r8, pc}
-+
-+@ Partial final write
-+2:
-+                cmp             r5,  #24-48
-+                blt             1f
-+                vst3.16         {d0,  d2,  d4},  [r0], r12
-+                vst3.16         {d1,  d3,  d5},  [r4]
-+                beq             11b
-+                vmov            q0,  q8
-+                sub             r5,  #24
-+                vmov            q1,  q9
-+                vmov            q2,  q10
-+1:
-+                cmp             r5,  #12-48
-+                blt             1f
-+                vst3.16         {d0,  d2,  d4},  [r0]!
-+                beq             11b
-+                vmov            d0, d1
-+                sub             r5, #12
-+                vmov            d2, d3
-+                vmov            d4, d5
-+1:
-+                cmp             r5,  #6-48
-+                add             r4,  r0,  #6    @ avoid [r0]! on sequential instructions
-+                blt             1f
-+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]
-+                vst3.16         {d0[1], d2[1], d4[1]}, [r4]
-+                add             r0,  #12
-+                beq             11b
-+                vmov            s0,  s1
-+                sub             r5,  #6
-+                vmov            s4,  s5
-+                vmov            s8,  s9
-+1:
-+                cmp             r5, #3-48
-+                blt             1f
-+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]!
-+                beq             11b
-+                sub             r5, #3
-+                vshr.u32        d0, #16
-+                vshr.u32        d2, #16
-+1:
-+                cmp             r5, #2-48
-+                blt             1f
-+                vst2.16         {d0[0], d2[0]}, [r0]!
-+                b               11b
-+1:
-+                vst1.16         {d0[0]}, [r0]!
-+                b               11b
-+
-+endfunc
-+
-+
-+@ void ff_rpi_sand30_lines_to_planar_c16(
-+@   uint8_t * dst_u,            // [r0]
-+@   unsigned int dst_stride_u,  // [r1]
-+@   uint8_t * dst_v,            // [r2]
-+@   unsigned int dst_stride_v,  // [r3]
-+@   const uint8_t * src,        // [sp, #0]  -> r4, r5
-+@   unsigned int stride1,       // [sp, #4]  128
-+@   unsigned int stride2,       // [sp, #8]  -> r8
-+@   unsigned int _x,            // [sp, #12] 0
-+@   unsigned int y,             // [sp, #16] (r7 in prefix)
-+@   unsigned int _w,            // [sp, #20] -> r6, r9
-+@   unsigned int h);            // [sp, #24] -> r7
-+@
-+@ Assumes that we are starting on a stripe boundary and that overreading
-+@ within the stripe is OK. However it does respect the dest size for writing
-+
-+function ff_rpi_sand30_lines_to_planar_c16, export=1
-+                push            {r4-r10, lr}    @ +32
-+                ldr             r5,  [sp, #32]
-+                ldr             r8,  [sp, #40]
-+                ldr             r7,  [sp, #48]
-+                ldr             r9,  [sp, #52]
-+                mov             r12, #48
-+                sub             r8,  #1
-+                lsl             r8,  #7
-+                add             r5,  r5,  r7,  lsl #7
-+                sub             r1,  r1,  r9,  lsl #1
-+                sub             r3,  r3,  r9,  lsl #1
-+                ldr             r7,  [sp, #56]
-+10:
-+                mov             lr,  #0
-+                mov             r4,  r5
-+                mov             r6,  r9
-+1:
-+                vldm            r4!, {q0-q3}
-+                add             lr,  #64
-+
-+                @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2
-+                vshrn.u32       d20, q0,  #14
-+                vmovn.u32       d18, q0
-+                vshrn.u32       d0,  q0,  #10
-+                ands            lr,  #127
-+
-+                vshrn.u32       d21, q1,  #14
-+                vmovn.u32       d19, q1
-+                vshrn.u32       d1,  q1,  #10
-+
-+                vshrn.u32       d22, q2,  #10
-+                vmovn.u32       d2,  q2
-+                vshrn.u32       d4,  q2,  #14
-+
-+                add             r10, r0,  #24
-+                vshrn.u32       d23, q3,  #10
-+                vmovn.u32       d3,  q3
-+                vshrn.u32       d5,  q3,  #14
-+
-+                it              eq
-+                addeq           r4,  r8
-+                vuzp.16         q0,  q11
-+                vuzp.16         q9,  q1
-+                vuzp.16         q10, q2
-+
-+                @ q0   V0, V3,..
-+                @ q9   U0, U3...
-+                @ q10  U1, U4...
-+                @ q11  U2, U5,..
-+                @ q1   V1, V4,
-+                @ q2   V2, V5,..
-+
-+                subs            r6,  #24
-+                vbic.u16        q11, #0xfc00
-+                vbic.u16        q9,  #0xfc00
-+                vshr.u16        q10, #6
-+                vshr.u16        q2,  #6
-+                vbic.u16        q0,  #0xfc00
-+                vbic.u16        q1,  #0xfc00
-+
-+                blt             2f
-+
-+                vst3.16         {d18, d20, d22}, [r0],  r12
-+                vst3.16         {d19, d21, d23}, [r10]
-+                add             r10, r2,  #24
-+                vst3.16         {d0,  d2,  d4},  [r2],  r12
-+                vst3.16         {d1,  d3,  d5},  [r10]
-+
-+                bne             1b
-+
-+11:
-+                subs            r7,  #1
-+                add             r5,  #128
-+                add             r0,  r1
-+                add             r2,  r3
-+                bne             10b
-+
-+                pop             {r4-r10, pc}
-+
-+@ Partial final write
-+2:
-+                cmp             r6,  #-12
-+                blt             1f
-+                vst3.16         {d18, d20, d22}, [r0]!
-+                vst3.16         {d0,  d2,  d4},  [r2]!
-+                beq             11b
-+                vmov            d18, d19
-+                vmov            d20, d21
-+                vmov            d22, d23
-+                sub             r6,  #12
-+                vmov            d0,  d1
-+                vmov            d2,  d3
-+                vmov            d4,  d5
-+1:
-+                cmp             r6,  #-18
-+                @ Rezip here as it makes the remaining tail handling easier
-+                vzip.16         d0,  d18
-+                vzip.16         d2,  d20
-+                vzip.16         d4,  d22
-+                blt             1f
-+                vst3.16         {d0[1],  d2[1],  d4[1]},  [r0]!
-+                vst3.16         {d0[0],  d2[0],  d4[0]},  [r2]!
-+                vst3.16         {d0[3],  d2[3],  d4[3]},  [r0]!
-+                vst3.16         {d0[2],  d2[2],  d4[2]},  [r2]!
-+                beq             11b
-+                vmov            d0,  d18
-+                vmov            d2,  d20
-+                sub             r6,  #6
-+                vmov            d4,  d22
-+1:
-+                cmp             r6,  #-21
-+                blt             1f
-+                vst3.16         {d0[1], d2[1], d4[1]}, [r0]!
-+                vst3.16         {d0[0], d2[0], d4[0]}, [r2]!
-+                beq             11b
-+                vmov            s4,  s5
-+                sub             r6,  #3
-+                vmov            s0,  s1
-+1:
-+                cmp             r6,  #-22
-+                blt             1f
-+                vst2.16         {d0[1], d2[1]}, [r0]!
-+                vst2.16         {d0[0], d2[0]}, [r2]!
-+                b               11b
-+1:
-+                vst1.16         {d0[1]}, [r0]!
-+                vst1.16         {d0[0]}, [r2]!
-+                b               11b
-+
-+endfunc
-+
-+@ void ff_rpi_sand30_lines_to_planar_p010(
-+@   uint8_t * dest,             // [r0]
-+@   unsigned int dst_stride,    // [r1]
-+@   const uint8_t * src,        // [r2]
-+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
-+@   unsigned int src_stride2,   // [sp, #0]  -> r3
-+@   unsigned int _x,            // [sp, #4]  Ignored - 0
-+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
-+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
-+@   unsigned int h);            // [sp, #16] -> r7
-+@
-+@ Assumes that we are starting on a stripe boundary and that overreading
-+@ within the stripe is OK. However it does respect the dest size for writing
-+
-+function ff_rpi_sand30_lines_to_planar_p010, export=1
-+                push            {r4-r8, lr}     @ +24
-+                ldr             r3,  [sp, #24]
-+                ldr             r6,  [sp, #36]
-+                ldr             r7,  [sp, #32]  @ y
-+                mov             r12, #48
-+                vmov.u16        q15, #0xffc0
-+                sub             r3,  #1
-+                lsl             r3,  #7
-+                sub             r1,  r1,  r6,  lsl #1
-+                add             r8,  r2,  r7,  lsl #7
-+                ldr             r7,  [sp, #40]
-+
-+10:
-+                mov             r2,  r8
-+                add             r4,  r0,  #24
-+                mov             r5,  r6
-+                mov             lr,  #0
-+1:
-+                vldm            r2!, {q10-q13}
-+                add             lr,  #64
-+
-+                vshl.u32        q14, q10, #6
-+                ands            lr,  #127
-+                vshrn.u32       d4,  q10, #14
-+                vshrn.u32       d2,  q10, #4
-+                vmovn.u32       d0,  q14
-+
-+                vshl.u32        q14, q11, #6
-+                it              eq
-+                addeq           r2,  r3
-+                vshrn.u32       d5,  q11, #14
-+                vshrn.u32       d3,  q11, #4
-+                vmovn.u32       d1,  q14
-+
-+                subs            r5,  #48
-+                vand            q2,  q15
-+                vand            q1,  q15
-+                vand            q0,  q15
-+
-+                vshl.u32        q14, q12, #6
-+                vshrn.u32       d20, q12, #14
-+                vshrn.u32       d18, q12, #4
-+                vmovn.u32       d16, q14
-+
-+                vshl.u32        q14, q13, #6
-+                vshrn.u32       d21, q13, #14
-+                vshrn.u32       d19, q13, #4
-+                vmovn.u32       d17, q14
-+
-+                vand            q10, q15
-+                vand            q9,  q15
-+                vand            q8,  q15
-+                blt             2f
-+
-+                vst3.16         {d0,  d2,  d4},  [r0], r12
-+                vst3.16         {d1,  d3,  d5},  [r4], r12
-+                vst3.16         {d16, d18, d20}, [r0], r12
-+                vst3.16         {d17, d19, d21}, [r4], r12
-+
-+                bne             1b
-+
-+11:
-+                subs            r7,  #1
-+                add             r0,  r1
-+                add             r8,  #128
-+                bne             10b
-+
-+                pop             {r4-r8, pc}
-+
-+@ Partial final write
-+2:
-+                cmp             r5,  #24-48
-+                blt             1f
-+                vst3.16         {d0,  d2,  d4},  [r0], r12
-+                vst3.16         {d1,  d3,  d5},  [r4]
-+                beq             11b
-+                vmov            q0,  q8
-+                sub             r5,  #24
-+                vmov            q1,  q9
-+                vmov            q2,  q10
-+1:
-+                cmp             r5,  #12-48
-+                blt             1f
-+                vst3.16         {d0,  d2,  d4},  [r0]!
-+                beq             11b
-+                vmov            d0, d1
-+                sub             r5, #12
-+                vmov            d2, d3
-+                vmov            d4, d5
-+1:
-+                cmp             r5,  #6-48
-+                add             r4,  r0,  #6    @ avoid [r0]! on sequential instructions
-+                blt             1f
-+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]
-+                vst3.16         {d0[1], d2[1], d4[1]}, [r4]
-+                add             r0,  #12
-+                beq             11b
-+                vmov            s0,  s1
-+                sub             r5,  #6
-+                vmov            s4,  s5
-+                vmov            s8,  s9
-+1:
-+                cmp             r5, #3-48
-+                blt             1f
-+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]!
-+                beq             11b
-+                sub             r5, #3
-+                vshr.u32        d0, #16
-+                vshr.u32        d2, #16
-+1:
-+                cmp             r5, #2-48
-+                blt             1f
-+                vst2.16         {d0[0], d2[0]}, [r0]!
-+                b               11b
-+1:
-+                vst1.16         {d0[0]}, [r0]!
-+                b               11b
-+
-+endfunc
-+
-+
-+@ void ff_rpi_sand30_lines_to_planar_y8(
-+@   uint8_t * dest,             // [r0]
-+@   unsigned int dst_stride,    // [r1]
-+@   const uint8_t * src,        // [r2]
-+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
-+@   unsigned int src_stride2,   // [sp, #0]  -> r3
-+@   unsigned int _x,            // [sp, #4]  Ignored - 0
-+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
-+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
-+@   unsigned int h);            // [sp, #16] -> r7
-+@
-+@ Assumes that we are starting on a stripe boundary and that overreading
-+@ within the stripe is OK. However it does respect the dest size for wri
-+
-+function ff_rpi_sand30_lines_to_planar_y8, export=1
-+                push            {r4-r8, lr}     @ +24
-+                ldr             r3,  [sp, #24]
-+                ldr             r6,  [sp, #36]
-+                ldr             r7,  [sp, #32]  @ y
-+                mov             r12, #48
-+                lsl             r3,  #7
-+                sub             r1,  r1,  r6
-+                add             r8,  r2,  r7,  lsl #7
-+                ldr             r7,  [sp, #40]
-+
-+10:
-+                mov             r2,  r8
-+                add             r4,  r0,  #24
-+                mov             r5,  r6
-+1:
-+                vldm            r2,  {q8-q15}
-+
-+                subs            r5,  #96
-+
-+                vmovn.u32       d0,  q8
-+                vshrn.u32       d2,  q8,  #12
-+                vshrn.u32       d4,  q8,  #16    @ Cannot vshrn.u32 #20!
-+
-+                add             r2,  r3
-+
-+                vmovn.u32       d1,  q9
-+                vshrn.u32       d3,  q9,  #12
-+                vshrn.u32       d5,  q9,  #16
-+
-+                pld             [r2, #0]
-+
-+                vshrn.u16       d0,  q0,  #2
-+                vmovn.u16       d1,  q1
-+                vshrn.u16       d2,  q2,  #6
-+
-+                vmovn.u32       d16, q10
-+                vshrn.u32       d18, q10, #12
-+                vshrn.u32       d20, q10, #16
-+
-+                vmovn.u32       d17, q11
-+                vshrn.u32       d19, q11, #12
-+                vshrn.u32       d21, q11, #16
-+
-+                pld             [r2, #64]
-+
-+                vshrn.u16       d4,  q8,  #2
-+                vmovn.u16       d5,  q9
-+                vshrn.u16       d6,  q10, #6
-+
-+                vmovn.u32       d16, q12
-+                vshrn.u32       d18, q12, #12
-+                vshrn.u32       d20, q12, #16
-+
-+                vmovn.u32       d17, q13
-+                vshrn.u32       d19, q13, #12
-+                vshrn.u32       d21, q13, #16
-+
-+                vshrn.u16       d16, q8,  #2
-+                vmovn.u16       d17, q9
-+                vshrn.u16       d18, q10, #6
-+
-+                vmovn.u32       d20, q14
-+                vshrn.u32       d22, q14, #12
-+                vshrn.u32       d24, q14, #16
-+
-+                vmovn.u32       d21, q15
-+                vshrn.u32       d23, q15, #12
-+                vshrn.u32       d25, q15, #16
-+
-+                vshrn.u16       d20, q10, #2
-+                vmovn.u16       d21, q11
-+                vshrn.u16       d22, q12, #6
-+
-+                blt             2f
-+
-+                vst3.8          {d0,  d1,  d2},  [r0], r12
-+                vst3.8          {d4,  d5,  d6},  [r4], r12
-+                vst3.8          {d16, d17, d18}, [r0], r12
-+                vst3.8          {d20, d21, d22}, [r4], r12
-+
-+                bne             1b
-+
-+11:
-+                subs            r7,  #1
-+                add             r0,  r1
-+                add             r8,  #128
-+                bne             10b
-+
-+                pop             {r4-r8, pc}
-+
-+@ Partial final write
-+2:
-+                cmp             r5,  #48-96
-+                blt             1f
-+                vst3.8          {d0,  d1,  d2},  [r0], r12
-+                vst3.8          {d4,  d5,  d6},  [r4], r12
-+                beq             11b
-+                vmov            q0,  q8
-+                vmov            q2,  q10
-+                sub             r5,  #48
-+                vmov            d2,  d18
-+                vmov            d6,  d22
-+1:
-+                cmp             r5,  #24-96
-+                blt             1f
-+                vst3.8          {d0,  d1,  d2},  [r0]!
-+                beq             11b
-+                vmov            q0,  q2
-+                sub             r5,  #24
-+                vmov            d2,  d6
-+1:
-+                cmp             r5,  #12-96
-+                blt             1f
-+                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
-+                vst3.8          {d0[1], d1[1], d2[1]}, [r0]!
-+                vst3.8          {d0[2], d1[2], d2[2]}, [r0]!
-+                vst3.8          {d0[3], d1[3], d2[3]}, [r0]!
-+                beq             11b
-+                vmov            s0,  s1
-+                sub             r5,  #12
-+                vmov            s2,  s3
-+                vmov            s4,  s5
-+1:
-+                cmp             r5,  #6-96
-+                blt             1f
-+                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
-+                vst3.8          {d0[1], d1[1], d2[1]}, [r0]!
-+                add             r0,  #12
-+                beq             11b
-+                vshr.u32        d0,  #16
-+                sub             r5,  #6
-+                vshr.u32        d1,  #16
-+                vshr.u32        d2,  #16
-+1:
-+                cmp             r5, #3-96
-+                blt             1f
-+                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
-+                beq             11b
-+                sub             r5, #3
-+                vshr.u32        d0, #8
-+                vshr.u32        d1, #8
-+1:
-+                cmp             r5, #2-96
-+                blt             1f
-+                vst2.8          {d0[0], d1[0]}, [r0]!
-+                b               11b
-+1:
-+                vst1.8          {d0[0]}, [r0]!
-+                b               11b
-+
-+endfunc
-+
-+
---- /dev/null
-+++ b/libavutil/arm/rpi_sand_neon.h
-@@ -0,0 +1,110 @@
-+/*
-+Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox
-+*/
-+
-+#ifndef AVUTIL_ARM_SAND_NEON_H
-+#define AVUTIL_ARM_SAND_NEON_H
-+
-+void ff_rpi_sand128b_stripe_to_8_10(
-+  uint8_t * dest,             // [r0]
-+  const uint8_t * src1,       // [r1]
-+  const uint8_t * src2,       // [r2]
-+  unsigned int lines);        // [r3]
-+
-+void ff_rpi_sand8_lines_to_planar_y8(
-+  uint8_t * dest,             // [r0]
-+  unsigned int dst_stride,    // [r1]
-+  const uint8_t * src,        // [r2]
-+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
-+  unsigned int src_stride2,   // [sp, #0]  -> r3
-+  unsigned int _x,            // [sp, #4]  Ignored - 0
-+  unsigned int y,             // [sp, #8]  (r7 in prefix)
-+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
-+  unsigned int h);            // [sp, #16] -> r7
-+
-+void ff_rpi_sand8_lines_to_planar_c8(
-+  uint8_t * dst_u,            // [r0]
-+  unsigned int dst_stride_u,  // [r1]
-+  uint8_t * dst_v,            // [r2]
-+  unsigned int dst_stride_v,  // [r3]
-+  const uint8_t * src,        // [sp, #0]  -> r4, r5
-+  unsigned int stride1,       // [sp, #4]  128
-+  unsigned int stride2,       // [sp, #8]  -> r8
-+  unsigned int _x,            // [sp, #12] 0
-+  unsigned int y,             // [sp, #16] (r7 in prefix)
-+  unsigned int _w,            // [sp, #20] -> r12, r6
-+  unsigned int h);            // [sp, #24] -> r7
-+
-+void ff_rpi_sand30_lines_to_planar_y16(
-+  uint8_t * dest,             // [r0]
-+  unsigned int dst_stride,    // [r1]
-+  const uint8_t * src,        // [r2]
-+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
-+  unsigned int src_stride2,   // [sp, #0]  -> r3
-+  unsigned int _x,            // [sp, #4]  Ignored - 0
-+  unsigned int y,             // [sp, #8]  (r7 in prefix)
-+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
-+  unsigned int h);            // [sp, #16] -> r7
-+
-+void ff_rpi_sand30_lines_to_planar_c16(
-+  uint8_t * dst_u,            // [r0]
-+  unsigned int dst_stride_u,  // [r1]
-+  uint8_t * dst_v,            // [r2]
-+  unsigned int dst_stride_v,  // [r3]
-+  const uint8_t * src,        // [sp, #0]  -> r4, r5
-+  unsigned int stride1,       // [sp, #4]  128
-+  unsigned int stride2,       // [sp, #8]  -> r8
-+  unsigned int _x,            // [sp, #12] 0
-+  unsigned int y,             // [sp, #16] (r7 in prefix)
-+  unsigned int _w,            // [sp, #20] -> r6, r9
-+  unsigned int h);            // [sp, #24] -> r7
-+
-+void ff_rpi_sand30_lines_to_planar_p010(
-+  uint8_t * dest,             // [r0]
-+  unsigned int dst_stride,    // [r1]
-+  const uint8_t * src,        // [r2]
-+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
-+  unsigned int src_stride2,   // [sp, #0]  -> r3
-+  unsigned int _x,            // [sp, #4]  Ignored - 0
-+  unsigned int y,             // [sp, #8]  (r7 in prefix)
-+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
-+  unsigned int h);            // [sp, #16] -> r7
-+
-+void ff_rpi_sand30_lines_to_planar_y8(
-+  uint8_t * dest,             // [r0]
-+  unsigned int dst_stride,    // [r1]
-+  const uint8_t * src,        // [r2]
-+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
-+  unsigned int src_stride2,   // [sp, #0]  -> r3
-+  unsigned int _x,            // [sp, #4]  Ignored - 0
-+  unsigned int y,             // [sp, #8]  (r7 in prefix)
-+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
-+  unsigned int h);            // [sp, #16] -> r7
-+
-+#endif // AVUTIL_ARM_SAND_NEON_H
-+
---- a/libavutil/frame.c
-+++ b/libavutil/frame.c
-@@ -16,6 +16,8 @@
-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-  */
- 
-+#include "config.h"
-+
- #include "channel_layout.h"
- #include "avassert.h"
- #include "buffer.h"
-@@ -26,6 +28,9 @@
- #include "mem.h"
- #include "samplefmt.h"
- #include "hwcontext.h"
-+#if CONFIG_SAND
-+#include "rpi_sand_fns.h"
-+#endif
- 
- #if FF_API_FRAME_GET_SET
- MAKE_ACCESSORS(AVFrame, frame, int64_t, best_effort_timestamp)
-@@ -902,6 +907,12 @@ int av_frame_apply_cropping(AVFrame *fra
-         (frame->crop_top + frame->crop_bottom) >= frame->height)
-         return AVERROR(ERANGE);
- 
-+#if CONFIG_SAND
-+    // Sand cannot be cropped - do not try
-+    if (av_rpi_is_sand_format(frame->format))
-+        return 0;
-+#endif
-+
-     desc = av_pix_fmt_desc_get(frame->format);
-     if (!desc)
-         return AVERROR_BUG;
---- a/libavutil/frame.h
-+++ b/libavutil/frame.h
-@@ -968,6 +968,16 @@ int av_frame_apply_cropping(AVFrame *fra
-  */
- const char *av_frame_side_data_name(enum AVFrameSideDataType type);
- 
-+
-+static inline int av_frame_cropped_width(const AVFrame * const frame)
-+{
-+    return frame->width - (frame->crop_left + frame->crop_right);
-+}
-+static inline int av_frame_cropped_height(const AVFrame * const frame)
-+{
-+    return frame->height - (frame->crop_top + frame->crop_bottom);
-+}
-+
- /**
-  * @}
-  */
---- a/libavutil/hwcontext_drm.c
-+++ b/libavutil/hwcontext_drm.c
-@@ -19,8 +19,10 @@
- #include <fcntl.h>
- #include <sys/mman.h>
- #include <unistd.h>
-+#include <sys/ioctl.h>
- 
- #include <drm.h>
-+#include <libdrm/drm_fourcc.h>
- #include <xf86drm.h>
- 
- #include "avassert.h"
-@@ -28,6 +30,11 @@
- #include "hwcontext_drm.h"
- #include "hwcontext_internal.h"
- #include "imgutils.h"
-+#include "libavutil/rpi_sand_fns.h"
-+
-+#include <linux/mman.h>
-+#include <linux/dma-buf.h>
-+#include <linux/dma-heap.h>
- 
- 
- static void drm_device_free(AVHWDeviceContext *hwdev)
-@@ -43,6 +50,11 @@ static int drm_device_create(AVHWDeviceC
-     AVDRMDeviceContext *hwctx = hwdev->hwctx;
-     drmVersionPtr version;
- 
-+    if (device == NULL) {
-+      hwctx->fd = -1;
-+      return 0;
-+    }
-+
-     hwctx->fd = open(device, O_RDWR);
-     if (hwctx->fd < 0)
-         return AVERROR(errno);
-@@ -85,18 +97,37 @@ static int drm_get_buffer(AVHWFramesCont
- typedef struct DRMMapping {
-     // Address and length of each mmap()ed region.
-     int nb_regions;
-+    unsigned int dmaflags;
-     void *address[AV_DRM_MAX_PLANES];
-     size_t length[AV_DRM_MAX_PLANES];
-+    int fds[AV_DRM_MAX_PLANES];
- } DRMMapping;
- 
-+static int dmasync(const int fd, const unsigned int flags)
-+{
-+    struct dma_buf_sync sync = {
-+        .flags = flags
-+    };
-+    while (ioctl(fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) {
-+        const int err = errno;
-+        if (errno == EINTR)
-+            continue;
-+        av_log(NULL, AV_LOG_WARNING, "%s: ioctl failed: flags=%#x\n", __func__, flags);
-+        return -err;
-+    }
-+    return 0;
-+}
-+
- static void drm_unmap_frame(AVHWFramesContext *hwfc,
-                             HWMapDescriptor *hwmap)
- {
-     DRMMapping *map = hwmap->priv;
-     int i;
- 
--    for (i = 0; i < map->nb_regions; i++)
-+    for (i = 0; i < map->nb_regions; i++) {
-         munmap(map->address[i], map->length[i]);
-+        dmasync(map->fds[i], DMA_BUF_SYNC_END | map->dmaflags);
-+    }
- 
-     av_free(map);
- }
-@@ -114,15 +145,28 @@ static int drm_map_frame(AVHWFramesConte
-     if (!map)
-         return AVERROR(ENOMEM);
- 
-+    for (i = 0; i < AV_DRM_MAX_PLANES; i++)
-+        map->fds[i] = -1;
-+
-     mmap_prot = 0;
--    if (flags & AV_HWFRAME_MAP_READ)
-+    if (flags & AV_HWFRAME_MAP_READ) {
-+        map->dmaflags |= DMA_BUF_SYNC_READ;
-         mmap_prot |= PROT_READ;
--    if (flags & AV_HWFRAME_MAP_WRITE)
-+    }
-+    if (flags & AV_HWFRAME_MAP_WRITE) {
-+        map->dmaflags |= DMA_BUF_SYNC_WRITE;
-         mmap_prot |= PROT_WRITE;
-+    }
-+
-+    if (dst->format == AV_PIX_FMT_NONE)
-+        dst->format = hwfc->sw_format;
- 
-     av_assert0(desc->nb_objects <= AV_DRM_MAX_PLANES);
-     for (i = 0; i < desc->nb_objects; i++) {
--        addr = mmap(NULL, desc->objects[i].size, mmap_prot, MAP_SHARED,
-+        dmasync(desc->objects[i].fd, DMA_BUF_SYNC_START | map->dmaflags);
-+        map->fds[i] = desc->objects[i].fd;
-+
-+        addr = mmap(NULL, desc->objects[i].size, mmap_prot, MAP_SHARED | MAP_POPULATE,
-                     desc->objects[i].fd, 0);
-         if (addr == MAP_FAILED) {
-             err = AVERROR(errno);
-@@ -151,6 +195,23 @@ static int drm_map_frame(AVHWFramesConte
- 
-     dst->width  = src->width;
-     dst->height = src->height;
-+    dst->crop_top    = src->crop_top;
-+    dst->crop_bottom = src->crop_bottom;
-+    dst->crop_left   = src->crop_left;
-+    dst->crop_right  = src->crop_right;
-+
-+#if CONFIG_SAND
-+    // Rework for sand frames
-+    if (av_rpi_is_sand_frame(dst)) {
-+        // As it stands the sand formats hold stride2 in linesize[3]
-+        // linesize[0] & [1] contain stride1 which is always 128 for everything we do
-+        // * Arguably this should be reworked s.t. stride2 is in linesize[0] & [1]
-+        dst->linesize[3] = fourcc_mod_broadcom_param(desc->objects[0].format_modifier);
-+        dst->linesize[0] = 128;
-+        dst->linesize[1] = 128;
-+        // *** Are we sure src->height is actually what we want ???
-+    }
-+#endif
- 
-     err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src,
-                                 &drm_unmap_frame, map);
-@@ -160,7 +221,9 @@ static int drm_map_frame(AVHWFramesConte
-     return 0;
- 
- fail:
--    for (i = 0; i < desc->nb_objects; i++) {
-+    for (i = 0; i < AV_DRM_MAX_PLANES; i++) {
-+        if (map->fds[i] != -1)
-+            dmasync(map->fds[i], DMA_BUF_SYNC_END | map->dmaflags);
-         if (map->address[i])
-             munmap(map->address[i], map->length[i]);
-     }
-@@ -172,16 +235,29 @@ static int drm_transfer_get_formats(AVHW
-                                     enum AVHWFrameTransferDirection dir,
-                                     enum AVPixelFormat **formats)
- {
--    enum AVPixelFormat *pix_fmts;
-+    enum AVPixelFormat *p;
- 
--    pix_fmts = av_malloc_array(2, sizeof(*pix_fmts));
--    if (!pix_fmts)
-+    p = *formats = av_malloc_array(3, sizeof(*p));
-+    if (!p)
-         return AVERROR(ENOMEM);
- 
--    pix_fmts[0] = ctx->sw_format;
--    pix_fmts[1] = AV_PIX_FMT_NONE;
-+    // **** Offer native sand too ????
-+    *p++ =
-+#if CONFIG_SAND
-+        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ?
-+            AV_PIX_FMT_YUV420P :
-+        ctx->sw_format == AV_PIX_FMT_RPI4_10 ?
-+            AV_PIX_FMT_YUV420P10LE :
-+#endif
-+            ctx->sw_format;
-+
-+#if CONFIG_SAND
-+    if (ctx->sw_format == AV_PIX_FMT_RPI4_10 ||
-+        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128)
-+        *p++ = AV_PIX_FMT_NV12;
-+#endif
- 
--    *formats = pix_fmts;
-+    *p = AV_PIX_FMT_NONE;
-     return 0;
- }
- 
-@@ -197,18 +273,63 @@ static int drm_transfer_data_from(AVHWFr
-     map = av_frame_alloc();
-     if (!map)
-         return AVERROR(ENOMEM);
--    map->format = dst->format;
- 
-+    // Map to default
-+    map->format = AV_PIX_FMT_NONE;
-     err = drm_map_frame(hwfc, map, src, AV_HWFRAME_MAP_READ);
-     if (err)
-         goto fail;
- 
--    map->width  = dst->width;
--    map->height = dst->height;
-+#if 0
-+    av_log(hwfc, AV_LOG_INFO, "%s: src fmt=%d (%d), dst fmt=%d (%d) s=%dx%d l=%d/%d/%d/%d, d=%dx%d l=%d/%d/%d\n", __func__,
-+           map->hwfc_format, AV_PIX_FMT_RPI4_8, dst->format, AV_PIX_FMT_YUV420P10LE,
-+           map->width, map->height,
-+           map->linesize[0],
-+           map->linesize[1],
-+           map->linesize[2],
-+           map->linesize[3],
-+           dst->width, dst->height,
-+           dst->linesize[0],
-+           dst->linesize[1],
-+           dst->linesize[2]);
-+#endif
-+#if CONFIG_SAND
-+    if (av_rpi_is_sand_frame(map)) {
-+        // Preserve crop - later ffmpeg code assumes that we have in that it
-+        // overwrites any crop that we create with the old values
-+        unsigned int stride2 = map->linesize[3];
-+        const unsigned int w = FFMIN(dst->width, map->width);
-+        const unsigned int h = FFMIN(dst->height, map->height);
-+
-+        map->crop_top = 0;
-+        map->crop_bottom = 0;
-+        map->crop_left = 0;
-+        map->crop_right = 0;
-+
-+        if (av_rpi_sand_to_planar_frame(dst, map) != 0)
-+        {
-+            av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__);
-+            err = AVERROR(EINVAL);
-+            goto fail;
-+        }
-+
-+        dst->width = w;
-+        dst->height = h;
-+    }
-+    else
-+#endif
-+    {
-+        // Kludge mapped h/w s.t. frame_copy works
-+        map->width  = dst->width;
-+        map->height = dst->height;
-+        err = av_frame_copy(dst, map);
-+    }
- 
--    err = av_frame_copy(dst, map);
-     if (err)
-+    {
-+        av_log(hwfc, AV_LOG_ERROR, "%s: Copy fail\n", __func__);
-         goto fail;
-+    }
- 
-     err = 0;
- fail:
-@@ -223,7 +344,10 @@ static int drm_transfer_data_to(AVHWFram
-     int err;
- 
-     if (src->width > hwfc->width || src->height > hwfc->height)
-+    {
-+        av_log(hwfc, AV_LOG_ERROR, "%s: H/w mismatch: %d/%d, %d/%d\n", __func__, dst->width, hwfc->width, dst->height, hwfc->height);
-         return AVERROR(EINVAL);
-+    }
- 
-     map = av_frame_alloc();
-     if (!map)
---- a/libavutil/pixdesc.c
-+++ b/libavutil/pixdesc.c
-@@ -2371,6 +2371,38 @@ static const AVPixFmtDescriptor av_pix_f
-         .name = "vulkan",
-         .flags = AV_PIX_FMT_FLAG_HWACCEL,
-     },
-+    [AV_PIX_FMT_SAND128] = {
-+        .name = "sand128",
-+        .nb_components = 3,
-+        .log2_chroma_w = 1,
-+        .log2_chroma_h = 1,
-+        .comp = {
-+            { 0, 1, 0, 0, 8, 0, 7, 1 },        /* Y */
-+            { 1, 2, 0, 0, 8, 1, 7, 1 },        /* U */
-+            { 1, 2, 1, 0, 8, 1, 7, 2 },        /* V */
-+        },
-+        .flags = 0,
-+    },
-+    [AV_PIX_FMT_SAND64_10] = {
-+        .name = "sand64_10",
-+        .nb_components = 3,
-+        .log2_chroma_w = 1,
-+        .log2_chroma_h = 1,
-+        .comp = {
-+            { 0, 2, 0, 0, 10, 0, 9, 1 },        /* Y */
-+            { 1, 4, 0, 0, 10, 3, 9, 1 },        /* U */
-+            { 1, 4, 2, 0, 10, 3, 9, 3 },        /* V */
-+        },
-+        .flags = 0,
-+    },
-+    [AV_PIX_FMT_RPI4_8] = {
-+        .name = "rpi",
-+        .flags = AV_PIX_FMT_FLAG_HWACCEL,
-+    },
-+    [AV_PIX_FMT_RPI4_10] = {
-+        .name = "rpi",
-+        .flags = AV_PIX_FMT_FLAG_HWACCEL,
-+    },
- };
- #if FF_API_PLUS1_MINUS1
- FF_ENABLE_DEPRECATION_WARNINGS
---- a/libavutil/pixfmt.h
-+++ b/libavutil/pixfmt.h
-@@ -357,6 +357,12 @@ enum AVPixelFormat {
- 
-     AV_PIX_FMT_Y210BE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian
-     AV_PIX_FMT_Y210LE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian
-+// RPI - not on ifdef so can be got at by calling progs
-+    AV_PIX_FMT_SAND128,    ///< 4:2:0  8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
-+    AV_PIX_FMT_SAND64_10,  ///< 4:2:0 10-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
-+    AV_PIX_FMT_SAND64_16,  ///< 4:2:0 16-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
-+    AV_PIX_FMT_RPI4_8,
-+    AV_PIX_FMT_RPI4_10,
- 
-     AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
- };
---- /dev/null
-+++ b/libavutil/rpi_sand_fn_pw.h
-@@ -0,0 +1,227 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox
-+*/
-+
-+// * Included twice from rpi_sand_fn with different PW
-+
-+#define STRCAT(x,y) x##y
-+
-+#if PW == 1
-+#define pixel uint8_t
-+#define FUNC(f) STRCAT(f, 8)
-+#elif PW == 2
-+#define pixel uint16_t
-+#define FUNC(f) STRCAT(f, 16)
-+#else
-+#error Unexpected PW
-+#endif
-+
-+// Fetches a single patch - offscreen fixup not done here
-+// w <= stride1
-+// unclipped
-+void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h)
-+{
-+    const unsigned int x = _x;
-+    const unsigned int w = _w;
-+    const unsigned int mask = stride1 - 1;
-+
-+#if PW == 1 && HAVE_SAND_ASM
-+    if (_x == 0) {
-+        ff_rpi_sand8_lines_to_planar_y8(dst, dst_stride,
-+                                     src, stride1, stride2, _x, y, _w, h);
-+        return;
-+    }
-+#endif
-+
-+    if ((x & ~mask) == ((x + w) & ~mask)) {
-+        // All in one sand stripe
-+        const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) {
-+            memcpy(dst, p, w);
-+        }
-+    }
-+    else
-+    {
-+        // Two+ stripe
-+        const unsigned int sstride = stride1 * stride2;
-+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        const uint8_t * p2 = p1 + sstride - (x & mask);
-+        const unsigned int w1 = stride1 - (x & mask);
-+        const unsigned int w3 = (x + w) & mask;
-+        const unsigned int w2 = w - (w1 + w3);
-+
-+        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) {
-+            unsigned int j;
-+            const uint8_t * p = p2;
-+            uint8_t * d = dst;
-+            memcpy(d, p1, w1);
-+            d += w1;
-+            for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) {
-+                memcpy(d, p, stride1);
-+            }
-+            memcpy(d, p, w3);
-+        }
-+    }
-+}
-+
-+// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V)
-+
-+void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u,
-+                             uint8_t * dst_v, const unsigned int dst_stride_v,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h)
-+{
-+    const unsigned int x = _x * 2;
-+    const unsigned int w = _w * 2;
-+    const unsigned int mask = stride1 - 1;
-+
-+#if PW == 1 && HAVE_SAND_ASM
-+    if (_x == 0) {
-+        ff_rpi_sand8_lines_to_planar_c8(dst_u, dst_stride_u, dst_v, dst_stride_v,
-+                                     src, stride1, stride2, _x, y, _w, h);
-+        return;
-+    }
-+#endif
-+
-+    if ((x & ~mask) == ((x + w) & ~mask)) {
-+        // All in one sand stripe
-+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) {
-+            pixel * du = (pixel *)dst_u;
-+            pixel * dv = (pixel *)dst_v;
-+            const pixel * p = (const pixel *)p1;
-+            for (unsigned int k = 0; k < w; k += 2 * PW) {
-+                *du++ = *p++;
-+                *dv++ = *p++;
-+            }
-+        }
-+    }
-+    else
-+    {
-+        // Two+ stripe
-+        const unsigned int sstride = stride1 * stride2;
-+        const unsigned int sstride_p = (sstride - stride1) / PW;
-+
-+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        const uint8_t * p2 = p1 + sstride - (x & mask);
-+        const unsigned int w1 = stride1 - (x & mask);
-+        const unsigned int w3 = (x + w) & mask;
-+        const unsigned int w2 = w - (w1 + w3);
-+
-+        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) {
-+            unsigned int j;
-+            const pixel * p = (const pixel *)p1;
-+            pixel * du = (pixel *)dst_u;
-+            pixel * dv = (pixel *)dst_v;
-+            for (unsigned int k = 0; k < w1; k += 2 * PW) {
-+                *du++ = *p++;
-+                *dv++ = *p++;
-+            }
-+            for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) {
-+                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
-+                    *du++ = *p++;
-+                    *dv++ = *p++;
-+                }
-+            }
-+            for (unsigned int k = 0; k < w3; k += 2 * PW) {
-+                *du++ = *p++;
-+                *dv++ = *p++;
-+            }
-+        }
-+    }
-+}
-+
-+void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c,
-+                             unsigned int stride1, unsigned int stride2,
-+                             const uint8_t * src_u, const unsigned int src_stride_u,
-+                             const uint8_t * src_v, const unsigned int src_stride_v,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h)
-+{
-+    const unsigned int x = _x * 2;
-+    const unsigned int w = _w * 2;
-+    const unsigned int mask = stride1 - 1;
-+    if ((x & ~mask) == ((x + w) & ~mask)) {
-+        // All in one sand stripe
-+        uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) {
-+            const pixel * su = (const pixel *)src_u;
-+            const pixel * sv = (const pixel *)src_v;
-+            pixel * p = (pixel *)p1;
-+            for (unsigned int k = 0; k < w; k += 2 * PW) {
-+                *p++ = *su++;
-+                *p++ = *sv++;
-+            }
-+        }
-+    }
-+    else
-+    {
-+        // Two+ stripe
-+        const unsigned int sstride = stride1 * stride2;
-+        const unsigned int sstride_p = (sstride - stride1) / PW;
-+
-+        const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        const uint8_t * p2 = p1 + sstride - (x & mask);
-+        const unsigned int w1 = stride1 - (x & mask);
-+        const unsigned int w3 = (x + w) & mask;
-+        const unsigned int w2 = w - (w1 + w3);
-+
-+        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) {
-+            unsigned int j;
-+            const pixel * su = (const pixel *)src_u;
-+            const pixel * sv = (const pixel *)src_v;
-+            pixel * p = (pixel *)p1;
-+            for (unsigned int k = 0; k < w1; k += 2 * PW) {
-+                *p++ = *su++;
-+                *p++ = *sv++;
-+            }
-+            for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) {
-+                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
-+                    *p++ = *su++;
-+                    *p++ = *sv++;
-+                }
-+            }
-+            for (unsigned int k = 0; k < w3; k += 2 * PW) {
-+                *p++ = *su++;
-+                *p++ = *sv++;
-+            }
-+        }
-+    }
-+}
-+
-+
-+#undef pixel
-+#undef STRCAT
-+#undef FUNC
-+
---- /dev/null
-+++ b/libavutil/rpi_sand_fns.c
-@@ -0,0 +1,445 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox
-+*/
-+
-+#include "config.h"
-+#include <stdint.h>
-+#include <string.h>
-+#include "rpi_sand_fns.h"
-+#include "avassert.h"
-+#include "frame.h"
-+
-+#if ARCH_ARM && HAVE_NEON
-+#include "arm/rpi_sand_neon.h"
-+#define HAVE_SAND_ASM 1
-+#elif ARCH_AARCH64 && HAVE_NEON
-+#include "aarch64/rpi_sand_neon.h"
-+#define HAVE_SAND_ASM 1
-+#else
-+#define HAVE_SAND_ASM 0
-+#endif
-+
-+#define PW 1
-+#include "rpi_sand_fn_pw.h"
-+#undef PW
-+
-+#define PW 2
-+#include "rpi_sand_fn_pw.h"
-+#undef PW
-+
-+#if 1
-+// Simple round
-+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
-+{
-+    const unsigned int rnd = (1 << shr) >> 1;
-+    const uint16_t * src = (const uint16_t *)_src;
-+
-+    for (; n != 0; --n) {
-+        *dst++ = (*src++ + rnd) >> shr;
-+    }
-+}
-+#else
-+// Dithered variation
-+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
-+{
-+    unsigned int rnd = (1 << shr) >> 1;
-+    const unsigned int mask = ((1 << shr) - 1);
-+    const uint16_t * src = (const uint16_t *)_src;
-+
-+    for (; n != 0; --n) {
-+        rnd = *src++ + (rnd & mask);
-+        *dst++ = rnd >> shr;
-+    }
-+}
-+#endif
-+
-+// Fetches a single patch - offscreen fixup not done here
-+// w <= stride1
-+// unclipped
-+// _x & _w in pixels, strides in bytes
-+void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h)
-+{
-+    const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
-+    const unsigned int xskip0 = _x - (x0 >> 2) * 3;
-+    const unsigned int x1 = ((_x + _w) / 3) * 4;
-+    const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
-+    const unsigned int mask = stride1 - 1;
-+    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
-+    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
-+
-+#if HAVE_SAND_ASM
-+    if (_x == 0) {
-+        ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
-+        return;
-+    }
-+#endif
-+
-+    if (x0 == x1) {
-+        // *******************
-+        // Partial single word xfer
-+        return;
-+    }
-+
-+    for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
-+    {
-+        unsigned int x = x0;
-+        const uint32_t * p = (const uint32_t *)p0;
-+        uint16_t * d = (uint16_t *)dst;
-+
-+        if (xskip0 != 0) {
-+            const uint32_t p3 = *p++;
-+
-+            if (xskip0 == 1)
-+                *d++ = (p3 >> 10) & 0x3ff;
-+            *d++ = (p3 >> 20) & 0x3ff;
-+
-+            if (((x += 4) & mask) == 0)
-+                p += slice_inc;
-+        }
-+
-+        while (x != x1) {
-+            const uint32_t p3 = *p++;
-+            *d++ = p3 & 0x3ff;
-+            *d++ = (p3 >> 10) & 0x3ff;
-+            *d++ = (p3 >> 20) & 0x3ff;
-+
-+            if (((x += 4) & mask) == 0)
-+                p += slice_inc;
-+        }
-+
-+        if (xrem1 != 0) {
-+            const uint32_t p3 = *p;
-+
-+            *d++ = p3 & 0x3ff;
-+            if (xrem1 == 2)
-+                *d++ = (p3 >> 10) & 0x3ff;
-+        }
-+    }
-+}
-+
-+
-+void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
-+                             uint8_t * dst_v, const unsigned int dst_stride_v,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h)
-+{
-+    const unsigned int x0 = (_x / 3) * 8; // Byte offset of the word
-+    const unsigned int xskip0 = _x - (x0 >> 3) * 3;
-+    const unsigned int x1 = ((_x + _w) / 3) * 8;
-+    const unsigned int xrem1 = _x + _w - (x1 >> 3) * 3;
-+    const unsigned int mask = stride1 - 1;
-+    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
-+    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
-+
-+#if HAVE_SAND_ASM
-+    if (_x == 0) {
-+        ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v,
-+                                       src, stride1, stride2, _x, y, _w, h);
-+        return;
-+    }
-+#endif
-+
-+    if (x0 == x1) {
-+        // *******************
-+        // Partial single word xfer
-+        return;
-+    }
-+
-+    for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p0 += stride1)
-+    {
-+        unsigned int x = x0;
-+        const uint32_t * p = (const uint32_t *)p0;
-+        uint16_t * du = (uint16_t *)dst_u;
-+        uint16_t * dv = (uint16_t *)dst_v;
-+
-+        if (xskip0 != 0) {
-+            const uint32_t p3a = *p++;
-+            const uint32_t p3b = *p++;
-+
-+            if (xskip0 == 1)
-+            {
-+                *du++ = (p3a >> 20) & 0x3ff;
-+                *dv++ = (p3b >>  0) & 0x3ff;
-+            }
-+            *du++ = (p3b >> 10) & 0x3ff;
-+            *dv++ = (p3b >> 20) & 0x3ff;
-+
-+            if (((x += 8) & mask) == 0)
-+                p += slice_inc;
-+        }
-+
-+        while (x != x1) {
-+            const uint32_t p3a = *p++;
-+            const uint32_t p3b = *p++;
-+
-+            *du++ = p3a & 0x3ff;
-+            *dv++ = (p3a >> 10) & 0x3ff;
-+            *du++ = (p3a >> 20) & 0x3ff;
-+            *dv++ = p3b & 0x3ff;
-+            *du++ = (p3b >> 10) & 0x3ff;
-+            *dv++ = (p3b >> 20) & 0x3ff;
-+
-+            if (((x += 8) & mask) == 0)
-+                p += slice_inc;
-+        }
-+
-+        if (xrem1 != 0) {
-+            const uint32_t p3a = *p++;
-+            const uint32_t p3b = *p++;
-+
-+            *du++ = p3a & 0x3ff;
-+            *dv++ = (p3a >> 10) & 0x3ff;
-+            if (xrem1 == 2)
-+            {
-+                *du++ = (p3a >> 20) & 0x3ff;
-+                *dv++ = p3b & 0x3ff;
-+            }
-+        }
-+    }
-+}
-+
-+// Fetches a single patch - offscreen fixup not done here
-+// w <= stride1
-+// single lose bottom 2 bits truncation
-+// _x & _w in pixels, strides in bytes
-+void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h)
-+{
-+    const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
-+    const unsigned int xskip0 = _x - (x0 >> 2) * 3;
-+    const unsigned int x1 = ((_x + _w) / 3) * 4;
-+    const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
-+    const unsigned int mask = stride1 - 1;
-+    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
-+    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
-+
-+#if HAVE_SAND_ASM
-+    if (_x == 0) {
-+        ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
-+        return;
-+    }
-+#endif
-+
-+    if (x0 == x1) {
-+        // *******************
-+        // Partial single word xfer
-+        return;
-+    }
-+
-+    for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
-+    {
-+        unsigned int x = x0;
-+        const uint32_t * p = (const uint32_t *)p0;
-+        uint8_t * d = dst;
-+
-+        if (xskip0 != 0) {
-+            const uint32_t p3 = *p++;
-+
-+            if (xskip0 == 1)
-+                *d++ = (p3 >> 12) & 0xff;
-+            *d++ = (p3 >> 22) & 0xff;
-+
-+            if (((x += 4) & mask) == 0)
-+                p += slice_inc;
-+        }
-+
-+        while (x != x1) {
-+            const uint32_t p3 = *p++;
-+            *d++ = (p3 >> 2) & 0xff;
-+            *d++ = (p3 >> 12) & 0xff;
-+            *d++ = (p3 >> 22) & 0xff;
-+
-+            if (((x += 4) & mask) == 0)
-+                p += slice_inc;
-+        }
-+
-+        if (xrem1 != 0) {
-+            const uint32_t p3 = *p;
-+
-+            *d++ = (p3 >> 2) & 0xff;
-+            if (xrem1 == 2)
-+                *d++ = (p3 >> 12) & 0xff;
-+        }
-+    }
-+}
-+
-+
-+
-+// w/h in pixels
-+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
-+                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
-+                         unsigned int w, unsigned int h, const unsigned int shr)
-+{
-+    const unsigned int n = dst_stride1 / 2;
-+    unsigned int j;
-+
-+    // This is true for our current layouts
-+    av_assert0(dst_stride1 == src_stride1);
-+
-+    // As we have the same stride1 for src & dest and src is wider than dest
-+    // then if we loop on src we can always write contiguously to dest
-+    // We make no effort to copy an exact width - round up to nearest src stripe
-+    // as we will always have storage in dest for that
-+
-+#if ARCH_ARM && HAVE_NEON
-+    if (shr == 3 && src_stride1 == 128) {
-+        for (j = 0; j + n < w; j += dst_stride1) {
-+            uint8_t * d = dst + j * dst_stride2;
-+            const uint8_t * s1 = src + j * 2 * src_stride2;
-+            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
-+
-+            ff_rpi_sand128b_stripe_to_8_10(d, s1, s2, h);
-+        }
-+    }
-+    else
-+#endif
-+    {
-+        for (j = 0; j + n < w; j += dst_stride1) {
-+            uint8_t * d = dst + j * dst_stride2;
-+            const uint8_t * s1 = src + j * 2 * src_stride2;
-+            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
-+
-+            for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) {
-+                cpy16_to_8(d, s1, n, shr);
-+                cpy16_to_8(d + n, s2, n, shr);
-+            }
-+        }
-+    }
-+
-+    // Fix up a trailing dest half stripe
-+    if (j < w) {
-+        uint8_t * d = dst + j * dst_stride2;
-+        const uint8_t * s1 = src + j * 2 * src_stride2;
-+
-+        for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) {
-+            cpy16_to_8(d, s1, n, shr);
-+        }
-+    }
-+}
-+
-+int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src)
-+{
-+    const int w = av_frame_cropped_width(src);
-+    const int h = av_frame_cropped_height(src);
-+    const int x = src->crop_left;
-+    const int y = src->crop_top;
-+
-+    // We will crop as part of the conversion
-+    dst->crop_top = 0;
-+    dst->crop_left = 0;
-+    dst->crop_bottom = 0;
-+    dst->crop_right = 0;
-+
-+    switch (src->format){
-+        case AV_PIX_FMT_SAND128:
-+        case AV_PIX_FMT_RPI4_8:
-+            switch (dst->format){
-+                case AV_PIX_FMT_YUV420P:
-+                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
-+                                             src->data[0],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x, y, w, h);
-+                    av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
-+                                             dst->data[2], dst->linesize[2],
-+                                             src->data[1],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x/2, y/2,  w/2, h/2);
-+                    break;
-+                case AV_PIX_FMT_NV12:
-+                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
-+                                             src->data[0],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x, y, w, h);
-+                    av_rpi_sand_to_planar_y8(dst->data[1], dst->linesize[1],
-+                                             src->data[1],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x/2, y/2, w, h/2);
-+                    break;
-+                default:
-+                    return -1;
-+            }
-+            break;
-+        case AV_PIX_FMT_SAND64_10:
-+            switch (dst->format){
-+                case AV_PIX_FMT_YUV420P10:
-+                    av_rpi_sand_to_planar_y16(dst->data[0], dst->linesize[0],
-+                                             src->data[0],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x*2, y, w*2, h);
-+                    av_rpi_sand_to_planar_c16(dst->data[1], dst->linesize[1],
-+                                             dst->data[2], dst->linesize[2],
-+                                             src->data[1],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x, y/2,  w, h/2);
-+                    break;
-+                default:
-+                    return -1;
-+            }
-+            break;
-+        case AV_PIX_FMT_RPI4_10:
-+            switch (dst->format){
-+                case AV_PIX_FMT_YUV420P10:
-+                    av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
-+                                             src->data[0],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x, y, w, h);
-+                    av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
-+                                             dst->data[2], dst->linesize[2],
-+                                             src->data[1],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x/2, y/2, w/2, h/2);
-+                    break;
-+                case AV_PIX_FMT_NV12:
-+                    av_rpi_sand30_to_planar_y8(dst->data[0], dst->linesize[0],
-+                                             src->data[0],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x, y, w, h);
-+                    av_rpi_sand30_to_planar_y8(dst->data[1], dst->linesize[1],
-+                                             src->data[1],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x/2, y/2, w, h/2);
-+                    break;
-+                default:
-+                    return -1;
-+            }
-+            break;
-+        default:
-+            return -1;
-+    }
-+
-+    return av_frame_copy_props(dst, src);
-+}
---- /dev/null
-+++ b/libavutil/rpi_sand_fns.h
-@@ -0,0 +1,188 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox
-+*/
-+
-+#ifndef AVUTIL_RPI_SAND_FNS
-+#define AVUTIL_RPI_SAND_FNS
-+
-+#include "libavutil/frame.h"
-+
-+// For all these fns _x & _w are measured as coord * PW
-+// For the C fns coords are in chroma pels (so luma / 2)
-+// Strides are in bytes
-+
-+void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+
-+void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u,
-+                             uint8_t * dst_v, const unsigned int dst_stride_v,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
-+                             uint8_t * dst_v, const unsigned int dst_stride_v,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+
-+void av_rpi_planar_to_sand_c8(uint8_t * dst_c,
-+                             unsigned int stride1, unsigned int stride2,
-+                             const uint8_t * src_u, const unsigned int src_stride_u,
-+                             const uint8_t * src_v, const unsigned int src_stride_v,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+void av_rpi_planar_to_sand_c16(uint8_t * dst_c,
-+                             unsigned int stride1, unsigned int stride2,
-+                             const uint8_t * src_u, const unsigned int src_stride_u,
-+                             const uint8_t * src_v, const unsigned int src_stride_v,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+
-+void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
-+                             uint8_t * dst_v, const unsigned int dst_stride_v,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+
-+void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+
-+// w/h in pixels
-+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
-+                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
-+                         unsigned int w, unsigned int h, const unsigned int shr);
-+
-+
-+// dst must contain required pixel format & allocated data buffers
-+// Cropping on the src buffer will be honoured and dst crop will be set to zero
-+int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src);
-+
-+
-+static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
-+{
-+#ifdef RPI_ZC_SAND128_ONLY
-+    // If we are sure we only only support 128 byte sand formats replace the
-+    // var with a constant which should allow for better optimisation
-+    return 128;
-+#else
-+    return frame->linesize[0];
-+#endif
-+}
-+
-+static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
-+{
-+    return frame->linesize[3];
-+}
-+
-+
-+static inline int av_rpi_is_sand_format(const int format)
-+{
-+    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_RPI4_10);
-+}
-+
-+static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
-+{
-+    return av_rpi_is_sand_format(frame->format);
-+}
-+
-+static inline int av_rpi_is_sand8_frame(const AVFrame * const frame)
-+{
-+    return (frame->format == AV_PIX_FMT_SAND128 || frame->format == AV_PIX_FMT_RPI4_8);
-+}
-+
-+static inline int av_rpi_is_sand16_frame(const AVFrame * const frame)
-+{
-+    return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16);
-+}
-+
-+static inline int av_rpi_is_sand30_frame(const AVFrame * const frame)
-+{
-+    return (frame->format == AV_PIX_FMT_RPI4_10);
-+}
-+
-+static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame)
-+{
-+    return av_rpi_is_sand8_frame(frame) ? 0 : 1;
-+}
-+
-+// If x is measured in bytes (not pixels) then this works for sand64_16 as
-+// well as sand128 - but in the general case we work that out
-+
-+static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y)
-+{
-+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
-+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
-+    const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame);
-+    const unsigned int x1 = x & (stride1 - 1);
-+    const unsigned int x2 = x ^ x1;
-+
-+    return x1 + stride1 * y + stride2 * x2;
-+}
-+
-+static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
-+{
-+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
-+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
-+    const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1);
-+    const unsigned int x1 = x & (stride1 - 1);
-+    const unsigned int x2 = x ^ x1;
-+
-+    return x1 + stride1 * y_c + stride2 * x2;
-+}
-+
-+static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
-+{
-+    return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y);
-+}
-+
-+static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
-+{
-+    return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y);
-+}
-+
-+#endif
-+
---- /dev/null
-+++ b/pi-util/BUILD.txt
-@@ -0,0 +1,59 @@
-+Building Pi FFmpeg
-+==================
-+
-+Current only building on a Pi is supported.
-+This builds ffmpeg the way I've tested it
-+
-+Get all dependencies - the current package dependencies are good enough
-+
-+$ sudo apt-get build-dep ffmpeg
-+
-+Configure using the pi-util/conf_native.sh script
-+-------------------------------------------------
-+
-+This sets the normal release options and creates an ouutput dir to build into
-+The directory name will depend on system and options but will be under out/
-+
-+There are a few choices here
-+ --mmal  build including the legacy mmal-based decoders and zero-copy code
-+         this requires appropriate libraries which currently will exist for
-+         armv7 but not arm64
-+ --noshared
-+         Build a static image rather than a shared library one.  Static is
-+         easier for testing as there is no need to worry about library
-+         paths being confused and therefore running the wrong code,  Shared
-+         is what is needed, in most cases, when building for use by other
-+         programs.
-+
-+So for a static build
-+---------------------
-+
-+$ pi-util/conf_native.sh --noshared
-+
-+$ make -j8 -C out/<wherever the script said it was building to>
-+
-+You can now run ffmpeg directly from where it was built
-+
-+For a shared build
-+------------------
-+
-+$ pi-util/conf_native.sh
-+
-+You will normally want an install target if shared. Note that the script has
-+set this up to be generated in out/<builddir>/install, you don't have to worry
-+about overwriting your system libs.
-+
-+$ make -j8 -C out/<builddir> install
-+
-+You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was
-+built or install the image on the system - you have to be careful to get rid
-+of all other ffmpeg libs or confusion may result.  There is a little script
-+that wipes all other versions - obviously use with care!
-+
-+$ sudo pi-util/clean_usr_libs.sh
-+
-+Then simply copying from the install to /usr works
-+
-+$ sudo cp -r out/<builddir>/install/* /usr
-+
-+
---- /dev/null
-+++ b/pi-util/NOTES.txt
-@@ -0,0 +1,69 @@
-+Notes on the hevc_rpi decoder & associated support code
-+-------------------------------------------------------
-+
-+There are 3 main parts to the existing code:
-+
-+1) The decoder - this is all in libavcodec as rpi_hevc*.
-+
-+2) A few filters to deal with Sand frames and a small patch to
-+automatically select the sand->i420 converter when required.
-+
-+3) A kludge in ffmpeg.c to display the decoded video. This could & should
-+be converted into a proper ffmpeg display module.
-+
-+
-+Decoder
-+-------
-+
-+The decoder is a modified version of the existing ffmpeg hevc decoder.
-+Generally it is ~100% faster than the existing ffmpeg hevc s/w decoder.
-+More complex bitstreams can be up to ~200% faster but particularly easy
-+streams can cut its advantage down to ~50%.  This means that a Pi3+ can
-+display nearly all 8-bit 1080p30 streams and with some overclocking it can
-+display most lower bitrate 10-bit 1080p30 streams - this latter case is
-+not helped by the requirement to downsample to 8-bit before display on a
-+Pi.
-+
-+It has had co-processor offload added for inter-pred and large block
-+residual transform.  Various parts have had optimized ARM NEON assembler
-+added and the existing ARM asm sections have been profiled and
-+re-optimized for A53. The main C code has been substantially reworked at
-+its lower levels in an attempt to optimize it and minimize memory
-+bandwidth. To some extent code paths that deal with frame types that it
-+doesn't support have been pruned.
-+
-+It outputs frames in Broadcom Sand format. This is a somewhat annoying
-+layout that doesn't fit into ffmpegs standard frame descriptions. It has
-+vertical stripes of 128 horizontal pixels (64 in 10 bit forms) with Y for
-+the stripe followed by interleaved U & V, that is then followed by the Y
-+for the next stripe, etc. The final stripe is always padded to
-+stripe-width. This is used in an attempt to help with cache locality and
-+cut down on the number of dram bank switches. It is annoying to use for
-+inter-pred with conventional processing but the way the Pi QPU (which is
-+used for inter-pred) works means that it has negligible downsides here and
-+the improved memory performance exceeds the overhead of the increased
-+complexity in the rest of the code.
-+
-+Frames must be allocated out of GPU memory (as otherwise they can't be
-+accessed by the co-processors). Utility functions (in rpi_zc.c) have been
-+written to make this easier. As the frames are already in GPU memory they
-+can be displayed by the Pi h/w without any further copying.
-+
-+
-+Known non-features
-+------------------
-+
-+Frame allocation should probably be done in some other way in order to fit
-+into the standard framework better.
-+
-+Sand frames are currently declared as software frames, there is an
-+argument that they should be hardware frames but they aren't really.
-+
-+There must be a better way of auto-selecting the hevc_rpi decoder over the
-+normal s/w hevc decoder, but I became confused by the existing h/w
-+acceleration framework and what I wanted to do didn't seem to fit in
-+neatly.
-+
-+Display should be a proper device rather than a kludge in ffmpeg.c
-+
-+
---- /dev/null
-+++ b/pi-util/TESTMESA.txt
-@@ -0,0 +1,82 @@
-+# Setup & Build instructions for testing Argon30 mesa support (on Pi4)
-+
-+# These assume that the drm_mmal test for Sand8 has been built on this Pi
-+# as build relies on many of the same files
-+
-+# 1st get everything required to build ffmpeg
-+# If sources aren't already enabled on your Pi then enable them
-+sudo su
-+sed "s/#deb-src/deb-src/" /etc/apt/sources.list > /tmp/sources.list
-+sed "s/#deb-src/deb-src/" /etc/apt/sources.list.d/raspi.list > /tmp/raspi.list
-+mv /tmp/sources.list /etc/apt/
-+mv /tmp/raspi.list /etc/apt/sources.list.d/
-+apt update
-+
-+# Get dependancies
-+sudo apt build-dep ffmpeg
-+
-+sudo apt install meson libepoxy-dev libxcb-dri3-dev libxcb1-dev libx11-dev libx11-xcb-dev libdrm-dev
-+
-+# Enable H265 V4L2 request decoder
-+sudo su
-+echo dtoverlay=rpivid-v4l2 >> /boot/config.txt
-+# You may also want to add more CMA if you are going to try 4k videos
-+# Change the dtoverlay=vc4-fkms-v3d line in config.txt to read
-+# dtoverlay=vc4-fkms-v3d,cma-512
-+reboot
-+# Check it has turned up
-+ls -la /dev/video*
-+# This should include video19
-+# crw-rw----+ 1 root video 81, 7 Aug  4 17:25 /dev/video19
-+
-+# Currently on the Pi the linux headers from the debian distro don't match
-+# the kernel that we ship and we need to update them - hopefully this step
-+# will be unneeded in the future
-+sudo apt install git bc bison flex libssl-dev make
-+git clone --depth=1 https://github.com/raspberrypi/linux --branch rpi-5.10.y
-+cd linux
-+KERNEL=kernel7l
-+make bcm2711_defconfig
-+make headers_install
-+sudo cp -r usr/include/linux /usr/include
-+cd ..
-+
-+# Config - this builds a staticly linked ffmpeg which is easier for testing
-+pi-util/conf_native.sh --noshared
-+
-+# Build (this is a bit dull)
-+# If you want to poke the source the libavdevice/egl_vout.c contains the
-+# output code -
-+cd out/armv7-static-rel
-+
-+# Check that you have actually configured V4L2 request
-+grep HEVC_V4L2REQUEST config.h
-+# You are hoping for
-+# #define CONFIG_HEVC_V4L2REQUEST_HWACCEL 1
-+# if you get 0 then the config has failed
-+
-+make -j6
-+
-+# Grab test streams
-+wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-h264.mkv
-+wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc.mkv
-+wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc-10bit.mkv
-+
-+# Test i420 output (works currently)
-+./ffmpeg -no_cvt_hw -vcodec h264_v4l2m2m -i jellyfish-3-mbps-hd-h264.mkv -f vout_egl -
-+
-+# Test Sand8 output - doesn't currently work but should once you have
-+# Sand8 working in drm_mmal. I can't guarantee that this will work as
-+# I can't test this path with a known working format, but the debug looks
-+# good.  If this doesn't work & drm_mmal does with sand8 then come back to me
-+# The "show_all 1" forces vout to display every frame otherwise it drops any
-+# frame that would cause it to block
-+./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc.mkv -show_all 1 -f vout_egl -
-+
-+# Test Sand30 - doesn't currently work
-+# (Beware that when FFmpeg errors out it often leaves your teminal window
-+# in a state where you need to reset it)
-+./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc-10bit.mkv -f vout_egl -
-+
-+
-+
---- /dev/null
-+++ b/pi-util/clean_usr_libs.sh
-@@ -0,0 +1,26 @@
-+set -e
-+U=/usr/lib/arm-linux-gnueabihf
-+rm -f $U/libavcodec.*
-+rm -f $U/libavdevice.*
-+rm -f $U/libavfilter.*
-+rm -f $U/libavformat.*
-+rm -f $U/libavutil.*
-+rm -f $U/libswresample.*
-+rm -f $U/libswscale.*
-+U=/usr/lib/arm-linux-gnueabihf/neon/vfp
-+rm -f $U/libavcodec.*
-+rm -f $U/libavdevice.*
-+rm -f $U/libavfilter.*
-+rm -f $U/libavformat.*
-+rm -f $U/libavutil.*
-+rm -f $U/libswresample.*
-+rm -f $U/libswscale.*
-+U=/usr/lib/aarch64-linux-gnu
-+rm -f $U/libavcodec.*
-+rm -f $U/libavdevice.*
-+rm -f $U/libavfilter.*
-+rm -f $U/libavformat.*
-+rm -f $U/libavutil.*
-+rm -f $U/libswresample.*
-+rm -f $U/libswscale.*
-+
---- /dev/null
-+++ b/pi-util/conf_arm64_native.sh
-@@ -0,0 +1,45 @@
-+echo "Configure for ARM64 native build"
-+
-+#RPI_KEEPS="-save-temps=obj"
-+
-+SHARED_LIBS="--enable-shared"
-+if [ "$1" == "--noshared" ]; then
-+  SHARED_LIBS="--disable-shared"
-+  echo Static libs
-+  OUT=out/arm64-static-rel
-+else
-+  echo Shared libs
-+  OUT=out/arm64-shared-rel
-+fi
-+
-+mkdir -p $OUT
-+cd $OUT
-+
-+A=aarch64-linux-gnu
-+USR_PREFIX=`pwd`/install
-+LIB_PREFIX=$USR_PREFIX/lib/$A
-+INC_PREFIX=$USR_PREFIX/include/$A
-+
-+../../configure \
-+ --prefix=$USR_PREFIX\
-+ --libdir=$LIB_PREFIX\
-+ --incdir=$INC_PREFIX\
-+ --disable-stripping\
-+ --disable-thumb\
-+ --disable-mmal\
-+ --enable-sand\
-+ --enable-v4l2-request\
-+ --enable-libdrm\
-+ --enable-epoxy\
-+ --enable-libudev\
-+ --enable-vout-drm\
-+ --enable-vout-egl\
-+ $SHARED_LIBS\
-+ --extra-cflags="-ggdb"
-+
-+# --enable-decoder=hevc_rpi\
-+# --enable-extra-warnings\
-+# --arch=armv71\
-+
-+# gcc option for getting asm listing
-+# -Wa,-ahls
---- /dev/null
-+++ b/pi-util/conf_h265.2016.csv
-@@ -0,0 +1,195 @@
-+1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5,8
-+1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5,8
-+1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5,8
-+1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5,8
-+1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5,8
-+1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5,8
-+1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5,8
-+1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5,8
-+1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5,8
-+1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5,8
-+1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5,8
-+1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5,8
-+1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5,8
-+1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5,8
-+1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5,8
-+1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5,8
-+1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5,8
-+1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5,8
-+1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5,8
-+1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5,8
-+1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5,8
-+1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5,10
-+1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5,8
-+1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5,8
-+1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5,8
-+1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5,8
-+1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5,8
-+1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5,8
-+1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5,8
-+1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5,8
-+1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5,8
-+1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5,8
-+1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5,8
-+1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5,8
-+1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5,8
-+1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5,8
-+1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5,8
-+1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5,8
-+1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5,8
-+1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5,8
-+1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5,8
-+1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5,8
-+1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5,10
-+1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5,8
-+1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5,8
-+1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5,8
-+1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5,8
-+1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5,8
-+1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5,8
-+1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5,8
-+1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5,8
-+1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5,8
-+1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5,8
-+1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5,8
-+1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5,8
-+1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5,8
-+1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5,8
-+1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5,8
-+1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5,8
-+1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5,8
-+1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5,8
-+1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5,8
-+1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5,8
-+1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5,8
-+1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5,8
-+1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5,8
-+1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5,8
-+1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5,8
-+1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5,8
-+1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5,8
-+1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5,8
-+1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5,8
-+1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5,8
-+1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5,8
-+1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5,8
-+1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5,8
-+1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5,8
-+1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5,8
-+1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5,8
-+1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5,8
-+1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5,8
-+1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5,8
-+1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5,8
-+1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5,8
-+1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5,8
-+1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5,8
-+1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5,8
-+1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5,8
-+1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5,8
-+1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5,8
-+1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5,8
-+1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5,8
-+1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5,8
-+1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5,8
-+1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5,8
-+1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5,8
-+1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5,8
-+1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5,8
-+1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5,8
-+1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5,8
-+1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5,8
-+1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5,8
-+1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5,8
-+1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5,8
-+1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5,8
-+1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5,8
-+1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5,8
-+1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5,8
-+1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5,8
-+1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5,8
-+1,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt,8
-+1,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt,8
-+1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5,8
-+1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5,8
-+1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5,8
-+1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5,8
-+1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5,8
-+1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5,8
-+1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5,8
-+1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5,8
-+1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5,8
-+1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5,8
-+1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5,8
-+1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5,8
-+1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5,8
-+1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5,8
-+1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5,8
-+3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth,10
-+1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5,8
-+1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5,8
-+3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???,8
-+1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5,10
-+1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5,8
-+1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5,8
-+1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5,10
-+1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5,10
-+1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5,8
-+1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5,10
-+1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5,8
-+1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5,10
-+1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5,8
-+1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5,10
-+1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5,8
-+1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5,10
-+1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5,8
-+1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5,10
-+1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5,8
-+1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5,0
-+0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt,8
-+0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt,8
-+0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt,10
-+0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt,8
-+0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt,8
-+1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt,0
-+0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt,8
-+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5,10
-+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5,8
-+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5,8
-+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5,8
-+0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5,10
-+0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5,8
-+0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5,8
-+0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5,8
-+1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5,10
-+1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5,0
-+1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5,0
-+1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5,0
-+1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5,0
-+1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5,0
-+1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5,0
-+0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5,0
-+0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5,8
-+0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5,8
-+1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5,0
-+1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5,8
-+1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5,0
-+1,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5,0
-+1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5,0
-+1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt,0
-+1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt,0
-+1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5,0
-+1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5,0
-+0,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5, # Runs out of memory - could be fixed,8
-+0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5,10
-+0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5,10
-+0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5,8
-+0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5,8
-+0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5,8
-+0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5,8
-+0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5,8
-+1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5,8
-+1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5,8
-+1,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5,8
-+1,local/dblk_pu32_horses_832x448,dblk_pu32_horses_832x448.265,dblk_pu32_horses_832x448.md5,8
-+1,local/intra_pred_21_laps,intra_pred_21_laps.265,intra_pred_21_laps.md5,8
---- /dev/null
-+++ b/pi-util/conf_h265.2016_HEVC_v1.csv
-@@ -0,0 +1,147 @@
-+1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
-+1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
-+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
-+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
-+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
-+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
-+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
-+1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
-+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
-+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
-+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
-+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
-+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
-+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
-+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
-+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
-+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
-+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
-+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
-+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
-+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
-+1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
-+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
-+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
-+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
-+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
-+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
-+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
-+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
-+1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
-+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
-+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
-+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
-+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
-+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
-+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
-+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
-+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
-+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
-+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
-+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
-+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
-+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
-+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
-+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
-+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
-+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
-+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
-+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
-+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
-+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
-+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
-+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
-+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
-+1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
-+1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
-+1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
-+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
-+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
-+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
-+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
-+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
-+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
-+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
-+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
-+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
-+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
-+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
-+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
-+1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
-+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
-+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
-+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
-+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
-+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
-+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
-+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
-+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
-+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
-+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
-+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
-+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
-+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
-+1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
-+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
-+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
-+1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
-+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
-+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
-+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
-+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
-+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
-+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
-+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
-+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
-+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
-+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
-+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
-+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
-+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
-+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
-+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
-+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
-+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
-+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
-+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
-+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
-+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
-+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
-+1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
-+2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
-+2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
-+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
-+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
-+1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
-+1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
-+1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
-+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
-+1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
-+1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
-+1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
-+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
-+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
-+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
-+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
-+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
-+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
-+3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
-+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
-+1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
-+3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
-+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
-+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
-+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
-+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
-+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
-+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
-+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
-+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
-+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
-+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
-+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
-+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
-+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
-+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
-+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
-+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
---- /dev/null
-+++ b/pi-util/conf_h265.csv
-@@ -0,0 +1,144 @@
-+1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
-+1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
-+1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
-+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
-+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
-+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
-+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
-+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
-+1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
-+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
-+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
-+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
-+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
-+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
-+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
-+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
-+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
-+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
-+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
-+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
-+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
-+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
-+1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5
-+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
-+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
-+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
-+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
-+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
-+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
-+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
-+1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
-+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
-+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
-+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
-+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
-+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
-+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
-+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
-+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
-+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
-+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
-+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
-+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
-+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
-+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
-+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
-+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
-+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
-+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
-+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
-+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
-+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
-+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
-+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
-+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
-+1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5
-+1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5
-+1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5
-+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
-+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
-+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
-+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
-+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
-+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
-+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
-+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
-+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
-+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
-+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
-+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
-+1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
-+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
-+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
-+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
-+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
-+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
-+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
-+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
-+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
-+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
-+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
-+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
-+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
-+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
-+1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
-+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
-+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
-+1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
-+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
-+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
-+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
-+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
-+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
-+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
-+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
-+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
-+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
-+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
-+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
-+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
-+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
-+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
-+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
-+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
-+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
-+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
-+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
-+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
-+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
-+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
-+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
-+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
-+1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5
-+1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
-+1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
-+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
-+1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
-+1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
-+1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
-+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
-+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
-+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
-+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
-+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
-+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
-+0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched
-+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
-+1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
-+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
-+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
-+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
-+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
-+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
-+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
-+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
-+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
-+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
-+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
-+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
-+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
-+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
-+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
-+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
-+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
---- /dev/null
-+++ b/pi-util/conf_native.sh
-@@ -0,0 +1,106 @@
-+echo "Configure for native build"
-+
-+FFSRC=`pwd`
-+MC=`dpkg --print-architecture`
-+BUILDBASE=$FFSRC/out
-+
-+#RPI_KEEPS="-save-temps=obj"
-+RPI_KEEPS=""
-+
-+NOSHARED=
-+MMAL=
-+
-+while [ "$1" != "" ] ; do
-+    case $1 in
-+	--noshared)
-+	    NOSHARED=1
-+	    ;;
-+	--mmal)
-+	    MMAL=1
-+	    ;;
-+	*)
-+	    echo "Usage $0: [--noshared] [--mmal]"
-+	    exit 1
-+	    ;;
-+    esac
-+    shift
-+done
-+
-+
-+MCOPTS=
-+RPI_INCLUDES=
-+RPI_LIBDIRS=
-+RPI_DEFINES=
-+RPI_EXTRALIBS=
-+
-+if [ "$MC" == "arm64" ]; then
-+  echo "M/C aarch64"
-+  A=aarch64-linux-gnu
-+  B=arm64
-+elif [ "$MC" == "armhf" ]; then
-+  echo "M/C armv7"
-+  A=arm-linux-gnueabihf
-+  B=armv7
-+  MCOPTS="--arch=armv6t2 --cpu=cortex-a7"
-+  RPI_DEFINES=-mfpu=neon-vfpv4
-+else
-+  echo Unexpected architecture $MC
-+  exit 1
-+fi
-+
-+if [ $MMAL ]; then
-+  RPI_OPT_VC=/opt/vc
-+  RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+  RPI_LIBDIRS="-L$RPI_OPT_VC/lib"
-+  RPI_DEFINES="$RPI_DEFINES -D__VCCOREVER__=0x4000000"
-+  RPI_EXTRALIBS="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm -Wl,--end-group"
-+  RPIOPTS="--enable-mmal --enable-rpi"
-+else
-+  RPIOPTS="--disable-mmal --enable-sand"
-+fi
-+
-+C=`lsb_release -sc`
-+V=`cat RELEASE`
-+
-+SHARED_LIBS="--enable-shared"
-+if [ $NOSHARED ]; then
-+  SHARED_LIBS="--disable-shared"
-+  OUT=$BUILDBASE/$B-$C-$V-static-rel
-+  echo Static libs
-+else
-+  echo Shared libs
-+  OUT=$BUILDBASE/$B-$C-$V-shared-rel
-+fi
-+
-+USR_PREFIX=$OUT/install
-+LIB_PREFIX=$USR_PREFIX/lib/$A
-+INC_PREFIX=$USR_PREFIX/include/$A
-+
-+echo Destination directory: $OUT
-+mkdir -p $OUT
-+# Nothing under here need worry git - including this .gitignore!
-+echo "**" > $BUILDBASE/.gitignore
-+cd $OUT
-+
-+$FFSRC/configure \
-+ --prefix=$USR_PREFIX\
-+ --libdir=$LIB_PREFIX\
-+ --incdir=$INC_PREFIX\
-+ $MCOPTS\
-+ --disable-stripping\
-+ --disable-thumb\
-+ --enable-v4l2-request\
-+ --enable-libdrm\
-+ --enable-vout-egl\
-+ --enable-vout-drm\
-+ $SHARED_LIBS\
-+ $RPIOPTS\
-+ --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
-+ --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\
-+ --extra-ldflags="$RPI_LIBDIRS"\
-+ --extra-libs="$RPI_EXTRALIBS"\
-+ --extra-version="rpi"
-+
-+
-+# gcc option for getting asm listing
-+# -Wa,-ahls
---- /dev/null
-+++ b/pi-util/ffconf.py
-@@ -0,0 +1,215 @@
-+#!/usr/bin/env python3
-+
-+import string
-+import os
-+import subprocess
-+import re
-+import argparse
-+import sys
-+import csv
-+from stat import *
-+
-+CODEC_HEVC_RPI  = 1
-+HWACCEL_RPI     = 2
-+HWACCEL_DRM     = 3
-+HWACCEL_VAAPI   = 4
-+
-+def testone(fileroot, srcname, es_file, md5_file, pix, dectype, vcodec, ffmpeg_exec):
-+    hwaccel = ""
-+    if dectype == HWACCEL_RPI:
-+        hwaccel = "rpi"
-+    elif dectype == HWACCEL_DRM:
-+        hwaccel = "drm"
-+    elif dectype == HWACCEL_VAAPI:
-+        hwaccel = "vaapi"
-+
-+    pix_fmt = []
-+    if pix == "8":
-+        pix_fmt = ["-pix_fmt", "yuv420p"]
-+    elif pix == "10":
-+        pix_fmt = ["-pix_fmt", "yuv420p10le"]
-+    elif pix == "12":
-+        pix_fmt = ["-pix_fmt", "yuv420p12le"]
-+
-+    tmp_root = "/tmp"
-+
-+    names = srcname.split('/')
-+    while len(names) > 1:
-+        tmp_root = os.path.join(tmp_root, names[0])
-+        del names[0]
-+    name = names[0]
-+
-+    if not os.path.exists(tmp_root):
-+        os.makedirs(tmp_root)
-+
-+    dec_file = os.path.join(tmp_root, name + ".dec.md5")
-+    try:
-+        os.remove(dec_file)
-+    except:
-+        pass
-+
-+    flog = open(os.path.join(tmp_root, name + ".log"), "wt")
-+
-+    ffargs = [ffmpeg_exec, "-flags", "unaligned", "-hwaccel", hwaccel, "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file)] + pix_fmt + ["-f", "md5", dec_file]
-+
-+    # Unaligned needed for cropping conformance
-+    if hwaccel:
-+        rstr = subprocess.call(ffargs, stdout=flog, stderr=subprocess.STDOUT)
-+    else:
-+        rstr = subprocess.call(
-+            [ffmpeg_exec, "-flags", "unaligned", "-vcodec", vcodec, "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
-+            stdout=flog, stderr=subprocess.STDOUT)
-+
-+    try:
-+        m1 = None
-+        m2 = None
-+        with open(os.path.join(fileroot, md5_file)) as f:
-+            for line in f:
-+                m1 = re.search("[0-9a-f]{32}", line.lower())
-+                if m1:
-+                    break
-+
-+        with open(dec_file) as f:
-+            m2 = re.search("[0-9a-f]{32}", f.readline())
-+    except:
-+        pass
-+
-+    if  m1 and m2 and m1.group() == m2.group():
-+        print("Match: " + m1.group(), file=flog)
-+        rv = 0
-+    elif not m1:
-+        print("****** Cannot find m1", file=flog)
-+        rv = 3
-+    elif not m2:
-+        print("****** Cannot find m2", file=flog)
-+        rv = 2
-+    else:
-+        print("****** Mismatch: " + m1.group() + " != " + m2.group(), file=flog)
-+        rv = 1
-+    flog.close()
-+    return rv
-+
-+def scandir(root):
-+    aconf = []
-+    ents = os.listdir(root)
-+    ents.sort(key=str.lower)
-+    for name in ents:
-+        test_path = os.path.join(root, name)
-+        if S_ISDIR(os.stat(test_path).st_mode):
-+            files = os.listdir(test_path)
-+            es_file = "?"
-+            md5_file = "?"
-+            for f in files:
-+                (base, ext) = os.path.splitext(f)
-+                if base[0] == '.':
-+                    pass
-+                elif ext == ".bit" or ext == ".bin":
-+                    es_file = f
-+                elif ext == ".md5" or (ext == ".txt" and (base[-4:] == "_md5" or base[-6:] == "md5sum")):
-+                    if md5_file == "?":
-+                        md5_file = f
-+                    elif base[-3:] == "yuv":
-+                        md5_file = f
-+            aconf.append((1, name, es_file, md5_file))
-+    return aconf
-+
-+def runtest(name, tests):
-+    if not tests:
-+        return True
-+    for t in tests:
-+        if name[0:len(t)] == t or name.find("/" + t) != -1:
-+            return True
-+    return False
-+
-+def doconf(csva, tests, test_root, vcodec, dectype, ffmpeg_exec):
-+    unx_failures = []
-+    unx_success = []
-+    failures = 0
-+    successes = 0
-+    for a in csva:
-+        exp_test = int(a[0])
-+        if (exp_test and runtest(a[1], tests)):
-+            name = a[1]
-+            print ("==== ", name, end="")
-+            sys.stdout.flush()
-+
-+            rv = testone(os.path.join(test_root, name), name, a[2], a[3], a[4], dectype=dectype, vcodec=vcodec, ffmpeg_exec=ffmpeg_exec)
-+            if (rv == 0):
-+                successes += 1
-+            else:
-+                failures += 1
-+
-+            if (rv == 0):
-+                if exp_test == 2:
-+                    print(": * OK *")
-+                    unx_success.append(name)
-+                else:
-+                    print(": ok")
-+            elif exp_test == 2 and rv == 1:
-+                print(": fail")
-+            elif exp_test == 3 and rv == 2:
-+                # Call an expected "crash" an abort
-+                print(": abort")
-+            else:
-+                unx_failures.append(name)
-+                if rv == 1:
-+                    print(": * FAIL *")
-+                elif (rv == 2) :
-+                    print(": * CRASH *")
-+                elif (rv == 3) :
-+                    print(": * MD5 MISSING *")
-+                else :
-+                    print(": * BANG *")
-+
-+    if unx_failures or unx_success:
-+        print("Unexpected Failures:", unx_failures)
-+        print("Unexpected Success: ", unx_success)
-+    else:
-+        print("All tests normal:", successes, "ok,", failures, "failed")
-+
-+
-+class ConfCSVDialect(csv.Dialect):
-+    delimiter = ','
-+    doublequote = True
-+    lineterminator = '\n'
-+    quotechar='"'
-+    quoting = csv.QUOTE_MINIMAL
-+    skipinitialspace = True
-+    strict = True
-+
-+if __name__ == '__main__':
-+
-+    argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
-+    argp.add_argument("tests", nargs='*')
-+    argp.add_argument("--pi4", action='store_true', help="Force pi4 cmd line")
-+    argp.add_argument("--drm", action='store_true', help="Force v4l2 drm cmd line")
-+    argp.add_argument("--vaapi", action='store_true', help="Force vaapi cmd line")
-+    argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test")
-+    argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
-+    argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename")
-+    argp.add_argument("--vcodec", default="hevc_rpi", help="vcodec name to use")
-+    argp.add_argument("--ffmpeg", default="./ffmpeg", help="ffmpeg exec name")
-+    args = argp.parse_args()
-+
-+    if args.csvgen:
-+        csv.writer(sys.stdout).writerows(scandir(args.test_root))
-+        exit(0)
-+
-+    with open(args.csv, 'rt') as csvfile:
-+        csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
-+
-+    dectype = CODEC_HEVC_RPI
-+    if os.path.exists("/dev/rpivid-hevcmem"):
-+        dectype = HWACCEL_RPI
-+    if args.drm or os.path.exists("/sys/module/rpivid_hevc"):
-+        dectype = HWACCEL_DRM
-+
-+    if args.pi4:
-+        dectype = HWACCEL_RPI
-+    elif args.drm:
-+        dectype = HWACCEL_DRM
-+    elif args.vaapi:
-+        dectype = HWACCEL_VAAPI
-+
-+    doconf(csva, args.tests, args.test_root, args.vcodec, dectype, args.ffmpeg)
-+
---- /dev/null
-+++ b/pi-util/ffperf.py
-@@ -0,0 +1,128 @@
-+#!/usr/bin/env python3
-+
-+import time
-+import string
-+import os
-+import tempfile
-+import subprocess
-+import re
-+import argparse
-+import sys
-+import csv
-+from stat import *
-+
-+class tstats:
-+    close_threshold = 0.01
-+
-+    def __init__(self, stats_dict=None):
-+        if stats_dict != None:
-+            self.name = stats_dict["name"]
-+            self.elapsed = float(stats_dict["elapsed"])
-+            self.user = float(stats_dict["user"])
-+            self.sys = float(stats_dict["sys"])
-+
-+    def times_str(self):
-+        ctime = self.sys + self.user
-+        return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
-+
-+    def dict(self):
-+        return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
-+
-+    def is_close(self, other):
-+        return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
-+
-+    def __lt__(self, other):
-+        return self.elapsed < other.elapsed
-+    def __gt__(self, other):
-+        return self.elapsed > other.elapsed
-+
-+    def time_file(name, prefix, ffmpeg="./ffmpeg"):
-+        stats = tstats()
-+        stats.name = name
-+        start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
-+        cproc = subprocess.Popen([ffmpeg, "-no_cvt_hw",
-+                                  "-vcodec", "hevc_rpi",
-+                                  "-t", "30", "-i", prefix + name,
-+                                  "-f", "vout_rpi", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
-+        pinfo = os.wait4(cproc.pid, 0)
-+        end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
-+        stats.elapsed = end_time - start_time
-+        stats.user = pinfo[2].ru_utime
-+        stats.sys = pinfo[2].ru_stime
-+        return stats
-+
-+
-+def common_prefix(s1, s2):
-+    for i in range(min(len(s1),len(s2))):
-+        if s1[i] != s2[i]:
-+            return s1[:i]
-+    return s1[:i+1]
-+
-+def main():
-+    global flog
-+
-+    argp = argparse.ArgumentParser(description="FFmpeg performance tester", epilog="""
-+To blank the screen before starting use "xdg-screensaver activate"
-+(For some reason this doesn't seem to work from within python).
-+""")
-+
-+    argp.add_argument("streams", nargs='*')
-+    argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename")
-+    argp.add_argument("--csv_in", help="CSV input filename")
-+    argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).")
-+    argp.add_argument("--repeat", default=3, type=int, help="Run repeat count")
-+    argp.add_argument("--ffmpeg", default="./ffmpeg", help="FFmpeg executable")
-+
-+    args = argp.parse_args()
-+
-+    csv_out = csv.DictWriter(open(args.csv_out, 'w', newline=''), ["name", "elapsed", "user", "sys"])
-+    csv_out.writeheader()
-+
-+    stats_in = {}
-+    if args.csv_in != None:
-+        with open(args.csv_in, 'r', newline='') as f_in:
-+            stats_in = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
-+
-+    flog = open(os.path.join(tempfile.gettempdir(), "ffperf.log"), "wt")
-+
-+    streams = args.streams
-+    if not streams:
-+        if not stats_in:
-+            print ("No source streams specified")
-+            return 1
-+        prefix = "" if args.prefix == None else args.prefix
-+        streams = [k for k in stats_in]
-+    elif args.prefix != None:
-+        prefix = args.prefix
-+    else:
-+        prefix = streams[0]
-+        for f in streams[1:]:
-+            prefix = common_prefix(prefix, f)
-+        pp = prefix.rpartition(os.sep)
-+        prefix = pp[0] + pp[1]
-+        streams = [s[len(prefix):] for s in streams]
-+
-+    for f in sorted(streams, key=lambda x : "~" * x.count(os.sep) + x.lower()):
-+        print ("====", f)
-+
-+        t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999})
-+        for i in range(args.repeat):
-+            t = tstats.time_file(f, prefix, args.ffmpeg)
-+            print ("...", t.times_str())
-+            if t0 > t:
-+                t0 = t
-+
-+        if t0.name in stats_in:
-+            pstat = stats_in[t0.name]
-+            print("---" if pstat.is_close(t0) else "<<<" if t0 < pstat else ">>>", pstat.times_str())
-+
-+        csv_out.writerow(t0.dict())
-+
-+        print ()
-+
-+    return 0
-+
-+
-+if __name__ == '__main__':
-+    exit(main())
-+
---- /dev/null
-+++ b/pi-util/genpatch.sh
-@@ -0,0 +1,35 @@
-+set -e
-+
-+NOPATCH=
-+if [ "$1" == "--notag" ]; then
-+  shift
-+  NOPATCH=1
-+fi
-+
-+if [ "$1" == "" ]; then
-+  echo Usage: $0 [--notag] \<patch_tag\>
-+  echo e.g.: $0 mmal_4
-+  exit 1
-+fi
-+
-+VERSION=`cat RELEASE`
-+if [ "$VERSION" == "" ]; then
-+  echo Can\'t find version RELEASE
-+  exit 1
-+fi
-+
-+PATCHFILE=../ffmpeg-$VERSION-$1.patch
-+
-+if [ $NOPATCH ]; then
-+  echo Not tagged
-+else
-+  # Only continue if we are all comitted
-+  git diff --name-status --exit-code
-+
-+  PATCHTAG=pi/$VERSION/$1
-+  echo Tagging: $PATCHTAG
-+
-+  git tag $PATCHTAG
-+fi
-+echo Generating patch: $PATCHFILE
-+git diff n$VERSION -- > $PATCHFILE
---- /dev/null
-+++ b/pi-util/make_array.py
-@@ -0,0 +1,23 @@
-+#!/usr/bin/env python
-+
-+# Usage
-+#   make_array file.bin
-+#   Produces file.h with array of bytes.
-+#
-+import sys
-+for file in sys.argv[1:]:
-+  prefix,suffix = file.split('.')
-+  assert suffix=='bin'
-+  name=prefix.split('/')[-1]
-+  print 'Converting',file
-+  with open(prefix+'.h','wb') as out:
-+    print >>out, 'static const unsigned char',name,'[] = {'
-+    with open(file,'rb') as fd:
-+      i = 0
-+      for byte in fd.read():
-+        print >>out, '0x%02x, ' % ord(byte),
-+        i = i + 1
-+        if i % 8 == 0:
-+          print >>out, ' // %04x' % (i - 8)
-+    print >>out,'};'
-+
---- /dev/null
-+++ b/pi-util/mkinst.sh
-@@ -0,0 +1,5 @@
-+set -e
-+
-+make install
-+
-+cp -r install/* ../vlc/sysroot/raspian_stretch_pi1-sysroot/usr
---- /dev/null
-+++ b/pi-util/patkodi.sh
-@@ -0,0 +1,9 @@
-+set -e
-+KODIBASE=/home/jc/rpi/kodi/xbmc
-+JOBS=-j20
-+make $JOBS
-+git diff xbmc/release/4.3-kodi > $KODIBASE/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
-+make -C $KODIBASE/tools/depends/target/ffmpeg $JOBS
-+make -C $KODIBASE/build install
-+
-+
---- /dev/null
-+++ b/pi-util/perfcmp.py
-@@ -0,0 +1,101 @@
-+#!/usr/bin/env python3
-+
-+import time
-+import string
-+import os
-+import tempfile
-+import subprocess
-+import re
-+import argparse
-+import sys
-+import csv
-+from stat import *
-+
-+class tstats:
-+    close_threshold = 0.01
-+
-+    def __init__(self, stats_dict=None):
-+        if stats_dict != None:
-+            self.name = stats_dict["name"]
-+            self.elapsed = float(stats_dict["elapsed"])
-+            self.user = float(stats_dict["user"])
-+            self.sys = float(stats_dict["sys"])
-+
-+    def times_str(self):
-+        ctime = self.sys + self.user
-+        return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
-+
-+    def dict(self):
-+        return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
-+
-+    def is_close(self, other):
-+        return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
-+
-+    def __lt__(self, other):
-+        return self.elapsed < other.elapsed
-+    def __gt__(self, other):
-+        return self.elapsed > other.elapsed
-+
-+    def time_file(name, prefix):
-+        stats = tstats()
-+        stats.name = name
-+        start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
-+        cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name,
-+                                  "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
-+        pinfo = os.wait4(cproc.pid, 0)
-+        end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
-+        stats.elapsed = end_time - start_time
-+        stats.user = pinfo[2].ru_utime
-+        stats.sys = pinfo[2].ru_stime
-+        return stats
-+
-+
-+def common_prefix(s1, s2):
-+    for i in range(min(len(s1),len(s2))):
-+        if s1[i] != s2[i]:
-+            return s1[:i]
-+    return s1[:i+1]
-+
-+def main():
-+    argp = argparse.ArgumentParser(description="FFmpeg performance compare")
-+
-+    argp.add_argument("stream0", help="CSV to compare")
-+    argp.add_argument("stream1", nargs='?', default="ffperf_out.csv", help="CSV to compare")
-+
-+    args = argp.parse_args()
-+
-+    with open(args.stream0, 'r', newline='') as f_in:
-+        stats0 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
-+    with open(args.stream1, 'r', newline='') as f_in:
-+        stats1 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
-+
-+    print (args.stream0, "<<-->>", args.stream1)
-+    print ()
-+
-+    for f in sorted(stats0.keys() | stats1.keys(), key=lambda x : "~" * x.count(os.sep) + x.lower()):
-+       if not (f in stats0) :
-+           print ("           XX               :", f)
-+           continue
-+       if not (f in stats1) :
-+           print ("       XX                   :", f)
-+           continue
-+
-+       s0 = stats0[f]
-+       s1 = stats1[f]
-+
-+       pcent = ((s0.elapsed - s1.elapsed) / s0.elapsed) * 100.0
-+       thresh = 0.3
-+       tc = 6
-+
-+       nchar = min(tc - 1, int(abs(pcent) / thresh))
-+       cc = "  --  " if nchar == 0 else "<" * nchar + " " * (tc - nchar) if pcent < 0 else " " * (tc - nchar) + ">" * nchar
-+
-+       print ("%6.2f %s%6.2f (%+5.2f) : %s" %
-+           (s0.elapsed, cc, s1.elapsed, pcent, f))
-+
-+    return 0
-+
-+
-+if __name__ == '__main__':
-+    exit(main())
-+
---- /dev/null
-+++ b/pi-util/qem.sh
-@@ -0,0 +1,9 @@
-+TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
-+QASM=python\ ../local/bin/qasm.py
-+SRC_FILE=libavcodec/rpi_hevc_shader.qasm
-+DST_BASE=shader
-+
-+cp libavcodec/rpi_hevc_shader_cmd.h $TARGET_DIR
-+$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
-+$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
-+
---- /dev/null
-+++ b/pi-util/v3dusage.py
-@@ -0,0 +1,128 @@
-+#!/usr/bin/env python
-+
-+import sys
-+import argparse
-+import re
-+
-+def do_logparse(logname):
-+
-+    rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ')
-+    rqcycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs doing vertex/coordinate shading +([0-9]+)$')
-+    rqtscycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs stalled waiting for TMUs +([0-9]+)$')
-+    rl2hits = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: L2C Total Level 2 cache ([a-z]+) +([0-9]+)$')
-+
-+    ttotal = {'idle':0.0}
-+    tstart = {}
-+    qctotal = {}
-+    qtstotal = {}
-+    l2hits = {}
-+    l2total = {}
-+    time0 = None
-+    idle_start = None
-+    qpu_op_no = 0
-+    op_count = 0
-+
-+    with open(logname, "rt") as infile:
-+        for line in infile:
-+            match = rmatch.match(line)
-+            if match:
-+#                print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":"
-+                time = float(match.group(1))
-+                unit = match.group(3)
-+                opstart = not match.group(2)
-+                optype = match.group(7)
-+                hascb = match.group(8) != "0"
-+
-+                if unit == 'qpu1':
-+                    unit = unit + "." + str(qpu_op_no)
-+                    if not opstart:
-+                        if hascb or optype == 'EXECUTE_SYNC':
-+                            qpu_op_no = 0
-+                        else:
-+                            qpu_op_no += 1
-+
-+                # Ignore sync type
-+                if optype == 'EXECUTE_SYNC':
-+                    continue
-+
-+                if not time0:
-+                    time0 = time
-+
-+                if opstart:
-+                    tstart[unit] = time;
-+                elif unit in tstart:
-+                    op_count += 1
-+                    if not unit in ttotal:
-+                        ttotal[unit] = 0.0
-+                    ttotal[unit] += time - tstart[unit]
-+                    del tstart[unit]
-+
-+                if not idle_start and not tstart:
-+                    idle_start = time
-+                elif idle_start and tstart:
-+                    ttotal['idle'] += time - idle_start
-+                    idle_start = None
-+
-+            match = rqcycle.match(line)
-+            if match:
-+                unit = "qpu1." + str(qpu_op_no)
-+                if not unit in qctotal:
-+                    qctotal[unit] = 0
-+                qctotal[unit] += int(match.group(2))
-+
-+            match = rqtscycle.match(line)
-+            if match:
-+                unit = "qpu1." + str(qpu_op_no)
-+                if not unit in qtstotal:
-+                    qtstotal[unit] = 0
-+                qtstotal[unit] += int(match.group(2))
-+
-+            match = rl2hits.match(line)
-+            if match:
-+                unit = "qpu1." + str(qpu_op_no)
-+                if not unit in l2total:
-+                    l2total[unit] = 0
-+                    l2hits[unit] = 0
-+                l2total[unit] += int(match.group(3))
-+                if match.group(2) == "hits":
-+                    l2hits[unit] += int(match.group(3))
-+
-+
-+    if not time0:
-+        print "No v3d profile records found"
-+    else:
-+        tlogged = time - time0
-+
-+        print "Logged time:", tlogged, "  Op count:", op_count
-+        for unit in sorted(ttotal):
-+            print b'%6s: %10.3f    %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged)
-+        print
-+        for unit in sorted(qctotal):
-+            if not unit in qtstotal:
-+                qtstotal[unit] = 0;
-+            print b'%6s: Qcycles: %10d, TMU stall: %10d (%7.3f%%)' % (unit, qctotal[unit], qtstotal[unit], (qtstotal[unit] * 100.0)/qctotal[unit])
-+            if unit in l2total:
-+                print b'        L2Total: %10d, hits:      %10d (%7.3f%%)' % (l2total[unit], l2hits[unit], (l2hits[unit] * 100.0)/l2total[unit])
-+
-+
-+
-+if __name__ == '__main__':
-+    argp = argparse.ArgumentParser(
-+        formatter_class=argparse.RawDescriptionHelpFormatter,
-+        description="QPU/VPU perf summary from VC logging",
-+        epilog = """
-+Will also summarise TMU stalls if logging requests set in qpu noflush param
-+in the profiled code.
-+
-+Example use:
-+  vcgencmd set_logging level=0xc0
-+  <command to profile>
-+  sudo vcdbg log msg >& t.log
-+  v3dusage.py t.log
-+""")
-+
-+    argp.add_argument("logfile")
-+    args = argp.parse_args()
-+
-+    do_logparse(args.logfile)
-+
---- a/tests/checkasm/Makefile
-+++ b/tests/checkasm/Makefile
-@@ -9,8 +9,10 @@ AVCODECOBJS-$(CONFIG_G722DSP)
- AVCODECOBJS-$(CONFIG_H264DSP)           += h264dsp.o
- AVCODECOBJS-$(CONFIG_H264PRED)          += h264pred.o
- AVCODECOBJS-$(CONFIG_H264QPEL)          += h264qpel.o
-+AVCODECOBJS-$(CONFIG_IDCTDSP)           += idctdsp.o
- AVCODECOBJS-$(CONFIG_LLVIDDSP)          += llviddsp.o
- AVCODECOBJS-$(CONFIG_LLVIDENCDSP)       += llviddspenc.o
-+AVCODECOBJS-$(CONFIG_VC1DSP)            += vc1dsp.o
- AVCODECOBJS-$(CONFIG_VP8DSP)            += vp8dsp.o
- AVCODECOBJS-$(CONFIG_VIDEODSP)          += videodsp.o
- 
---- a/tests/checkasm/checkasm.c
-+++ b/tests/checkasm/checkasm.c
-@@ -121,6 +121,9 @@ static const struct {
-     #if CONFIG_HUFFYUV_DECODER
-         { "huffyuvdsp", checkasm_check_huffyuvdsp },
-     #endif
-+    #if CONFIG_IDCTDSP
-+        { "idctdsp", checkasm_check_idctdsp },
-+    #endif
-     #if CONFIG_JPEG2000_DECODER
-         { "jpeg2000dsp", checkasm_check_jpeg2000dsp },
-     #endif
-@@ -145,6 +148,9 @@ static const struct {
-     #if CONFIG_V210_ENCODER
-         { "v210enc", checkasm_check_v210enc },
-     #endif
-+    #if CONFIG_VC1DSP
-+        { "vc1dsp", checkasm_check_vc1dsp },
-+    #endif
-     #if CONFIG_VP8DSP
-         { "vp8dsp", checkasm_check_vp8dsp },
-     #endif
---- a/tests/checkasm/checkasm.h
-+++ b/tests/checkasm/checkasm.h
-@@ -60,6 +60,7 @@ void checkasm_check_hevc_add_res(void);
- void checkasm_check_hevc_idct(void);
- void checkasm_check_hevc_sao(void);
- void checkasm_check_huffyuvdsp(void);
-+void checkasm_check_idctdsp(void);
- void checkasm_check_jpeg2000dsp(void);
- void checkasm_check_llviddsp(void);
- void checkasm_check_llviddspenc(void);
-@@ -73,6 +74,7 @@ void checkasm_check_sw_scale(void);
- void checkasm_check_utvideodsp(void);
- void checkasm_check_v210dec(void);
- void checkasm_check_v210enc(void);
-+void checkasm_check_vc1dsp(void);
- void checkasm_check_vf_eq(void);
- void checkasm_check_vf_gblur(void);
- void checkasm_check_vf_hflip(void);
---- /dev/null
-+++ b/tests/checkasm/idctdsp.c
-@@ -0,0 +1,98 @@
-+/*
-+ * Copyright (c) 2022 Ben Avison
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or modify
-+ * it under the terms of the GNU General Public License as published by
-+ * the Free Software Foundation; either version 2 of the License, or
-+ * (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-+ * GNU General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU General Public License along
-+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
-+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-+ */
-+
-+#include <string.h>
-+
-+#include "checkasm.h"
-+
-+#include "libavcodec/idctdsp.h"
-+
-+#include "libavutil/common.h"
-+#include "libavutil/internal.h"
-+#include "libavutil/intreadwrite.h"
-+#include "libavutil/mem_internal.h"
-+
-+#define IDCTDSP_TEST(func) { #func, offsetof(IDCTDSPContext, func) },
-+
-+typedef struct {
-+    const char *name;
-+    size_t offset;
-+} test;
-+
-+#define RANDOMIZE_BUFFER16(name, size)          \
-+    do {                                        \
-+        int i;                                  \
-+        for (i = 0; i < size; ++i) {            \
-+            uint16_t r = rnd() % 0x201 - 0x100; \
-+            AV_WN16A(name##0 + i, r);           \
-+            AV_WN16A(name##1 + i, r);           \
-+        }                                       \
-+    } while (0)
-+
-+#define RANDOMIZE_BUFFER8(name, size)         \
-+    do {                                      \
-+        int i;                                \
-+        for (i = 0; i < size; ++i) {          \
-+            uint8_t r = rnd();                \
-+            name##0[i] = r;                   \
-+            name##1[i] = r;                   \
-+        }                                     \
-+    } while (0)
-+
-+static void check_add_put_clamped(void)
-+{
-+    /* Source buffers are only as big as needed, since any over-read won't affect results */
-+    LOCAL_ALIGNED_16(int16_t, src0, [64]);
-+    LOCAL_ALIGNED_16(int16_t, src1, [64]);
-+    /* Destination buffers have borders of one row above/below and 8 columns left/right to catch overflows */
-+    LOCAL_ALIGNED_8(uint8_t, dst0, [10 * 24]);
-+    LOCAL_ALIGNED_8(uint8_t, dst1, [10 * 24]);
-+
-+    AVCodecContext avctx = { 0 };
-+    IDCTDSPContext h;
-+
-+    const test tests[] = {
-+        IDCTDSP_TEST(add_pixels_clamped)
-+        IDCTDSP_TEST(put_pixels_clamped)
-+        IDCTDSP_TEST(put_signed_pixels_clamped)
-+    };
-+
-+    ff_idctdsp_init(&h, &avctx);
-+
-+    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
-+        void (*func)(const int16_t *, uint8_t * ptrdiff_t) = *(void **)((intptr_t) &h + tests[t].offset);
-+        if (check_func(func, "idctdsp.%s", tests[t].name)) {
-+            declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *, uint8_t *, ptrdiff_t);
-+            RANDOMIZE_BUFFER16(src, 64);
-+            RANDOMIZE_BUFFER8(dst, 10 * 24);
-+            call_ref(src0, dst0 + 24 + 8, 24);
-+            call_new(src1, dst1 + 24 + 8, 24);
-+            if (memcmp(dst0, dst1, 10 * 24))
-+                fail();
-+            bench_new(src1, dst1 + 24 + 8, 24);
-+        }
-+    }
-+}
-+
-+void checkasm_check_idctdsp(void)
-+{
-+    check_add_put_clamped();
-+    report("idctdsp");
-+}
---- /dev/null
-+++ b/tests/checkasm/vc1dsp.c
-@@ -0,0 +1,452 @@
-+/*
-+ * Copyright (c) 2022 Ben Avison
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or modify
-+ * it under the terms of the GNU General Public License as published by
-+ * the Free Software Foundation; either version 2 of the License, or
-+ * (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-+ * GNU General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU General Public License along
-+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
-+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-+ */
-+
-+#include <string.h>
-+
-+#include "checkasm.h"
-+
-+#include "libavcodec/vc1dsp.h"
-+
-+#include "libavutil/common.h"
-+#include "libavutil/internal.h"
-+#include "libavutil/intreadwrite.h"
-+#include "libavutil/mem_internal.h"
-+
-+#define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) },
-+#define VC1DSP_SIZED_TEST(func, width, height) { #func, offsetof(VC1DSPContext, func), width, height },
-+
-+typedef struct {
-+    const char *name;
-+    size_t offset;
-+    int width;
-+    int height;
-+} test;
-+
-+typedef struct matrix {
-+    size_t width;
-+    size_t height;
-+    float d[];
-+} matrix;
-+
-+static const matrix T8 = { 8, 8, {
-+        12,  12,  12,  12,  12,  12,  12,  12,
-+        16,  15,   9,   4,  -4,  -9, -15, -16,
-+        16,   6,  -6, -16, -16,  -6,   6,  16,
-+        15,  -4, -16,  -9,   9,  16,   4, -15,
-+        12, -12, -12,  12,  12, -12, -12,  12,
-+         9, -16,   4,  15, -15,  -4,  16,  -9,
-+         6, -16,  16,  -6,  -6,  16, -16,   6,
-+         4,  -9,  15, -16,  16, -15,   9,  -4
-+} };
-+
-+static const matrix T4 = { 4, 4, {
-+        17,  17,  17,  17,
-+        22,  10, -10, -22,
-+        17, -17, -17,  17,
-+        10, -22,  22, -10
-+} };
-+
-+static const matrix T8t = { 8, 8, {
-+        12,  16,  16,  15,  12,   9,   6,   4,
-+        12,  15,   6,  -4, -12, -16, -16,  -9,
-+        12,   9,  -6, -16, -12,   4,  16,  15,
-+        12,   4, -16,  -9,  12,  15,  -6, -16,
-+        12,  -4, -16,   9,  12, -15,  -6,  16,
-+        12,  -9,  -6,  16, -12,  -4,  16, -15,
-+        12, -15,   6,   4, -12,  16, -16,   9,
-+        12, -16,  16, -15,  12,  -9,   6,  -4
-+} };
-+
-+static const matrix T4t = { 4, 4, {
-+        17,  22,  17,  10,
-+        17,  10, -17, -22,
-+        17, -10, -17,  22,
-+        17, -22,  17, -10
-+} };
-+
-+static matrix *new_matrix(size_t width, size_t height)
-+{
-+    matrix *out = av_mallocz(sizeof (matrix) + height * width * sizeof (float));
-+    if (out == NULL) {
-+        fprintf(stderr, "Memory allocation failure\n");
-+        exit(EXIT_FAILURE);
-+    }
-+    out->width = width;
-+    out->height = height;
-+    return out;
-+}
-+
-+static matrix *multiply(const matrix *a, const matrix *b)
-+{
-+    matrix *out;
-+    if (a->width != b->height) {
-+        fprintf(stderr, "Incompatible multiplication\n");
-+        exit(EXIT_FAILURE);
-+    }
-+    out = new_matrix(b->width, a->height);
-+    for (int j = 0; j < out->height; ++j)
-+        for (int i = 0; i < out->width; ++i) {
-+            float sum = 0;
-+            for (int k = 0; k < a->width; ++k)
-+                sum += a->d[j * a->width + k] * b->d[k * b->width + i];
-+            out->d[j * out->width + i] = sum;
-+        }
-+    return out;
-+}
-+
-+static void normalise(matrix *a)
-+{
-+    for (int j = 0; j < a->height; ++j)
-+        for (int i = 0; i < a->width; ++i) {
-+            float *p = a->d + j * a->width + i;
-+            *p *= 64;
-+            if (a->height == 4)
-+                *p /= (const unsigned[]) { 289, 292, 289, 292 } [j];
-+            else
-+                *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [j];
-+            if (a->width == 4)
-+                *p /= (const unsigned[]) { 289, 292, 289, 292 } [i];
-+            else
-+                *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [i];
-+        }
-+}
-+
-+static void divide_and_round_nearest(matrix *a, float by)
-+{
-+    for (int j = 0; j < a->height; ++j)
-+        for (int i = 0; i < a->width; ++i) {
-+            float *p = a->d + j * a->width + i;
-+            *p = rintf(*p / by);
-+        }
-+}
-+
-+static void tweak(matrix *a)
-+{
-+    for (int j = 4; j < a->height; ++j)
-+        for (int i = 0; i < a->width; ++i) {
-+            float *p = a->d + j * a->width + i;
-+            *p += 1;
-+        }
-+}
-+
-+/* The VC-1 spec places restrictions on the values permitted at three
-+ * different stages:
-+ * - D: the input coefficients in frequency domain
-+ * - E: the intermediate coefficients, inverse-transformed only horizontally
-+ * - R: the fully inverse-transformed coefficients
-+ *
-+ * To fully cater for the ranges specified requires various intermediate
-+ * values to be held to 17-bit precision; yet these conditions do not appear
-+ * to be utilised in real-world streams. At least some assembly
-+ * implementations have chosen to restrict these values to 16-bit precision,
-+ * to accelerate the decoding of real-world streams at the cost of strict
-+ * adherence to the spec. To avoid our test marking these as failures,
-+ * reduce our random inputs.
-+ */
-+#define ATTENUATION 4
-+
-+static matrix *generate_inverse_quantized_transform_coefficients(size_t width, size_t height)
-+{
-+    matrix *raw, *tmp, *D, *E, *R;
-+    raw = new_matrix(width, height);
-+    for (int i = 0; i < width * height; ++i)
-+        raw->d[i] = (int) (rnd() % (1024/ATTENUATION)) - 512/ATTENUATION;
-+    tmp = multiply(height == 8 ? &T8 : &T4, raw);
-+    D = multiply(tmp, width == 8 ? &T8t : &T4t);
-+    normalise(D);
-+    divide_and_round_nearest(D, 1);
-+    for (int i = 0; i < width * height; ++i) {
-+        if (D->d[i] < -2048/ATTENUATION || D->d[i] > 2048/ATTENUATION-1) {
-+            /* Rare, so simply try again */
-+            av_free(raw);
-+            av_free(tmp);
-+            av_free(D);
-+            return generate_inverse_quantized_transform_coefficients(width, height);
-+        }
-+    }
-+    E = multiply(D, width == 8 ? &T8 : &T4);
-+    divide_and_round_nearest(E, 8);
-+    for (int i = 0; i < width * height; ++i)
-+        if (E->d[i] < -4096/ATTENUATION || E->d[i] > 4096/ATTENUATION-1) {
-+            /* Rare, so simply try again */
-+            av_free(raw);
-+            av_free(tmp);
-+            av_free(D);
-+            av_free(E);
-+            return generate_inverse_quantized_transform_coefficients(width, height);
-+        }
-+    R = multiply(height == 8 ? &T8t : &T4t, E);
-+    tweak(R);
-+    divide_and_round_nearest(R, 128);
-+    for (int i = 0; i < width * height; ++i)
-+        if (R->d[i] < -512/ATTENUATION || R->d[i] > 512/ATTENUATION-1) {
-+            /* Rare, so simply try again */
-+            av_free(raw);
-+            av_free(tmp);
-+            av_free(D);
-+            av_free(E);
-+            av_free(R);
-+            return generate_inverse_quantized_transform_coefficients(width, height);
-+        }
-+    av_free(raw);
-+    av_free(tmp);
-+    av_free(E);
-+    av_free(R);
-+    return D;
-+}
-+
-+#define RANDOMIZE_BUFFER16(name, size)        \
-+    do {                                      \
-+        int i;                                \
-+        for (i = 0; i < size; ++i) {          \
-+            uint16_t r = rnd();               \
-+            AV_WN16A(name##0 + i, r);         \
-+            AV_WN16A(name##1 + i, r);         \
-+        }                                     \
-+    } while (0)
-+
-+#define RANDOMIZE_BUFFER8(name, size)         \
-+    do {                                      \
-+        int i;                                \
-+        for (i = 0; i < size; ++i) {          \
-+            uint8_t r = rnd();                \
-+            name##0[i] = r;                   \
-+            name##1[i] = r;                   \
-+        }                                     \
-+    } while (0)
-+
-+#define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size)  \
-+    do {                                            \
-+        uint8_t *p##0 = name##0, *p##1 = name##1;   \
-+        int i = (size);                             \
-+        while (i-- > 0) {                           \
-+            int x = 0x80 | (rnd() & 0x7F);          \
-+            x >>= rnd() % 9;                        \
-+            if (rnd() & 1)                          \
-+                x = -x;                             \
-+            *p##1++ = *p##0++ = 0x80 + x;           \
-+        }                                           \
-+    } while (0)
-+
-+static void check_inv_trans_inplace(void)
-+{
-+    /* Inverse transform input coefficients are stored in a 16-bit buffer
-+     * with row stride of 8 coefficients irrespective of transform size.
-+     * vc1_inv_trans_8x8 differs from the others in two ways: coefficients
-+     * are stored in column-major order, and the outputs are written back
-+     * to the input buffer, so we oversize it slightly to catch overruns. */
-+    LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [10 * 8]);
-+    LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [10 * 8]);
-+
-+    VC1DSPContext h;
-+
-+    ff_vc1dsp_init(&h);
-+
-+    if (check_func(h.vc1_inv_trans_8x8, "vc1dsp.vc1_inv_trans_8x8")) {
-+        matrix *coeffs;
-+        declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *);
-+        RANDOMIZE_BUFFER16(inv_trans_in, 10 * 8);
-+        coeffs = generate_inverse_quantized_transform_coefficients(8, 8);
-+        for (int j = 0; j < 8; ++j)
-+            for (int i = 0; i < 8; ++i) {
-+                int idx = 8 + i * 8 + j;
-+                inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * 8 + i];
-+            }
-+        call_ref(inv_trans_in0 + 8);
-+        call_new(inv_trans_in1 + 8);
-+        if (memcmp(inv_trans_in0,  inv_trans_in1,  10 * 8 * sizeof (int16_t)))
-+            fail();
-+        bench_new(inv_trans_in1 + 8);
-+        av_free(coeffs);
-+    }
-+}
-+
-+static void check_inv_trans_adding(void)
-+{
-+    /* Inverse transform input coefficients are stored in a 16-bit buffer
-+     * with row stride of 8 coefficients irrespective of transform size. */
-+    LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [8 * 8]);
-+    LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [8 * 8]);
-+
-+    /* For all but vc1_inv_trans_8x8, the inverse transform is narrowed and
-+     * added with saturation to an array of unsigned 8-bit values. Oversize
-+     * this by 8 samples left and right and one row above and below. */
-+    LOCAL_ALIGNED_8(uint8_t, inv_trans_out0, [10 * 24]);
-+    LOCAL_ALIGNED_8(uint8_t, inv_trans_out1, [10 * 24]);
-+
-+    VC1DSPContext h;
-+
-+    const test tests[] = {
-+        VC1DSP_SIZED_TEST(vc1_inv_trans_8x4, 8, 4)
-+        VC1DSP_SIZED_TEST(vc1_inv_trans_4x8, 4, 8)
-+        VC1DSP_SIZED_TEST(vc1_inv_trans_4x4, 4, 4)
-+        VC1DSP_SIZED_TEST(vc1_inv_trans_8x8_dc, 8, 8)
-+        VC1DSP_SIZED_TEST(vc1_inv_trans_8x4_dc, 8, 4)
-+        VC1DSP_SIZED_TEST(vc1_inv_trans_4x8_dc, 4, 8)
-+        VC1DSP_SIZED_TEST(vc1_inv_trans_4x4_dc, 4, 4)
-+    };
-+
-+    ff_vc1dsp_init(&h);
-+
-+    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
-+        void (*func)(uint8_t *, ptrdiff_t, int16_t *) = *(void **)((intptr_t) &h + tests[t].offset);
-+        if (check_func(func, "vc1dsp.%s", tests[t].name)) {
-+            matrix *coeffs;
-+            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int16_t *);
-+            RANDOMIZE_BUFFER16(inv_trans_in, 8 * 8);
-+            RANDOMIZE_BUFFER8(inv_trans_out, 10 * 24);
-+            coeffs = generate_inverse_quantized_transform_coefficients(tests[t].width, tests[t].height);
-+            for (int j = 0; j < tests[t].height; ++j)
-+                for (int i = 0; i < tests[t].width; ++i) {
-+                    int idx = j * 8 + i;
-+                    inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * tests[t].width + i];
-+                }
-+            call_ref(inv_trans_out0 + 24 + 8, 24, inv_trans_in0);
-+            call_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1);
-+            if (memcmp(inv_trans_out0, inv_trans_out1, 10 * 24))
-+                fail();
-+            bench_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1 + 8);
-+            av_free(coeffs);
-+        }
-+    }
-+}
-+
-+static void check_loop_filter(void)
-+{
-+    /* Deblocking filter buffers are big enough to hold a 16x16 block,
-+     * plus 16 columns left and 4 rows above to hold filter inputs
-+     * (depending on whether v or h neighbouring block edge, oversized
-+     * horizontally to maintain 16-byte alignment) plus 16 columns and
-+     * 4 rows below to catch write overflows */
-+    LOCAL_ALIGNED_16(uint8_t, filter_buf0, [24 * 48]);
-+    LOCAL_ALIGNED_16(uint8_t, filter_buf1, [24 * 48]);
-+
-+    VC1DSPContext h;
-+
-+    const test tests[] = {
-+        VC1DSP_TEST(vc1_v_loop_filter4)
-+        VC1DSP_TEST(vc1_h_loop_filter4)
-+        VC1DSP_TEST(vc1_v_loop_filter8)
-+        VC1DSP_TEST(vc1_h_loop_filter8)
-+        VC1DSP_TEST(vc1_v_loop_filter16)
-+        VC1DSP_TEST(vc1_h_loop_filter16)
-+    };
-+
-+    ff_vc1dsp_init(&h);
-+
-+    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
-+        void (*func)(uint8_t *, ptrdiff_t, int) = *(void **)((intptr_t) &h + tests[t].offset);
-+        declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int);
-+        if (check_func(func, "vc1dsp.%s", tests[t].name)) {
-+            for (int count = 1000; count > 0; --count) {
-+                int pq = rnd() % 31 + 1;
-+                RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 48);
-+                call_ref(filter_buf0 + 4 * 48 + 16, 48, pq);
-+                call_new(filter_buf1 + 4 * 48 + 16, 48, pq);
-+                if (memcmp(filter_buf0, filter_buf1, 24 * 48))
-+                    fail();
-+            }
-+        }
-+        for (int j = 0; j < 24; ++j)
-+            for (int i = 0; i < 48; ++i)
-+                filter_buf1[j * 48 + i] = 0x60 + 0x40 * (i >= 16 && j >= 4);
-+        if (check_func(func, "vc1dsp.%s_bestcase", tests[t].name))
-+            bench_new(filter_buf1 + 4 * 48 + 16, 48, 1);
-+        if (check_func(func, "vc1dsp.%s_worstcase", tests[t].name))
-+            bench_new(filter_buf1 + 4 * 48 + 16, 48, 31);
-+    }
-+}
-+
-+#define TEST_UNESCAPE                                                                               \
-+    do {                                                                                            \
-+        for (int count = 100; count > 0; --count) {                                                 \
-+            escaped_offset = rnd() & 7;                                                             \
-+            unescaped_offset = rnd() & 7;                                                           \
-+            escaped_len = (1u << (rnd() % 8) + 3) - (rnd() & 7);                                    \
-+            RANDOMIZE_BUFFER8(unescaped, UNESCAPE_BUF_SIZE);                                        \
-+            len0 = call_ref(escaped0 + escaped_offset, escaped_len, unescaped0 + unescaped_offset); \
-+            len1 = call_new(escaped1 + escaped_offset, escaped_len, unescaped1 + unescaped_offset); \
-+            if (len0 != len1 || memcmp(unescaped0, unescaped1, UNESCAPE_BUF_SIZE))                  \
-+                fail();                                                                             \
-+        }                                                                                           \
-+    } while (0)
-+
-+static void check_unescape(void)
-+{
-+    /* This appears to be a typical length of buffer in use */
-+#define LOG2_UNESCAPE_BUF_SIZE 17
-+#define UNESCAPE_BUF_SIZE (1u<<LOG2_UNESCAPE_BUF_SIZE)
-+    LOCAL_ALIGNED_8(uint8_t, escaped0, [UNESCAPE_BUF_SIZE]);
-+    LOCAL_ALIGNED_8(uint8_t, escaped1, [UNESCAPE_BUF_SIZE]);
-+    LOCAL_ALIGNED_8(uint8_t, unescaped0, [UNESCAPE_BUF_SIZE]);
-+    LOCAL_ALIGNED_8(uint8_t, unescaped1, [UNESCAPE_BUF_SIZE]);
-+
-+    VC1DSPContext h;
-+
-+    ff_vc1dsp_init(&h);
-+
-+    if (check_func(h.vc1_unescape_buffer, "vc1dsp.vc1_unescape_buffer")) {
-+        int len0, len1, escaped_offset, unescaped_offset, escaped_len;
-+        declare_func_emms(AV_CPU_FLAG_MMX, int, const uint8_t *, int, uint8_t *);
-+
-+        /* Test data which consists of escapes sequences packed as tightly as possible */
-+        for (int x = 0; x < UNESCAPE_BUF_SIZE; ++x)
-+            escaped1[x] = escaped0[x] = 3 * (x % 3 == 0);
-+        TEST_UNESCAPE;
-+
-+        /* Test random data */
-+        RANDOMIZE_BUFFER8(escaped, UNESCAPE_BUF_SIZE);
-+        TEST_UNESCAPE;
-+
-+        /* Test data with escape sequences at random intervals */
-+        for (int x = 0; x <= UNESCAPE_BUF_SIZE - 4;) {
-+            int gap, gap_msb;
-+            escaped1[x+0] = escaped0[x+0] = 0;
-+            escaped1[x+1] = escaped0[x+1] = 0;
-+            escaped1[x+2] = escaped0[x+2] = 3;
-+            escaped1[x+3] = escaped0[x+3] = rnd() & 3;
-+            gap_msb = 2u << (rnd() % 8);
-+            gap = (rnd() &~ -gap_msb) | gap_msb;
-+            x += gap;
-+        }
-+        TEST_UNESCAPE;
-+
-+        /* Test data which is known to contain no escape sequences */
-+        memset(escaped0, 0xFF, UNESCAPE_BUF_SIZE);
-+        memset(escaped1, 0xFF, UNESCAPE_BUF_SIZE);
-+        TEST_UNESCAPE;
-+
-+        /* Benchmark the no-escape-sequences case */
-+        bench_new(escaped1, UNESCAPE_BUF_SIZE, unescaped1);
-+    }
-+}
-+
-+void checkasm_check_vc1dsp(void)
-+{
-+    check_inv_trans_inplace();
-+    check_inv_trans_adding();
-+    report("inv_trans");
-+
-+    check_loop_filter();
-+    report("loop_filter");
-+
-+    check_unescape();
-+    report("unescape_buffer");
-+}
---- a/tests/fate/checkasm.mak
-+++ b/tests/fate/checkasm.mak
-@@ -16,6 +16,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp
-                 fate-checkasm-hevc_add_res                              \
-                 fate-checkasm-hevc_idct                                 \
-                 fate-checkasm-hevc_sao                                  \
-+                fate-checkasm-idctdsp                                   \
-                 fate-checkasm-jpeg2000dsp                               \
-                 fate-checkasm-llviddsp                                  \
-                 fate-checkasm-llviddspenc                               \
-@@ -27,6 +28,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp
-                 fate-checkasm-sw_scale                                  \
-                 fate-checkasm-v210dec                                   \
-                 fate-checkasm-v210enc                                   \
-+                fate-checkasm-vc1dsp                                    \
-                 fate-checkasm-vf_blend                                  \
-                 fate-checkasm-vf_colorspace                             \
-                 fate-checkasm-vf_eq                                     \
diff --git a/.github/scripts/Linux/arm/ffmpeg-arm-patches/0005-fix_flags.patch b/.github/scripts/Linux/arm/ffmpeg-arm-patches/0005-fix_flags.patch
deleted file mode 100644
index 6175d5be61..0000000000
--- a/.github/scripts/Linux/arm/ffmpeg-arm-patches/0005-fix_flags.patch
+++ /dev/null
@@ -1,17 +0,0 @@
---- a/configure
-+++ b/configure
-@@ -6471,11 +6471,9 @@ enabled mbedtls           && { check_pkg
-                                die "ERROR: mbedTLS not found"; }
- enabled mediacodec        && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; }
- ( enabled rpi ||
--  enabled mmal )          && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
--                               { ! enabled cross_compile &&
--                                 add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline &&
--                                 add_ldflags -L/opt/vc/lib/ &&
--                                 check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host -lvcos -lvcsm -lvchostif -lvchiq_arm; } ||
-+  enabled mmal )          && { { add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline &&
-+                               add_ldflags -L/opt/vc/lib/ &&
-+                               check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host -lvcsm -lvchostif -lvchiq_arm -lvcos; } ||
-                                die "ERROR: mmal not found" &&
-                                check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; }
- enabled openal            && { { for al_extralibs in "${OPENAL_LIBS}" "-lopenal" "-lOpenAL32"; do