fn dav1d_sgr_filter2_neon: Deduplicate w/ generics (#343)

memorysafety · Aug 3, 2023 · 04ee7fd · 04ee7fd
1 parent 0099481
commit 04ee7fd
Show file tree

Hide file tree

Showing 3 changed files with 159 additions and 193 deletions.
diff --git a/src/looprestoration.rs b/src/looprestoration.rs
@@ -163,6 +163,21 @@ extern "C" {
         strength: libc::c_int,
         bitdepth_max: libc::c_int,
     );
+    fn dav1d_sgr_box5_v_neon(
+        sumsq: *mut int32_t,
+        sum: *mut int16_t,
+        w: libc::c_int,
+        h: libc::c_int,
+        edges: LrEdgeFlags,
+    );
+    fn dav1d_sgr_calc_ab2_neon(
+        a: *mut int32_t,
+        b: *mut int16_t,
+        w: libc::c_int,
+        h: libc::c_int,
+        strength: libc::c_int,
+        bitdepth_max: libc::c_int,
+    );
 }
 
 // 256 * 1.5 + 3 + 3 = 390
@@ -974,7 +989,7 @@ unsafe fn sgr_mix_rust<BD: BitDepth>(
     }
 }
 
-type fn_dav1d_sgr_box3_h_neon<BD> = unsafe extern "C" fn(
+type fn_dav1d_sgr_box_h_neon<BD> = unsafe extern "C" fn(
     sumsq: *mut int32_t,
     sum: *mut int16_t,
     left: *const [<BD as BitDepth>::Pixel; 4],
@@ -985,7 +1000,7 @@ type fn_dav1d_sgr_box3_h_neon<BD> = unsafe extern "C" fn(
     edges: LrEdgeFlags,
 );
 
-type fn_dav1d_sgr_finish_filter1_neon<BD> = unsafe extern "C" fn(
+type fn_dav1d_sgr_finish_filter_neon<BD> = unsafe extern "C" fn(
     tmp: *mut int16_t,
     src: *const <BD as BitDepth>::Pixel,
     stride: ptrdiff_t,
@@ -998,13 +1013,15 @@ type fn_dav1d_sgr_finish_filter1_neon<BD> = unsafe extern "C" fn(
 // TODO(randomPoison): Temporarily pub until all usages can be made private.
 #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))]
 pub(crate) trait BitDepthLooprestorationArm: BitDepth {
-    const dav1d_sgr_box3_h_neon: fn_dav1d_sgr_box3_h_neon<Self>;
-    const dav1d_sgr_finish_filter1_neon: fn_dav1d_sgr_finish_filter1_neon<Self>;
+    const dav1d_sgr_box3_h_neon: fn_dav1d_sgr_box_h_neon<Self>;
+    const dav1d_sgr_box5_h_neon: fn_dav1d_sgr_box_h_neon<Self>;
+    const dav1d_sgr_finish_filter1_neon: fn_dav1d_sgr_finish_filter_neon<Self>;
+    const dav1d_sgr_finish_filter2_neon: fn_dav1d_sgr_finish_filter_neon<Self>;
 }
 
 #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))]
 impl BitDepthLooprestorationArm for BitDepth8 {
-    const dav1d_sgr_box3_h_neon: fn_dav1d_sgr_box3_h_neon<Self> = {
+    const dav1d_sgr_box3_h_neon: fn_dav1d_sgr_box_h_neon<Self> = {
         extern "C" {
             fn dav1d_sgr_box3_h_8bpc_neon(
                 sumsq: *mut int32_t,
@@ -1021,7 +1038,24 @@ impl BitDepthLooprestorationArm for BitDepth8 {
         dav1d_sgr_box3_h_8bpc_neon
     };
 
-    const dav1d_sgr_finish_filter1_neon: fn_dav1d_sgr_finish_filter1_neon<Self> = {
+    const dav1d_sgr_box5_h_neon: fn_dav1d_sgr_box_h_neon<Self> = {
+        extern "C" {
+            fn dav1d_sgr_box5_h_8bpc_neon(
+                sumsq: *mut int32_t,
+                sum: *mut int16_t,
+                left: *const [<BitDepth8 as BitDepth>::Pixel; 4],
+                src: *const <BitDepth8 as BitDepth>::Pixel,
+                stride: ptrdiff_t,
+                w: libc::c_int,
+                h: libc::c_int,
+                edges: LrEdgeFlags,
+            );
+        }
+
+        dav1d_sgr_box5_h_8bpc_neon
+    };
+
+    const dav1d_sgr_finish_filter1_neon: fn_dav1d_sgr_finish_filter_neon<Self> = {
         extern "C" {
             fn dav1d_sgr_finish_filter1_8bpc_neon(
                 tmp: *mut int16_t,
@@ -1036,11 +1070,27 @@ impl BitDepthLooprestorationArm for BitDepth8 {
 
         dav1d_sgr_finish_filter1_8bpc_neon
     };
+
+    const dav1d_sgr_finish_filter2_neon: fn_dav1d_sgr_finish_filter_neon<Self> = {
+        extern "C" {
+            fn dav1d_sgr_finish_filter2_8bpc_neon(
+                tmp: *mut int16_t,
+                src: *const <BitDepth8 as BitDepth>::Pixel,
+                stride: ptrdiff_t,
+                a: *const int32_t,
+                b: *const int16_t,
+                w: libc::c_int,
+                h: libc::c_int,
+            );
+        }
+
+        dav1d_sgr_finish_filter2_8bpc_neon
+    };
 }
 
 #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))]
 impl BitDepthLooprestorationArm for BitDepth16 {
-    const dav1d_sgr_box3_h_neon: fn_dav1d_sgr_box3_h_neon<Self> = {
+    const dav1d_sgr_box3_h_neon: fn_dav1d_sgr_box_h_neon<Self> = {
         extern "C" {
             fn dav1d_sgr_box3_h_16bpc_neon(
                 sumsq: *mut int32_t,
@@ -1057,7 +1107,24 @@ impl BitDepthLooprestorationArm for BitDepth16 {
         dav1d_sgr_box3_h_16bpc_neon
     };
 
-    const dav1d_sgr_finish_filter1_neon: fn_dav1d_sgr_finish_filter1_neon<Self> = {
+    const dav1d_sgr_box5_h_neon: fn_dav1d_sgr_box_h_neon<Self> = {
+        extern "C" {
+            fn dav1d_sgr_box5_h_16bpc_neon(
+                sumsq: *mut int32_t,
+                sum: *mut int16_t,
+                left: *const [<BitDepth16 as BitDepth>::Pixel; 4],
+                src: *const <BitDepth16 as BitDepth>::Pixel,
+                stride: ptrdiff_t,
+                w: libc::c_int,
+                h: libc::c_int,
+                edges: LrEdgeFlags,
+            );
+        }
+
+        dav1d_sgr_box5_h_16bpc_neon
+    };
+
+    const dav1d_sgr_finish_filter1_neon: fn_dav1d_sgr_finish_filter_neon<Self> = {
         extern "C" {
             fn dav1d_sgr_finish_filter1_16bpc_neon(
                 tmp: *mut int16_t,
@@ -1072,6 +1139,22 @@ impl BitDepthLooprestorationArm for BitDepth16 {
 
         dav1d_sgr_finish_filter1_16bpc_neon
     };
+
+    const dav1d_sgr_finish_filter2_neon: fn_dav1d_sgr_finish_filter_neon<Self> = {
+        extern "C" {
+            fn dav1d_sgr_finish_filter2_16bpc_neon(
+                tmp: *mut int16_t,
+                src: *const <BitDepth16 as BitDepth>::Pixel,
+                stride: ptrdiff_t,
+                a: *const int32_t,
+                b: *const int16_t,
+                w: libc::c_int,
+                h: libc::c_int,
+            );
+        }
+
+        dav1d_sgr_finish_filter2_16bpc_neon
+    };
 }
 
 // TODO(randomPoison): Temporarily pub until callers are deduplicated.
@@ -1129,3 +1212,59 @@ pub(crate) unsafe fn dav1d_sgr_filter1_neon<BD: BitDepthLooprestorationArm>(
     dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, bd.bitdepth_max().as_());
     BD::dav1d_sgr_finish_filter1_neon(tmp, src, stride, a, b, w, h);
 }
+
+// TODO(randomPoison): Temporarily pub until callers are deduplicated.
+#[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))]
+pub(crate) unsafe fn dav1d_sgr_filter2_neon<BD: BitDepthLooprestorationArm>(
+    mut tmp: *mut int16_t,
+    mut src: *const BD::Pixel,
+    stride: ptrdiff_t,
+    mut left: *const [BD::Pixel; 4],
+    mut lpf: *const BD::Pixel,
+    w: libc::c_int,
+    h: libc::c_int,
+    strength: libc::c_int,
+    edges: LrEdgeFlags,
+    bd: BD,
+) {
+    let mut sumsq_mem: Align16<[int32_t; 27208]> = Align16([0; 27208]);
+    let sumsq: *mut int32_t = &mut *sumsq_mem
+        .0
+        .as_mut_ptr()
+        .offset(((384 + 16) * 2 + 8) as isize) as *mut int32_t;
+    let a: *mut int32_t = sumsq;
+    let mut sum_mem: Align16<[int16_t; 27216]> = Align16([0; 27216]);
+    let sum: *mut int16_t = &mut *sum_mem
+        .0
+        .as_mut_ptr()
+        .offset(((384 + 16) * 2 + 16) as isize) as *mut int16_t;
+    let b: *mut int16_t = sum;
+    BD::dav1d_sgr_box5_h_neon(sumsq, sum, left, src, stride, w, h, edges);
+    if edges as libc::c_uint & LR_HAVE_TOP as libc::c_int as libc::c_uint != 0 {
+        BD::dav1d_sgr_box5_h_neon(
+            &mut *sumsq.offset((-(2 as libc::c_int) * (384 + 16)) as isize),
+            &mut *sum.offset((-(2 as libc::c_int) * (384 + 16)) as isize),
+            0 as *const [BD::Pixel; 4],
+            lpf,
+            stride,
+            w,
+            2 as libc::c_int,
+            edges,
+        );
+    }
+    if edges as libc::c_uint & LR_HAVE_BOTTOM as libc::c_int as libc::c_uint != 0 {
+        BD::dav1d_sgr_box5_h_neon(
+            &mut *sumsq.offset((h * (384 + 16)) as isize),
+            &mut *sum.offset((h * (384 + 16)) as isize),
+            0 as *const [BD::Pixel; 4],
+            lpf.offset((6 * BD::pxstride(stride as usize)) as isize),
+            stride,
+            w,
+            2 as libc::c_int,
+            edges,
+        );
+    }
+    dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);
+    dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, bd.bitdepth_max().as_());
+    BD::dav1d_sgr_finish_filter2_neon(tmp, src, stride, a, b, w, h);
+}
diff --git a/src/looprestoration_tmpl_16.rs b/src/looprestoration_tmpl_16.rs
@@ -9,30 +9,6 @@ use cfg_if::cfg_if;
 
 #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))]
 extern "C" {
-    fn dav1d_sgr_box5_v_neon(
-        sumsq: *mut int32_t,
-        sum: *mut int16_t,
-        w: libc::c_int,
-        h: libc::c_int,
-        edges: LrEdgeFlags,
-    );
-    fn dav1d_sgr_calc_ab2_neon(
-        a: *mut int32_t,
-        b: *mut int16_t,
-        w: libc::c_int,
-        h: libc::c_int,
-        strength: libc::c_int,
-        bitdepth_max: libc::c_int,
-    );
-    fn dav1d_sgr_finish_filter2_16bpc_neon(
-        tmp: *mut int16_t,
-        src: *const pixel,
-        stride: ptrdiff_t,
-        a: *const int32_t,
-        b: *const int16_t,
-        w: libc::c_int,
-        h: libc::c_int,
-    );
     fn dav1d_sgr_weighted2_16bpc_neon(
         dst: *mut pixel,
         dst_stride: ptrdiff_t,
@@ -56,16 +32,6 @@ extern "C" {
         wt: libc::c_int,
         bitdepth_max: libc::c_int,
     );
-    fn dav1d_sgr_box5_h_16bpc_neon(
-        sumsq: *mut int32_t,
-        sum: *mut int16_t,
-        left: *const [pixel; 4],
-        src: *const pixel,
-        stride: ptrdiff_t,
-        w: libc::c_int,
-        h: libc::c_int,
-        edges: LrEdgeFlags,
-    );
 }
 #[cfg(all(feature = "asm", target_arch = "arm"))]
 extern "C" {
@@ -378,64 +344,6 @@ unsafe extern "C" fn sgr_filter_3x3_neon(
     );
 }
 
-#[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))]
-unsafe extern "C" fn dav1d_sgr_filter2_neon(
-    mut tmp: *mut int16_t,
-    mut src: *const pixel,
-    stride: ptrdiff_t,
-    mut left: *const [pixel; 4],
-    mut lpf: *const pixel,
-    w: libc::c_int,
-    h: libc::c_int,
-    strength: libc::c_int,
-    edges: LrEdgeFlags,
-    bitdepth_max: libc::c_int,
-) {
-    use crate::src::looprestoration::LR_HAVE_BOTTOM;
-    use crate::src::looprestoration::LR_HAVE_TOP;
-
-    let mut sumsq_mem: Align16<[int32_t; 27208]> = Align16([0; 27208]);
-    let sumsq: *mut int32_t = &mut *sumsq_mem
-        .0
-        .as_mut_ptr()
-        .offset(((384 + 16) * 2 + 8) as isize) as *mut int32_t;
-    let a: *mut int32_t = sumsq;
-    let mut sum_mem: Align16<[int16_t; 27216]> = Align16([0; 27216]);
-    let sum: *mut int16_t = &mut *sum_mem
-        .0
-        .as_mut_ptr()
-        .offset(((384 + 16) * 2 + 16) as isize) as *mut int16_t;
-    let b: *mut int16_t = sum;
-    dav1d_sgr_box5_h_16bpc_neon(sumsq, sum, left, src, stride, w, h, edges);
-    if edges as libc::c_uint & LR_HAVE_TOP as libc::c_int as libc::c_uint != 0 {
-        dav1d_sgr_box5_h_16bpc_neon(
-            &mut *sumsq.offset((-(2 as libc::c_int) * (384 + 16)) as isize),
-            &mut *sum.offset((-(2 as libc::c_int) * (384 + 16)) as isize),
-            0 as *const [pixel; 4],
-            lpf,
-            stride,
-            w,
-            2 as libc::c_int,
-            edges,
-        );
-    }
-    if edges as libc::c_uint & LR_HAVE_BOTTOM as libc::c_int as libc::c_uint != 0 {
-        dav1d_sgr_box5_h_16bpc_neon(
-            &mut *sumsq.offset((h * (384 + 16)) as isize),
-            &mut *sum.offset((h * (384 + 16)) as isize),
-            0 as *const [pixel; 4],
-            lpf.offset((6 * PXSTRIDE(stride)) as isize),
-            stride,
-            w,
-            2 as libc::c_int,
-            edges,
-        );
-    }
-    dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);
-    dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, bitdepth_max);
-    dav1d_sgr_finish_filter2_16bpc_neon(tmp, src, stride, a, b, w, h);
-}
-
 #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))]
 unsafe extern "C" fn sgr_filter_5x5_neon_erased(
     p: *mut libc::c_void,
@@ -473,6 +381,9 @@ unsafe extern "C" fn sgr_filter_5x5_neon(
     edges: LrEdgeFlags,
     bitdepth_max: libc::c_int,
 ) {
+    use crate::include::common::bitdepth::BitDepth;
+    use crate::src::looprestoration::dav1d_sgr_filter2_neon;
+
     let mut tmp: Align16<[int16_t; 24576]> = Align16([0; 24576]);
     dav1d_sgr_filter2_neon(
         tmp.0.as_mut_ptr(),
@@ -484,7 +395,7 @@ unsafe extern "C" fn sgr_filter_5x5_neon(
         h,
         (*params).sgr.s0 as libc::c_int,
         edges,
-        bitdepth_max,
+        BitDepth16::from_c(bitdepth_max),
     );
     dav1d_sgr_weighted1_16bpc_neon(
         dst,
@@ -538,6 +449,7 @@ unsafe extern "C" fn sgr_filter_mix_neon(
 ) {
     use crate::include::common::bitdepth::BitDepth;
     use crate::src::looprestoration::dav1d_sgr_filter1_neon;
+    use crate::src::looprestoration::dav1d_sgr_filter2_neon;
 
     let mut tmp1: Align16<[int16_t; 24576]> = Align16([0; 24576]);
     let mut tmp2: Align16<[int16_t; 24576]> = Align16([0; 24576]);
@@ -551,7 +463,7 @@ unsafe extern "C" fn sgr_filter_mix_neon(
         h,
         (*params).sgr.s0 as libc::c_int,
         edges,
-        bitdepth_max,
+        BitDepth16::from_c(bitdepth_max),
     );
     dav1d_sgr_filter1_neon(
         tmp2.0.as_mut_ptr(),