From 04ee7fd2c42d3b3a57feda6869d429ca83d380c6 Mon Sep 17 00:00:00 2001 From: Nicole L Date: Wed, 2 Aug 2023 19:42:40 -0700 Subject: [PATCH] `fn dav1d_sgr_filter2_neon`: Deduplicate w/ generics (#343) --- src/looprestoration.rs | 155 +++++++++++++++++++++++++++++++-- src/looprestoration_tmpl_16.rs | 100 ++------------------- src/looprestoration_tmpl_8.rs | 97 ++------------------- 3 files changed, 159 insertions(+), 193 deletions(-) diff --git a/src/looprestoration.rs b/src/looprestoration.rs index 60d46fcfc..8461dab22 100644 --- a/src/looprestoration.rs +++ b/src/looprestoration.rs @@ -163,6 +163,21 @@ extern "C" { strength: libc::c_int, bitdepth_max: libc::c_int, ); + fn dav1d_sgr_box5_v_neon( + sumsq: *mut int32_t, + sum: *mut int16_t, + w: libc::c_int, + h: libc::c_int, + edges: LrEdgeFlags, + ); + fn dav1d_sgr_calc_ab2_neon( + a: *mut int32_t, + b: *mut int16_t, + w: libc::c_int, + h: libc::c_int, + strength: libc::c_int, + bitdepth_max: libc::c_int, + ); } // 256 * 1.5 + 3 + 3 = 390 @@ -974,7 +989,7 @@ unsafe fn sgr_mix_rust( } } -type fn_dav1d_sgr_box3_h_neon = unsafe extern "C" fn( +type fn_dav1d_sgr_box_h_neon = unsafe extern "C" fn( sumsq: *mut int32_t, sum: *mut int16_t, left: *const [::Pixel; 4], @@ -985,7 +1000,7 @@ type fn_dav1d_sgr_box3_h_neon = unsafe extern "C" fn( edges: LrEdgeFlags, ); -type fn_dav1d_sgr_finish_filter1_neon = unsafe extern "C" fn( +type fn_dav1d_sgr_finish_filter_neon = unsafe extern "C" fn( tmp: *mut int16_t, src: *const ::Pixel, stride: ptrdiff_t, @@ -998,13 +1013,15 @@ type fn_dav1d_sgr_finish_filter1_neon = unsafe extern "C" fn( // TODO(randomPoison): Temporarily pub until all usages can be made private. #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] pub(crate) trait BitDepthLooprestorationArm: BitDepth { - const dav1d_sgr_box3_h_neon: fn_dav1d_sgr_box3_h_neon; - const dav1d_sgr_finish_filter1_neon: fn_dav1d_sgr_finish_filter1_neon; + const dav1d_sgr_box3_h_neon: fn_dav1d_sgr_box_h_neon; + const dav1d_sgr_box5_h_neon: fn_dav1d_sgr_box_h_neon; + const dav1d_sgr_finish_filter1_neon: fn_dav1d_sgr_finish_filter_neon; + const dav1d_sgr_finish_filter2_neon: fn_dav1d_sgr_finish_filter_neon; } #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] impl BitDepthLooprestorationArm for BitDepth8 { - const dav1d_sgr_box3_h_neon: fn_dav1d_sgr_box3_h_neon = { + const dav1d_sgr_box3_h_neon: fn_dav1d_sgr_box_h_neon = { extern "C" { fn dav1d_sgr_box3_h_8bpc_neon( sumsq: *mut int32_t, @@ -1021,7 +1038,24 @@ impl BitDepthLooprestorationArm for BitDepth8 { dav1d_sgr_box3_h_8bpc_neon }; - const dav1d_sgr_finish_filter1_neon: fn_dav1d_sgr_finish_filter1_neon = { + const dav1d_sgr_box5_h_neon: fn_dav1d_sgr_box_h_neon = { + extern "C" { + fn dav1d_sgr_box5_h_8bpc_neon( + sumsq: *mut int32_t, + sum: *mut int16_t, + left: *const [::Pixel; 4], + src: *const ::Pixel, + stride: ptrdiff_t, + w: libc::c_int, + h: libc::c_int, + edges: LrEdgeFlags, + ); + } + + dav1d_sgr_box5_h_8bpc_neon + }; + + const dav1d_sgr_finish_filter1_neon: fn_dav1d_sgr_finish_filter_neon = { extern "C" { fn dav1d_sgr_finish_filter1_8bpc_neon( tmp: *mut int16_t, @@ -1036,11 +1070,27 @@ impl BitDepthLooprestorationArm for BitDepth8 { dav1d_sgr_finish_filter1_8bpc_neon }; + + const dav1d_sgr_finish_filter2_neon: fn_dav1d_sgr_finish_filter_neon = { + extern "C" { + fn dav1d_sgr_finish_filter2_8bpc_neon( + tmp: *mut int16_t, + src: *const ::Pixel, + stride: ptrdiff_t, + a: *const int32_t, + b: *const int16_t, + w: libc::c_int, + h: libc::c_int, + ); + } + + dav1d_sgr_finish_filter2_8bpc_neon + }; } #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] impl BitDepthLooprestorationArm for BitDepth16 { - const dav1d_sgr_box3_h_neon: fn_dav1d_sgr_box3_h_neon = { + const dav1d_sgr_box3_h_neon: fn_dav1d_sgr_box_h_neon = { extern "C" { fn dav1d_sgr_box3_h_16bpc_neon( sumsq: *mut int32_t, @@ -1057,7 +1107,24 @@ impl BitDepthLooprestorationArm for BitDepth16 { dav1d_sgr_box3_h_16bpc_neon }; - const dav1d_sgr_finish_filter1_neon: fn_dav1d_sgr_finish_filter1_neon = { + const dav1d_sgr_box5_h_neon: fn_dav1d_sgr_box_h_neon = { + extern "C" { + fn dav1d_sgr_box5_h_16bpc_neon( + sumsq: *mut int32_t, + sum: *mut int16_t, + left: *const [::Pixel; 4], + src: *const ::Pixel, + stride: ptrdiff_t, + w: libc::c_int, + h: libc::c_int, + edges: LrEdgeFlags, + ); + } + + dav1d_sgr_box5_h_16bpc_neon + }; + + const dav1d_sgr_finish_filter1_neon: fn_dav1d_sgr_finish_filter_neon = { extern "C" { fn dav1d_sgr_finish_filter1_16bpc_neon( tmp: *mut int16_t, @@ -1072,6 +1139,22 @@ impl BitDepthLooprestorationArm for BitDepth16 { dav1d_sgr_finish_filter1_16bpc_neon }; + + const dav1d_sgr_finish_filter2_neon: fn_dav1d_sgr_finish_filter_neon = { + extern "C" { + fn dav1d_sgr_finish_filter2_16bpc_neon( + tmp: *mut int16_t, + src: *const ::Pixel, + stride: ptrdiff_t, + a: *const int32_t, + b: *const int16_t, + w: libc::c_int, + h: libc::c_int, + ); + } + + dav1d_sgr_finish_filter2_16bpc_neon + }; } // TODO(randomPoison): Temporarily pub until callers are deduplicated. @@ -1129,3 +1212,59 @@ pub(crate) unsafe fn dav1d_sgr_filter1_neon( dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, bd.bitdepth_max().as_()); BD::dav1d_sgr_finish_filter1_neon(tmp, src, stride, a, b, w, h); } + +// TODO(randomPoison): Temporarily pub until callers are deduplicated. +#[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] +pub(crate) unsafe fn dav1d_sgr_filter2_neon( + mut tmp: *mut int16_t, + mut src: *const BD::Pixel, + stride: ptrdiff_t, + mut left: *const [BD::Pixel; 4], + mut lpf: *const BD::Pixel, + w: libc::c_int, + h: libc::c_int, + strength: libc::c_int, + edges: LrEdgeFlags, + bd: BD, +) { + let mut sumsq_mem: Align16<[int32_t; 27208]> = Align16([0; 27208]); + let sumsq: *mut int32_t = &mut *sumsq_mem + .0 + .as_mut_ptr() + .offset(((384 + 16) * 2 + 8) as isize) as *mut int32_t; + let a: *mut int32_t = sumsq; + let mut sum_mem: Align16<[int16_t; 27216]> = Align16([0; 27216]); + let sum: *mut int16_t = &mut *sum_mem + .0 + .as_mut_ptr() + .offset(((384 + 16) * 2 + 16) as isize) as *mut int16_t; + let b: *mut int16_t = sum; + BD::dav1d_sgr_box5_h_neon(sumsq, sum, left, src, stride, w, h, edges); + if edges as libc::c_uint & LR_HAVE_TOP as libc::c_int as libc::c_uint != 0 { + BD::dav1d_sgr_box5_h_neon( + &mut *sumsq.offset((-(2 as libc::c_int) * (384 + 16)) as isize), + &mut *sum.offset((-(2 as libc::c_int) * (384 + 16)) as isize), + 0 as *const [BD::Pixel; 4], + lpf, + stride, + w, + 2 as libc::c_int, + edges, + ); + } + if edges as libc::c_uint & LR_HAVE_BOTTOM as libc::c_int as libc::c_uint != 0 { + BD::dav1d_sgr_box5_h_neon( + &mut *sumsq.offset((h * (384 + 16)) as isize), + &mut *sum.offset((h * (384 + 16)) as isize), + 0 as *const [BD::Pixel; 4], + lpf.offset((6 * BD::pxstride(stride as usize)) as isize), + stride, + w, + 2 as libc::c_int, + edges, + ); + } + dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges); + dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, bd.bitdepth_max().as_()); + BD::dav1d_sgr_finish_filter2_neon(tmp, src, stride, a, b, w, h); +} diff --git a/src/looprestoration_tmpl_16.rs b/src/looprestoration_tmpl_16.rs index a105c3a55..ff8b20b9e 100644 --- a/src/looprestoration_tmpl_16.rs +++ b/src/looprestoration_tmpl_16.rs @@ -9,30 +9,6 @@ use cfg_if::cfg_if; #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] extern "C" { - fn dav1d_sgr_box5_v_neon( - sumsq: *mut int32_t, - sum: *mut int16_t, - w: libc::c_int, - h: libc::c_int, - edges: LrEdgeFlags, - ); - fn dav1d_sgr_calc_ab2_neon( - a: *mut int32_t, - b: *mut int16_t, - w: libc::c_int, - h: libc::c_int, - strength: libc::c_int, - bitdepth_max: libc::c_int, - ); - fn dav1d_sgr_finish_filter2_16bpc_neon( - tmp: *mut int16_t, - src: *const pixel, - stride: ptrdiff_t, - a: *const int32_t, - b: *const int16_t, - w: libc::c_int, - h: libc::c_int, - ); fn dav1d_sgr_weighted2_16bpc_neon( dst: *mut pixel, dst_stride: ptrdiff_t, @@ -56,16 +32,6 @@ extern "C" { wt: libc::c_int, bitdepth_max: libc::c_int, ); - fn dav1d_sgr_box5_h_16bpc_neon( - sumsq: *mut int32_t, - sum: *mut int16_t, - left: *const [pixel; 4], - src: *const pixel, - stride: ptrdiff_t, - w: libc::c_int, - h: libc::c_int, - edges: LrEdgeFlags, - ); } #[cfg(all(feature = "asm", target_arch = "arm"))] extern "C" { @@ -378,64 +344,6 @@ unsafe extern "C" fn sgr_filter_3x3_neon( ); } -#[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] -unsafe extern "C" fn dav1d_sgr_filter2_neon( - mut tmp: *mut int16_t, - mut src: *const pixel, - stride: ptrdiff_t, - mut left: *const [pixel; 4], - mut lpf: *const pixel, - w: libc::c_int, - h: libc::c_int, - strength: libc::c_int, - edges: LrEdgeFlags, - bitdepth_max: libc::c_int, -) { - use crate::src::looprestoration::LR_HAVE_BOTTOM; - use crate::src::looprestoration::LR_HAVE_TOP; - - let mut sumsq_mem: Align16<[int32_t; 27208]> = Align16([0; 27208]); - let sumsq: *mut int32_t = &mut *sumsq_mem - .0 - .as_mut_ptr() - .offset(((384 + 16) * 2 + 8) as isize) as *mut int32_t; - let a: *mut int32_t = sumsq; - let mut sum_mem: Align16<[int16_t; 27216]> = Align16([0; 27216]); - let sum: *mut int16_t = &mut *sum_mem - .0 - .as_mut_ptr() - .offset(((384 + 16) * 2 + 16) as isize) as *mut int16_t; - let b: *mut int16_t = sum; - dav1d_sgr_box5_h_16bpc_neon(sumsq, sum, left, src, stride, w, h, edges); - if edges as libc::c_uint & LR_HAVE_TOP as libc::c_int as libc::c_uint != 0 { - dav1d_sgr_box5_h_16bpc_neon( - &mut *sumsq.offset((-(2 as libc::c_int) * (384 + 16)) as isize), - &mut *sum.offset((-(2 as libc::c_int) * (384 + 16)) as isize), - 0 as *const [pixel; 4], - lpf, - stride, - w, - 2 as libc::c_int, - edges, - ); - } - if edges as libc::c_uint & LR_HAVE_BOTTOM as libc::c_int as libc::c_uint != 0 { - dav1d_sgr_box5_h_16bpc_neon( - &mut *sumsq.offset((h * (384 + 16)) as isize), - &mut *sum.offset((h * (384 + 16)) as isize), - 0 as *const [pixel; 4], - lpf.offset((6 * PXSTRIDE(stride)) as isize), - stride, - w, - 2 as libc::c_int, - edges, - ); - } - dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges); - dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, bitdepth_max); - dav1d_sgr_finish_filter2_16bpc_neon(tmp, src, stride, a, b, w, h); -} - #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] unsafe extern "C" fn sgr_filter_5x5_neon_erased( p: *mut libc::c_void, @@ -473,6 +381,9 @@ unsafe extern "C" fn sgr_filter_5x5_neon( edges: LrEdgeFlags, bitdepth_max: libc::c_int, ) { + use crate::include::common::bitdepth::BitDepth; + use crate::src::looprestoration::dav1d_sgr_filter2_neon; + let mut tmp: Align16<[int16_t; 24576]> = Align16([0; 24576]); dav1d_sgr_filter2_neon( tmp.0.as_mut_ptr(), @@ -484,7 +395,7 @@ unsafe extern "C" fn sgr_filter_5x5_neon( h, (*params).sgr.s0 as libc::c_int, edges, - bitdepth_max, + BitDepth16::from_c(bitdepth_max), ); dav1d_sgr_weighted1_16bpc_neon( dst, @@ -538,6 +449,7 @@ unsafe extern "C" fn sgr_filter_mix_neon( ) { use crate::include::common::bitdepth::BitDepth; use crate::src::looprestoration::dav1d_sgr_filter1_neon; + use crate::src::looprestoration::dav1d_sgr_filter2_neon; let mut tmp1: Align16<[int16_t; 24576]> = Align16([0; 24576]); let mut tmp2: Align16<[int16_t; 24576]> = Align16([0; 24576]); @@ -551,7 +463,7 @@ unsafe extern "C" fn sgr_filter_mix_neon( h, (*params).sgr.s0 as libc::c_int, edges, - bitdepth_max, + BitDepth16::from_c(bitdepth_max), ); dav1d_sgr_filter1_neon( tmp2.0.as_mut_ptr(), diff --git a/src/looprestoration_tmpl_8.rs b/src/looprestoration_tmpl_8.rs index a31af2d05..a7ee7bb8a 100644 --- a/src/looprestoration_tmpl_8.rs +++ b/src/looprestoration_tmpl_8.rs @@ -8,40 +8,6 @@ use cfg_if::cfg_if; #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] extern "C" { - fn dav1d_sgr_box5_h_8bpc_neon( - sumsq: *mut int32_t, - sum: *mut int16_t, - left: *const [pixel; 4], - src: *const pixel, - stride: ptrdiff_t, - w: libc::c_int, - h: libc::c_int, - edges: LrEdgeFlags, - ); - fn dav1d_sgr_box5_v_neon( - sumsq: *mut int32_t, - sum: *mut int16_t, - w: libc::c_int, - h: libc::c_int, - edges: LrEdgeFlags, - ); - fn dav1d_sgr_calc_ab2_neon( - a: *mut int32_t, - b: *mut int16_t, - w: libc::c_int, - h: libc::c_int, - strength: libc::c_int, - bitdepth_max: libc::c_int, - ); - fn dav1d_sgr_finish_filter2_8bpc_neon( - tmp: *mut int16_t, - src: *const pixel, - stride: ptrdiff_t, - a: *const int32_t, - b: *const int16_t, - w: libc::c_int, - h: libc::c_int, - ); fn dav1d_sgr_weighted1_8bpc_neon( dst: *mut pixel, dst_stride: ptrdiff_t, @@ -288,63 +254,6 @@ unsafe extern "C" fn loop_restoration_dsp_init_arm( (*c).sgr[2] = sgr_filter_mix_neon_erased; } -#[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] -unsafe extern "C" fn dav1d_sgr_filter2_neon( - mut tmp: *mut int16_t, - mut src: *const pixel, - stride: ptrdiff_t, - mut left: *const [pixel; 4], - mut lpf: *const pixel, - w: libc::c_int, - h: libc::c_int, - strength: libc::c_int, - edges: LrEdgeFlags, -) { - use crate::src::looprestoration::LR_HAVE_BOTTOM; - use crate::src::looprestoration::LR_HAVE_TOP; - - let mut sumsq_mem: Align16<[int32_t; 27208]> = Align16([0; 27208]); - let sumsq: *mut int32_t = &mut *sumsq_mem - .0 - .as_mut_ptr() - .offset(((384 + 16) * 2 + 8) as isize) as *mut int32_t; - let a: *mut int32_t = sumsq; - let mut sum_mem: Align16<[int16_t; 27216]> = Align16([0; 27216]); - let sum: *mut int16_t = &mut *sum_mem - .0 - .as_mut_ptr() - .offset(((384 + 16) * 2 + 16) as isize) as *mut int16_t; - let b: *mut int16_t = sum; - dav1d_sgr_box5_h_8bpc_neon(sumsq, sum, left, src, stride, w, h, edges); - if edges as libc::c_uint & LR_HAVE_TOP as libc::c_int as libc::c_uint != 0 { - dav1d_sgr_box5_h_8bpc_neon( - &mut *sumsq.offset((-(2 as libc::c_int) * (384 + 16)) as isize), - &mut *sum.offset((-(2 as libc::c_int) * (384 + 16)) as isize), - 0 as *const [pixel; 4], - lpf, - stride, - w, - 2 as libc::c_int, - edges, - ); - } - if edges as libc::c_uint & LR_HAVE_BOTTOM as libc::c_int as libc::c_uint != 0 { - dav1d_sgr_box5_h_8bpc_neon( - &mut *sumsq.offset((h * (384 + 16)) as isize), - &mut *sum.offset((h * (384 + 16)) as isize), - 0 as *const [pixel; 4], - lpf.offset((6 * stride) as isize), - stride, - w, - 2 as libc::c_int, - edges, - ); - } - dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges); - dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, 0xff as libc::c_int); - dav1d_sgr_finish_filter2_8bpc_neon(tmp, src, stride, a, b, w, h); -} - #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] unsafe extern "C" fn sgr_filter_5x5_neon_erased( p: *mut libc::c_void, @@ -380,6 +289,9 @@ unsafe extern "C" fn sgr_filter_5x5_neon( params: *const LooprestorationParams, edges: LrEdgeFlags, ) { + use crate::include::common::bitdepth::BitDepth; + use crate::src::looprestoration::dav1d_sgr_filter2_neon; + let mut tmp: Align16<[int16_t; 24576]> = Align16([0; 24576]); dav1d_sgr_filter2_neon( tmp.0.as_mut_ptr(), @@ -391,6 +303,7 @@ unsafe extern "C" fn sgr_filter_5x5_neon( h, (*params).sgr.s0 as libc::c_int, edges, + BitDepth8::new(()), ); dav1d_sgr_weighted1_8bpc_neon( dst, @@ -504,6 +417,7 @@ unsafe extern "C" fn sgr_filter_mix_neon( ) { use crate::include::common::bitdepth::BitDepth; use crate::src::looprestoration::dav1d_sgr_filter1_neon; + use crate::src::looprestoration::dav1d_sgr_filter2_neon; let mut tmp1: Align16<[int16_t; 24576]> = Align16([0; 24576]); let mut tmp2: Align16<[int16_t; 24576]> = Align16([0; 24576]); @@ -517,6 +431,7 @@ unsafe extern "C" fn sgr_filter_mix_neon( h, (*params).sgr.s0 as libc::c_int, edges, + BitDepth8::new(()), ); dav1d_sgr_filter1_neon( tmp2.0.as_mut_ptr(),