diff --git a/src/looprestoration_tmpl_16.rs b/src/looprestoration_tmpl_16.rs index 5d143512a..4073720a2 100644 --- a/src/looprestoration_tmpl_16.rs +++ b/src/looprestoration_tmpl_16.rs @@ -294,6 +294,31 @@ extern "C" { edges: LrEdgeFlags, ); } +#[cfg(all(feature = "asm", target_arch = "arm"))] +extern "C" { + fn dav1d_wiener_filter_h_16bpc_neon( + dst: *mut int16_t, + left: *const [pixel; 4], + src: *const pixel, + stride: ptrdiff_t, + fh: *const int16_t, + w: intptr_t, + h: libc::c_int, + edges: LrEdgeFlags, + bitdepth_max: libc::c_int, + ); + fn dav1d_wiener_filter_v_16bpc_neon( + dst: *mut pixel, + stride: ptrdiff_t, + mid: *const int16_t, + w: libc::c_int, + h: libc::c_int, + fv: *const int16_t, + edges: LrEdgeFlags, + mid_stride: ptrdiff_t, + bitdepth_max: libc::c_int, + ); +} use crate::src::tables::dav1d_sgr_x_by_x; @@ -1072,7 +1097,76 @@ unsafe extern "C" fn loop_restoration_dsp_init_x86( } } } - +#[cfg(all(feature = "asm", target_arch = "arm"))] +unsafe extern "C" fn wiener_filter_neon( + dst: *mut pixel, + stride: ptrdiff_t, + left: *const [pixel; 4], + mut lpf: *const pixel, + w: libc::c_int, + h: libc::c_int, + params: *const LooprestorationParams, + edges: LrEdgeFlags, + bitdepth_max: libc::c_int, +) { + let filter: *const [int16_t; 8] = (*params).filter.0.as_ptr(); + let mut mid: Align16<[int16_t; 68 * 384]> = Align16([0; 68 * 384]); + let mut mid_stride: libc::c_int = w + 7 & !7; + dav1d_wiener_filter_h_16bpc_neon( + &mut *mid.0.as_mut_ptr().offset((2 * mid_stride) as isize), + left, + dst, + stride, + (*filter.offset(0)).as_ptr(), + w as intptr_t, + h, + edges, + bitdepth_max, + ); + if edges & LR_HAVE_TOP != 0 { + dav1d_wiener_filter_h_16bpc_neon( + mid.0.as_mut_ptr(), + core::ptr::null(), + lpf, + stride, + (*filter.offset(0)).as_ptr(), + w as intptr_t, + 2, + edges, + bitdepth_max, + ); + } + if edges & LR_HAVE_BOTTOM != 0 { + dav1d_wiener_filter_h_16bpc_neon( + &mut *mid + .0 + .as_mut_ptr() + .offset(((2 as libc::c_int + h) * mid_stride) as isize), + core::ptr::null(), + lpf.offset((6 * PXSTRIDE(stride)) as isize), + stride, + (*filter.offset(0)).as_ptr(), + w as intptr_t, + 2, + edges, + bitdepth_max, + ); + } + dav1d_wiener_filter_v_16bpc_neon( + dst, + stride, + &mut *mid + .0 + .as_mut_ptr() + .offset((2 as libc::c_int * mid_stride) as isize), + w, + h, + (*filter.offset(1)).as_ptr(), + edges, + (mid_stride as usize * ::core::mem::size_of::()) as ptrdiff_t, + bitdepth_max, + ); +} #[cfg(feature = "asm")] use crate::src::cpu::dav1d_get_cpu_flags; @@ -1095,9 +1189,8 @@ unsafe extern "C" fn loop_restoration_dsp_init_arm( (*c).wiener[0] = Some(dav1d_wiener_filter7_16bpc_neon); (*c).wiener[1] = Some(dav1d_wiener_filter5_16bpc_neon); } else { - // TODO(perl): enable assembly routines here - // (*c).wiener[0] = Some(dav1d_wiener_filter_neon); - // (*c).wiener[1] = Some(dav1d_wiener_filter_neon); + (*c).wiener[0] = Some(wiener_filter_neon); + (*c).wiener[1] = Some(wiener_filter_neon); } } diff --git a/src/looprestoration_tmpl_8.rs b/src/looprestoration_tmpl_8.rs index 1bf2c5aee..13e80ae0d 100644 --- a/src/looprestoration_tmpl_8.rs +++ b/src/looprestoration_tmpl_8.rs @@ -288,6 +288,30 @@ extern "C" { ); } +#[cfg(all(feature = "asm", target_arch = "arm"))] +extern "C" { + fn dav1d_wiener_filter_h_8bpc_neon( + dst: *mut int16_t, + left: *const [pixel; 4], + src: *const pixel, + stride: ptrdiff_t, + fh: *const int16_t, + w: intptr_t, + h: libc::c_int, + edges: LrEdgeFlags, + ); + fn dav1d_wiener_filter_v_8bpc_neon( + dst: *mut pixel, + stride: ptrdiff_t, + mid: *const int16_t, + w: libc::c_int, + h: libc::c_int, + fv: *const int16_t, + edges: LrEdgeFlags, + mid_stride: ptrdiff_t, + ); +} + use crate::src::tables::dav1d_sgr_x_by_x; pub type pixel = uint8_t; @@ -1037,7 +1061,70 @@ unsafe extern "C" fn loop_restoration_dsp_init_x86( (*c).sgr[2] = Some(dav1d_sgr_filter_mix_8bpc_avx512icl); } } - +#[cfg(all(feature = "asm", target_arch = "arm"))] +unsafe extern "C" fn wiener_filter_neon( + dst: *mut pixel, + stride: ptrdiff_t, + left: *const [pixel; 4], + mut lpf: *const pixel, + w: libc::c_int, + h: libc::c_int, + params: *const LooprestorationParams, + edges: LrEdgeFlags, +) { + let filter: *const [int16_t; 8] = (*params).filter.0.as_ptr(); + let mut mid: Align16<[int16_t; 68 * 384]> = Align16([0; 68 * 384]); + let mut mid_stride: libc::c_int = w + 7 & !7; + // Horizontal filter + dav1d_wiener_filter_h_8bpc_neon( + &mut *mid.0.as_mut_ptr().offset((2 * mid_stride) as isize), + left, + dst, + stride, + (*filter.offset(0)).as_ptr(), + w as isize, + h, + edges, + ); + if edges & LR_HAVE_TOP != 0 { + dav1d_wiener_filter_h_8bpc_neon( + mid.0.as_mut_ptr(), + core::ptr::null(), + lpf, + stride, + (*filter.offset(0)).as_ptr(), + w as isize, + 2, + edges, + ); + } + if edges & LR_HAVE_BOTTOM != 0 { + dav1d_wiener_filter_h_8bpc_neon( + &mut *mid + .0 + .as_mut_ptr() + .offset(((2 as libc::c_int + h) * mid_stride) as isize), + core::ptr::null(), + lpf.offset((6 * stride) as isize), + stride, + (*filter.offset(0)).as_ptr(), + w as isize, + 2, + edges, + ); + } + // Vertical filter + dav1d_wiener_filter_v_8bpc_neon( + dst, + stride, + &mut *mid.0.as_mut_ptr().offset((2 * mid_stride) as isize), + w, + h, + (*filter.offset(1)).as_ptr(), + edges, + (mid_stride as usize * ::core::mem::size_of::()) as ptrdiff_t, + ); +} #[cfg(feature = "asm")] use crate::src::cpu::dav1d_get_cpu_flags; @@ -1060,9 +1147,8 @@ unsafe extern "C" fn loop_restoration_dsp_init_arm( (*c).wiener[0] = Some(dav1d_wiener_filter7_8bpc_neon); (*c).wiener[1] = Some(dav1d_wiener_filter5_8bpc_neon); } else { - // TODO(perl): enable assembly routines here - // (*c).wiener[0] = Some(dav1d_wiener_filter_neon); - // (*c).wiener[1] = Some(dav1d_wiener_filter_neon); + (*c).wiener[0] = Some(wiener_filter_neon); + (*c).wiener[1] = Some(wiener_filter_neon); } }