From 8965ead84b3740aaadf316277ea958b22e0bf38a Mon Sep 17 00:00:00 2001 From: Khyber Sen Date: Tue, 22 Aug 2023 16:36:10 -0700 Subject: [PATCH 1/2] `src/looprestoration.rs`: Use the bitdepth-specific `fn`s directly rather than creating bitdepth-dispatching shim `fn`s. This eliminates the extra call. --- src/looprestoration.rs | 220 ++++++++++++++++------------------------- 1 file changed, 86 insertions(+), 134 deletions(-) diff --git a/src/looprestoration.rs b/src/looprestoration.rs index 14bd86918..a6b3cfde9 100644 --- a/src/looprestoration.rs +++ b/src/looprestoration.rs @@ -59,13 +59,14 @@ pub struct Dav1dLoopRestorationDSPContext { pub sgr: [looprestorationfilter_fn; 3], } -/// Generates a generic wrapper function that delegates to the appropriate -/// bitdepth-specific extern function. -macro_rules! decl_looprestorationfilter_bd_fn { - ($name:ident, $suffix:ident) => { - paste::paste! { - #[cfg(feature = "asm")] - unsafe extern "C" fn [<$name $suffix>]( +#[cfg(all( + feature = "asm", + any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") +))] +macro_rules! decl_looprestorationfilter_fn { + (fn $name:ident) => {{ + extern "C" { + fn $name( dst: *mut pixel, dst_stride: ptrdiff_t, left: const_left_pixel_row, @@ -75,111 +76,11 @@ macro_rules! decl_looprestorationfilter_bd_fn { params: *const LooprestorationParams, edges: LrEdgeFlags, bitdepth_max: libc::c_int, - ) { - (match BD::BPC { - BPC::BPC8 => { - extern "C" { - fn [<$name _8bpc $suffix>]( - dst: *mut pixel, - dst_stride: ptrdiff_t, - left: const_left_pixel_row, - lpf: *const pixel, - w: libc::c_int, - h: libc::c_int, - params: *const LooprestorationParams, - edges: LrEdgeFlags, - bitdepth_max: libc::c_int, - ); - } - [<$name _8bpc $suffix>] - } - - BPC::BPC16 => { - extern "C" { - fn [<$name _16bpc $suffix>]( - dst: *mut pixel, - dst_stride: ptrdiff_t, - left: const_left_pixel_row, - lpf: *const pixel, - w: libc::c_int, - h: libc::c_int, - params: *const LooprestorationParams, - edges: LrEdgeFlags, - bitdepth_max: libc::c_int, - ); - } - [<$name _16bpc $suffix>] - } - })(dst, dst_stride, left, lpf, w, h, params, edges, bitdepth_max) - } + ); } - }; -} -decl_looprestorationfilter_bd_fn!(dav1d_wiener_filter7, _ssse3); -decl_looprestorationfilter_bd_fn!(dav1d_wiener_filter5, _ssse3); -decl_looprestorationfilter_bd_fn!(dav1d_wiener_filter5, _avx2); -decl_looprestorationfilter_bd_fn!(dav1d_wiener_filter7, _avx2); -decl_looprestorationfilter_bd_fn!(dav1d_wiener_filter7, _avx512icl); -decl_looprestorationfilter_bd_fn!(dav1d_sgr_filter_mix, _avx512icl); -decl_looprestorationfilter_bd_fn!(dav1d_sgr_filter_3x3, _avx512icl); -decl_looprestorationfilter_bd_fn!(dav1d_sgr_filter_5x5, _avx512icl); -decl_looprestorationfilter_bd_fn!(dav1d_sgr_filter_mix, _avx2); -decl_looprestorationfilter_bd_fn!(dav1d_sgr_filter_3x3, _avx2); -decl_looprestorationfilter_bd_fn!(dav1d_sgr_filter_5x5, _avx2); -decl_looprestorationfilter_bd_fn!(dav1d_sgr_filter_mix, _ssse3); -decl_looprestorationfilter_bd_fn!(dav1d_sgr_filter_3x3, _ssse3); -decl_looprestorationfilter_bd_fn!(dav1d_sgr_filter_5x5, _ssse3); -decl_looprestorationfilter_bd_fn!(dav1d_wiener_filter7, _neon); -decl_looprestorationfilter_bd_fn!(dav1d_wiener_filter5, _neon); - -#[cfg(all( - feature = "bitdepth_8", - feature = "asm", - any(target_arch = "x86", target_arch = "x86_64"), -))] -extern "C" { - fn dav1d_wiener_filter7_8bpc_sse2( - dst: *mut pixel, - dst_stride: ptrdiff_t, - left: const_left_pixel_row, - lpf: *const pixel, - w: libc::c_int, - h: libc::c_int, - params: *const LooprestorationParams, - edges: LrEdgeFlags, - bitdepth_max: libc::c_int, - ); - fn dav1d_wiener_filter5_8bpc_sse2( - dst: *mut pixel, - dst_stride: ptrdiff_t, - left: const_left_pixel_row, - lpf: *const pixel, - w: libc::c_int, - h: libc::c_int, - params: *const LooprestorationParams, - edges: LrEdgeFlags, - bitdepth_max: libc::c_int, - ); -} - -#[cfg(all( - feature = "bitdepth_16", - feature = "asm", - any(target_arch = "x86", target_arch = "x86_64"), -))] -extern "C" { - fn dav1d_wiener_filter5_16bpc_avx512icl( - dst: *mut pixel, - dst_stride: ptrdiff_t, - left: const_left_pixel_row, - lpf: *const pixel, - w: libc::c_int, - h: libc::c_int, - params: *const LooprestorationParams, - edges: LrEdgeFlags, - bitdepth_max: libc::c_int, - ); + $name + }}; } #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] @@ -200,6 +101,7 @@ extern "C" { strength: libc::c_int, bitdepth_max: libc::c_int, ); + fn dav1d_sgr_box5_v_neon( sumsq: *mut int32_t, sum: *mut int16_t, @@ -207,6 +109,7 @@ extern "C" { h: libc::c_int, edges: LrEdgeFlags, ); + fn dav1d_sgr_calc_ab2_neon( a: *mut int32_t, b: *mut int16_t, @@ -1803,21 +1706,36 @@ fn loop_restoration_dsp_init_x86( #[cfg(feature = "bitdepth_8")] if BD::BPC == BPC::BPC8 { - c.wiener[0] = dav1d_wiener_filter7_8bpc_sse2; - c.wiener[1] = dav1d_wiener_filter5_8bpc_sse2; + c.wiener[0] = decl_looprestorationfilter_fn!(fn dav1d_wiener_filter7_8bpc_sse2); + c.wiener[1] = decl_looprestorationfilter_fn!(fn dav1d_wiener_filter5_8bpc_sse2); } if flags & DAV1D_X86_CPU_FLAG_SSSE3 == 0 { return; } - c.wiener[0] = dav1d_wiener_filter7_ssse3::; - c.wiener[1] = dav1d_wiener_filter5_ssse3::; + c.wiener[0] = match BD::BPC { + BPC::BPC8 => decl_looprestorationfilter_fn!(fn dav1d_wiener_filter7_8bpc_ssse3), + BPC::BPC16 => decl_looprestorationfilter_fn!(fn dav1d_wiener_filter7_16bpc_ssse3), + }; + c.wiener[1] = match BD::BPC { + BPC::BPC8 => decl_looprestorationfilter_fn!(fn dav1d_wiener_filter5_8bpc_ssse3), + BPC::BPC16 => decl_looprestorationfilter_fn!(fn dav1d_wiener_filter5_16bpc_ssse3), + }; if BD::BPC == BPC::BPC8 || bpc == 10 { - c.sgr[0] = dav1d_sgr_filter_5x5_ssse3::; - c.sgr[1] = dav1d_sgr_filter_3x3_ssse3::; - c.sgr[2] = dav1d_sgr_filter_mix_ssse3::; + c.sgr[0] = match BD::BPC { + BPC::BPC8 => decl_looprestorationfilter_fn!(fn dav1d_sgr_filter_5x5_8bpc_ssse3), + BPC::BPC16 => decl_looprestorationfilter_fn!(fn dav1d_sgr_filter_5x5_16bpc_ssse3), + }; + c.sgr[1] = match BD::BPC { + BPC::BPC8 => decl_looprestorationfilter_fn!(fn dav1d_sgr_filter_3x3_8bpc_ssse3), + BPC::BPC16 => decl_looprestorationfilter_fn!(fn dav1d_sgr_filter_3x3_16bpc_ssse3), + }; + c.sgr[2] = match BD::BPC { + BPC::BPC8 => decl_looprestorationfilter_fn!(fn dav1d_sgr_filter_mix_8bpc_ssse3), + BPC::BPC16 => decl_looprestorationfilter_fn!(fn dav1d_sgr_filter_mix_16bpc_ssse3), + }; } #[cfg(target_arch = "x86_64")] @@ -1826,35 +1744,63 @@ fn loop_restoration_dsp_init_x86( return; } - c.wiener[0] = dav1d_wiener_filter7_avx2::; - c.wiener[1] = dav1d_wiener_filter5_avx2::; + c.wiener[0] = match BD::BPC { + BPC::BPC8 => decl_looprestorationfilter_fn!(fn dav1d_wiener_filter7_8bpc_avx2), + BPC::BPC16 => decl_looprestorationfilter_fn!(fn dav1d_wiener_filter7_16bpc_avx2), + }; + c.wiener[1] = match BD::BPC { + BPC::BPC8 => decl_looprestorationfilter_fn!(fn dav1d_wiener_filter5_8bpc_avx2), + BPC::BPC16 => decl_looprestorationfilter_fn!(fn dav1d_wiener_filter5_16bpc_avx2), + }; if BD::BPC == BPC::BPC8 || bpc == 10 { - c.sgr[0] = dav1d_sgr_filter_5x5_avx2::; - c.sgr[1] = dav1d_sgr_filter_3x3_avx2::; - c.sgr[2] = dav1d_sgr_filter_mix_avx2::; + c.sgr[0] = match BD::BPC { + BPC::BPC8 => decl_looprestorationfilter_fn!(fn dav1d_sgr_filter_5x5_8bpc_avx2), + BPC::BPC16 => decl_looprestorationfilter_fn!(fn dav1d_sgr_filter_5x5_16bpc_avx2), + }; + c.sgr[1] = match BD::BPC { + BPC::BPC8 => decl_looprestorationfilter_fn!(fn dav1d_sgr_filter_3x3_8bpc_avx2), + BPC::BPC16 => decl_looprestorationfilter_fn!(fn dav1d_sgr_filter_3x3_16bpc_avx2), + }; + c.sgr[2] = match BD::BPC { + BPC::BPC8 => decl_looprestorationfilter_fn!(fn dav1d_sgr_filter_mix_8bpc_avx2), + BPC::BPC16 => decl_looprestorationfilter_fn!(fn dav1d_sgr_filter_mix_16bpc_avx2), + }; } if flags & DAV1D_X86_CPU_FLAG_AVX512ICL == 0 { return; } - c.wiener[0] = dav1d_wiener_filter7_avx512icl::; + c.wiener[0] = match BD::BPC { + BPC::BPC8 => decl_looprestorationfilter_fn!(fn dav1d_wiener_filter7_8bpc_avx512icl), + BPC::BPC16 => decl_looprestorationfilter_fn!(fn dav1d_wiener_filter7_16bpc_avx512icl), + }; c.wiener[1] = match BD::BPC { // With VNNI we don't need a 5-tap version. BPC::BPC8 => c.wiener[0], - - #[cfg(feature = "bitdepth_16")] - BPC::BPC16 => dav1d_wiener_filter5_16bpc_avx512icl, - - #[cfg(not(feature = "bitdepth_16"))] - BPC::BPC16 => unreachable!(), + BPC::BPC16 => decl_looprestorationfilter_fn!(fn dav1d_wiener_filter5_16bpc_avx512icl), }; if BD::BPC == BPC::BPC8 || bpc == 10 { - c.sgr[0] = dav1d_sgr_filter_5x5_avx512icl::; - c.sgr[1] = dav1d_sgr_filter_3x3_avx512icl::; - c.sgr[2] = dav1d_sgr_filter_mix_avx512icl::; + c.sgr[0] = match BD::BPC { + BPC::BPC8 => decl_looprestorationfilter_fn!(fn dav1d_sgr_filter_5x5_8bpc_avx512icl), + BPC::BPC16 => { + decl_looprestorationfilter_fn!(fn dav1d_sgr_filter_5x5_16bpc_avx512icl) + } + }; + c.sgr[1] = match BD::BPC { + BPC::BPC8 => decl_looprestorationfilter_fn!(fn dav1d_sgr_filter_3x3_8bpc_avx512icl), + BPC::BPC16 => { + decl_looprestorationfilter_fn!(fn dav1d_sgr_filter_3x3_16bpc_avx512icl) + } + }; + c.sgr[2] = match BD::BPC { + BPC::BPC8 => decl_looprestorationfilter_fn!(fn dav1d_sgr_filter_mix_8bpc_avx512icl), + BPC::BPC16 => { + decl_looprestorationfilter_fn!(fn dav1d_sgr_filter_mix_16bpc_avx512icl) + } + }; } } } @@ -1878,8 +1824,14 @@ fn loop_restoration_dsp_init_arm( cfg_if::cfg_if! { if #[cfg(target_arch = "aarch64")] { - c.wiener[0] = dav1d_wiener_filter7_neon::; - c.wiener[1] = dav1d_wiener_filter5_neon::; + c.wiener[0] = match BD::BPC { + BPC::BPC8 => decl_looprestorationfilter_fn!(fn dav1d_wiener_filter7_8bpc_neon), + BPC::BPC16 => decl_looprestorationfilter_fn!(fn dav1d_wiener_filter7_16bpc_neon), + }; + c.wiener[1] = match BD::BPC { + BPC::BPC8 => decl_looprestorationfilter_fn!(fn dav1d_wiener_filter5_8bpc_neon), + BPC::BPC16 => decl_looprestorationfilter_fn!(fn dav1d_wiener_filter5_16bpc_neon), + }; } else { c.wiener[0] = wiener_filter_neon_erased::; c.wiener[1] = wiener_filter_neon_erased::; From eb647328954144d00d6362ed7f5a2c4af6b40dd4 Mon Sep 17 00:00:00 2001 From: Khyber Sen Date: Tue, 22 Aug 2023 17:51:46 -0700 Subject: [PATCH 2/2] `fn loop_restoration_dsp_init_x86`: Remove an uneeded `#[cfg(feature = "bitdepth_8")]`. --- src/looprestoration.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/looprestoration.rs b/src/looprestoration.rs index a6b3cfde9..2ce035d87 100644 --- a/src/looprestoration.rs +++ b/src/looprestoration.rs @@ -1704,7 +1704,6 @@ fn loop_restoration_dsp_init_x86( return; } - #[cfg(feature = "bitdepth_8")] if BD::BPC == BPC::BPC8 { c.wiener[0] = decl_looprestorationfilter_fn!(fn dav1d_wiener_filter7_8bpc_sse2); c.wiener[1] = decl_looprestorationfilter_fn!(fn dav1d_wiener_filter5_8bpc_sse2);