From bb4d595854c37e1f059912338a1dd58c60a7e17b Mon Sep 17 00:00:00 2001 From: Khyber Sen Date: Thu, 19 Oct 2023 18:28:55 -0700 Subject: [PATCH 1/2] `fn {generate_grain_uv,fguv_32x32xn}_c_erased`: Use const generics to replace the `4{20,22,44}` variants. --- src/filmgrain.rs | 142 ++++++++--------------------------------------- 1 file changed, 22 insertions(+), 120 deletions(-) diff --git a/src/filmgrain.rs b/src/filmgrain.rs index d6b8ffda6..a1bee386b 100644 --- a/src/filmgrain.rs +++ b/src/filmgrain.rs @@ -376,7 +376,12 @@ unsafe fn generate_grain_uv_c( } } -unsafe extern "C" fn generate_grain_uv_420_c_erased( +unsafe extern "C" fn generate_grain_uv_c_erased< + BD: BitDepth, + const NM: usize, + const IS_SUBX: bool, + const IS_SUBY: bool, +>( buf: *mut [DynEntry; GRAIN_WIDTH], buf_y: *const [DynEntry; GRAIN_WIDTH], data: *const Rav1dFilmGrainData, @@ -388,44 +393,8 @@ unsafe extern "C" fn generate_grain_uv_420_c_erased( buf_y.cast(), data, uv, - true, - true, - BD::from_c(bitdepth_max), - ); -} - -unsafe extern "C" fn generate_grain_uv_422_c_erased( - buf: *mut [DynEntry; GRAIN_WIDTH], - buf_y: *const [DynEntry; GRAIN_WIDTH], - data: *const Rav1dFilmGrainData, - uv: intptr_t, - bitdepth_max: c_int, -) { - generate_grain_uv_c::( - buf.cast(), - buf_y.cast(), - data, - uv, - true, - false, - BD::from_c(bitdepth_max), - ); -} - -unsafe extern "C" fn generate_grain_uv_444_c_erased( - buf: *mut [DynEntry; GRAIN_WIDTH], - buf_y: *const [DynEntry; GRAIN_WIDTH], - data: *const Rav1dFilmGrainData, - uv: intptr_t, - bitdepth_max: c_int, -) { - generate_grain_uv_c::( - buf.cast(), - buf_y.cast(), - data, - uv, - false, - false, + IS_SUBX, + IS_SUBY, BD::from_c(bitdepth_max), ); } @@ -1120,79 +1089,12 @@ unsafe fn fguv_32x32xn_c( } } -unsafe extern "C" fn fguv_32x32xn_420_c_erased( - dst_row: *mut DynPixel, - src_row: *const DynPixel, - stride: ptrdiff_t, - data: *const Rav1dFilmGrainData, - pw: usize, - scaling: *const u8, - grain_lut: *const [DynEntry; GRAIN_WIDTH], - bh: c_int, - row_num: c_int, - luma_row: *const DynPixel, - luma_stride: ptrdiff_t, - uv_pl: c_int, - is_id: c_int, - bitdepth_max: c_int, -) { - fguv_32x32xn_c::( - dst_row.cast(), - src_row.cast(), - stride, - data, - pw, - scaling, - grain_lut.cast(), - bh, - row_num, - luma_row.cast(), - luma_stride, - uv_pl, - is_id, - true, - true, - BD::from_c(bitdepth_max), - ); -} - -unsafe extern "C" fn fguv_32x32xn_422_c_erased( - dst_row: *mut DynPixel, - src_row: *const DynPixel, - stride: ptrdiff_t, - data: *const Rav1dFilmGrainData, - pw: usize, - scaling: *const u8, - grain_lut: *const [DynEntry; GRAIN_WIDTH], - bh: c_int, - row_num: c_int, - luma_row: *const DynPixel, - luma_stride: ptrdiff_t, - uv_pl: c_int, - is_id: c_int, - bitdepth_max: c_int, -) { - fguv_32x32xn_c::( - dst_row.cast(), - src_row.cast(), - stride, - data, - pw, - scaling, - grain_lut.cast(), - bh, - row_num, - luma_row.cast(), - luma_stride, - uv_pl, - is_id, - true, - false, - BD::from_c(bitdepth_max), - ); -} - -unsafe extern "C" fn fguv_32x32xn_444_c_erased( +unsafe extern "C" fn fguv_32x32xn_c_erased< + BD: BitDepth, + const NM: usize, + const IS_SX: bool, + const IS_SY: bool, +>( dst_row: *mut DynPixel, src_row: *const DynPixel, stride: ptrdiff_t, @@ -1222,8 +1124,8 @@ unsafe extern "C" fn fguv_32x32xn_444_c_erased( luma_stride, uv_pl, is_id, - false, - false, + IS_SX, + IS_SY, BD::from_c(bitdepth_max), ); } @@ -1784,19 +1686,19 @@ unsafe fn film_grain_dsp_init_arm(c: *mut Rav1dFilmGrainDSPContext pub unsafe fn rav1d_film_grain_dsp_init(c: *mut Rav1dFilmGrainDSPContext) { (*c).generate_grain_y = Some(generate_grain_y_c_erased::); (*c).generate_grain_uv[(RAV1D_PIXEL_LAYOUT_I420 - 1) as usize] = - Some(generate_grain_uv_420_c_erased::); + Some(generate_grain_uv_c_erased::); (*c).generate_grain_uv[(RAV1D_PIXEL_LAYOUT_I422 - 1) as usize] = - Some(generate_grain_uv_422_c_erased::); + Some(generate_grain_uv_c_erased::); (*c).generate_grain_uv[(RAV1D_PIXEL_LAYOUT_I444 - 1) as usize] = - Some(generate_grain_uv_444_c_erased::); + Some(generate_grain_uv_c_erased::); (*c).fgy_32x32xn = Some(fgy_32x32xn_c_erased::); (*c).fguv_32x32xn[(RAV1D_PIXEL_LAYOUT_I420 - 1) as usize] = - Some(fguv_32x32xn_420_c_erased::); + Some(fguv_32x32xn_c_erased::); (*c).fguv_32x32xn[(RAV1D_PIXEL_LAYOUT_I422 - 1) as usize] = - Some(fguv_32x32xn_422_c_erased::); + Some(fguv_32x32xn_c_erased::); (*c).fguv_32x32xn[(RAV1D_PIXEL_LAYOUT_I444 - 1) as usize] = - Some(fguv_32x32xn_444_c_erased::); + Some(fguv_32x32xn_c_erased::); #[cfg(feature = "asm")] cfg_if! { From 8b668e2c81b11868e1ef17b6cc1ad92d403b5c96 Mon Sep 17 00:00:00 2001 From: Khyber Sen Date: Thu, 19 Oct 2023 19:07:21 -0700 Subject: [PATCH 2/2] `fn fguv_32x32xn_neon{,_erased}`: Use const generics to replace the `4{20,22,44}` variants. --- src/filmgrain.rs | 242 ++++------------------------------------------- 1 file changed, 20 insertions(+), 222 deletions(-) diff --git a/src/filmgrain.rs b/src/filmgrain.rs index a1bee386b..612b507c9 100644 --- a/src/filmgrain.rs +++ b/src/filmgrain.rs @@ -1324,114 +1324,12 @@ unsafe fn fgy_32x32xn_neon( } #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] -unsafe extern "C" fn fguv_32x32xn_420_neon_erased( - dst_row: *mut DynPixel, - src_row: *const DynPixel, - stride: ptrdiff_t, - data: *const Rav1dFilmGrainData, - pw: usize, - scaling: *const u8, - grain_lut: *const [DynEntry; GRAIN_WIDTH], - bh: c_int, - row_num: c_int, - luma_row: *const DynPixel, - luma_stride: ptrdiff_t, - uv: c_int, - is_id: c_int, - bitdepth_max: c_int, -) { - fguv_32x32xn_420_neon::( - dst_row.cast(), - src_row.cast(), - stride, - data, - pw, - scaling, - grain_lut.cast(), - bh, - row_num, - luma_row.cast(), - luma_stride, - uv, - is_id, - BD::from_c(bitdepth_max), - ); -} - -#[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] -unsafe fn fguv_32x32xn_420_neon( - dst_row: *mut BD::Pixel, - src_row: *const BD::Pixel, - stride: ptrdiff_t, - data: *const Rav1dFilmGrainData, - pw: usize, - scaling: *const u8, - grain_lut: *const [BD::Entry; GRAIN_WIDTH], - bh: c_int, - row_num: c_int, - luma_row: *const BD::Pixel, - luma_stride: ptrdiff_t, - uv: c_int, - is_id: c_int, - bd: BD, -) { - let rows = 1 + ((*data).overlap_flag && row_num > 0) as c_int; - let mut seed: [c_uint; 2] = [0; 2]; - let mut i = 0; - while i < rows { - seed[i as usize] = (*data).seed; - seed[i as usize] ^= (((row_num - i) * 37 + 178 & 0xff as c_int) << 8) as c_uint; - seed[i as usize] ^= ((row_num - i) * 173 + 105 & 0xff as c_int) as c_uint; - i += 1; - } - let mut offsets: [[c_int; 2]; 2] = [[0; 2]; 2]; - let mut bx: c_uint = 0 as c_int as c_uint; - while (bx as usize) < pw { - if (*data).overlap_flag && bx != 0 { - let mut i = 0; - while i < rows { - offsets[1][i as usize] = offsets[0][i as usize]; - i += 1; - } - } - let mut i = 0; - while i < rows { - offsets[0][i as usize] = - get_random_number(8 as c_int, &mut *seed.as_mut_ptr().offset(i as isize)); - i += 1; - } - let mut r#type = 0; - if (*data).overlap_flag && row_num != 0 { - r#type |= 1 as c_int; - } - if (*data).overlap_flag && bx != 0 { - r#type |= 2 as c_int; - } - if (*data).chroma_scaling_from_luma { - r#type |= 4 as c_int; - } - bd_fn!(decl_fguv_32x32xn_fn, BD, fguv_32x32_420, neon)( - dst_row.offset(bx as isize).cast(), - src_row.offset(bx as isize).cast(), - stride, - scaling, - data, - grain_lut.cast(), - luma_row.offset((bx << 1) as isize).cast(), - luma_stride, - offsets.as_mut_ptr() as *const [c_int; 2], - bh as ptrdiff_t, - uv as ptrdiff_t, - is_id as ptrdiff_t, - r#type as ptrdiff_t, - bd.into_c(), - ); - bx = bx.wrapping_add((32 >> 1) as c_uint); - } -} - -#[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] -unsafe extern "C" fn fguv_32x32xn_422_neon_erased( +unsafe extern "C" fn fguv_32x32xn_neon_erased< + BD: BitDepth, + const NM: usize, + const IS_SX: bool, + const IS_SY: bool, +>( dst_row: *mut DynPixel, src_row: *const DynPixel, stride: ptrdiff_t, @@ -1447,7 +1345,7 @@ unsafe extern "C" fn fguv_32x32xn_422_neon_erased( is_id: c_int, bitdepth_max: c_int, ) { - fguv_32x32xn_422_neon::( + fguv_32x32xn_neon::( dst_row.cast(), src_row.cast(), stride, @@ -1466,7 +1364,7 @@ unsafe extern "C" fn fguv_32x32xn_422_neon_erased( } #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] -unsafe fn fguv_32x32xn_422_neon( +unsafe fn fguv_32x32xn_neon( dst_row: *mut BD::Pixel, src_row: *const BD::Pixel, stride: ptrdiff_t, @@ -1482,113 +1380,8 @@ unsafe fn fguv_32x32xn_422_neon( is_id: c_int, bd: BD, ) { - let rows = 1 + ((*data).overlap_flag && row_num > 0) as c_int; - let mut seed: [c_uint; 2] = [0; 2]; - let mut i = 0; - while i < rows { - seed[i as usize] = (*data).seed; - seed[i as usize] ^= (((row_num - i) * 37 + 178 & 0xff as c_int) << 8) as c_uint; - seed[i as usize] ^= ((row_num - i) * 173 + 105 & 0xff as c_int) as c_uint; - i += 1; - } - let mut offsets: [[c_int; 2]; 2] = [[0; 2]; 2]; - let mut bx: c_uint = 0 as c_int as c_uint; - while (bx as usize) < pw { - if (*data).overlap_flag && bx != 0 { - let mut i = 0; - while i < rows { - offsets[1][i as usize] = offsets[0][i as usize]; - i += 1; - } - } - let mut i = 0; - while i < rows { - offsets[0][i as usize] = - get_random_number(8 as c_int, &mut *seed.as_mut_ptr().offset(i as isize)); - i += 1; - } - let mut r#type = 0; - if (*data).overlap_flag && row_num != 0 { - r#type |= 1 as c_int; - } - if (*data).overlap_flag && bx != 0 { - r#type |= 2 as c_int; - } - if (*data).chroma_scaling_from_luma { - r#type |= 4 as c_int; - } - bd_fn!(decl_fguv_32x32xn_fn, BD, fguv_32x32_422, neon)( - dst_row.offset(bx as isize).cast(), - src_row.offset(bx as isize).cast(), - stride, - scaling, - data, - grain_lut.cast(), - luma_row.offset((bx << 1) as isize).cast(), - luma_stride, - offsets.as_mut_ptr() as *const [c_int; 2], - bh as ptrdiff_t, - uv as ptrdiff_t, - is_id as ptrdiff_t, - r#type as ptrdiff_t, - bd.into_c(), - ); - bx = bx.wrapping_add((32 >> 1) as c_uint); - } -} + let [sx, _sy] = [IS_SX, IS_SY].map(|it| it as c_int); -#[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] -unsafe extern "C" fn fguv_32x32xn_444_neon_erased( - dst_row: *mut DynPixel, - src_row: *const DynPixel, - stride: ptrdiff_t, - data: *const Rav1dFilmGrainData, - pw: usize, - scaling: *const u8, - grain_lut: *const [DynEntry; GRAIN_WIDTH], - bh: c_int, - row_num: c_int, - luma_row: *const DynPixel, - luma_stride: ptrdiff_t, - uv: c_int, - is_id: c_int, - bitdepth_max: c_int, -) { - fguv_32x32xn_444_neon::( - dst_row.cast(), - src_row.cast(), - stride, - data, - pw, - scaling, - grain_lut.cast(), - bh, - row_num, - luma_row.cast(), - luma_stride, - uv, - is_id, - BD::from_c(bitdepth_max), - ); -} - -#[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] -unsafe fn fguv_32x32xn_444_neon( - dst_row: *mut BD::Pixel, - src_row: *const BD::Pixel, - stride: ptrdiff_t, - data: *const Rav1dFilmGrainData, - pw: usize, - scaling: *const u8, - grain_lut: *const [BD::Entry; GRAIN_WIDTH], - bh: c_int, - row_num: c_int, - luma_row: *const BD::Pixel, - luma_stride: ptrdiff_t, - uv: c_int, - is_id: c_int, - bd: BD, -) { let rows = 1 + ((*data).overlap_flag && row_num > 0) as c_int; let mut seed: [c_uint; 2] = [0; 2]; let mut i = 0; @@ -1624,14 +1417,19 @@ unsafe fn fguv_32x32xn_444_neon( if (*data).chroma_scaling_from_luma { r#type |= 4 as c_int; } - bd_fn!(decl_fguv_32x32xn_fn, BD, fguv_32x32_444, neon)( + (match NM { + 420 => bd_fn!(decl_fguv_32x32xn_fn, BD, fguv_32x32_420, neon), + 422 => bd_fn!(decl_fguv_32x32xn_fn, BD, fguv_32x32_422, neon), + 444 => bd_fn!(decl_fguv_32x32xn_fn, BD, fguv_32x32_444, neon), + _ => unreachable!(), + })( dst_row.offset(bx as isize).cast(), src_row.offset(bx as isize).cast(), stride, scaling, data, grain_lut.cast(), - luma_row.offset((bx << 0) as isize).cast(), + luma_row.offset((bx << sx) as isize).cast(), luma_stride, offsets.as_mut_ptr() as *const [c_int; 2], bh as ptrdiff_t, @@ -1640,7 +1438,7 @@ unsafe fn fguv_32x32xn_444_neon( r#type as ptrdiff_t, bd.into_c(), ); - bx = bx.wrapping_add((32 >> 0) as c_uint); + bx = bx.wrapping_add((32 >> sx) as c_uint); } } @@ -1675,11 +1473,11 @@ unsafe fn film_grain_dsp_init_arm(c: *mut Rav1dFilmGrainDSPContext (*c).fgy_32x32xn = Some(fgy_32x32xn_neon_erased::); (*c).fguv_32x32xn[(RAV1D_PIXEL_LAYOUT_I420 - 1) as usize] = - Some(fguv_32x32xn_420_neon_erased::); + Some(fguv_32x32xn_neon_erased::); (*c).fguv_32x32xn[(RAV1D_PIXEL_LAYOUT_I422 - 1) as usize] = - Some(fguv_32x32xn_422_neon_erased::); + Some(fguv_32x32xn_neon_erased::); (*c).fguv_32x32xn[(RAV1D_PIXEL_LAYOUT_I444 - 1) as usize] = - Some(fguv_32x32xn_444_neon_erased::); + Some(fguv_32x32xn_neon_erased::); } #[cold]