Skip to content

Commit

Permalink
fn decode_b: Performance improvements (#1320)
Browse files Browse the repository at this point in the history
Inlines `backup2x8` which is inlined in C and improves performance
slightly.

The `f.a[t.a]` block context reference is constant
throughout `decode_b`, but it appears that the
function is too complex for the optimizer to not
recompute this reference. Making it a local
improves performance measurably (~1% on a Ryzen
7700X for 8-bit Chimera).
  • Loading branch information
rinon authored Jul 16, 2024
2 parents f392366 + f63c2ab commit 31aa266
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 63 deletions.
7 changes: 4 additions & 3 deletions src/cdef_apply.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,10 @@ fn backup2lines<BD: BitDepth>(
}
}

#[inline(always)]
fn backup2x8<BD: BitDepth>(
dst: &mut [[[BD::Pixel; 2]; 8]; 3],
src: [Rav1dPictureDataComponentOffset; 3],
src: &[Rav1dPictureDataComponentOffset; 3],
x_off: c_int,
layout: Rav1dPixelLayout,
flag: Backup2x8Flags,
Expand Down Expand Up @@ -252,11 +253,11 @@ pub(crate) fn rav1d_cdef_brow<BD: BitDepth>(
if !do_left.is_empty() && edges.contains(CdefEdgeFlags::HAVE_LEFT) {
// we didn't backup the prefilter data because it wasn't
// there, so do it here instead
backup2x8::<BD>(&mut lr_bak[bit as usize], bptrs, 0, layout, do_left);
backup2x8::<BD>(&mut lr_bak[bit as usize], &bptrs, 0, layout, do_left);
}
if edges.contains(CdefEdgeFlags::HAVE_RIGHT) {
// backup pre-filter data for next iteration
backup2x8::<BD>(&mut lr_bak[!bit as usize], bptrs, 8, layout, flag);
backup2x8::<BD>(&mut lr_bak[!bit as usize], &bptrs, 8, layout, flag);
}

let mut variance = 0;
Expand Down
Loading

0 comments on commit 31aa266

Please sign in to comment.