From bbf7d6c8126ad56bbcf05af2ed73f82804fc6069 Mon Sep 17 00:00:00 2001
From: Khyber Sen <kkysen@gmail.com>
Date: Fri, 10 Nov 2023 19:56:54 -0800
Subject: [PATCH] `fn generate_grain_{y,uv}_rust`: Replace indexing `ar_lag`
 loops with iterators to elide bounds checks.

This also explicitly truncates `ar_lag` to 2 bits (so `ar_lag <= ar_pad`),
which it already is when being set, but the compiler can't see that (a `u2` would be appropriate).
`ar_lag <= ar_pad` proves all the bounds are in-bounds,
but the compiler only sees this when using slicing and iterators.

There are still some bounds checks for the luma code,
but that only happens once per inner loop so it's less bad.

This increases perf by ~4-5% on `donna`
and removes 4 bounds checks per `fn`.
---
 src/filmgrain.rs | 61 +++++++++++++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 29 deletions(-)

diff --git a/src/filmgrain.rs b/src/filmgrain.rs
index 622f59cf1..603e9bbe2 100644
--- a/src/filmgrain.rs
+++ b/src/filmgrain.rs
@@ -387,26 +387,30 @@ unsafe fn generate_grain_y_rust<BD: BitDepth>(
     }
 
     let ar_pad = 3;
-    let ar_lag = data.ar_coeff_lag as isize;
+    // `ar_lag` is 2 bits; this tells the compiler it definitely is.
+    // That also means `ar_lag <= ar_pad`.
+    let ar_lag = data.ar_coeff_lag as usize & ((1 << 2) - 1);
 
-    for y in ar_pad..GRAIN_HEIGHT {
-        for x in ar_pad..GRAIN_WIDTH - ar_pad {
+    for y in 0..GRAIN_HEIGHT - ar_pad {
+        for x in 0..GRAIN_WIDTH - 2 * ar_pad {
             let mut coeff = (data.ar_coeffs_y).as_ptr();
             let mut sum = 0;
-            for dy in -ar_lag..=0 {
-                for dx in -ar_lag..=ar_lag {
-                    if dx == 0 && dy == 0 {
+            for (dy, buf_row) in buf[y..][ar_pad - ar_lag..=ar_pad].iter().enumerate() {
+                for (dx, &buf_val) in buf_row[x..][ar_pad - ar_lag..=ar_pad + ar_lag]
+                    .iter()
+                    .enumerate()
+                {
+                    if dx == ar_lag && dy == ar_lag {
                         break;
                     }
-                    sum += *coeff as c_int
-                        * buf[(y as isize + dy) as usize][(x as isize + dx) as usize]
-                            .as_::<c_int>();
+                    sum += *coeff as c_int * buf_val.as_::<c_int>();
                     coeff = coeff.offset(1);
                 }
             }
 
-            let grain = buf[y][x].as_::<c_int>() + round2(sum, data.ar_coeff_shift);
-            buf[y][x] = iclip(grain, grain_min, grain_max).as_::<BD::Entry>();
+            let buf_yx = &mut buf[y + ar_pad][x + ar_pad];
+            let grain = (*buf_yx).as_::<c_int>() + round2(sum, data.ar_coeff_shift);
+            (*buf_yx) = iclip(grain, grain_min, grain_max).as_::<BD::Entry>();
         }
     }
 }
@@ -449,23 +453,23 @@ unsafe fn generate_grain_uv_rust<BD: BitDepth>(
     }
 
     let ar_pad = 3;
-    let ar_lag = data.ar_coeff_lag as isize;
+    // `ar_lag` is 2 bits; this tells the compiler it definitely is.
+    // That also means `ar_lag <= ar_pad`.
+    let ar_lag = data.ar_coeff_lag as usize & ((1 << 2) - 1);
 
-    for y in ar_pad..chromaH {
-        for x in ar_pad..chromaW - ar_pad {
+    for y in 0..chromaH - ar_pad {
+        for x in 0..chromaW - 2 * ar_pad {
             let mut coeff = (data.ar_coeffs_uv[uv]).as_ptr();
             let mut sum = 0;
-            for dy in -ar_lag..=0 {
-                for dx in -ar_lag..=ar_lag {
-                    // For the final (current) pixel, we need to add in the
-                    // contribution from the luma grain texture.
-                    if dx == 0 && dy == 0 {
-                        if data.num_y_points == 0 {
-                            break;
-                        }
+            for (dy, buf_row) in buf[y..][ar_pad - ar_lag..=ar_pad].iter().enumerate() {
+                for (dx, &buf_val) in buf_row[x..][ar_pad - ar_lag..=ar_pad + ar_lag]
+                    .iter()
+                    .enumerate()
+                {
+                    if dx == ar_lag && dy == ar_lag {
                         let mut luma = 0;
-                        let lumaX = ((x - ar_pad) << subx) + ar_pad;
-                        let lumaY = ((y - ar_pad) << suby) + ar_pad;
+                        let lumaX = (x << subx) + ar_pad;
+                        let lumaY = (y << suby) + ar_pad;
                         for i in 0..=suby {
                             for j in 0..=subx {
                                 luma +=
@@ -477,15 +481,14 @@ unsafe fn generate_grain_uv_rust<BD: BitDepth>(
                         sum += luma * *coeff as c_int;
                         break;
                     }
-                    sum += *coeff as c_int
-                        * buf[(y as isize + dy) as usize][(x as isize + dx) as usize]
-                            .as_::<c_int>();
+                    sum += *coeff as c_int * buf_val.as_::<c_int>();
                     coeff = coeff.offset(1);
                 }
             }
 
-            let grain = buf[y][x].as_::<c_int>() + round2(sum, data.ar_coeff_shift);
-            buf[y][x] = iclip(grain, grain_min, grain_max).as_::<BD::Entry>();
+            let buf_yx = &mut buf[y + ar_pad][x + ar_pad];
+            let grain = (*buf_yx).as_::<c_int>() + round2(sum, data.ar_coeff_shift);
+            (*buf_yx) = iclip(grain, grain_min, grain_max).as_::<BD::Entry>();
         }
     }
 }