diff --git a/lib.rs b/lib.rs
index f399af9b6..633a41608 100644
--- a/lib.rs
+++ b/lib.rs
@@ -30,6 +30,7 @@ pub mod include {
 } // mod include
 pub mod src {
     pub mod align;
+    mod assume;
     mod cdef;
     #[cfg_attr(not(feature = "bitdepth_16"), allow(dead_code))]
     mod cdef_apply_tmpl_16;
diff --git a/src/assume.rs b/src/assume.rs
new file mode 100644
index 000000000..12447957e
--- /dev/null
+++ b/src/assume.rs
@@ -0,0 +1,9 @@
+use std::hint::unreachable_unchecked;
+
+/// A stable version of [`core::intrinsics::assume`].
+#[inline(always)]
+pub unsafe fn assume(condition: bool) {
+    if !condition {
+        unreachable_unchecked();
+    }
+}
diff --git a/src/filmgrain.rs b/src/filmgrain.rs
index 603e9bbe2..1d3e88bf1 100644
--- a/src/filmgrain.rs
+++ b/src/filmgrain.rs
@@ -10,6 +10,7 @@ use crate::include::common::intops::iclip;
 use crate::include::dav1d::headers::Dav1dFilmGrainData;
 use crate::include::dav1d::headers::Rav1dFilmGrainData;
 use crate::include::dav1d::headers::Rav1dPixelLayoutSubSampled;
+use crate::src::assume::assume;
 use crate::src::enum_map::enum_map;
 use crate::src::enum_map::DefaultValue;
 use crate::src::enum_map::EnumMap;
@@ -367,6 +368,8 @@ unsafe extern "C" fn generate_grain_y_c_erased<BD: BitDepth>(
     generate_grain_y_rust(buf, data, bd)
 }
 
+const AR_PAD: usize = 3;
+
 unsafe fn generate_grain_y_rust<BD: BitDepth>(
     buf: &mut GrainLut<BD::Entry>,
     data: &Rav1dFilmGrainData,
@@ -386,17 +389,16 @@ unsafe fn generate_grain_y_rust<BD: BitDepth>(
         });
     }
 
-    let ar_pad = 3;
     // `ar_lag` is 2 bits; this tells the compiler it definitely is.
     // That also means `ar_lag <= ar_pad`.
     let ar_lag = data.ar_coeff_lag as usize & ((1 << 2) - 1);
 
-    for y in 0..GRAIN_HEIGHT - ar_pad {
-        for x in 0..GRAIN_WIDTH - 2 * ar_pad {
+    for y in 0..GRAIN_HEIGHT - AR_PAD {
+        for x in 0..GRAIN_WIDTH - 2 * AR_PAD {
             let mut coeff = (data.ar_coeffs_y).as_ptr();
             let mut sum = 0;
-            for (dy, buf_row) in buf[y..][ar_pad - ar_lag..=ar_pad].iter().enumerate() {
-                for (dx, &buf_val) in buf_row[x..][ar_pad - ar_lag..=ar_pad + ar_lag]
+            for (dy, buf_row) in buf[y..][AR_PAD - ar_lag..=AR_PAD].iter().enumerate() {
+                for (dx, &buf_val) in buf_row[x..][AR_PAD - ar_lag..=AR_PAD + ar_lag]
                     .iter()
                     .enumerate()
                 {
@@ -408,7 +410,7 @@ unsafe fn generate_grain_y_rust<BD: BitDepth>(
                 }
             }
 
-            let buf_yx = &mut buf[y + ar_pad][x + ar_pad];
+            let buf_yx = &mut buf[y + AR_PAD][x + AR_PAD];
             let grain = (*buf_yx).as_::<c_int>() + round2(sum, data.ar_coeff_shift);
             (*buf_yx) = iclip(grain, grain_min, grain_max).as_::<BD::Entry>();
         }
@@ -425,7 +427,67 @@ unsafe fn generate_grain_uv_rust<BD: BitDepth>(
     bd: BD,
 ) {
     let uv = is_uv as usize;
-    let [subx, suby] = [is_subx, is_suby].map(|it| it as u8);
+
+    struct IsSub {
+        y: bool,
+        x: bool,
+    }
+
+    impl IsSub {
+        const fn chroma(&self) -> (usize, usize) {
+            let h = if self.y {
+                SUB_GRAIN_HEIGHT
+            } else {
+                GRAIN_HEIGHT
+            };
+            let w = if self.x { SUB_GRAIN_WIDTH } else { GRAIN_WIDTH };
+            (h, w)
+        }
+
+        const fn len(&self) -> (usize, usize) {
+            let (h, w) = self.chroma();
+            (h - AR_PAD, w - 2 * AR_PAD)
+        }
+
+        const fn luma(&self, (y, x): (usize, usize)) -> (usize, usize) {
+            (
+                (y << self.y as usize) + AR_PAD,
+                (x << self.x as usize) + AR_PAD,
+            )
+        }
+
+        const fn buf_index(&self, (y, x): (usize, usize)) -> (usize, usize) {
+            let (y, x) = self.luma((y, x));
+            (y + self.y as usize, x + self.x as usize)
+        }
+
+        const fn max_buf_index(&self) -> (usize, usize) {
+            let (y, x) = self.len();
+            self.buf_index((y - 1, x - 1))
+        }
+
+        const fn check_buf_index<T, const Y: usize, const X: usize>(
+            &self,
+            _: &Option<[[T; X]; Y]>,
+        ) {
+            let (y, x) = self.max_buf_index();
+            assert!(y < Y);
+            assert!(x < X);
+        }
+
+        #[allow(dead_code)] // False positive; used in a `const`.
+        const fn check_buf_index_all<T, const Y: usize, const X: usize>(buf: &Option<[[T; X]; Y]>) {
+            Self { y: true, x: true }.check_buf_index(buf);
+            Self { y: true, x: false }.check_buf_index(buf);
+            Self { y: false, x: true }.check_buf_index(buf);
+            Self { y: false, x: false }.check_buf_index(buf);
+        }
+    }
+
+    let is_sub = IsSub {
+        y: is_suby,
+        x: is_subx,
+    };
 
     let bitdepth_min_8 = bd.bitdepth() - 8;
     let mut seed = data.seed ^ if is_uv { 0x49d8 } else { 0xb524 };
@@ -434,49 +496,42 @@ unsafe fn generate_grain_uv_rust<BD: BitDepth>(
     let grain_min = -grain_ctr;
     let grain_max = grain_ctr - 1;
 
-    let chromaW = if is_subx {
-        SUB_GRAIN_WIDTH
-    } else {
-        GRAIN_WIDTH
-    };
-    let chromaH = if is_suby {
-        SUB_GRAIN_HEIGHT
-    } else {
-        GRAIN_HEIGHT
-    };
-
-    for row in &mut buf[..chromaH] {
-        row[..chromaW].fill_with(|| {
+    for row in &mut buf[..is_sub.chroma().0] {
+        row[..is_sub.chroma().1].fill_with(|| {
             let value = get_random_number(11, &mut seed);
             round2(dav1d_gaussian_sequence[value as usize], shift).as_::<BD::Entry>()
         });
     }
 
-    let ar_pad = 3;
     // `ar_lag` is 2 bits; this tells the compiler it definitely is.
     // That also means `ar_lag <= ar_pad`.
     let ar_lag = data.ar_coeff_lag as usize & ((1 << 2) - 1);
 
-    for y in 0..chromaH - ar_pad {
-        for x in 0..chromaW - 2 * ar_pad {
+    for y in 0..is_sub.len().0 {
+        for x in 0..is_sub.len().1 {
             let mut coeff = (data.ar_coeffs_uv[uv]).as_ptr();
             let mut sum = 0;
-            for (dy, buf_row) in buf[y..][ar_pad - ar_lag..=ar_pad].iter().enumerate() {
-                for (dx, &buf_val) in buf_row[x..][ar_pad - ar_lag..=ar_pad + ar_lag]
+            for (dy, buf_row) in buf[y..][AR_PAD - ar_lag..=AR_PAD].iter().enumerate() {
+                for (dx, &buf_val) in buf_row[x..][AR_PAD - ar_lag..=AR_PAD + ar_lag]
                     .iter()
                     .enumerate()
                 {
                     if dx == ar_lag && dy == ar_lag {
                         let mut luma = 0;
-                        let lumaX = (x << subx) + ar_pad;
-                        let lumaY = (y << suby) + ar_pad;
-                        for i in 0..=suby {
-                            for j in 0..=subx {
-                                luma +=
-                                    buf_y[lumaY + i as usize][lumaX + j as usize].as_::<c_int>();
+                        let (luma_y, luma_x) = is_sub.luma((y, x));
+                        const _: () = IsSub::check_buf_index_all(&None::<GrainLut<()>>);
+                        // The optimizer is not smart enough to deduce this on its own.
+                        // Safety: The above static check checks all maximum index possibilities.
+                        unsafe {
+                            assume(luma_y < GRAIN_HEIGHT + 1 - 1);
+                            assume(luma_x < GRAIN_WIDTH - 1);
+                        }
+                        for i in 0..1 + is_sub.y as usize {
+                            for j in 0..1 + is_sub.x as usize {
+                                luma += buf_y[luma_y + i][luma_x + j].as_::<c_int>();
                             }
                         }
-                        luma = round2(luma, subx + suby);
+                        luma = round2(luma, is_sub.y as u8 + is_sub.x as u8);
 
                         sum += luma * *coeff as c_int;
                         break;
@@ -486,7 +541,7 @@ unsafe fn generate_grain_uv_rust<BD: BitDepth>(
                 }
             }
 
-            let buf_yx = &mut buf[y + ar_pad][x + ar_pad];
+            let buf_yx = &mut buf[y + AR_PAD][x + AR_PAD];
             let grain = (*buf_yx).as_::<c_int>() + round2(sum, data.ar_coeff_shift);
             (*buf_yx) = iclip(grain, grain_min, grain_max).as_::<BD::Entry>();
         }