From 2f36411d177dfba741e37db6d03bea6b59889192 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Mon, 9 Oct 2023 10:11:20 -0700
Subject: [PATCH 1/2] Add multisampled antialiasing

This is ported from the multi branch.

Configuration of antialiasing mode is currently set statically, but could become more dynamic. In addition, the mask LUT is computed and uploaded every frame, rather than being persistent.
---
 .vscode/settings.json |   2 +-
 shader/fine.wgsl      | 324 ++++++++++++++++++++++++++++++++++++++++--
 src/lib.rs            |  14 ++
 src/mask.rs           |  98 +++++++++++++
 src/render.rs         |  63 ++++++--
 src/shaders.rs        |  60 ++++++--
 6 files changed, 522 insertions(+), 39 deletions(-)
 create mode 100644 src/mask.rs
diff --git a/.vscode/settings.json b/.vscode/settings.json
index a75948f72..57a124c70 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -16,6 +16,6 @@
   },
   "wgsl-analyzer.diagnostics.nagaVersion": "main",
   "wgsl-analyzer.preprocessor.shaderDefs": [
-    "full"
+    "full", "msaa16", "msaa"
   ]
 }
diff --git a/shader/fine.wgsl b/shader/fine.wgsl
index 108c88cba..9bd7da3a3 100644
--- a/shader/fine.wgsl
+++ b/shader/fine.wgsl
@@ -2,8 +2,10 @@
 
 // Fine rasterizer. This can run in simple (just path rendering) and full
 // modes, controllable by #define.
+//
+// To enable multisampled rendering, turn on both the msaa ifdef and one of msaa8
+// or msaa16.
 
-// This is a cut'n'paste w/ backdrop.
 struct Tile {
     backdrop: i32,
     segments: u32,
@@ -18,8 +20,6 @@ var<uniform> config: Config;
 @group(0) @binding(1)
 var<storage> segments: array<Segment>;
 
-#ifdef full
-
 #import blend
 #import ptcl
 
@@ -40,6 +40,309 @@ var gradients: texture_2d<f32>;
 @group(0) @binding(6)
 var image_atlas: texture_2d<f32>;
 
+#ifdef msaa8
+let MASK_WIDTH = 32u;
+let MASK_HEIGHT = 32u;
+let SH_SAMPLES_SIZE = 256u;
+let SAMPLE_WORDS_PER_PIXEL = 1u;
+// This might be better in uniform, but that has 16 byte alignment
+@group(0) @binding(7)
+var<storage> mask_lut: array<u32, 256u>;
+#endif
+
+#ifdef msaa16
+let MASK_WIDTH = 64u;
+let MASK_HEIGHT = 64u;
+let SH_SAMPLES_SIZE = 512u;
+let SAMPLE_WORDS_PER_PIXEL = 2u;
+@group(0) @binding(7)
+var<storage> mask_lut: array<u32, 2048u>;
+#endif
+
+#ifdef msaa
+let WG_SIZE = 64u;
+var<workgroup> sh_count: array<u32, WG_SIZE>;
+
+// This is 8 winding numbers packed to a u32, 4 bits per sample
+var<workgroup> sh_winding: array<atomic<u32>, 32u>;
+// Same packing, one group of 8 per pixel
+var<workgroup> sh_samples: array<atomic<u32>, SH_SAMPLES_SIZE>;
+// Same packing, accumulating winding numbers for vertical edge crossings
+var<workgroup> sh_winding_y: array<atomic<u32>, 2u>;
+
+// number of integer cells spanned by interval defined by a, b
+fn span(a: f32, b: f32) -> u32 {
+    return u32(max(ceil(max(a, b)) - floor(min(a, b)), 1.0));
+}
+
+let SEG_SIZE = 5u;
+
+// New multisampled algorithm.
+fn fill_path_ms(fill: CmdFill, wg_id: vec2<u32>, local_id: vec2<u32>) -> array<f32, PIXELS_PER_THREAD> {
+    let n_segs = fill.size_and_rule >> 1u;
+    let even_odd = (fill.size_and_rule & 1u) != 0u;
+    let tile_origin = vec2(f32(wg_id.x) * f32(TILE_HEIGHT), f32(wg_id.y) * f32(TILE_WIDTH));
+    let th_ix = local_id.y * (TILE_WIDTH / PIXELS_PER_THREAD) + local_id.x;
+    if th_ix < 32u {
+        if th_ix < 2u {
+            atomicStore(&sh_winding_y[th_ix], 0x88888888u);
+        }
+        atomicStore(&sh_winding[th_ix], 0x88888888u);
+    }
+    let sample_count = PIXELS_PER_THREAD * SAMPLE_WORDS_PER_PIXEL;
+    for (var i = 0u; i < sample_count; i++) {
+        atomicStore(&sh_samples[th_ix * sample_count + i], 0x88888888u);
+    }
+    workgroupBarrier();
+    let n_batch = (n_segs + (WG_SIZE - 1u)) / WG_SIZE;
+    for (var batch = 0u; batch < n_batch; batch++) {
+        let seg_ix = batch * WG_SIZE + th_ix;
+        let seg_off = fill.seg_data + seg_ix;
+        var count = 0u;
+        let slice_size = min(n_segs - batch * WG_SIZE, WG_SIZE);
+        // TODO: might save a register rewriting this in terms of limit
+        if th_ix < slice_size {
+            let segment = segments[seg_off];
+            // Note: coords relative to tile origin probably a good idea in coarse path,
+            // especially as f16 would work. But keeping existing scheme for compatibility.
+            let xy0 = segment.origin - tile_origin;
+            let xy1 = xy0 + segment.delta;
+            var y_edge_f = f32(TILE_HEIGHT);
+            var delta = select(-1, 1, xy1.x <= xy0.x);
+            if xy0.x == 0.0 && xy1.x == 0.0 {
+                if xy0.y == 0.0 {
+                    y_edge_f = 0.0;
+                } else if xy1.y == 0.0 {
+                    y_edge_f = 0.0;
+                    delta = -delta;
+                }
+            } else {
+                if xy0.x == 0.0 {
+                    if xy0.y != 0.0 {
+                        y_edge_f = xy0.y;
+                    }
+                } else if xy1.x == 0.0 && xy1.y != 0.0 {
+                    y_edge_f = xy1.y;
+                }
+                // discard horizontal lines aligned to pixel grid
+                if !(xy0.y == xy1.y && xy0.y == floor(xy0.y)) {
+                    count = span(xy0.x, xy1.x) + span(xy0.y, xy1.y) - 1u;
+                }
+            }
+            let y_edge = u32(ceil(y_edge_f));
+            if y_edge < TILE_HEIGHT {
+                atomicAdd(&sh_winding_y[y_edge >> 3u], u32(delta) << ((y_edge & 7u) << 2u));
+            }
+        }
+        // workgroup prefix sum of counts
+        sh_count[th_ix] = count;
+        let lg_n = firstLeadingBit(slice_size * 2u - 1u);
+        for (var i = 0u; i < lg_n; i++) {
+            workgroupBarrier();
+            if th_ix >= 1u << i {
+                count += sh_count[th_ix - (1u << i)];
+            }
+            workgroupBarrier();
+            sh_count[th_ix] = count;
+        }
+#ifdef have_uniform
+        let total = workgroupUniformLoad(&sh_count[slice_size - 1u]);
+#else
+        workgroupBarrier();
+        let total = sh_count[slice_size - 1u];
+#endif
+        for (var i = th_ix; i < total; i += WG_SIZE) {
+            // binary search to find pixel
+            var lo = 0u;
+            var hi = slice_size;
+            let goal = i;
+            while hi > lo + 1u {
+                let mid = (lo + hi) >> 1u;
+                if goal >= sh_count[mid - 1u] {
+                    lo = mid;
+                } else {
+                    hi = mid;
+                }
+            }
+            let el_ix = lo;
+            let last_pixel = i + 1u == sh_count[el_ix];
+            let sub_ix = i - select(0u, sh_count[el_ix - 1u], el_ix > 0u);
+            let seg_off = fill.seg_data + batch * WG_SIZE + el_ix;
+            let segment = segments[seg_off];
+            let xy0_in = segment.origin - tile_origin;
+            let xy1_in = xy0_in + segment.delta;
+            let is_down = xy1_in.y >= xy0_in.y;
+            let xy0 = select(xy1_in, xy0_in, is_down);
+            let xy1 = select(xy0_in, xy1_in, is_down);
+
+            // Set up data for line rasterization
+            // Note: this is duplicated work if total count exceeds a workgroup.
+            // One alternative is to compute it in a separate dispatch.
+            let dx = abs(xy1.x - xy0.x);
+            let dy = xy1.y - xy0.y;
+            let dy_dxdy = dy / (dx + dy);
+            let a = dx / (dx + dy);
+            let is_positive_slope = xy1.x >= xy0.x;
+            let sign = select(-1.0, 1.0, is_positive_slope);
+            let xt0 = floor(xy0.x * sign);
+            let c = xy0.x * sign - xt0;
+            // This has a special case in the JS code, but we should just not render
+            let y0i = floor(xy0.y);
+            let ytop = select(y0i + 1.0, ceil(xy0.y), xy0.y == xy1.y);
+            let b = dy_dxdy * c + a * (ytop - xy0.y);
+            let x0i = i32(xt0 * sign + 0.5 * (sign - 1.0));
+            // Use line equation to plot pixel coordinates
+
+            let zf = a * f32(sub_ix) + b;
+            let z = floor(zf);
+            let x = x0i + i32(sign * z);
+            let y = i32(y0i) + i32(sub_ix) - i32(z);
+            var is_delta: bool;
+            // We need to adjust winding number if slope is positive and there
+            // is a crossing at the left edge of the pixel.
+            var is_bump = false;
+            let zp = floor(a * f32(sub_ix - 1u) + b);
+            if sub_ix == 0u {
+                is_delta = y0i == xy0.y && y0i != xy1.y;
+                is_bump = xy0.x == 0.0;
+            } else {
+                is_delta = z == zp;
+                is_bump = is_positive_slope && !is_delta;
+            }
+            let pix_ix = u32(y) * TILE_WIDTH + u32(x);
+            if u32(x) < TILE_WIDTH - 1u && u32(y) < TILE_HEIGHT {
+                let delta_pix = pix_ix + 1u;
+                if is_delta {
+                    let delta = select(u32(-1), 1u, is_down) << ((delta_pix & 7u) << 2u);
+                    atomicAdd(&sh_winding[delta_pix >> 3u], delta);
+                }
+            }
+            // Apply sample mask
+            let mask_block = u32(is_positive_slope) * (MASK_WIDTH * MASK_HEIGHT / 2u);
+            let half_height = f32(MASK_HEIGHT / 2u);
+            let mask_row = floor(min(a * half_height, half_height - 1.0)) * f32(MASK_WIDTH);
+            let mask_col = floor((zf - z) * f32(MASK_WIDTH));
+            let mask_ix = mask_block + u32(mask_row + mask_col);
+#ifdef msaa8
+            var mask = mask_lut[mask_ix / 4u] >> ((mask_ix % 4u) * 8u);
+            mask &= 0xffu;
+            // Intersect with y half-plane masks
+            if sub_ix == 0u && !is_bump {
+                let mask_shift = u32(round(8.0 * (xy0.y - f32(y))));
+                mask &= 0xffu << mask_shift;
+            }
+            if last_pixel && xy1.x != 0.0 {
+                let mask_shift = u32(round(8.0 * (xy1.y - f32(y))));
+                mask &= ~(0xffu << mask_shift);
+            }
+            let mask_a = mask | (mask << 6u);
+            let mask_b = mask_a | (mask_a << 12u);
+            let mask_exp = (mask_b & 0x1010101u) | ((mask_b << 3u) & 0x10101010u);
+            var mask_signed = select(mask_exp, u32(-i32(mask_exp)), is_down);
+            if is_bump {
+                mask_signed += select(u32(-0x11111111), 0x1111111u, is_down);
+            }
+            atomicAdd(&sh_samples[pix_ix], mask_signed);
+#endif
+#ifdef msaa16
+            var mask = mask_lut[mask_ix / 2u] >> ((mask_ix % 2u) * 16u);
+            mask &= 0xffffu;
+            // Intersect with y half-plane masks
+            if sub_ix == 0u && !is_bump {
+                let mask_shift = u32(round(16.0 * (xy0.y - f32(y))));
+                mask &= 0xffffu << mask_shift;
+            }
+            if last_pixel && xy1.x != 0.0 {
+                let mask_shift = u32(round(16.0 * (xy1.y - f32(y))));
+                mask &= ~(0xffffu << mask_shift);
+            }
+            let mask0 = mask & 0xffu;
+            let mask0_a = mask0 | (mask0 << 6u);
+            let mask0_b = mask0_a | (mask0_a << 12u);
+            let mask0_exp = (mask0_b & 0x1010101u) | ((mask0_b << 3u) & 0x10101010u);
+            var mask0_signed = select(mask0_exp, u32(-i32(mask0_exp)), is_down);
+            let mask1 = (mask >> 8u) & 0xffu;
+            let mask1_a = mask1 | (mask1 << 6u);
+            let mask1_b = mask1_a | (mask1_a << 12u);
+            let mask1_exp = (mask1_b & 0x1010101u) | ((mask1_b << 3u) & 0x10101010u);
+            var mask1_signed = select(mask1_exp, u32(-i32(mask1_exp)), is_down);
+            if is_bump {
+                let bump_delta = select(u32(-0x11111111), 0x1111111u, is_down);
+                mask0_signed += bump_delta;
+                mask1_signed += bump_delta;
+            }
+            atomicAdd(&sh_samples[pix_ix * 2u], mask0_signed);
+            atomicAdd(&sh_samples[pix_ix * 2u + 1u], mask1_signed);
+#endif
+        }
+        workgroupBarrier();
+    }
+    var area: array<f32, PIXELS_PER_THREAD>;
+    let major = (th_ix * PIXELS_PER_THREAD) >> 3u;
+    var packed_w = atomicLoad(&sh_winding[major]);
+    // Prefix sum of packed 4 bit values within u32
+    packed_w += (packed_w - 0x8888888u) << 4u;
+    packed_w += (packed_w - 0x888888u) << 8u;
+    packed_w += (packed_w - 0x8888u) << 16u;
+    // Note: could probably do bias in one go, but it would be inscrutable
+    if (major & 1u) != 0u {
+        // We could use shmem to communicate the value from another thread;
+        // if we had subgroups that would almost certainly be the most
+        // efficient way. But we just calculate again for simplicity.
+        var last_packed = atomicLoad(&sh_winding[major - 1u]);
+        last_packed += (last_packed - 0x8888888u) << 4u;
+        last_packed += (last_packed - 0x888888u) << 8u;
+        last_packed += (last_packed - 0x8888u) << 16u;
+        let bump = ((last_packed >> 28u) - 8u) * 0x11111111u;
+        packed_w += bump;
+    }
+    var packed_y = atomicLoad(&sh_winding_y[local_id.y >> 3u]);
+    packed_y += (packed_y - 0x8888888u) << 4u;
+    packed_y += (packed_y - 0x888888u) << 8u;
+    packed_y += (packed_y - 0x8888u) << 16u;
+    if th_ix == 0u {
+        atomicStore(&sh_winding_y[0], packed_y);        
+    }
+    workgroupBarrier();
+    var wind_y = (packed_y >> ((local_id.y & 7u) << 2u)) - 8u;
+    if local_id.y >= 8u {
+        wind_y += (atomicLoad(&sh_winding_y[0]) >> 28u) - 8u;
+    }
+
+    for (var i = 0u; i < PIXELS_PER_THREAD; i++) {
+        let pix_ix = th_ix * PIXELS_PER_THREAD + i;
+        let minor = pix_ix & 7u;
+        //let nonzero = ((packed_w >> (minor << 2u)) & 0xfu) != u32(8 + backdrop);
+        // TODO: math might be off here
+        let expected_zero = (((packed_w >> (minor * 4u)) + wind_y) & 0xfu) - u32(fill.backdrop);
+        if expected_zero >= 16u {
+            area[i] = 1.0;
+        } else {
+#ifdef msaa8
+            let samples = atomicLoad(&sh_samples[pix_ix]);
+            let xored = (expected_zero * 0x11111111u) ^ samples;
+            // Each 4-bit nibble in xored is 0 for winding = 0, nonzero otherwise
+            let xored2 = xored | (xored * 2u);
+            let xored4 = xored2 | (xored2 * 4u);
+            area[i] = f32(countOneBits(xored4 & 0x88888888u)) * 0.125;
+#endif
+#ifdef msaa16
+            let samples0 = atomicLoad(&sh_samples[pix_ix * 2u]);
+            let samples1 = atomicLoad(&sh_samples[pix_ix * 2u + 1u]);
+            let xored0 = (expected_zero * 0x11111111u) ^ samples0;
+            let xored0_2 = xored0 | (xored0 * 2u);
+            let xored1 = (expected_zero * 0x11111111u) ^ samples1;
+            let xored1_2 = xored1 | (xored1 >> 1u);
+            let xored2 = (xored0_2 & 0xAAAAAAAAu) | (xored1_2 & 0x55555555u);
+            let xored4 = xored2 | (xored2 * 4u);
+            area[i] = f32(countOneBits(xored4 & 0xCCCCCCCCu)) * 0.0625;
+#endif
+        }
+    }
+    return area;
+}
+#endif
+
 fn read_fill(cmd_ix: u32) -> CmdFill {
     let size_and_rule = ptcl[cmd_ix + 1u];
     let seg_data = ptcl[cmd_ix + 2u];
@@ -126,15 +429,12 @@ fn extend_mode(t: f32, mode: u32) -> f32 {
     }
 }
 
-#else
-
-@group(0) @binding(3)
-var output: texture_storage_2d<r8, write>;
-
-#endif
-
 let PIXELS_PER_THREAD = 4u;
 
+// Analytic area antialiasing.
+//
+// This is currently dead code if msaa is enabled, but it would be fairly straightforward
+// to wire this so it's a dynamic choice (even per-path).
 fn fill_path(fill: CmdFill, xy: vec2<f32>) -> array<f32, PIXELS_PER_THREAD> {
     let n_segs = fill.size_and_rule >> 1u;
     let even_odd = (fill.size_and_rule & 1u) != 0u;
@@ -220,7 +520,11 @@ fn main(
             // CMD_FILL
             case 1u: {
                 let fill = read_fill(cmd_ix);
+#ifdef msaa
+                area = fill_path_ms(fill, wg_id.xy, local_id.xy);
+#else
                 area = fill_path(fill, xy);
+#endif
                 cmd_ix += 4u;
             }
             // CMD_STROKE
diff --git a/src/lib.rs b/src/lib.rs
index 28e5bf7ff..f383fd0fd 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -17,6 +17,7 @@
 mod cpu_dispatch;
 mod cpu_shader;
 mod engine;
+mod mask;
 mod render;
 mod scene;
 mod shaders;
@@ -61,6 +62,19 @@ pub type Error = Box<dyn std::error::Error>;
 /// Specialization of `Result` for our catch-all error type.
 pub type Result<T> = std::result::Result<T, Error>;
 
+/// Possible configurations for antialiasing.
+#[derive(PartialEq, Eq)]
+#[allow(unused)]
+enum AaConfig {
+    Area,
+    Msaa8,
+    Msaa16,
+}
+
+/// Configuration of antialiasing. Currently this is static, but could be switched to
+/// a launch option or even finer-grained.
+const ANTIALIASING: AaConfig = AaConfig::Msaa16;
+
 /// Renders a scene into a texture or surface.
 #[cfg(feature = "wgpu")]
 pub struct Renderer {
diff --git a/src/mask.rs b/src/mask.rs
new file mode 100644
index 000000000..61cacf0bc
--- /dev/null
+++ b/src/mask.rs
@@ -0,0 +1,98 @@
+// Copyright 2022 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+//! Create a lookup table of half-plane sample masks.
+
+// Width is number of discrete translations
+const MASK_WIDTH: usize = 32;
+// Height is the number of discrete slopes
+const MASK_HEIGHT: usize = 32;
+
+const PATTERN: [u8; 8] = [0, 5, 3, 7, 1, 4, 6, 2];
+
+fn one_mask(slope: f64, mut translation: f64, is_pos: bool) -> u8 {
+    if is_pos {
+        translation = 1. - translation;
+    }
+    let mut result = 0;
+    for (i, item) in PATTERN.iter().enumerate() {
+        let mut y = (i as f64 + 0.5) * 0.125;
+        let x = (*item as f64 + 0.5) * 0.125;
+        if !is_pos {
+            y = 1. - y;
+        }
+        if (x - (1.0 - translation)) * (1. - slope) - (y - translation) * slope >= 0. {
+            result |= 1 << i;
+        }
+    }
+    result
+}
+
+/// Make a lookup table of half-plane masks.
+///
+/// The table is organized into two blocks each with MASK_HEIGHT/2 slopes.
+/// The first block is negative slopes (x decreases as y increates),
+/// the second as positive.
+pub fn make_mask_lut() -> Vec<u8> {
+    (0..MASK_WIDTH * MASK_HEIGHT)
+        .map(|i| {
+            const HALF_HEIGHT: usize = MASK_HEIGHT / 2;
+            let u = i % MASK_WIDTH;
+            let v = i / MASK_WIDTH;
+            let is_pos = v >= HALF_HEIGHT;
+            let y = ((v % HALF_HEIGHT) as f64 + 0.5) * (1.0 / HALF_HEIGHT as f64);
+            let x = (u as f64 + 0.5) * (1.0 / MASK_WIDTH as f64);
+            one_mask(y, x, is_pos)
+        })
+        .collect()
+}
+
+// Width is number of discrete translations
+const MASK16_WIDTH: usize = 64;
+// Height is the number of discrete slopes
+const MASK16_HEIGHT: usize = 64;
+
+// This is based on the [D3D11 standard sample pattern].
+//
+// [D3D11 standard sample pattern]: https://learn.microsoft.com/en-us/windows/win32/api/d3d11/ne-d3d11-d3d11_standard_multisample_quality_levels
+const PATTERN_16: [u8; 16] = [1, 8, 4, 11, 15, 7, 3, 12, 0, 9, 5, 13, 2, 10, 6, 14];
+
+fn one_mask_16(slope: f64, mut translation: f64, is_pos: bool) -> u16 {
+    if is_pos {
+        translation = 1. - translation;
+    }
+    let mut result = 0;
+    for (i, item) in PATTERN_16.iter().enumerate() {
+        let mut y = (i as f64 + 0.5) * 0.0625;
+        let x = (*item as f64 + 0.5) * 0.0625;
+        if !is_pos {
+            y = 1. - y;
+        }
+        if (x - (1.0 - translation)) * (1. - slope) - (y - translation) * slope >= 0. {
+            result |= 1 << i;
+        }
+    }
+    result
+}
+
+/// Make a lookup table of half-plane masks.
+///
+/// The table is organized into two blocks each with MASK16_HEIGHT/2 slopes.
+/// The first block is negative slopes (x decreases as y increates),
+/// the second as positive.
+pub fn make_mask_lut_16() -> Vec<u8> {
+    let v16 = (0..MASK16_WIDTH * MASK16_HEIGHT)
+        .map(|i| {
+            const HALF_HEIGHT: usize = MASK16_HEIGHT / 2;
+            let u = i % MASK16_WIDTH;
+            let v = i / MASK16_WIDTH;
+            let is_pos = v >= HALF_HEIGHT;
+            let y = ((v % HALF_HEIGHT) as f64 + 0.5) * (1.0 / HALF_HEIGHT as f64);
+            let x = (u as f64 + 0.5) * (1.0 / MASK16_WIDTH as f64);
+            one_mask_16(y, x, is_pos)
+        })
+        .collect::<Vec<_>>();
+    // This annoyingly makes another copy. We can avoid that by pushing two
+    // bytes per iteration of the above loop.
+    bytemuck::cast_slice(&v16).into()
+}
diff --git a/src/render.rs b/src/render.rs
index 268007faa..0bb657954 100644
--- a/src/render.rs
+++ b/src/render.rs
@@ -3,7 +3,7 @@
 use crate::{
     engine::{BufProxy, ImageFormat, ImageProxy, Recording, ResourceProxy},
     shaders::FullShaders,
-    RenderParams, Scene,
+    AaConfig, RenderParams, Scene, ANTIALIASING,
 };
 use vello_encoding::{Encoding, WorkgroupSize};
 
@@ -11,6 +11,7 @@ use vello_encoding::{Encoding, WorkgroupSize};
 pub struct Render {
     fine_wg_count: Option<WorkgroupSize>,
     fine_resources: Option<FineResources>,
+    mask_buf: Option<ResourceProxy>,
 }
 
 /// Resources produced by pipeline, needed for fine rasterization.
@@ -62,6 +63,7 @@ impl Render {
         Render {
             fine_wg_count: None,
             fine_resources: None,
+            mask_buf: None,
         }
     }
 
@@ -412,19 +414,48 @@ impl Render {
     pub fn record_fine(&mut self, shaders: &FullShaders, recording: &mut Recording) {
         let fine_wg_count = self.fine_wg_count.take().unwrap();
         let fine = self.fine_resources.take().unwrap();
-        recording.dispatch(
-            shaders.fine,
-            fine_wg_count,
-            [
-                fine.config_buf,
-                fine.segments_buf,
-                fine.ptcl_buf,
-                fine.info_bin_data_buf,
-                ResourceProxy::Image(fine.out_image),
-                fine.gradient_image,
-                fine.image_atlas,
-            ],
-        );
+        match ANTIALIASING {
+            AaConfig::Area => {
+                recording.dispatch(
+                    shaders.fine,
+                    fine_wg_count,
+                    [
+                        fine.config_buf,
+                        fine.segments_buf,
+                        fine.ptcl_buf,
+                        fine.info_bin_data_buf,
+                        ResourceProxy::Image(fine.out_image),
+                        fine.gradient_image,
+                        fine.image_atlas,
+                    ],
+                );
+            }
+            _ => {
+                if self.mask_buf.is_none() {
+                    let mask_lut = match ANTIALIASING {
+                        AaConfig::Msaa16 => crate::mask::make_mask_lut_16(),
+                        AaConfig::Msaa8 => crate::mask::make_mask_lut(),
+                        _ => unreachable!(),
+                    };
+                    let buf = recording.upload("mask lut", mask_lut);
+                    self.mask_buf = Some(buf.into());
+                }
+                recording.dispatch(
+                    shaders.fine,
+                    fine_wg_count,
+                    [
+                        fine.config_buf,
+                        fine.segments_buf,
+                        fine.ptcl_buf,
+                        fine.info_bin_data_buf,
+                        ResourceProxy::Image(fine.out_image),
+                        fine.gradient_image,
+                        fine.image_atlas,
+                        self.mask_buf.unwrap(),
+                    ],
+                );
+            }
+        }
         recording.free_resource(fine.config_buf);
         recording.free_resource(fine.tile_buf);
         recording.free_resource(fine.segments_buf);
@@ -432,6 +463,10 @@ impl Render {
         recording.free_resource(fine.gradient_image);
         recording.free_resource(fine.image_atlas);
         recording.free_resource(fine.info_bin_data_buf);
+        // TODO: make mask buf persistent
+        if let Some(mask_buf) = self.mask_buf.take() {
+            recording.free_resource(mask_buf);
+        }
     }
 
     /// Get the output image.
diff --git a/src/shaders.rs b/src/shaders.rs
index 86e6ed7bd..668dafac4 100644
--- a/src/shaders.rs
+++ b/src/shaders.rs
@@ -86,6 +86,8 @@ pub struct FullShaders {
 
 #[cfg(feature = "wgpu")]
 pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShaders, Error> {
+    use crate::ANTIALIASING;
+
     let imports = SHARED_SHADERS
         .iter()
         .copied()
@@ -93,6 +95,17 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
     let empty = HashSet::new();
     let mut full_config = HashSet::new();
     full_config.insert("full".into());
+    match crate::ANTIALIASING {
+        crate::AaConfig::Msaa16 => {
+            full_config.insert("msaa".into());
+            full_config.insert("msaa16".into());
+        }
+        crate::AaConfig::Msaa8 => {
+            full_config.insert("msaa".into());
+            full_config.insert("msaa8".into());
+        }
+        crate::AaConfig::Area => (),
+    }
     let mut small_config = HashSet::new();
     small_config.insert("full".into());
     small_config.insert("small".into());
@@ -292,20 +305,39 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
             BindType::Buffer,
         ],
     )?;
-    let fine = engine.add_shader(
-        device,
-        "fine",
-        preprocess::preprocess(shader!("fine"), &full_config, &imports).into(),
-        &[
-            BindType::Uniform,
-            BindType::BufReadOnly,
-            BindType::BufReadOnly,
-            BindType::BufReadOnly,
-            BindType::Image(ImageFormat::Rgba8),
-            BindType::ImageRead(ImageFormat::Rgba8),
-            BindType::ImageRead(ImageFormat::Rgba8),
-        ],
-    )?;
+    let fine = match ANTIALIASING {
+        crate::AaConfig::Area => engine.add_shader(
+            device,
+            "fine",
+            preprocess::preprocess(shader!("fine"), &full_config, &imports).into(),
+            &[
+                BindType::Uniform,
+                BindType::BufReadOnly,
+                BindType::BufReadOnly,
+                BindType::BufReadOnly,
+                BindType::Image(ImageFormat::Rgba8),
+                BindType::ImageRead(ImageFormat::Rgba8),
+                BindType::ImageRead(ImageFormat::Rgba8),
+            ],
+        )?,
+        _ => {
+            engine.add_shader(
+                device,
+                "fine",
+                preprocess::preprocess(shader!("fine"), &full_config, &imports).into(),
+                &[
+                    BindType::Uniform,
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                    BindType::Image(ImageFormat::Rgba8),
+                    BindType::ImageRead(ImageFormat::Rgba8),
+                    BindType::ImageRead(ImageFormat::Rgba8),
+                    BindType::BufReadOnly, // mask buffer
+                ],
+            )?
+        }
+    };
     Ok(FullShaders {
         pathtag_reduce,
         pathtag_reduce2,

From 2c0ef60588349886135461111da6edd163e450aa Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Wed, 11 Oct 2023 18:28:30 -0700
Subject: [PATCH 2/2] Address review feedback

Minor cleanup of the multisampling logic, and switch default to Area.
---
 shader/fine.wgsl | 9 ++-------
 src/lib.rs       | 2 +-
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/shader/fine.wgsl b/shader/fine.wgsl
index 9bd7da3a3..f41747d82 100644
--- a/shader/fine.wgsl
+++ b/shader/fine.wgsl
@@ -145,12 +145,7 @@ fn fill_path_ms(fill: CmdFill, wg_id: vec2<u32>, local_id: vec2<u32>) -> array<f
             workgroupBarrier();
             sh_count[th_ix] = count;
         }
-#ifdef have_uniform
         let total = workgroupUniformLoad(&sh_count[slice_size - 1u]);
-#else
-        workgroupBarrier();
-        let total = sh_count[slice_size - 1u];
-#endif
         for (var i = th_ix; i < total; i += WG_SIZE) {
             // binary search to find pixel
             var lo = 0u;
@@ -180,15 +175,15 @@ fn fill_path_ms(fill: CmdFill, wg_id: vec2<u32>, local_id: vec2<u32>) -> array<f
             // One alternative is to compute it in a separate dispatch.
             let dx = abs(xy1.x - xy0.x);
             let dy = xy1.y - xy0.y;
+            // TODO: apply numerical robustness and optimization
             let dy_dxdy = dy / (dx + dy);
             let a = dx / (dx + dy);
             let is_positive_slope = xy1.x >= xy0.x;
             let sign = select(-1.0, 1.0, is_positive_slope);
             let xt0 = floor(xy0.x * sign);
             let c = xy0.x * sign - xt0;
-            // This has a special case in the JS code, but we should just not render
             let y0i = floor(xy0.y);
-            let ytop = select(y0i + 1.0, ceil(xy0.y), xy0.y == xy1.y);
+            let ytop = y0i + 1.0;
             let b = dy_dxdy * c + a * (ytop - xy0.y);
             let x0i = i32(xt0 * sign + 0.5 * (sign - 1.0));
             // Use line equation to plot pixel coordinates
diff --git a/src/lib.rs b/src/lib.rs
index f383fd0fd..006accd3c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -73,7 +73,7 @@ enum AaConfig {
 
 /// Configuration of antialiasing. Currently this is static, but could be switched to
 /// a launch option or even finer-grained.
-const ANTIALIASING: AaConfig = AaConfig::Msaa16;
+const ANTIALIASING: AaConfig = AaConfig::Area;
 
 /// Renders a scene into a texture or surface.
 #[cfg(feature = "wgpu")]