diff --git a/.vscode/settings.json b/.vscode/settings.json
index a75948f7..57a124c7 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -16,6 +16,6 @@
   },
   "wgsl-analyzer.diagnostics.nagaVersion": "main",
   "wgsl-analyzer.preprocessor.shaderDefs": [
-    "full"
+    "full", "msaa16", "msaa"
   ]
 }
diff --git a/shader/fine.wgsl b/shader/fine.wgsl
index 108c88cb..f41747d8 100644
--- a/shader/fine.wgsl
+++ b/shader/fine.wgsl
@@ -2,8 +2,10 @@
 
 // Fine rasterizer. This can run in simple (just path rendering) and full
 // modes, controllable by #define.
+//
+// To enable multisampled rendering, turn on both the msaa ifdef and one of msaa8
+// or msaa16.
 
-// This is a cut'n'paste w/ backdrop.
 struct Tile {
     backdrop: i32,
     segments: u32,
@@ -18,8 +20,6 @@ var<uniform> config: Config;
 @group(0) @binding(1)
 var<storage> segments: array<Segment>;
 
-#ifdef full
-
 #import blend
 #import ptcl
 
@@ -40,6 +40,304 @@ var gradients: texture_2d<f32>;
 @group(0) @binding(6)
 var image_atlas: texture_2d<f32>;
 
+#ifdef msaa8
+let MASK_WIDTH = 32u;
+let MASK_HEIGHT = 32u;
+let SH_SAMPLES_SIZE = 256u;
+let SAMPLE_WORDS_PER_PIXEL = 1u;
+// This might be better in uniform, but that has 16 byte alignment
+@group(0) @binding(7)
+var<storage> mask_lut: array<u32, 256u>;
+#endif
+
+#ifdef msaa16
+let MASK_WIDTH = 64u;
+let MASK_HEIGHT = 64u;
+let SH_SAMPLES_SIZE = 512u;
+let SAMPLE_WORDS_PER_PIXEL = 2u;
+@group(0) @binding(7)
+var<storage> mask_lut: array<u32, 2048u>;
+#endif
+
+#ifdef msaa
+let WG_SIZE = 64u;
+var<workgroup> sh_count: array<u32, WG_SIZE>;
+
+// This is 8 winding numbers packed to a u32, 4 bits per sample
+var<workgroup> sh_winding: array<atomic<u32>, 32u>;
+// Same packing, one group of 8 per pixel
+var<workgroup> sh_samples: array<atomic<u32>, SH_SAMPLES_SIZE>;
+// Same packing, accumulating winding numbers for vertical edge crossings
+var<workgroup> sh_winding_y: array<atomic<u32>, 2u>;
+
+// number of integer cells spanned by interval defined by a, b
+fn span(a: f32, b: f32) -> u32 {
+    return u32(max(ceil(max(a, b)) - floor(min(a, b)), 1.0));
+}
+
+let SEG_SIZE = 5u;
+
+// New multisampled algorithm.
+fn fill_path_ms(fill: CmdFill, wg_id: vec2<u32>, local_id: vec2<u32>) -> array<f32, PIXELS_PER_THREAD> {
+    let n_segs = fill.size_and_rule >> 1u;
+    let even_odd = (fill.size_and_rule & 1u) != 0u;
+    let tile_origin = vec2(f32(wg_id.x) * f32(TILE_HEIGHT), f32(wg_id.y) * f32(TILE_WIDTH));
+    let th_ix = local_id.y * (TILE_WIDTH / PIXELS_PER_THREAD) + local_id.x;
+    if th_ix < 32u {
+        if th_ix < 2u {
+            atomicStore(&sh_winding_y[th_ix], 0x88888888u);
+        }
+        atomicStore(&sh_winding[th_ix], 0x88888888u);
+    }
+    let sample_count = PIXELS_PER_THREAD * SAMPLE_WORDS_PER_PIXEL;
+    for (var i = 0u; i < sample_count; i++) {
+        atomicStore(&sh_samples[th_ix * sample_count + i], 0x88888888u);
+    }
+    workgroupBarrier();
+    let n_batch = (n_segs + (WG_SIZE - 1u)) / WG_SIZE;
+    for (var batch = 0u; batch < n_batch; batch++) {
+        let seg_ix = batch * WG_SIZE + th_ix;
+        let seg_off = fill.seg_data + seg_ix;
+        var count = 0u;
+        let slice_size = min(n_segs - batch * WG_SIZE, WG_SIZE);
+        // TODO: might save a register rewriting this in terms of limit
+        if th_ix < slice_size {
+            let segment = segments[seg_off];
+            // Note: coords relative to tile origin probably a good idea in coarse path,
+            // especially as f16 would work. But keeping existing scheme for compatibility.
+            let xy0 = segment.origin - tile_origin;
+            let xy1 = xy0 + segment.delta;
+            var y_edge_f = f32(TILE_HEIGHT);
+            var delta = select(-1, 1, xy1.x <= xy0.x);
+            if xy0.x == 0.0 && xy1.x == 0.0 {
+                if xy0.y == 0.0 {
+                    y_edge_f = 0.0;
+                } else if xy1.y == 0.0 {
+                    y_edge_f = 0.0;
+                    delta = -delta;
+                }
+            } else {
+                if xy0.x == 0.0 {
+                    if xy0.y != 0.0 {
+                        y_edge_f = xy0.y;
+                    }
+                } else if xy1.x == 0.0 && xy1.y != 0.0 {
+                    y_edge_f = xy1.y;
+                }
+                // discard horizontal lines aligned to pixel grid
+                if !(xy0.y == xy1.y && xy0.y == floor(xy0.y)) {
+                    count = span(xy0.x, xy1.x) + span(xy0.y, xy1.y) - 1u;
+                }
+            }
+            let y_edge = u32(ceil(y_edge_f));
+            if y_edge < TILE_HEIGHT {
+                atomicAdd(&sh_winding_y[y_edge >> 3u], u32(delta) << ((y_edge & 7u) << 2u));
+            }
+        }
+        // workgroup prefix sum of counts
+        sh_count[th_ix] = count;
+        let lg_n = firstLeadingBit(slice_size * 2u - 1u);
+        for (var i = 0u; i < lg_n; i++) {
+            workgroupBarrier();
+            if th_ix >= 1u << i {
+                count += sh_count[th_ix - (1u << i)];
+            }
+            workgroupBarrier();
+            sh_count[th_ix] = count;
+        }
+        let total = workgroupUniformLoad(&sh_count[slice_size - 1u]);
+        for (var i = th_ix; i < total; i += WG_SIZE) {
+            // binary search to find pixel
+            var lo = 0u;
+            var hi = slice_size;
+            let goal = i;
+            while hi > lo + 1u {
+                let mid = (lo + hi) >> 1u;
+                if goal >= sh_count[mid - 1u] {
+                    lo = mid;
+                } else {
+                    hi = mid;
+                }
+            }
+            let el_ix = lo;
+            let last_pixel = i + 1u == sh_count[el_ix];
+            let sub_ix = i - select(0u, sh_count[el_ix - 1u], el_ix > 0u);
+            let seg_off = fill.seg_data + batch * WG_SIZE + el_ix;
+            let segment = segments[seg_off];
+            let xy0_in = segment.origin - tile_origin;
+            let xy1_in = xy0_in + segment.delta;
+            let is_down = xy1_in.y >= xy0_in.y;
+            let xy0 = select(xy1_in, xy0_in, is_down);
+            let xy1 = select(xy0_in, xy1_in, is_down);
+
+            // Set up data for line rasterization
+            // Note: this is duplicated work if total count exceeds a workgroup.
+            // One alternative is to compute it in a separate dispatch.
+            let dx = abs(xy1.x - xy0.x);
+            let dy = xy1.y - xy0.y;
+            // TODO: apply numerical robustness and optimization
+            let dy_dxdy = dy / (dx + dy);
+            let a = dx / (dx + dy);
+            let is_positive_slope = xy1.x >= xy0.x;
+            let sign = select(-1.0, 1.0, is_positive_slope);
+            let xt0 = floor(xy0.x * sign);
+            let c = xy0.x * sign - xt0;
+            let y0i = floor(xy0.y);
+            let ytop = y0i + 1.0;
+            let b = dy_dxdy * c + a * (ytop - xy0.y);
+            let x0i = i32(xt0 * sign + 0.5 * (sign - 1.0));
+            // Use line equation to plot pixel coordinates
+
+            let zf = a * f32(sub_ix) + b;
+            let z = floor(zf);
+            let x = x0i + i32(sign * z);
+            let y = i32(y0i) + i32(sub_ix) - i32(z);
+            var is_delta: bool;
+            // We need to adjust winding number if slope is positive and there
+            // is a crossing at the left edge of the pixel.
+            var is_bump = false;
+            let zp = floor(a * f32(sub_ix - 1u) + b);
+            if sub_ix == 0u {
+                is_delta = y0i == xy0.y && y0i != xy1.y;
+                is_bump = xy0.x == 0.0;
+            } else {
+                is_delta = z == zp;
+                is_bump = is_positive_slope && !is_delta;
+            }
+            let pix_ix = u32(y) * TILE_WIDTH + u32(x);
+            if u32(x) < TILE_WIDTH - 1u && u32(y) < TILE_HEIGHT {
+                let delta_pix = pix_ix + 1u;
+                if is_delta {
+                    let delta = select(u32(-1), 1u, is_down) << ((delta_pix & 7u) << 2u);
+                    atomicAdd(&sh_winding[delta_pix >> 3u], delta);
+                }
+            }
+            // Apply sample mask
+            let mask_block = u32(is_positive_slope) * (MASK_WIDTH * MASK_HEIGHT / 2u);
+            let half_height = f32(MASK_HEIGHT / 2u);
+            let mask_row = floor(min(a * half_height, half_height - 1.0)) * f32(MASK_WIDTH);
+            let mask_col = floor((zf - z) * f32(MASK_WIDTH));
+            let mask_ix = mask_block + u32(mask_row + mask_col);
+#ifdef msaa8
+            var mask = mask_lut[mask_ix / 4u] >> ((mask_ix % 4u) * 8u);
+            mask &= 0xffu;
+            // Intersect with y half-plane masks
+            if sub_ix == 0u && !is_bump {
+                let mask_shift = u32(round(8.0 * (xy0.y - f32(y))));
+                mask &= 0xffu << mask_shift;
+            }
+            if last_pixel && xy1.x != 0.0 {
+                let mask_shift = u32(round(8.0 * (xy1.y - f32(y))));
+                mask &= ~(0xffu << mask_shift);
+            }
+            let mask_a = mask | (mask << 6u);
+            let mask_b = mask_a | (mask_a << 12u);
+            let mask_exp = (mask_b & 0x1010101u) | ((mask_b << 3u) & 0x10101010u);
+            var mask_signed = select(mask_exp, u32(-i32(mask_exp)), is_down);
+            if is_bump {
+                mask_signed += select(u32(-0x11111111), 0x1111111u, is_down);
+            }
+            atomicAdd(&sh_samples[pix_ix], mask_signed);
+#endif
+#ifdef msaa16
+            var mask = mask_lut[mask_ix / 2u] >> ((mask_ix % 2u) * 16u);
+            mask &= 0xffffu;
+            // Intersect with y half-plane masks
+            if sub_ix == 0u && !is_bump {
+                let mask_shift = u32(round(16.0 * (xy0.y - f32(y))));
+                mask &= 0xffffu << mask_shift;
+            }
+            if last_pixel && xy1.x != 0.0 {
+                let mask_shift = u32(round(16.0 * (xy1.y - f32(y))));
+                mask &= ~(0xffffu << mask_shift);
+            }
+            let mask0 = mask & 0xffu;
+            let mask0_a = mask0 | (mask0 << 6u);
+            let mask0_b = mask0_a | (mask0_a << 12u);
+            let mask0_exp = (mask0_b & 0x1010101u) | ((mask0_b << 3u) & 0x10101010u);
+            var mask0_signed = select(mask0_exp, u32(-i32(mask0_exp)), is_down);
+            let mask1 = (mask >> 8u) & 0xffu;
+            let mask1_a = mask1 | (mask1 << 6u);
+            let mask1_b = mask1_a | (mask1_a << 12u);
+            let mask1_exp = (mask1_b & 0x1010101u) | ((mask1_b << 3u) & 0x10101010u);
+            var mask1_signed = select(mask1_exp, u32(-i32(mask1_exp)), is_down);
+            if is_bump {
+                let bump_delta = select(u32(-0x11111111), 0x1111111u, is_down);
+                mask0_signed += bump_delta;
+                mask1_signed += bump_delta;
+            }
+            atomicAdd(&sh_samples[pix_ix * 2u], mask0_signed);
+            atomicAdd(&sh_samples[pix_ix * 2u + 1u], mask1_signed);
+#endif
+        }
+        workgroupBarrier();
+    }
+    var area: array<f32, PIXELS_PER_THREAD>;
+    let major = (th_ix * PIXELS_PER_THREAD) >> 3u;
+    var packed_w = atomicLoad(&sh_winding[major]);
+    // Prefix sum of packed 4 bit values within u32
+    packed_w += (packed_w - 0x8888888u) << 4u;
+    packed_w += (packed_w - 0x888888u) << 8u;
+    packed_w += (packed_w - 0x8888u) << 16u;
+    // Note: could probably do bias in one go, but it would be inscrutable
+    if (major & 1u) != 0u {
+        // We could use shmem to communicate the value from another thread;
+        // if we had subgroups that would almost certainly be the most
+        // efficient way. But we just calculate again for simplicity.
+        var last_packed = atomicLoad(&sh_winding[major - 1u]);
+        last_packed += (last_packed - 0x8888888u) << 4u;
+        last_packed += (last_packed - 0x888888u) << 8u;
+        last_packed += (last_packed - 0x8888u) << 16u;
+        let bump = ((last_packed >> 28u) - 8u) * 0x11111111u;
+        packed_w += bump;
+    }
+    var packed_y = atomicLoad(&sh_winding_y[local_id.y >> 3u]);
+    packed_y += (packed_y - 0x8888888u) << 4u;
+    packed_y += (packed_y - 0x888888u) << 8u;
+    packed_y += (packed_y - 0x8888u) << 16u;
+    if th_ix == 0u {
+        atomicStore(&sh_winding_y[0], packed_y);        
+    }
+    workgroupBarrier();
+    var wind_y = (packed_y >> ((local_id.y & 7u) << 2u)) - 8u;
+    if local_id.y >= 8u {
+        wind_y += (atomicLoad(&sh_winding_y[0]) >> 28u) - 8u;
+    }
+
+    for (var i = 0u; i < PIXELS_PER_THREAD; i++) {
+        let pix_ix = th_ix * PIXELS_PER_THREAD + i;
+        let minor = pix_ix & 7u;
+        //let nonzero = ((packed_w >> (minor << 2u)) & 0xfu) != u32(8 + backdrop);
+        // TODO: math might be off here
+        let expected_zero = (((packed_w >> (minor * 4u)) + wind_y) & 0xfu) - u32(fill.backdrop);
+        if expected_zero >= 16u {
+            area[i] = 1.0;
+        } else {
+#ifdef msaa8
+            let samples = atomicLoad(&sh_samples[pix_ix]);
+            let xored = (expected_zero * 0x11111111u) ^ samples;
+            // Each 4-bit nibble in xored is 0 for winding = 0, nonzero otherwise
+            let xored2 = xored | (xored * 2u);
+            let xored4 = xored2 | (xored2 * 4u);
+            area[i] = f32(countOneBits(xored4 & 0x88888888u)) * 0.125;
+#endif
+#ifdef msaa16
+            let samples0 = atomicLoad(&sh_samples[pix_ix * 2u]);
+            let samples1 = atomicLoad(&sh_samples[pix_ix * 2u + 1u]);
+            let xored0 = (expected_zero * 0x11111111u) ^ samples0;
+            let xored0_2 = xored0 | (xored0 * 2u);
+            let xored1 = (expected_zero * 0x11111111u) ^ samples1;
+            let xored1_2 = xored1 | (xored1 >> 1u);
+            let xored2 = (xored0_2 & 0xAAAAAAAAu) | (xored1_2 & 0x55555555u);
+            let xored4 = xored2 | (xored2 * 4u);
+            area[i] = f32(countOneBits(xored4 & 0xCCCCCCCCu)) * 0.0625;
+#endif
+        }
+    }
+    return area;
+}
+#endif
+
 fn read_fill(cmd_ix: u32) -> CmdFill {
     let size_and_rule = ptcl[cmd_ix + 1u];
     let seg_data = ptcl[cmd_ix + 2u];
@@ -126,15 +424,12 @@ fn extend_mode(t: f32, mode: u32) -> f32 {
     }
 }
 
-#else
-
-@group(0) @binding(3)
-var output: texture_storage_2d<r8, write>;
-
-#endif
-
 let PIXELS_PER_THREAD = 4u;
 
+// Analytic area antialiasing.
+//
+// This is currently dead code if msaa is enabled, but it would be fairly straightforward
+// to wire this so it's a dynamic choice (even per-path).
 fn fill_path(fill: CmdFill, xy: vec2<f32>) -> array<f32, PIXELS_PER_THREAD> {
     let n_segs = fill.size_and_rule >> 1u;
     let even_odd = (fill.size_and_rule & 1u) != 0u;
@@ -220,7 +515,11 @@ fn main(
             // CMD_FILL
             case 1u: {
                 let fill = read_fill(cmd_ix);
+#ifdef msaa
+                area = fill_path_ms(fill, wg_id.xy, local_id.xy);
+#else
                 area = fill_path(fill, xy);
+#endif
                 cmd_ix += 4u;
             }
             // CMD_STROKE
diff --git a/src/lib.rs b/src/lib.rs
index 28e5bf7f..006accd3 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -17,6 +17,7 @@
 mod cpu_dispatch;
 mod cpu_shader;
 mod engine;
+mod mask;
 mod render;
 mod scene;
 mod shaders;
@@ -61,6 +62,19 @@ pub type Error = Box<dyn std::error::Error>;
 /// Specialization of `Result` for our catch-all error type.
 pub type Result<T> = std::result::Result<T, Error>;
 
+/// Possible configurations for antialiasing.
+#[derive(PartialEq, Eq)]
+#[allow(unused)]
+enum AaConfig {
+    Area,
+    Msaa8,
+    Msaa16,
+}
+
+/// Configuration of antialiasing. Currently this is static, but could be switched to
+/// a launch option or even finer-grained.
+const ANTIALIASING: AaConfig = AaConfig::Area;
+
 /// Renders a scene into a texture or surface.
 #[cfg(feature = "wgpu")]
 pub struct Renderer {
diff --git a/src/mask.rs b/src/mask.rs
new file mode 100644
index 00000000..61cacf0b
--- /dev/null
+++ b/src/mask.rs
@@ -0,0 +1,98 @@
+// Copyright 2022 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+//! Create a lookup table of half-plane sample masks.
+
+// Width is number of discrete translations
+const MASK_WIDTH: usize = 32;
+// Height is the number of discrete slopes
+const MASK_HEIGHT: usize = 32;
+
+const PATTERN: [u8; 8] = [0, 5, 3, 7, 1, 4, 6, 2];
+
+fn one_mask(slope: f64, mut translation: f64, is_pos: bool) -> u8 {
+    if is_pos {
+        translation = 1. - translation;
+    }
+    let mut result = 0;
+    for (i, item) in PATTERN.iter().enumerate() {
+        let mut y = (i as f64 + 0.5) * 0.125;
+        let x = (*item as f64 + 0.5) * 0.125;
+        if !is_pos {
+            y = 1. - y;
+        }
+        if (x - (1.0 - translation)) * (1. - slope) - (y - translation) * slope >= 0. {
+            result |= 1 << i;
+        }
+    }
+    result
+}
+
+/// Make a lookup table of half-plane masks.
+///
+/// The table is organized into two blocks each with MASK_HEIGHT/2 slopes.
+/// The first block is negative slopes (x decreases as y increates),
+/// the second as positive.
+pub fn make_mask_lut() -> Vec<u8> {
+    (0..MASK_WIDTH * MASK_HEIGHT)
+        .map(|i| {
+            const HALF_HEIGHT: usize = MASK_HEIGHT / 2;
+            let u = i % MASK_WIDTH;
+            let v = i / MASK_WIDTH;
+            let is_pos = v >= HALF_HEIGHT;
+            let y = ((v % HALF_HEIGHT) as f64 + 0.5) * (1.0 / HALF_HEIGHT as f64);
+            let x = (u as f64 + 0.5) * (1.0 / MASK_WIDTH as f64);
+            one_mask(y, x, is_pos)
+        })
+        .collect()
+}
+
+// Width is number of discrete translations
+const MASK16_WIDTH: usize = 64;
+// Height is the number of discrete slopes
+const MASK16_HEIGHT: usize = 64;
+
+// This is based on the [D3D11 standard sample pattern].
+//
+// [D3D11 standard sample pattern]: https://learn.microsoft.com/en-us/windows/win32/api/d3d11/ne-d3d11-d3d11_standard_multisample_quality_levels
+const PATTERN_16: [u8; 16] = [1, 8, 4, 11, 15, 7, 3, 12, 0, 9, 5, 13, 2, 10, 6, 14];
+
+fn one_mask_16(slope: f64, mut translation: f64, is_pos: bool) -> u16 {
+    if is_pos {
+        translation = 1. - translation;
+    }
+    let mut result = 0;
+    for (i, item) in PATTERN_16.iter().enumerate() {
+        let mut y = (i as f64 + 0.5) * 0.0625;
+        let x = (*item as f64 + 0.5) * 0.0625;
+        if !is_pos {
+            y = 1. - y;
+        }
+        if (x - (1.0 - translation)) * (1. - slope) - (y - translation) * slope >= 0. {
+            result |= 1 << i;
+        }
+    }
+    result
+}
+
+/// Make a lookup table of half-plane masks.
+///
+/// The table is organized into two blocks each with MASK16_HEIGHT/2 slopes.
+/// The first block is negative slopes (x decreases as y increates),
+/// the second as positive.
+pub fn make_mask_lut_16() -> Vec<u8> {
+    let v16 = (0..MASK16_WIDTH * MASK16_HEIGHT)
+        .map(|i| {
+            const HALF_HEIGHT: usize = MASK16_HEIGHT / 2;
+            let u = i % MASK16_WIDTH;
+            let v = i / MASK16_WIDTH;
+            let is_pos = v >= HALF_HEIGHT;
+            let y = ((v % HALF_HEIGHT) as f64 + 0.5) * (1.0 / HALF_HEIGHT as f64);
+            let x = (u as f64 + 0.5) * (1.0 / MASK16_WIDTH as f64);
+            one_mask_16(y, x, is_pos)
+        })
+        .collect::<Vec<_>>();
+    // This annoyingly makes another copy. We can avoid that by pushing two
+    // bytes per iteration of the above loop.
+    bytemuck::cast_slice(&v16).into()
+}
diff --git a/src/render.rs b/src/render.rs
index 268007fa..0bb65795 100644
--- a/src/render.rs
+++ b/src/render.rs
@@ -3,7 +3,7 @@
 use crate::{
     engine::{BufProxy, ImageFormat, ImageProxy, Recording, ResourceProxy},
     shaders::FullShaders,
-    RenderParams, Scene,
+    AaConfig, RenderParams, Scene, ANTIALIASING,
 };
 use vello_encoding::{Encoding, WorkgroupSize};
 
@@ -11,6 +11,7 @@ use vello_encoding::{Encoding, WorkgroupSize};
 pub struct Render {
     fine_wg_count: Option<WorkgroupSize>,
     fine_resources: Option<FineResources>,
+    mask_buf: Option<ResourceProxy>,
 }
 
 /// Resources produced by pipeline, needed for fine rasterization.
@@ -62,6 +63,7 @@ impl Render {
         Render {
             fine_wg_count: None,
             fine_resources: None,
+            mask_buf: None,
         }
     }
 
@@ -412,19 +414,48 @@ impl Render {
     pub fn record_fine(&mut self, shaders: &FullShaders, recording: &mut Recording) {
         let fine_wg_count = self.fine_wg_count.take().unwrap();
         let fine = self.fine_resources.take().unwrap();
-        recording.dispatch(
-            shaders.fine,
-            fine_wg_count,
-            [
-                fine.config_buf,
-                fine.segments_buf,
-                fine.ptcl_buf,
-                fine.info_bin_data_buf,
-                ResourceProxy::Image(fine.out_image),
-                fine.gradient_image,
-                fine.image_atlas,
-            ],
-        );
+        match ANTIALIASING {
+            AaConfig::Area => {
+                recording.dispatch(
+                    shaders.fine,
+                    fine_wg_count,
+                    [
+                        fine.config_buf,
+                        fine.segments_buf,
+                        fine.ptcl_buf,
+                        fine.info_bin_data_buf,
+                        ResourceProxy::Image(fine.out_image),
+                        fine.gradient_image,
+                        fine.image_atlas,
+                    ],
+                );
+            }
+            _ => {
+                if self.mask_buf.is_none() {
+                    let mask_lut = match ANTIALIASING {
+                        AaConfig::Msaa16 => crate::mask::make_mask_lut_16(),
+                        AaConfig::Msaa8 => crate::mask::make_mask_lut(),
+                        _ => unreachable!(),
+                    };
+                    let buf = recording.upload("mask lut", mask_lut);
+                    self.mask_buf = Some(buf.into());
+                }
+                recording.dispatch(
+                    shaders.fine,
+                    fine_wg_count,
+                    [
+                        fine.config_buf,
+                        fine.segments_buf,
+                        fine.ptcl_buf,
+                        fine.info_bin_data_buf,
+                        ResourceProxy::Image(fine.out_image),
+                        fine.gradient_image,
+                        fine.image_atlas,
+                        self.mask_buf.unwrap(),
+                    ],
+                );
+            }
+        }
         recording.free_resource(fine.config_buf);
         recording.free_resource(fine.tile_buf);
         recording.free_resource(fine.segments_buf);
@@ -432,6 +463,10 @@ impl Render {
         recording.free_resource(fine.gradient_image);
         recording.free_resource(fine.image_atlas);
         recording.free_resource(fine.info_bin_data_buf);
+        // TODO: make mask buf persistent
+        if let Some(mask_buf) = self.mask_buf.take() {
+            recording.free_resource(mask_buf);
+        }
     }
 
     /// Get the output image.
diff --git a/src/shaders.rs b/src/shaders.rs
index 86e6ed7b..668dafac 100644
--- a/src/shaders.rs
+++ b/src/shaders.rs
@@ -86,6 +86,8 @@ pub struct FullShaders {
 
 #[cfg(feature = "wgpu")]
 pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShaders, Error> {
+    use crate::ANTIALIASING;
+
     let imports = SHARED_SHADERS
         .iter()
         .copied()
@@ -93,6 +95,17 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
     let empty = HashSet::new();
     let mut full_config = HashSet::new();
     full_config.insert("full".into());
+    match crate::ANTIALIASING {
+        crate::AaConfig::Msaa16 => {
+            full_config.insert("msaa".into());
+            full_config.insert("msaa16".into());
+        }
+        crate::AaConfig::Msaa8 => {
+            full_config.insert("msaa".into());
+            full_config.insert("msaa8".into());
+        }
+        crate::AaConfig::Area => (),
+    }
     let mut small_config = HashSet::new();
     small_config.insert("full".into());
     small_config.insert("small".into());
@@ -292,20 +305,39 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
             BindType::Buffer,
         ],
     )?;
-    let fine = engine.add_shader(
-        device,
-        "fine",
-        preprocess::preprocess(shader!("fine"), &full_config, &imports).into(),
-        &[
-            BindType::Uniform,
-            BindType::BufReadOnly,
-            BindType::BufReadOnly,
-            BindType::BufReadOnly,
-            BindType::Image(ImageFormat::Rgba8),
-            BindType::ImageRead(ImageFormat::Rgba8),
-            BindType::ImageRead(ImageFormat::Rgba8),
-        ],
-    )?;
+    let fine = match ANTIALIASING {
+        crate::AaConfig::Area => engine.add_shader(
+            device,
+            "fine",
+            preprocess::preprocess(shader!("fine"), &full_config, &imports).into(),
+            &[
+                BindType::Uniform,
+                BindType::BufReadOnly,
+                BindType::BufReadOnly,
+                BindType::BufReadOnly,
+                BindType::Image(ImageFormat::Rgba8),
+                BindType::ImageRead(ImageFormat::Rgba8),
+                BindType::ImageRead(ImageFormat::Rgba8),
+            ],
+        )?,
+        _ => {
+            engine.add_shader(
+                device,
+                "fine",
+                preprocess::preprocess(shader!("fine"), &full_config, &imports).into(),
+                &[
+                    BindType::Uniform,
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                    BindType::Image(ImageFormat::Rgba8),
+                    BindType::ImageRead(ImageFormat::Rgba8),
+                    BindType::ImageRead(ImageFormat::Rgba8),
+                    BindType::BufReadOnly, // mask buffer
+                ],
+            )?
+        }
+    };
     Ok(FullShaders {
         pathtag_reduce,
         pathtag_reduce2,