diff --git a/crates/encoding/src/clip.rs b/crates/encoding/src/clip.rs
index 5a4e8f07..1a68fbab 100644
--- a/crates/encoding/src/clip.rs
+++ b/crates/encoding/src/clip.rs
@@ -4,10 +4,18 @@
 use bytemuck::{Pod, Zeroable};
 
 /// Clip stack element.
+///
+/// This is the bicyclic semigroup, a monoid useful for representing
+/// stack depth. There is considerably more detail in the draft paper
+///  [Fast GPU bounding boxes on tree-structured scenes].
+///
+/// [Fast GPU bounding boxes on tree-structured scenes]: https://arxiv.org/abs/2205.11659
 #[derive(Copy, Clone, Pod, Zeroable, Debug, Default)]
 #[repr(C)]
 pub struct ClipBic {
+    /// When interpreted as a stack operation, the number of pop operations.
     pub a: u32,
+    /// When interpreted as a stack operation, the number of push operations.
     pub b: u32,
 }
 
@@ -41,3 +49,20 @@ pub struct Clip {
 pub struct ClipBbox {
     pub bbox: [f32; 4],
 }
+
+impl ClipBic {
+    pub fn new(a: u32, b: u32) -> Self {
+        ClipBic { a, b }
+    }
+
+    /// The bicyclic semigroup operation.
+    ///
+    /// This operation is associative. When interpreted as a stack
+    /// operation, it represents doing the pops of `self`, the pushes of
+    /// `self`, the pops of `other`, and the pushes of `other`. The middle
+    /// two can cancel each other out.
+    pub fn combine(self, other: ClipBic) -> Self {
+        let m = self.b.min(other.a);
+        ClipBic::new(self.a + other.a - m, self.b + other.b - m)
+    }
+}
diff --git a/crates/encoding/src/config.rs b/crates/encoding/src/config.rs
index 93ffe80c..6f24925b 100644
--- a/crates/encoding/src/config.rs
+++ b/crates/encoding/src/config.rs
@@ -1,9 +1,11 @@
 // Copyright 2023 The Vello authors
 // SPDX-License-Identifier: Apache-2.0 OR MIT
 
+use crate::SegmentCount;
+
 use super::{
-    BinHeader, Clip, ClipBbox, ClipBic, ClipElement, Cubic, DrawBbox, DrawMonoid, Layout, Path,
-    PathBbox, PathMonoid, PathSegment, Tile,
+    BinHeader, Clip, ClipBbox, ClipBic, ClipElement, Cubic, DrawBbox, DrawMonoid, Layout, LineSoup,
+    Path, PathBbox, PathMonoid, PathSegment, Tile,
 };
 use bytemuck::{Pod, Zeroable};
 use std::mem;
@@ -14,7 +16,7 @@ const TILE_HEIGHT: u32 = 16;
 // TODO: Obtain these from the vello_shaders crate
 pub(crate) const PATH_REDUCE_WG: u32 = 256;
 const PATH_BBOX_WG: u32 = 256;
-const PATH_COARSE_WG: u32 = 256;
+const FLATTEN_WG: u32 = 256;
 const CLIP_REDUCE_WG: u32 = 256;
 
 /// Counters for tracking dynamic allocation on the GPU.
@@ -29,8 +31,24 @@ pub struct BumpAllocators {
     pub binning: u32,
     pub ptcl: u32,
     pub tile: u32,
+    pub seg_counts: u32,
     pub segments: u32,
     pub blend: u32,
+    pub lines: u32,
+}
+
+/// Storage of indirect dispatch size values.
+///
+/// The original plan was to reuse BumpAllocators, but the WebGPU compatible
+/// usage list rules forbid that being used as indirect counts while also
+/// bound as writable.
+#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
+#[repr(C)]
+pub struct IndirectCount {
+    pub count_x: u32,
+    pub count_y: u32,
+    pub count_z: u32,
+    pub pad0: u32,
 }
 
 /// Uniform render configuration data used by all GPU stages.
@@ -114,7 +132,7 @@ pub struct WorkgroupCounts {
     pub path_scan1: WorkgroupSize,
     pub path_scan: WorkgroupSize,
     pub bbox_clear: WorkgroupSize,
-    pub path_seg: WorkgroupSize,
+    pub flatten: WorkgroupSize,
     pub draw_reduce: WorkgroupSize,
     pub draw_leaf: WorkgroupSize,
     pub clip_reduce: WorkgroupSize,
@@ -146,7 +164,7 @@ impl WorkgroupCounts {
             path_tag_wgs
         };
         let draw_object_wgs = (n_draw_objects + PATH_BBOX_WG - 1) / PATH_BBOX_WG;
-        let path_coarse_wgs = (n_path_tags + PATH_COARSE_WG - 1) / PATH_COARSE_WG;
+        let flatten_wgs = (n_path_tags + FLATTEN_WG - 1) / FLATTEN_WG;
         let clip_reduce_wgs = n_clips.saturating_sub(1) / CLIP_REDUCE_WG;
         let clip_wgs = (n_clips + CLIP_REDUCE_WG - 1) / CLIP_REDUCE_WG;
         let path_wgs = (n_paths + PATH_BBOX_WG - 1) / PATH_BBOX_WG;
@@ -159,14 +177,14 @@ impl WorkgroupCounts {
             path_scan1: (reduced_size / PATH_REDUCE_WG, 1, 1),
             path_scan: (path_tag_wgs, 1, 1),
             bbox_clear: (draw_object_wgs, 1, 1),
-            path_seg: (path_coarse_wgs, 1, 1),
+            flatten: (flatten_wgs, 1, 1),
             draw_reduce: (draw_object_wgs, 1, 1),
             draw_leaf: (draw_object_wgs, 1, 1),
             clip_reduce: (clip_reduce_wgs, 1, 1),
             clip_leaf: (clip_wgs, 1, 1),
             binning: (draw_object_wgs, 1, 1),
             tile_alloc: (path_wgs, 1, 1),
-            path_coarse: (path_coarse_wgs, 1, 1),
+            path_coarse: (flatten_wgs, 1, 1),
             backdrop: (path_wgs, 1, 1),
             coarse: (width_in_bins, height_in_bins, 1),
             fine: (width_in_tiles, height_in_tiles, 1),
@@ -248,11 +266,14 @@ pub struct BufferSizes {
     pub clip_bboxes: BufferSize<ClipBbox>,
     pub draw_bboxes: BufferSize<DrawBbox>,
     pub bump_alloc: BufferSize<BumpAllocators>,
+    pub indirect_count: BufferSize<IndirectCount>,
     pub bin_headers: BufferSize<BinHeader>,
     pub paths: BufferSize<Path>,
     // Bump allocated buffers
+    pub lines: BufferSize<LineSoup>,
     pub bin_data: BufferSize<u32>,
     pub tiles: BufferSize<Tile>,
+    pub seg_counts: BufferSize<SegmentCount>,
     pub segments: BufferSize<PathSegment>,
     pub ptcl: BufferSize<u32>,
 }
@@ -284,6 +305,7 @@ impl BufferSizes {
         let clip_bboxes = BufferSize::new(n_clips);
         let draw_bboxes = BufferSize::new(n_paths);
         let bump_alloc = BufferSize::new(1);
+        let indirect_count = BufferSize::new(1);
         let bin_headers = BufferSize::new(draw_object_wgs * 256);
         let n_paths_aligned = align_up(n_paths, 256);
         let paths = BufferSize::new(n_paths_aligned);
@@ -293,6 +315,8 @@ impl BufferSizes {
         // reasonable heuristics.
         let bin_data = BufferSize::new(1 << 18);
         let tiles = BufferSize::new(1 << 21);
+        let lines = BufferSize::new(1 << 21);
+        let seg_counts = BufferSize::new(1 << 21);
         let segments = BufferSize::new(1 << 21);
         let ptcl = BufferSize::new(1 << 23);
         Self {
@@ -311,10 +335,13 @@ impl BufferSizes {
             clip_bboxes,
             draw_bboxes,
             bump_alloc,
+            indirect_count,
+            lines,
             bin_headers,
             paths,
             bin_data,
             tiles,
+            seg_counts,
             segments,
             ptcl,
         }
diff --git a/crates/encoding/src/lib.rs b/crates/encoding/src/lib.rs
index 62a7d6c8..3d24c192 100644
--- a/crates/encoding/src/lib.rs
+++ b/crates/encoding/src/lib.rs
@@ -24,8 +24,8 @@ mod resolve;
 pub use binning::BinHeader;
 pub use clip::{Clip, ClipBbox, ClipBic, ClipElement};
 pub use config::{
-    BufferSize, BufferSizes, BumpAllocators, ConfigUniform, RenderConfig, WorkgroupCounts,
-    WorkgroupSize,
+    BufferSize, BufferSizes, BumpAllocators, ConfigUniform, IndirectCount, RenderConfig,
+    WorkgroupCounts, WorkgroupSize,
 };
 pub use draw::{
     DrawBbox, DrawBeginClip, DrawColor, DrawImage, DrawLinearGradient, DrawMonoid,
@@ -35,7 +35,8 @@ pub use encoding::{Encoding, StreamOffsets};
 pub use math::Transform;
 pub use monoid::Monoid;
 pub use path::{
-    Cubic, Path, PathBbox, PathEncoder, PathMonoid, PathSegment, PathSegmentType, PathTag, Tile,
+    Cubic, LineSoup, Path, PathBbox, PathEncoder, PathMonoid, PathSegment, PathSegmentType,
+    PathTag, SegmentCount, Tile,
 };
 pub use resolve::{resolve_solid_paths_only, Layout};
 
diff --git a/crates/encoding/src/path.rs b/crates/encoding/src/path.rs
index 10e7499d..414ce23e 100644
--- a/crates/encoding/src/path.rs
+++ b/crates/encoding/src/path.rs
@@ -6,6 +6,28 @@ use peniko::kurbo::Shape;
 
 use super::Monoid;
 
+/// Line segment (after flattening, before tiling).
+#[derive(Clone, Copy, Debug, Zeroable, Pod, Default)]
+#[repr(C)]
+pub struct LineSoup {
+    pub path_ix: u32,
+    pub _padding: u32,
+    pub p0: [f32; 2],
+    pub p1: [f32; 2],
+}
+
+/// Line segment (after flattening, before tiling).
+#[derive(Clone, Copy, Debug, Zeroable, Pod, Default)]
+#[repr(C)]
+pub struct SegmentCount {
+    pub line_ix: u32,
+    // This could more accurately be modeled as:
+    //     segment_within_line: u16,
+    //     segment_within_slice: u16,
+    // However, here we mirror the way it's written in WGSL
+    pub counts: u32,
+}
+
 /// Path segment.
 #[derive(Clone, Copy, Debug, Zeroable, Pod, Default)]
 #[repr(C)]
@@ -13,7 +35,7 @@ pub struct PathSegment {
     pub origin: [f32; 2],
     pub delta: [f32; 2],
     pub y_edge: f32,
-    pub next: u32,
+    pub _padding: u32,
 }
 
 /// Path segment type.
@@ -193,7 +215,7 @@ pub struct PathBbox {
 #[repr(C)]
 pub struct Path {
     /// Bounding box in tiles.
-    pub bbox: [f32; 4],
+    pub bbox: [u32; 4],
     /// Offset (in u32s) to tile rectangle.
     pub tiles: u32,
     _padding: [u32; 3],
diff --git a/shader/coarse.wgsl b/shader/coarse.wgsl
index e497c85b..0635443f 100644
--- a/shader/coarse.wgsl
+++ b/shader/coarse.wgsl
@@ -33,7 +33,7 @@ var<storage> info_bin_data: array<u32>;
 var<storage> paths: array<Path>;
 
 @group(0) @binding(6)
-var<storage> tiles: array<Tile>;
+var<storage, read_write> tiles: array<Tile>;
 
 @group(0) @binding(7)
 var<storage, read_write> bump: BumpAllocators;
@@ -82,31 +82,30 @@ fn alloc_cmd(size: u32) {
     }
 }
 
-fn write_path(tile: Tile, linewidth: f32) -> bool {
-    // TODO: take flags
-    alloc_cmd(3u);
-    if linewidth < 0.0 {
-        let even_odd = linewidth < -1.0;
-        if tile.segments != 0u {
-            let fill = CmdFill(tile.segments, tile.backdrop);
-            ptcl[cmd_offset] = CMD_FILL;
-            let segments_and_rule = select(fill.tile << 1u, (fill.tile << 1u) | 1u, even_odd);
-            ptcl[cmd_offset + 1u] = segments_and_rule;
-            ptcl[cmd_offset + 2u] = u32(fill.backdrop);
-            cmd_offset += 3u;
-        } else {
-            if even_odd && (abs(tile.backdrop) & 1) == 0 {
-                return false;
-            }
-            ptcl[cmd_offset] = CMD_SOLID;
-            cmd_offset += 1u;
-        }
+fn write_path(tile: Tile, tile_ix: u32, linewidth: f32) -> bool {
+    let even_odd = linewidth < -1.0;
+    // We overload the "segments" field to store both count (written by
+    // path_count stage) and segment allocation (used by path_tiling and
+    // fine).
+    let n_segs = tile.segment_count_or_ix;
+    if n_segs != 0u {
+        var seg_ix = atomicAdd(&bump.segments, n_segs);
+        tiles[tile_ix].segment_count_or_ix = ~seg_ix;
+        alloc_cmd(4u);
+        ptcl[cmd_offset] = CMD_FILL;
+        let size_and_rule = (n_segs << 1u) | u32(even_odd);
+        let fill = CmdFill(size_and_rule, seg_ix, tile.backdrop);
+        ptcl[cmd_offset + 1u] = fill.size_and_rule;
+        ptcl[cmd_offset + 2u] = fill.seg_data;
+        ptcl[cmd_offset + 3u] = u32(fill.backdrop);
+        cmd_offset += 4u;
     } else {
-        let stroke = CmdStroke(tile.segments, 0.5 * linewidth);
-        ptcl[cmd_offset] = CMD_STROKE;
-        ptcl[cmd_offset + 1u] = stroke.tile;
-        ptcl[cmd_offset + 2u] = bitcast<u32>(stroke.half_width);
-        cmd_offset += 3u;
+        if even_odd && (abs(tile.backdrop) & 1) == 0 {
+            return false;
+        }
+        alloc_cmd(1u);
+        ptcl[cmd_offset] = CMD_SOLID;
+        cmd_offset += 1u;
     }
     return true;
 }
@@ -311,7 +310,7 @@ fn main(
                 let blend = scene[dd];
                 is_blend = blend != BLEND_CLIP;
             }
-            let include_tile = tile.segments != 0u || (tile.backdrop == 0) == is_clip || is_blend;
+            let include_tile = tile.segment_count_or_ix != 0u || (tile.backdrop == 0) == is_clip || is_blend;
             if include_tile {
                 let el_slice = el_ix / 32u;
                 let el_mask = 1u << (el_ix & 31u);
@@ -352,7 +351,7 @@ fn main(
                     // DRAWTAG_FILL_COLOR
                     case 0x44u: {
                         let linewidth = bitcast<f32>(info_bin_data[di]);
-                        if write_path(tile, linewidth) {
+                        if write_path(tile, tile_ix, linewidth) {
                             let rgba_color = scene[dd];
                             write_color(CmdColor(rgba_color));
                         }
@@ -360,7 +359,7 @@ fn main(
                     // DRAWTAG_FILL_LIN_GRADIENT
                     case 0x114u: {
                         let linewidth = bitcast<f32>(info_bin_data[di]);
-                        if write_path(tile, linewidth) {
+                        if write_path(tile, tile_ix, linewidth) {
                             let index = scene[dd];
                             let info_offset = di + 1u;
                             write_grad(CMD_LIN_GRAD, index, info_offset);
@@ -369,7 +368,7 @@ fn main(
                     // DRAWTAG_FILL_RAD_GRADIENT
                     case 0x29cu: {
                         let linewidth = bitcast<f32>(info_bin_data[di]);
-                        if write_path(tile, linewidth) {
+                        if write_path(tile, tile_ix, linewidth) {
                             let index = scene[dd];
                             let info_offset = di + 1u;
                             write_grad(CMD_RAD_GRAD, index, info_offset);
@@ -378,13 +377,13 @@ fn main(
                     // DRAWTAG_FILL_IMAGE
                     case 0x248u: {
                         let linewidth = bitcast<f32>(info_bin_data[di]);
-                        if write_path(tile, linewidth) {                            
+                        if write_path(tile, tile_ix, linewidth) {
                             write_image(di + 1u);
                         }
                     }
                     // DRAWTAG_BEGIN_CLIP
                     case 0x9u: {
-                        if tile.segments == 0u && tile.backdrop == 0 {
+                        if tile.segment_count_or_ix == 0u && tile.backdrop == 0 {
                             clip_zero_depth = clip_depth + 1u;
                         } else {
                             write_begin_clip();
@@ -396,7 +395,7 @@ fn main(
                     // DRAWTAG_END_CLIP
                     case 0x21u: {
                         clip_depth -= 1u;
-                        write_path(tile, -1.0);
+                        write_path(tile, tile_ix, -1.0);
                         let blend = scene[dd];
                         let alpha = bitcast<f32>(scene[dd + 1u]);
                         write_end_clip(CmdEndClip(blend, alpha));
diff --git a/shader/fine.wgsl b/shader/fine.wgsl
index f64f0159..108c88cb 100644
--- a/shader/fine.wgsl
+++ b/shader/fine.wgsl
@@ -41,15 +41,10 @@ var gradients: texture_2d<f32>;
 var image_atlas: texture_2d<f32>;
 
 fn read_fill(cmd_ix: u32) -> CmdFill {
-    let tile = ptcl[cmd_ix + 1u];
-    let backdrop = i32(ptcl[cmd_ix + 2u]);
-    return CmdFill(tile, backdrop);
-}
-
-fn read_stroke(cmd_ix: u32) -> CmdStroke {
-    let tile = ptcl[cmd_ix + 1u];
-    let half_width = bitcast<f32>(ptcl[cmd_ix + 2u]);
-    return CmdStroke(tile, half_width);
+    let size_and_rule = ptcl[cmd_ix + 1u];
+    let seg_data = ptcl[cmd_ix + 2u];
+    let backdrop = i32(ptcl[cmd_ix + 3u]);
+    return CmdFill(size_and_rule, seg_data, backdrop);
 }
 
 fn read_color(cmd_ix: u32) -> CmdColor {
@@ -140,15 +135,17 @@ var output: texture_storage_2d<r8, write>;
 
 let PIXELS_PER_THREAD = 4u;
 
-fn fill_path(tile: Tile, xy: vec2<f32>, even_odd: bool) -> array<f32, PIXELS_PER_THREAD> {
+fn fill_path(fill: CmdFill, xy: vec2<f32>) -> array<f32, PIXELS_PER_THREAD> {
+    let n_segs = fill.size_and_rule >> 1u;
+    let even_odd = (fill.size_and_rule & 1u) != 0u;
     var area: array<f32, PIXELS_PER_THREAD>;
-    let backdrop_f = f32(tile.backdrop);
+    let backdrop_f = f32(fill.backdrop);
     for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
         area[i] = backdrop_f;
     }
-    var segment_ix = tile.segments;
-    while segment_ix != 0u {
-        let segment = segments[segment_ix];
+    for (var i = 0u; i < n_segs; i++) {
+        let seg_off = fill.seg_data + i;
+        let segment = segments[seg_off];
         let y = segment.origin.y - xy.y;
         let y0 = clamp(y, 0.0, 1.0);
         let y1 = clamp(y + segment.delta.y, 0.0, 1.0);
@@ -177,7 +174,6 @@ fn fill_path(tile: Tile, xy: vec2<f32>, even_odd: bool) -> array<f32, PIXELS_PER
         for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
             area[i] += y_edge;
         }
-        segment_ix = segment.next;
     }
     if even_odd {
         // even-odd winding rule
@@ -194,32 +190,6 @@ fn fill_path(tile: Tile, xy: vec2<f32>, even_odd: bool) -> array<f32, PIXELS_PER
     return area;
 }
 
-fn stroke_path(seg: u32, half_width: f32, xy: vec2<f32>) -> array<f32, PIXELS_PER_THREAD> {
-    var df: array<f32, PIXELS_PER_THREAD>;
-    for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
-        df[i] = 1e9;
-    }
-    var segment_ix = seg;
-    while segment_ix != 0u {
-        let segment = segments[segment_ix];
-        let delta = segment.delta;
-        let dpos0 = xy + vec2(0.5, 0.5) - segment.origin;
-        let scale = 1.0 / dot(delta, delta);
-        for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
-            let dpos = vec2(dpos0.x + f32(i), dpos0.y);
-            let t = clamp(dot(dpos, delta) * scale, 0.0, 1.0);
-            // performance idea: hoist sqrt out of loop
-            df[i] = min(df[i], length(delta * t - dpos));
-        }
-        segment_ix = segment.next;
-    }
-    for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
-        // reuse array; return alpha rather than distance
-        df[i] = clamp(half_width + 0.5 - df[i], 0.0, 1.0);
-    }
-    return df;
-}
-
 // The X size should be 16 / PIXELS_PER_THREAD
 @compute @workgroup_size(4, 16)
 fn main(
@@ -250,16 +220,17 @@ fn main(
             // CMD_FILL
             case 1u: {
                 let fill = read_fill(cmd_ix);
-                let segments = fill.tile >> 1u;
-                let even_odd = (fill.tile & 1u) != 0u;
-                let tile = Tile(fill.backdrop, segments);
-                area = fill_path(tile, xy, even_odd);
-                cmd_ix += 3u;
+                area = fill_path(fill, xy);
+                cmd_ix += 4u;
             }
             // CMD_STROKE
             case 2u: {
-                let stroke = read_stroke(cmd_ix);
-                area = stroke_path(stroke.tile, stroke.half_width, xy);
+                // Stroking in fine rasterization is disabled, as strokes will be expanded
+                // to fills earlier in the pipeline. This implementation is a stub, just to
+                // keep the shader from crashing.
+                for (var i = 0u; i < PIXELS_PER_THREAD; i++) {
+                    area[i] = 0.0;
+                }
                 cmd_ix += 3u;
             }
             // CMD_SOLID
diff --git a/shader/pathseg.wgsl b/shader/flatten.wgsl
similarity index 51%
rename from shader/pathseg.wgsl
rename to shader/flatten.wgsl
index bf02419b..e27bb6ed 100644
--- a/shader/pathseg.wgsl
+++ b/shader/flatten.wgsl
@@ -1,20 +1,12 @@
 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
 
-// Path segment decoding for the full case.
-
-// In the simple case, path segments are decoded as part of the coarse
-// path rendering stage. In the full case, they are separated, as the
-// decoding process also generates bounding boxes, and those in turn are
-// used for tile allocation and clipping; actual coarse path rasterization
-// can't proceed until those are complete.
-
-// There's some duplication of the decoding code but we won't worry about
-// that just now. Perhaps it could be factored more nicely later.
+// Flatten curves to lines
 
 #import config
 #import pathtag
+#import segment
 #import cubic
-#import transform
+#import bump
 
 @group(0) @binding(0)
 var<uniform> config: Config;
@@ -38,37 +30,133 @@ struct AtomicPathBbox {
 var<storage, read_write> path_bboxes: array<AtomicPathBbox>;
 
 @group(0) @binding(4)
-var<storage, read_write> cubics: array<Cubic>;
-
-// Monoid is yagni, for future optimization
-
-// struct BboxMonoid {
-//     bbox: vec4<f32>,
-//     flags: u32,
-// }
-
-// let FLAG_RESET_BBOX = 1u;
-// let FLAG_SET_BBOX = 2u;
-
-// fn combine_bbox_monoid(a: BboxMonoid, b: BboxMonoid) -> BboxMonoid {
-//     var c: BboxMonoid;
-//     c.bbox = b.bbox;
-//     // TODO: previous-me thought this should be gated on b & SET_BBOX == false also
-//     if (a.flags & FLAG_RESET_BBOX) == 0u && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y {
-//         c.bbox = a.bbox;
-//     } else if (a.flags & FLAG_RESET_BBOX) == 0u && (b.flags & FLAG_SET_BBOX) == 0u ||
-//         (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)
-//     {
-//         c.bbox = vec4<f32>(min(a.bbox.xy, c.bbox.xy), max(a.bbox.xw, c.bbox.zw));
-//     }
-//     c.flags = (a.flags & FLAG_SET_BBOX) | b.flags;
-//     c.flags |= (a.flags & FLAG_RESET_BBOX) << 1u;
-//     return c;
-// }
-
-// fn bbox_monoid_identity() -> BboxMonoid {
-//     return BboxMonoid();
-// }
+var<storage, read_write> bump: BumpAllocators;
+
+@group(0) @binding(5)
+var<storage, read_write> lines: array<LineSoup>;
+
+struct SubdivResult {
+    val: f32,
+    a0: f32,
+    a2: f32,
+}
+
+let D = 0.67;
+fn approx_parabola_integral(x: f32) -> f32 {
+    return x * inverseSqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x)));
+}
+
+let B = 0.39;
+fn approx_parabola_inv_integral(x: f32) -> f32 {
+    return x * sqrt(1.0 - B + (B * B + 0.5 * x * x));
+}
+
+fn estimate_subdiv(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, sqrt_tol: f32) -> SubdivResult {
+    let d01 = p1 - p0;
+    let d12 = p2 - p1;
+    let dd = d01 - d12;
+    let cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
+    let cross_inv = select(1.0 / cross, 1.0e9, abs(cross) < 1.0e-9);
+    let x0 = dot(d01, dd) * cross_inv;
+    let x2 = dot(d12, dd) * cross_inv;
+    let scale = abs(cross / (length(dd) * (x2 - x0)));
+
+    let a0 = approx_parabola_integral(x0);
+    let a2 = approx_parabola_integral(x2);
+    var val = 0.0;
+    if scale < 1e9 {
+        let da = abs(a2 - a0);
+        let sqrt_scale = sqrt(scale);
+        if sign(x0) == sign(x2) {
+            val = sqrt_scale;
+        } else {
+            let xmin = sqrt_tol / sqrt_scale;
+            val = sqrt_tol / approx_parabola_integral(xmin);
+        }
+        val *= da;
+    }
+    return SubdivResult(val, a0, a2);
+}
+
+fn eval_quad(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, t: f32) -> vec2<f32> {
+    let mt = 1.0 - t;
+    return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t;
+}
+
+fn eval_cubic(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, p3: vec2<f32>, t: f32) -> vec2<f32> {
+    let mt = 1.0 - t;
+    return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t;
+}
+
+let MAX_QUADS = 16u;
+
+fn flatten_cubic(cubic: Cubic) {
+    let p0 = cubic.p0;
+    let p1 = cubic.p1;
+    let p2 = cubic.p2;
+    let p3 = cubic.p3;
+    let err_v = 3.0 * (p2 - p1) + p0 - p3;
+    let err = dot(err_v, err_v);
+    let ACCURACY = 0.25;
+    let Q_ACCURACY = ACCURACY * 0.1;
+    let REM_ACCURACY = ACCURACY - Q_ACCURACY;
+    let MAX_HYPOT2 = 432.0 * Q_ACCURACY * Q_ACCURACY;
+    var n_quads = max(u32(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1u);
+    n_quads = min(n_quads, MAX_QUADS);
+    var keep_params: array<SubdivResult, MAX_QUADS>;
+    var val = 0.0;
+    var qp0 = p0;
+    let step = 1.0 / f32(n_quads);
+    for (var i = 0u; i < n_quads; i += 1u) {
+        let t = f32(i + 1u) * step;
+        let qp2 = eval_cubic(p0, p1, p2, p3, t);
+        var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
+        qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
+        let params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
+        keep_params[i] = params;
+        val += params.val;
+        qp0 = qp2;
+    }
+    let n = max(u32(ceil(val * (0.5 / sqrt(REM_ACCURACY)))), 1u);
+    var lp0 = p0;
+    qp0 = p0;
+    let v_step = val / f32(n);
+    var n_out = 1u;
+    var val_sum = 0.0;
+    for (var i = 0u; i < n_quads; i += 1u) {
+        let t = f32(i + 1u) * step;
+        let qp2 = eval_cubic(p0, p1, p2, p3, t);
+        var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
+        qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
+        let params = keep_params[i];
+        let u0 = approx_parabola_inv_integral(params.a0);
+        let u2 = approx_parabola_inv_integral(params.a2);
+        let uscale = 1.0 / (u2 - u0);
+        var val_target = f32(n_out) * v_step;
+        while n_out == n || val_target < val_sum + params.val {
+            var lp1: vec2<f32>;
+            if n_out == n {
+                lp1 = p3;
+            } else {
+                let u = (val_target - val_sum) / params.val;
+                let a = mix(params.a0, params.a2, u);
+                let au = approx_parabola_inv_integral(a);
+                let t = (au - u0) * uscale;
+                lp1 = eval_quad(qp0, qp1, qp2, t);
+            }
+
+            // Output line segment lp0..lp1
+            let line_ix = atomicAdd(&bump.lines, 1u);
+            // TODO: check failure
+            lines[line_ix] = LineSoup(cubic.path_ix, lp0, lp1);
+            n_out += 1u;
+            val_target += v_step;
+            lp0 = lp1;
+        }
+        val_sum += params.val;
+        qp0 = qp2;
+    }
+}
 
 var<private> pathdata_base: u32;
 
@@ -85,6 +173,11 @@ fn read_i16_point(ix: u32) -> vec2<f32> {
     return vec2(x, y);
 }
 
+struct Transform {
+    mat: vec4<f32>,
+    translate: vec2<f32>,
+}
+
 fn read_transform(transform_base: u32, ix: u32) -> Transform {
     let base = transform_base + ix * 6u;
     let c0 = bitcast<f32>(scene[base]);
@@ -93,9 +186,13 @@ fn read_transform(transform_base: u32, ix: u32) -> Transform {
     let c3 = bitcast<f32>(scene[base + 3u]);
     let c4 = bitcast<f32>(scene[base + 4u]);
     let c5 = bitcast<f32>(scene[base + 5u]);
-    let matrx = vec4(c0, c1, c2, c3);
+    let mat = vec4(c0, c1, c2, c3);
     let translate = vec2(c4, c5);
-    return Transform(matrx, translate);
+    return Transform(mat, translate);
+}
+
+fn transform_apply(transform: Transform, p: vec2<f32>) -> vec2<f32> {
+    return transform.mat.xy * p.x + transform.mat.zw * p.y + transform.translate;
 }
 
 fn round_down(x: f32) -> i32 {
@@ -116,6 +213,7 @@ fn main(
     pathdata_base = config.pathdata_base;
     let shift = (ix & 3u) * 8u;
     var tm = reduce_tag(tag_word & ((1u << shift) - 1u));
+    // TODO: this can be a read buf overflow. Conditionalize by tag byte?
     tm = combine_tag_monoid(tag_monoids[ix >> 2u], tm);
     var tag_byte = (tag_word >> shift) & 0xffu;
 
@@ -177,11 +275,11 @@ fn main(
             // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
             // This is the correct bounding box, but we're not handling rendering
             // in the isotropic case, so it may mismatch.
-            stroke = 0.5 * linewidth * vec2(length(transform.matrx.xz), length(transform.matrx.yw));
+            stroke = 0.5 * linewidth * vec2(length(transform.mat.xz), length(transform.mat.yw));
             bbox += vec4(-stroke, stroke);
         }
         let flags = u32(linewidth >= 0.0);
-        cubics[global_id.x] = Cubic(p0, p1, p2, p3, stroke, tm.path_ix, flags);
+        flatten_cubic(Cubic(p0, p1, p2, p3, stroke, tm.path_ix, flags));
         // Update bounding box using atomics only. Computing a monoid is a
         // potential future optimization.
         if bbox.z > bbox.x || bbox.w > bbox.y {
diff --git a/shader/path_coarse.wgsl b/shader/path_coarse.wgsl
deleted file mode 100644
index 5f168a63..00000000
--- a/shader/path_coarse.wgsl
+++ /dev/null
@@ -1,309 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-#import config
-#import pathtag
-
-@group(0) @binding(0)
-var<uniform> config: Config;
-
-@group(0) @binding(1)
-var<storage> scene: array<u32>;
-
-@group(0) @binding(2)
-var<storage> tag_monoids: array<TagMonoid>;
-
-#ifdef cubics_out
-@group(0) @binding(3)
-var<storage, read_write> output: array<vec2<f32>>;
-#else
-// We don't get this from import as it's the atomic version
-struct AtomicTile {
-    backdrop: atomic<i32>,
-    segments: atomic<u32>,
-}
-
-#import segment
-
-@group(0) @binding(3)
-var<storage, read_write> tiles: array<AtomicTile>;
-
-@group(0) @binding(4)
-var<storage, read_write> segments: array<Segment>;
-#endif
-
-var<private> pathdata_base: u32;
-
-fn read_f32_point(ix: u32) -> vec2<f32> {
-    let x = bitcast<f32>(scene[pathdata_base + ix]);
-    let y = bitcast<f32>(scene[pathdata_base + ix + 1u]);
-    return vec2(x, y);
-}
-
-fn read_i16_point(ix: u32) -> vec2<f32> {
-    let raw = scene[pathdata_base + ix];
-    let x = f32(i32(raw << 16u) >> 16u);
-    let y = f32(i32(raw) >> 16u);
-    return vec2(x, y);
-}
-
-#ifndef cubics_out
-struct SubdivResult {
-    val: f32,
-    a0: f32,
-    a2: f32,
-}
-
-let D = 0.67;
-fn approx_parabola_integral(x: f32) -> f32 {
-    return x * inverseSqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x)));
-}
-
-let B = 0.39;
-fn approx_parabola_inv_integral(x: f32) -> f32 {
-    return x * sqrt(1.0 - B + (B * B + 0.5 * x * x));
-}
-
-fn estimate_subdiv(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, sqrt_tol: f32) -> SubdivResult {
-    let d01 = p1 - p0;
-    let d12 = p2 - p1;
-    let dd = d01 - d12;
-    let cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
-    let cross_inv = 1.0 / cross;
-    let x0 = dot(d01, dd) * cross_inv;
-    let x2 = dot(d12, dd) * cross_inv;
-    let scale = abs(cross / (length(dd) * (x2 - x0)));
-
-    let a0 = approx_parabola_integral(x0);
-    let a2 = approx_parabola_integral(x2);
-    var val = 0.0;
-    if scale < 1e9 {
-        let da = abs(a2 - a0);
-        let sqrt_scale = sqrt(scale);
-        if sign(x0) == sign(x2) {
-            val = sqrt_scale;
-        } else {
-            let xmin = sqrt_tol / sqrt_scale;
-            val = sqrt_tol / approx_parabola_integral(xmin);
-        }
-        val *= da;
-    }
-    return SubdivResult(val, a0, a2);
-}
-
-fn eval_quad(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, t: f32) -> vec2<f32> {
-    let mt = 1.0 - t;
-    return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t;
-}
-
-fn eval_cubic(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, p3: vec2<f32>, t: f32) -> vec2<f32> {
-    let mt = 1.0 - t;
-    return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t;
-}
-
-fn alloc_segment() -> u32 {
-    // Use 0-index segment (address is sentinel) as counter
-    // TODO: separate small buffer binding for this?
-    return atomicAdd(&tiles[4096].segments, 1u) + 1u;
-}
-#endif
-
-let MAX_QUADS = 16u;
-
-@compute @workgroup_size(256)
-fn main(
-    @builtin(global_invocation_id) global_id: vec3<u32>,
-    @builtin(local_invocation_id) local_id: vec3<u32>,
-) {
-    // Obtain exclusive prefix sum of tag monoid
-    let ix = global_id.x;
-    let tag_word = scene[config.pathtag_base + (ix >> 2u)];
-    pathdata_base = config.pathdata_base;
-    let shift = (ix & 3u) * 8u;
-    var tm = reduce_tag(tag_word & ((1u << shift) - 1u));
-    tm = combine_tag_monoid(tag_monoids[ix >> 2u], tm);
-    var tag_byte = (tag_word >> shift) & 0xffu;
-    // should be extractBits(tag_word, shift, 8)?
-
-    // Decode path data
-    let seg_type = tag_byte & PATH_TAG_SEG_TYPE;
-    if seg_type != 0u {
-        var p0: vec2<f32>;
-        var p1: vec2<f32>;
-        var p2: vec2<f32>;
-        var p3: vec2<f32>;
-        if (tag_byte & PATH_TAG_F32) != 0u {
-            p0 = read_f32_point(tm.pathseg_offset);
-            p1 = read_f32_point(tm.pathseg_offset + 2u);
-            if seg_type >= PATH_TAG_QUADTO {
-                p2 = read_f32_point(tm.pathseg_offset + 4u);
-                if seg_type == PATH_TAG_CUBICTO {
-                    p3 = read_f32_point(tm.pathseg_offset + 6u);
-                }
-            }
-        } else {
-            p0 = read_i16_point(tm.pathseg_offset);
-            p1 = read_i16_point(tm.pathseg_offset + 1u);
-            if seg_type >= PATH_TAG_QUADTO {
-                p2 = read_i16_point(tm.pathseg_offset + 2u);
-                if seg_type == PATH_TAG_CUBICTO {
-                    p3 = read_i16_point(tm.pathseg_offset + 3u);
-                }
-            }
-        }
-        // TODO: transform goes here
-        // Degree-raise
-        if seg_type == PATH_TAG_LINETO {
-            p3 = p1;
-            p2 = mix(p3, p0, 1.0 / 3.0);
-            p1 = mix(p0, p3, 1.0 / 3.0);
-        } else if seg_type == PATH_TAG_QUADTO {
-            p3 = p2;
-            p2 = mix(p1, p2, 1.0 / 3.0);
-            p1 = mix(p1, p0, 1.0 / 3.0);
-        }
-#ifdef cubics_out
-        let out_ix = ix * 4u;
-        output[out_ix] = p0;
-        output[out_ix + 1u] = p1;
-        output[out_ix + 2u] = p2;
-        output[out_ix + 3u] = p3;
-#else
-        let err_v = 3.0 * (p2 - p1) + p0 - p3;
-        let err = dot(err_v, err_v);
-        let ACCURACY = 0.25;
-        let Q_ACCURACY = ACCURACY * 0.1;
-        let REM_ACCURACY = (ACCURACY - Q_ACCURACY);
-        let MAX_HYPOT2 = 432.0 * Q_ACCURACY * Q_ACCURACY;
-        var n_quads = max(u32(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1u);
-        n_quads = min(n_quads, MAX_QUADS);
-        var keep_params: array<SubdivResult, MAX_QUADS>;
-        var val = 0.0;
-        var qp0 = p0;
-        let step = 1.0 / f32(n_quads);
-        for (var i = 0u; i < n_quads; i += 1u) {
-            let t = f32(i + 1u) * step;
-            let qp2 = eval_cubic(p0, p1, p2, p3, t);
-            var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
-            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
-            let params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
-            keep_params[i] = params;
-            val += params.val;
-            qp0 = qp2;
-        }
-        let n = max(u32(ceil(val * (0.5 / sqrt(REM_ACCURACY)))), 1u);
-        var lp0 = p0;
-        qp0 = p0;
-        let v_step = val / f32(n);
-        var n_out = 1u;
-        var val_sum = 0.0;
-        for (var i = 0u; i < n_quads; i += 1u) {
-            let t = f32(i + 1u) * step;
-            let qp2 = eval_cubic(p0, p1, p2, p3, t);
-            var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
-            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
-            let params = keep_params[i];
-            let u0 = approx_parabola_inv_integral(params.a0);
-            let u2 = approx_parabola_inv_integral(params.a2);
-            let uscale = 1.0 / (u2 - u0);
-            var val_target = f32(n_out) * v_step;
-            while n_out == n || val_target < val_sum + params.val {
-                var lp1: vec2<f32>;
-                if n_out == n {
-                    lp1 = p3;
-                } else {
-                    let u = (val_target - val_sum) / params.val;
-                    let a = mix(params.a0, params.a2, u);
-                    let au = approx_parabola_inv_integral(a);
-                    let t = (au - u0) * uscale;
-                    lp1 = eval_quad(qp0, qp1, qp2, t);
-                }
-
-                // Output line segment lp0..lp1
-                let xymin = min(lp0, lp1);
-                let xymax = max(lp0, lp1);
-                let dp = lp1 - lp0;
-                let recip_dx = 1.0 / dp.x;
-                let invslope = select(dp.x / dp.y, 1.0e9, abs(dp.y) < 1.0e-9);
-                let c = 0.5 * abs(invslope);
-                let b = invslope;
-                let SX = 1.0 / f32(TILE_WIDTH);
-                let SY = 1.0 / f32(TILE_HEIGHT);
-                let a = (lp0.x - (lp0.y - 0.5 * f32(TILE_HEIGHT)) * b) * SX;
-                var x0 = i32(floor(xymin.x * SX));
-                var x1 = i32(floor(xymax.x * SX) + 1.0);
-                var y0 = i32(floor(xymin.y * SY));
-                var y1 = i32(floor(xymax.y * SY) + 1.0);
-                x0 = clamp(x0, 0, i32(config.width_in_tiles));
-                x1 = clamp(x1, 0, i32(config.width_in_tiles));
-                y0 = clamp(y0, 0, i32(config.height_in_tiles));
-                y1 = clamp(y1, 0, i32(config.height_in_tiles));
-                var xc = a + b * f32(y0);
-                var xray = i32(floor(lp0.x * SX));
-                var last_xray = i32(floor(lp1.x * SX));
-                if dp.y < 0.0 {
-                    let tmp = xray;
-                    xray = last_xray;
-                    last_xray = tmp;
-                }
-                for (var y = y0; y < y1; y += 1) {
-                    let tile_y0 = f32(y) * f32(TILE_HEIGHT);
-                    let xbackdrop = max(xray + 1, 0);
-                    if xymin.y < tile_y0 && xbackdrop < i32(config.width_in_tiles) {
-                        let backdrop = select(-1, 1, dp.y < 0.0);
-                        let tile_ix = y * i32(config.width_in_tiles) + xbackdrop;
-                        atomicAdd(&tiles[tile_ix].backdrop, backdrop);
-                    }
-                    var next_xray = last_xray;
-                    if y + 1 < y1 {
-                        let tile_y1 = f32(y + 1) * f32(TILE_HEIGHT);
-                        let x_edge = lp0.x + (tile_y1 - lp0.y) * invslope;
-                        next_xray = i32(floor(x_edge * SX));
-                    }
-                    let min_xray = min(xray, next_xray);
-                    let max_xray = max(xray, next_xray);
-                    var xx0 = min(i32(floor(xc - c)), min_xray);
-                    var xx1 = max(i32(ceil(xc + c)), max_xray + 1);
-                    xx0 = clamp(xx0, x0, x1);
-                    xx1 = clamp(xx1, x0, x1);
-                    var tile_seg: Segment;
-                    for (var x = xx0; x < xx1; x += 1) {
-                        let tile_x0 = f32(x) * f32(TILE_WIDTH);
-                        let tile_ix = y * i32(config.width_in_tiles) + x;
-                        // allocate segment, insert linked list
-                        let seg_ix = alloc_segment();
-                        let old = atomicExchange(&tiles[tile_ix].segments, seg_ix);
-                        tile_seg.origin = lp0;
-                        tile_seg.delta = dp;
-                        var y_edge = mix(lp0.y, lp1.y, (tile_x0 - lp0.x) * recip_dx);
-                        if xymin.x < tile_x0 {
-                            let p = vec2(tile_x0, y_edge);
-                            if dp.x < 0.0 {
-                                tile_seg.delta = p - lp0;
-                            } else {
-                                tile_seg.origin = p;
-                                tile_seg.delta = lp1 - p;
-                            }
-                            if tile_seg.delta.x == 0.0 {
-                                tile_seg.delta.x = sign(dp.x) * 1e-9;
-                            }
-                        }
-                        if x <= min_xray || max_xray < x {
-                            y_edge = 1e9;
-                        }
-                        tile_seg.y_edge = y_edge;
-                        tile_seg.next = old;
-                        segments[seg_ix] = tile_seg;
-                    }
-                    xc += b;
-                    xray = next_xray;
-                }
-                n_out += 1u;
-                val_target += v_step;
-                lp0 = lp1;
-            }
-            val_sum += params.val;
-            qp0 = qp2;
-        }
-#endif
-    }
-}
diff --git a/shader/path_coarse_full.wgsl b/shader/path_coarse_full.wgsl
deleted file mode 100644
index cf03301b..00000000
--- a/shader/path_coarse_full.wgsl
+++ /dev/null
@@ -1,273 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Path coarse rasterization for the full implementation.
-
-#import config
-#import pathtag
-#import tile
-#import segment
-#import cubic
-#import bump
-
-@group(0) @binding(0)
-var<uniform> config: Config;
-
-@group(0) @binding(1)
-var<storage> scene: array<u32>;
-
-@group(0) @binding(2)
-var<storage> cubics: array<Cubic>;
-
-@group(0) @binding(3)
-var<storage> paths: array<Path>;
-
-// We don't get this from import as it's the atomic version
-struct AtomicTile {
-    backdrop: atomic<i32>,
-    segments: atomic<u32>,
-}
-
-@group(0) @binding(4)
-var<storage, read_write> bump: BumpAllocators;
-
-@group(0) @binding(5)
-var<storage, read_write> tiles: array<AtomicTile>;
-
-@group(0) @binding(6)
-var<storage, read_write> segments: array<Segment>;
-
-struct SubdivResult {
-    val: f32,
-    a0: f32,
-    a2: f32,
-}
-
-let D = 0.67;
-fn approx_parabola_integral(x: f32) -> f32 {
-    return x * inverseSqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x)));
-}
-
-let B = 0.39;
-fn approx_parabola_inv_integral(x: f32) -> f32 {
-    return x * sqrt(1.0 - B + (B * B + 0.5 * x * x));
-}
-
-fn estimate_subdiv(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, sqrt_tol: f32) -> SubdivResult {
-    let d01 = p1 - p0;
-    let d12 = p2 - p1;
-    let dd = d01 - d12;
-    let cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
-    let cross_inv = select(1.0 / cross, 1.0e9, abs(cross) < 1.0e-9);
-    let x0 = dot(d01, dd) * cross_inv;
-    let x2 = dot(d12, dd) * cross_inv;
-    let scale = abs(cross / (length(dd) * (x2 - x0)));
-
-    let a0 = approx_parabola_integral(x0);
-    let a2 = approx_parabola_integral(x2);
-    var val = 0.0;
-    if scale < 1e9 {
-        let da = abs(a2 - a0);
-        let sqrt_scale = sqrt(scale);
-        if sign(x0) == sign(x2) {
-            val = sqrt_scale;
-        } else {
-            let xmin = sqrt_tol / sqrt_scale;
-            val = sqrt_tol / approx_parabola_integral(xmin);
-        }
-        val *= da;
-    }
-    return SubdivResult(val, a0, a2);
-}
-
-fn eval_quad(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, t: f32) -> vec2<f32> {
-    let mt = 1.0 - t;
-    return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t;
-}
-
-fn eval_cubic(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, p3: vec2<f32>, t: f32) -> vec2<f32> {
-    let mt = 1.0 - t;
-    return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t;
-}
-
-fn alloc_segment() -> u32 {
-    var offset = atomicAdd(&bump.segments, 1u) + 1u;
-    if offset + 1u > config.segments_size {
-        offset = 0u;
-        atomicOr(&bump.failed, STAGE_PATH_COARSE);
-    }
-    return offset;
-}
-
-let MAX_QUADS = 16u;
-
-@compute @workgroup_size(256)
-fn main(
-    @builtin(global_invocation_id) global_id: vec3<u32>,
-) {
-    // Exit early if prior stages failed, as we can't run this stage.
-    // We need to check only prior stages, as if this stage has failed in another workgroup, 
-    // we still want to know this workgroup's memory requirement.   
-    if (atomicLoad(&bump.failed) & (STAGE_BINNING | STAGE_TILE_ALLOC)) != 0u {
-        return;
-    }
-    let ix = global_id.x;
-    let tag_word = scene[config.pathtag_base + (ix >> 2u)];
-    let shift = (ix & 3u) * 8u;
-    var tag_byte = (tag_word >> shift) & 0xffu;
-
-    if (tag_byte & PATH_TAG_SEG_TYPE) != 0u {
-        // Discussion question: it might actually be cheaper to do the path segment
-        // decoding & transform again rather than store the result in a buffer;
-        // classic memory vs ALU tradeoff.
-        let cubic = cubics[global_id.x];
-        let path = paths[cubic.path_ix];
-        let is_stroke = (cubic.flags & CUBIC_IS_STROKE) != 0u;
-        let bbox = vec4<i32>(path.bbox);
-        let p0 = cubic.p0;
-        let p1 = cubic.p1;
-        let p2 = cubic.p2;
-        let p3 = cubic.p3;
-        let err_v = 3.0 * (p2 - p1) + p0 - p3;
-        let err = dot(err_v, err_v);
-        let ACCURACY = 0.25;
-        let Q_ACCURACY = ACCURACY * 0.1;
-        let REM_ACCURACY = (ACCURACY - Q_ACCURACY);
-        let MAX_HYPOT2 = 432.0 * Q_ACCURACY * Q_ACCURACY;
-        var n_quads = max(u32(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1u);
-        n_quads = min(n_quads, MAX_QUADS);
-        var keep_params: array<SubdivResult, MAX_QUADS>;
-        var val = 0.0;
-        var qp0 = p0;
-        let step = 1.0 / f32(n_quads);
-        for (var i = 0u; i < n_quads; i += 1u) {
-            let t = f32(i + 1u) * step;
-            let qp2 = eval_cubic(p0, p1, p2, p3, t);
-            var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
-            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
-            let params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
-            keep_params[i] = params;
-            val += params.val;
-            qp0 = qp2;
-        }
-        let n = max(u32(ceil(val * (0.5 / sqrt(REM_ACCURACY)))), 1u);
-        var lp0 = p0;
-        qp0 = p0;
-        let v_step = val / f32(n);
-        var n_out = 1u;
-        var val_sum = 0.0;
-        for (var i = 0u; i < n_quads; i += 1u) {
-            let t = f32(i + 1u) * step;
-            let qp2 = eval_cubic(p0, p1, p2, p3, t);
-            var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
-            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
-            let params = keep_params[i];
-            let u0 = approx_parabola_inv_integral(params.a0);
-            let u2 = approx_parabola_inv_integral(params.a2);
-            let uscale = 1.0 / (u2 - u0);
-            var val_target = f32(n_out) * v_step;
-            while n_out == n || val_target < val_sum + params.val {
-                var lp1: vec2<f32>;
-                if n_out == n {
-                    lp1 = p3;
-                } else {
-                    let u = (val_target - val_sum) / params.val;
-                    let a = mix(params.a0, params.a2, u);
-                    let au = approx_parabola_inv_integral(a);
-                    let t = (au - u0) * uscale;
-                    lp1 = eval_quad(qp0, qp1, qp2, t);
-                }
-
-                // Output line segment lp0..lp1
-                let xymin = min(lp0, lp1) - cubic.stroke;
-                let xymax = max(lp0, lp1) + cubic.stroke;
-                let dp = lp1 - lp0;
-                let recip_dx = 1.0 / dp.x;
-                let invslope = select(dp.x / dp.y, 1.0e9, abs(dp.y) < 1.0e-9);
-                let SX = 1.0 / f32(TILE_WIDTH);
-                let SY = 1.0 / f32(TILE_HEIGHT);
-                let c = (cubic.stroke.x + abs(invslope) * (0.5 * f32(TILE_HEIGHT) + cubic.stroke.y)) * SX;
-                let b = invslope;
-                let a = (lp0.x - (lp0.y - 0.5 * f32(TILE_HEIGHT)) * b) * SX;
-                var x0 = i32(floor(xymin.x * SX));
-                var x1 = i32(floor(xymax.x * SX) + 1.0);
-                var y0 = i32(floor(xymin.y * SY));
-                var y1 = i32(floor(xymax.y * SY) + 1.0);
-                x0 = clamp(x0, bbox.x, bbox.z);
-                x1 = clamp(x1, bbox.x, bbox.z);
-                y0 = clamp(y0, bbox.y, bbox.w);
-                y1 = clamp(y1, bbox.y, bbox.w);
-                var xc = a + b * f32(y0);
-                let stride = bbox.z - bbox.x;
-                var base = i32(path.tiles) + (y0 - bbox.y) * stride - bbox.x;
-                var xray = i32(floor(lp0.x * SX));
-                var last_xray = i32(floor(lp1.x * SX));
-                if dp.y < 0.0 {
-                    let tmp = xray;
-                    xray = last_xray;
-                    last_xray = tmp;
-                }
-                for (var y = y0; y < y1; y += 1) {
-                    let tile_y0 = f32(y) * f32(TILE_HEIGHT);
-                    let xbackdrop = max(xray + 1, bbox.x);
-                    if !is_stroke && xymin.y < tile_y0 && xbackdrop < bbox.z {
-                        let backdrop = select(-1, 1, dp.y < 0.0);
-                        let tile_ix = base + xbackdrop;
-                        atomicAdd(&tiles[tile_ix].backdrop, backdrop);
-                    }
-                    var next_xray = last_xray;
-                    if y + 1 < y1 {
-                        let tile_y1 = f32(y + 1) * f32(TILE_HEIGHT);
-                        let x_edge = lp0.x + (tile_y1 - lp0.y) * invslope;
-                        next_xray = i32(floor(x_edge * SX));
-                    }
-                    let min_xray = min(xray, next_xray);
-                    let max_xray = max(xray, next_xray);
-                    var xx0 = min(i32(floor(xc - c)), min_xray);
-                    var xx1 = max(i32(ceil(xc + c)), max_xray + 1);
-                    xx0 = clamp(xx0, x0, x1);
-                    xx1 = clamp(xx1, x0, x1);
-                    var tile_seg: Segment;
-                    for (var x = xx0; x < xx1; x += 1) {
-                        let tile_x0 = f32(x) * f32(TILE_WIDTH);
-                        let tile_ix = base + x;
-                        // allocate segment, insert linked list
-                        let seg_ix = alloc_segment();
-                        let old = atomicExchange(&tiles[tile_ix].segments, seg_ix);
-                        tile_seg.origin = lp0;
-                        tile_seg.delta = dp;
-                        var y_edge = 0.0;
-                        if !is_stroke {
-                            y_edge = mix(lp0.y, lp1.y, (tile_x0 - lp0.x) * recip_dx);
-                            if xymin.x < tile_x0 {
-                                let p = vec2(tile_x0, y_edge);
-                                if dp.x < 0.0 {
-                                    tile_seg.delta = p - lp0;
-                                } else {
-                                    tile_seg.origin = p;
-                                    tile_seg.delta = lp1 - p;
-                                }
-                                if tile_seg.delta.x == 0.0 {
-                                    tile_seg.delta.x = sign(dp.x) * 1e-9;
-                                }
-                            }
-                            if x <= min_xray || max_xray < x {
-                                y_edge = 1e9;
-                            }
-                        }
-                        tile_seg.y_edge = y_edge;
-                        tile_seg.next = old;
-                        segments[seg_ix] = tile_seg;
-                    }
-                    xc += b;
-                    base += stride;
-                    xray = next_xray;
-                }
-                n_out += 1u;
-                val_target += v_step;
-                lp0 = lp1;
-            }
-            val_sum += params.val;
-            qp0 = qp2;
-        }
-    }
-}
diff --git a/shader/path_count.wgsl b/shader/path_count.wgsl
new file mode 100644
index 00000000..e34ff7a9
--- /dev/null
+++ b/shader/path_count.wgsl
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Stage to compute counts of number of segments in each tile
+
+#import bump
+#import config
+#import segment
+#import tile
+
+@group(0) @binding(0)
+var<uniform> config: Config;
+
+// TODO: this is cut'n'pasted from path_coarse.
+struct AtomicTile {
+    backdrop: atomic<i32>,
+    segment_count_or_ix: atomic<u32>,
+}
+
+@group(0) @binding(1)
+var<storage, read_write> bump: BumpAllocators;
+
+@group(0) @binding(2)
+var<storage> lines: array<LineSoup>;
+
+@group(0) @binding(3)
+var<storage> paths: array<Path>;
+
+@group(0) @binding(4)
+var<storage, read_write> tile: array<AtomicTile>;
+
+@group(0) @binding(5)
+var<storage, read_write> seg_counts: array<SegmentCount>;
+
+// number of integer cells spanned by interval defined by a, b
+fn span(a: f32, b: f32) -> u32 {
+    return u32(max(ceil(max(a, b)) - floor(min(a, b)), 1.0));
+}
+
+// Note regarding clipping to bounding box:
+//
+// We have to do the backdrop bumps for all tiles to the left of the bbox.
+// This should probably be a separate loop. This also causes a numerical
+// robustness issue.
+
+// This shader is dispatched with one thread for each line.
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+) {
+    let n_lines = atomicLoad(&bump.lines);
+    var count = 0u;
+    if global_id.x < n_lines {
+        let line = lines[global_id.x];
+        // coarse rasterization logic to count number of tiles touched by line
+        let is_down = line.p1.y >= line.p0.y;
+        let xy0 = select(line.p1, line.p0, is_down);
+        let xy1 = select(line.p0, line.p1, is_down);
+        let s0 = xy0 * TILE_SCALE;
+        let s1 = xy1 * TILE_SCALE;
+        count = span(s0.x, s1.x) + span(s0.y, s1.y) - 1u;
+        let line_ix = global_id.x;
+
+        let dx = abs(s1.x - s0.x);
+        let dy = s1.y - s0.y;
+        if dx + dy == 0.0 {
+            // Zero-length segment, drop it. Note, this could be culled in the
+            // flattening stage, but eliding the test here would be fragile, as
+            // it would be pretty bad to let it slip through.
+            return;
+        }
+        if dy == 0.0 && floor(s0.y) == s0.y {
+            return;
+        }
+        let idxdy = 1.0 / (dx + dy);
+        let a = dx * idxdy;
+        let is_positive_slope = s1.x >= s0.x;
+        let sign = select(-1.0, 1.0, is_positive_slope);
+        let xt0 = floor(s0.x * sign);
+        let c = s0.x * sign - xt0;
+        let y0 = floor(s0.y);
+        let ytop = select(y0 + 1.0, ceil(s0.y), s0.y == s1.y);
+        let b = (dy * c + dx * (ytop - s0.y)) * idxdy;
+        let x0 = xt0 * sign + select(-1.0, 0.0, is_positive_slope);
+
+        let path = paths[line.path_ix];
+        let bbox = vec4<i32>(path.bbox);
+        let xmin = min(s0.x, s1.x);
+        // If line is to left of bbox, we may still need to do backdrop
+        let stride = bbox.z - bbox.x;
+        if s0.y >= f32(bbox.w) || s1.y <= f32(bbox.y) || xmin >= f32(bbox.z) || stride == 0 {
+            return;
+        }
+        // Clip line to bounding box. Clipping is done in "i" space.
+        var imin = 0u;
+        if s0.y < f32(bbox.y) {
+            var iminf = round((f32(bbox.y) - y0 + b - a) / (1.0 - a)) - 1.0;
+            // Numerical robustness: goal is to find the first i value for which
+            // the following predicate is false. Above formula is designed to
+            // undershoot by 0.5.
+            if y0 + iminf - floor(a * iminf + b) < f32(bbox.y) {
+                iminf += 1.0;
+            }
+            imin = u32(iminf);
+        }
+        var imax = count;
+        if s1.y > f32(bbox.w) {
+            var imaxf = round((f32(bbox.w) - y0 + b - a) / (1.0 - a)) - 1.0;
+            if y0 + imaxf - floor(a * imaxf + b) < f32(bbox.w) {
+                imaxf += 1.0;
+            }
+            imax = u32(imaxf);
+        }
+        let delta = select(1, -1, is_down);
+        var ymin = 0;
+        var ymax = 0;
+        if max(s0.x, s1.x) <= f32(bbox.x) {
+            ymin = i32(ceil(s0.y));
+            ymax = i32(ceil(s1.y));
+            imax = imin;
+        } else {
+            let fudge = select(1.0, 0.0, is_positive_slope);
+            if xmin < f32(bbox.x) {
+                var f = round((sign * (f32(bbox.x) - x0) - b + fudge) / a);
+                if (x0 + sign * floor(a * f + b) < f32(bbox.x)) == is_positive_slope {
+                    f += 1.0;
+                }
+                let ynext = i32(y0 + f - floor(a * f + b) + 1.0);
+                if is_positive_slope {
+                    if u32(f) > imin {
+                        ymin = i32(y0 + select(1.0, 0.0, y0 == s0.y));
+                        ymax = ynext;
+                        imin = u32(f);
+                    }
+                } else {
+                    if u32(f) < imax {
+                        ymin = ynext;
+                        ymax = i32(ceil(s1.y));
+                        imax = u32(f);
+                    }
+                }
+            }
+            if max(s0.x, s1.x) > f32(bbox.z) {
+                var f = round((sign * (f32(bbox.z) - x0) - b + fudge) / a);
+                if (x0 + sign * floor(a * f + b) < f32(bbox.z)) == is_positive_slope {
+                    f += 1.0;
+                }
+                if is_positive_slope {
+                    imax = min(imax, u32(f));
+                } else {
+                    imin = max(imin, u32(f));
+                }
+            }
+        }
+        imax = max(imin, imax);
+        // Apply backdrop for part of line left of bbox
+        ymin = max(ymin, bbox.y);
+        ymax = min(ymax, bbox.w);
+        for (var y = ymin; y < ymax; y++) {
+            let base = i32(path.tiles) + (y - bbox.y) * stride;
+            atomicAdd(&tile[base].backdrop, delta);
+        }
+        var last_z = floor(a * (f32(imin) - 1.0) + b);
+        let seg_base = atomicAdd(&bump.seg_counts, imax - imin);
+        for (var i = imin; i < imax; i++) {
+            let subix = i;
+            // coarse rasterization logic
+            // Note: we hope fast-math doesn't strength reduce this.
+            let zf = a * f32(subix) + b;
+            let z = floor(zf);
+            // x, y are tile coordinates relative to render target
+            let y = i32(y0 + f32(subix) - z);
+            let x = i32(x0 + sign * z);
+            let base = i32(path.tiles) + (y - bbox.y) * stride - bbox.x;
+            let top_edge = select(last_z == z, y0 == s0.y, subix == 0u);
+            if top_edge && x + 1 < bbox.z {
+                let x_bump = max(x + 1, bbox.x);
+                atomicAdd(&tile[base + x_bump].backdrop, delta);
+            }
+            let seg_within_slice = atomicAdd(&tile[base + x].segment_count_or_ix, 1u);
+            // Pack two count values into a single u32
+            let counts = (seg_within_slice << 16u) | subix;
+            let seg_count = SegmentCount(line_ix, counts);
+            seg_counts[seg_base + i - imin] = seg_count;
+            // Note: since we're iterating, we have a reliable value for
+            // last_z.
+            last_z = z;
+        }
+    }
+}
diff --git a/shader/path_count_setup.wgsl b/shader/path_count_setup.wgsl
new file mode 100644
index 00000000..95af29c3
--- /dev/null
+++ b/shader/path_count_setup.wgsl
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Set up dispatch size for path count stage.
+
+#import bump
+
+@group(0) @binding(0)
+var<storage> bump: BumpAllocators;
+
+@group(0) @binding(1)
+var<storage, read_write> indirect: IndirectCount;
+
+// Partition size for path count stage
+let WG_SIZE = 256u;
+
+@compute @workgroup_size(1)
+fn main() {
+    let lines = atomicLoad(&bump.lines);
+    indirect.count_x = (lines + (WG_SIZE - 1u)) / WG_SIZE;
+    indirect.count_y = 1u;
+    indirect.count_z = 1u;
+}
diff --git a/shader/path_tiling.wgsl b/shader/path_tiling.wgsl
new file mode 100644
index 00000000..ace01ab9
--- /dev/null
+++ b/shader/path_tiling.wgsl
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Write path segments
+
+#import bump
+#import config
+#import segment
+#import tile
+
+@group(0) @binding(0)
+var<storage> bump: BumpAllocators;
+
+@group(0) @binding(1)
+var<storage> seg_counts: array<SegmentCount>;
+
+@group(0) @binding(2)
+var<storage> lines: array<LineSoup>;
+
+@group(0) @binding(3)
+var<storage> paths: array<Path>;
+
+@group(0) @binding(4)
+var<storage> tiles: array<Tile>;
+
+@group(0) @binding(5)
+var<storage, read_write> segments: array<Segment>;
+
+fn span(a: f32, b: f32) -> u32 {
+    return u32(max(ceil(max(a, b)) - floor(min(a, b)), 1.0));
+}
+
+// One invocation for each tile that is to be written.
+// Total number of invocations = bump.seg_counts
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+) {
+    let n_segments = atomicLoad(&bump.seg_counts);
+    if global_id.x < n_segments {
+        let seg_count = seg_counts[global_id.x];
+        let line = lines[seg_count.line_ix];
+        let counts = seg_count.counts;
+        let seg_within_slice = counts >> 16u;
+        let seg_within_line = counts & 0xffffu;
+
+        // coarse rasterization logic
+        let is_down = line.p1.y >= line.p0.y;
+        var xy0 = select(line.p1, line.p0, is_down);
+        var xy1 = select(line.p0, line.p1, is_down);
+        let s0 = xy0 * TILE_SCALE;
+        let s1 = xy1 * TILE_SCALE;
+        let count = span(s0.x, s1.x) + span(s0.y, s1.y) - 1u;
+        let dx = abs(s1.x - s0.x);
+        let dy = s1.y - s0.y;
+        // Division by zero can't happen because zero-length lines
+        // have already been discarded in the path_count stage.
+        let idxdy = 1.0 / (dx + dy);
+        let a = dx * idxdy;
+        let is_positive_slope = s1.x >= s0.x;
+        let sign = select(-1.0, 1.0, is_positive_slope);
+        let xt0 = floor(s0.x * sign);
+        let c = s0.x * sign - xt0;
+        let y0i = floor(s0.y);
+        let ytop = select(y0i + 1.0, ceil(s0.y), s0.y == s1.y);
+        let b = (dy * c + dx * (ytop - s0.y)) * idxdy;
+        let x0i = i32(xt0 * sign + 0.5 * (sign - 1.0));
+        let z = floor(a * f32(seg_within_line) + b);
+        let x = x0i + i32(sign * z);
+        let y = i32(y0i + f32(seg_within_line) - z);
+
+        let path = paths[line.path_ix];
+        let bbox = vec4<i32>(path.bbox);
+        let stride = bbox.z - bbox.x;
+        let tile_ix = i32(path.tiles) + (y - bbox.y) * stride + x - bbox.x;
+        let tile = tiles[tile_ix];
+        let seg_start = ~tile.segment_count_or_ix;
+        if i32(seg_start) < 0 {
+            return;
+        }
+        let tile_xy = vec2(f32(x) * f32(TILE_WIDTH), f32(y) * f32(TILE_HEIGHT));
+        let tile_xy1 = tile_xy + vec2(f32(TILE_WIDTH), f32(TILE_HEIGHT));
+
+        if seg_within_line > 0u {
+            let z_prev = floor(a * (f32(seg_within_line) - 1.0) + b);
+            if z == z_prev {
+                // Top edge is clipped
+                var xt = xy0.x + (xy1.x - xy0.x) * (tile_xy.y - xy0.y) / (xy1.y - xy0.y);
+                // TODO: we want to switch to tile-relative coordinates
+                xt = clamp(xt, tile_xy.x + 1e-3, tile_xy1.x);
+                xy0 = vec2(xt, tile_xy.y);
+            } else {
+                // If is_positive_slope, left edge is clipped, otherwise right
+                let x_clip = select(tile_xy1.x, tile_xy.x, is_positive_slope);
+                var yt = xy0.y + (xy1.y - xy0.y) * (x_clip - xy0.x) / (xy1.x - xy0.x);
+                yt = clamp(yt, tile_xy.y + 1e-3, tile_xy1.y);
+                xy0 = vec2(x_clip, yt);
+            }
+        }
+        if seg_within_line < count - 1u {
+            let z_next = floor(a * (f32(seg_within_line) + 1.0) + b);
+            if z == z_next {
+                // Bottom edge is clipped
+                var xt = xy0.x + (xy1.x - xy0.x) * (tile_xy1.y - xy0.y) / (xy1.y - xy0.y);
+                xt = clamp(xt, tile_xy.x + 1e-3, tile_xy1.x);
+                xy1 = vec2(xt, tile_xy1.y);
+            } else {
+                // If is_positive_slope, right edge is clipped, otherwise left
+                let x_clip = select(tile_xy.x, tile_xy1.x, is_positive_slope);
+                var yt = xy0.y + (xy1.y - xy0.y) * (x_clip - xy0.x) / (xy1.x - xy0.x);
+                yt = clamp(yt, tile_xy.y + 1e-3, tile_xy1.y);
+                xy1 = vec2(x_clip, yt);
+            }
+        }
+        // See comments in CPU version of shader
+        var y_edge = 1e9;
+        if xy0.x == tile_xy.x {
+            y_edge = xy0.y;
+        } else if xy1.x == tile_xy.x {
+            y_edge = xy1.y;
+        }
+        if !is_down {
+            let tmp = xy0;
+            xy0 = xy1;
+            xy1 = tmp;
+        }
+        let segment = Segment(xy0, xy1 - xy0, y_edge);
+        segments[seg_start + seg_within_slice] = segment;
+    }
+}
diff --git a/shader/path_tiling_setup.wgsl b/shader/path_tiling_setup.wgsl
new file mode 100644
index 00000000..722be191
--- /dev/null
+++ b/shader/path_tiling_setup.wgsl
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Set up dispatch size for path tiling stage.
+
+#import bump
+
+@group(0) @binding(0)
+var<storage> bump: BumpAllocators;
+
+@group(0) @binding(1)
+var<storage, read_write> indirect: IndirectCount;
+
+// Partition size for path tiling stage
+let WG_SIZE = 256u;
+
+@compute @workgroup_size(1)
+fn main() {
+    let segments = atomicLoad(&bump.seg_counts);
+    indirect.count_x = (segments + (WG_SIZE - 1u)) / WG_SIZE;
+    indirect.count_y = 1u;
+    indirect.count_z = 1u;
+}
diff --git a/shader/shared/bump.wgsl b/shader/shared/bump.wgsl
index 4864f313..6c5789f1 100644
--- a/shader/shared/bump.wgsl
+++ b/shader/shared/bump.wgsl
@@ -6,13 +6,21 @@ let STAGE_TILE_ALLOC: u32 = 0x2u;
 let STAGE_PATH_COARSE: u32 = 0x4u;
 let STAGE_COARSE: u32 = 0x8u;
 
-// This must be kept in sync with the struct in src/render.rs
+// This must be kept in sync with the struct in config.rs in the encoding crate.
 struct BumpAllocators {
     // Bitmask of stages that have failed allocation.
     failed: atomic<u32>,
     binning: atomic<u32>,
     ptcl: atomic<u32>,
     tile: atomic<u32>,
+    seg_counts: atomic<u32>,
     segments: atomic<u32>,
     blend: atomic<u32>,
+    lines: atomic<u32>,
+}
+
+struct IndirectCount {
+    count_x: u32,
+    count_y: u32,
+    count_z: u32,
 }
diff --git a/shader/shared/config.wgsl b/shader/shared/config.wgsl
index 4ac718c3..1e553f9a 100644
--- a/shader/shared/config.wgsl
+++ b/shader/shared/config.wgsl
@@ -48,6 +48,9 @@ let N_TILE_Y = 16u;
 //let N_TILE = N_TILE_X * N_TILE_Y;
 let N_TILE = 256u;
 
+// Not currently supporting non-square tiles
+let TILE_SCALE = 0.0625;
+
 let BLEND_STACK_SPLIT = 4u;
 
 // The following are computed in draw_leaf from the generic gradient parameters
diff --git a/shader/shared/ptcl.wgsl b/shader/shared/ptcl.wgsl
index d5fc5d4f..cf759a5b 100644
--- a/shader/shared/ptcl.wgsl
+++ b/shader/shared/ptcl.wgsl
@@ -25,7 +25,8 @@ let CMD_JUMP = 11u;
 // hand in the relevant shaders
 
 struct CmdFill {
-    tile: u32,
+    size_and_rule: u32,
+    seg_data: u32,
     backdrop: i32,
 }
 
diff --git a/shader/shared/segment.wgsl b/shader/shared/segment.wgsl
index 7d035926..9fe13447 100644
--- a/shader/shared/segment.wgsl
+++ b/shader/shared/segment.wgsl
@@ -1,8 +1,33 @@
 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
 
+// Segments laid out for contiguous storage
 struct Segment {
     origin: vec2<f32>,
     delta: vec2<f32>,
     y_edge: f32,
-    next: u32,
+}
+
+// A line segment produced by flattening and ready for rasterization.
+//
+// The name is perhaps too playful, but reflects the fact that these
+// lines are completely unordered. They will flow through coarse path
+// rasterization, then the per-tile segments will be scatter-written into
+// segment storage so that each (tile, path) tuple gets a contiguous
+// slice of segments.
+struct LineSoup {
+    path_ix: u32,
+    // Note: this creates an alignment gap. Don't worry about
+    // this now, but maybe move to scalars.
+    p0: vec2<f32>,
+    p1: vec2<f32>,
+}
+
+// An intermediate data structure for sorting tile segments.
+struct SegmentCount {
+    // Reference to element of LineSoup array
+    line_ix: u32,
+    // Two count values packed into a single u32
+    // Lower 16 bits: index of segment within line
+    // Upper 16 bits: index of segment within segment slice
+    counts: u32,
 }
diff --git a/shader/shared/tile.wgsl b/shader/shared/tile.wgsl
index 7af8a625..063a5bc1 100644
--- a/shader/shared/tile.wgsl
+++ b/shader/shared/tile.wgsl
@@ -11,5 +11,10 @@ struct Path {
 
 struct Tile {
     backdrop: i32,
-    segments: u32,
+    // This is used for the count of the number of segments in the
+    // tile up to coarse rasterization, and the index afterwards.
+    // In the latter variant, the bits are inverted so that tiling
+    // can detect whether the tile was allocated; it's best to
+    // consider this an enum packed into a u32.
+    segment_count_or_ix: u32,
 }
diff --git a/src/lib.rs b/src/lib.rs
index f45bdcf2..55147acd 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -251,7 +251,9 @@ impl Renderer {
     ) -> Result<Option<BumpAllocators>> {
         let mut render = Render::new();
         let encoding = scene.data();
-        let recording = render.render_encoding_coarse(encoding, &self.shaders, params, true);
+        // TODO: turn this on; the download feature interacts with CPU dispatch
+        let robust = false;
+        let recording = render.render_encoding_coarse(encoding, &self.shaders, params, robust);
         let target = render.out_image();
         let bump_buf = render.bump_buf();
         self.engine.run_recording(
diff --git a/src/render.rs b/src/render.rs
index 03e64844..46256363 100644
--- a/src/render.rs
+++ b/src/render.rs
@@ -190,17 +190,21 @@ impl Render {
             wg_counts.bbox_clear,
             [config_buf, path_bbox_buf],
         );
-        let cubic_buf =
-            ResourceProxy::new_buf(buffer_sizes.cubics.size_in_bytes().into(), "cubic_buf");
+        let bump_buf = BufProxy::new(buffer_sizes.bump_alloc.size_in_bytes().into(), "bump_buf");
+        recording.clear_all(bump_buf);
+        let bump_buf = ResourceProxy::Buf(bump_buf);
+        let lines_buf =
+            ResourceProxy::new_buf(buffer_sizes.lines.size_in_bytes().into(), "lines_buf");
         recording.dispatch(
-            shaders.pathseg,
-            wg_counts.path_seg,
+            shaders.flatten,
+            wg_counts.flatten,
             [
                 config_buf,
                 scene_buf,
                 tagmonoid_buf,
                 path_bbox_buf,
-                cubic_buf,
+                bump_buf,
+                lines_buf,
             ],
         );
         let draw_reduced_buf = ResourceProxy::new_buf(
@@ -273,13 +277,10 @@ impl Render {
             buffer_sizes.draw_bboxes.size_in_bytes().into(),
             "draw_bbox_buf",
         );
-        let bump_buf = BufProxy::new(buffer_sizes.bump_alloc.size_in_bytes().into(), "bump_buf");
         let bin_header_buf = ResourceProxy::new_buf(
             buffer_sizes.bin_headers.size_in_bytes().into(),
             "bin_header_buf",
         );
-        recording.clear_all(bump_buf);
-        let bump_buf = ResourceProxy::Buf(bump_buf);
         recording.dispatch(
             shaders.binning,
             wg_counts.binning,
@@ -314,21 +315,33 @@ impl Render {
             ],
         );
         recording.free_resource(draw_bbox_buf);
+        recording.free_resource(tagmonoid_buf);
+        let indirect_count_buf = BufProxy::new(
+            buffer_sizes.indirect_count.size_in_bytes().into(),
+            "indirect_count",
+        );
         recording.dispatch(
-            shaders.path_coarse,
-            wg_counts.path_coarse,
+            shaders.path_count_setup,
+            (1, 1, 1),
+            [bump_buf, indirect_count_buf.into()],
+        );
+        let seg_counts_buf = ResourceProxy::new_buf(
+            buffer_sizes.seg_counts.size_in_bytes().into(),
+            "seg_counts_buf",
+        );
+        recording.dispatch_indirect(
+            shaders.path_count,
+            indirect_count_buf,
+            0,
             [
                 config_buf,
-                scene_buf,
-                cubic_buf,
-                path_buf,
                 bump_buf,
+                lines_buf,
+                path_buf,
                 tile_buf,
-                segments_buf,
+                seg_counts_buf,
             ],
         );
-        recording.free_resource(tagmonoid_buf);
-        recording.free_resource(cubic_buf);
         recording.dispatch(
             shaders.backdrop,
             wg_counts.backdrop,
@@ -349,6 +362,27 @@ impl Render {
                 ptcl_buf,
             ],
         );
+        recording.dispatch(
+            shaders.path_tiling_setup,
+            (1, 1, 1),
+            [bump_buf, indirect_count_buf.into()],
+        );
+        recording.dispatch_indirect(
+            shaders.path_tiling,
+            indirect_count_buf,
+            0,
+            [
+                bump_buf,
+                seg_counts_buf,
+                lines_buf,
+                path_buf,
+                tile_buf,
+                segments_buf,
+            ],
+        );
+        recording.free_buf(indirect_count_buf);
+        recording.free_resource(seg_counts_buf);
+        recording.free_resource(lines_buf);
         recording.free_resource(scene_buf);
         recording.free_resource(draw_monoid_buf);
         recording.free_resource(bin_header_buf);
diff --git a/src/shaders.rs b/src/shaders.rs
index b55f158c..23a3950f 100644
--- a/src/shaders.rs
+++ b/src/shaders.rs
@@ -65,16 +65,19 @@ pub struct FullShaders {
     pub pathtag_scan: ShaderId,
     pub pathtag_scan_large: ShaderId,
     pub bbox_clear: ShaderId,
-    pub pathseg: ShaderId,
+    pub flatten: ShaderId,
     pub draw_reduce: ShaderId,
     pub draw_leaf: ShaderId,
     pub clip_reduce: ShaderId,
     pub clip_leaf: ShaderId,
     pub binning: ShaderId,
     pub tile_alloc: ShaderId,
-    pub path_coarse: ShaderId,
     pub backdrop: ShaderId,
+    pub path_count_setup: ShaderId,
+    pub path_count: ShaderId,
     pub coarse: ShaderId,
+    pub path_tiling_setup: ShaderId,
+    pub path_tiling: ShaderId,
     pub fine: ShaderId,
 }
 
@@ -147,16 +150,17 @@ pub fn full_shaders(
         preprocess::preprocess(shader!("bbox_clear"), &empty, &imports).into(),
         &[BindType::Uniform, BindType::Buffer],
     )?;
-    let pathseg = engine.add_shader(
+    let flatten = engine.add_shader(
         device,
-        "pathseg",
-        preprocess::preprocess(shader!("pathseg"), &full_config, &imports).into(),
+        "flatten",
+        preprocess::preprocess(shader!("flatten"), &full_config, &imports).into(),
         &[
             BindType::Uniform,
             BindType::BufReadOnly,
             BindType::BufReadOnly,
             BindType::Buffer,
             BindType::Buffer,
+            BindType::Buffer,
         ],
     )?;
     let draw_reduce = engine.add_shader(
@@ -232,17 +236,21 @@ pub fn full_shaders(
             BindType::Buffer,
         ],
     )?;
-
-    let path_coarse = engine.add_shader(
+    let path_count_setup = engine.add_shader(
+        device,
+        "path_count_setup",
+        preprocess::preprocess(shader!("path_count_setup"), &empty, &imports).into(),
+        &[BindType::BufReadOnly, BindType::Buffer],
+    )?;
+    let path_count = engine.add_shader(
         device,
-        "path_coarse_full",
-        preprocess::preprocess(shader!("path_coarse_full"), &full_config, &imports).into(),
+        "path_count",
+        preprocess::preprocess(shader!("path_count"), &full_config, &imports).into(),
         &[
             BindType::Uniform,
+            BindType::Buffer,
             BindType::BufReadOnly,
             BindType::BufReadOnly,
-            BindType::BufReadOnly,
-            BindType::Buffer,
             BindType::Buffer,
             BindType::Buffer,
         ],
@@ -264,9 +272,28 @@ pub fn full_shaders(
             BindType::BufReadOnly,
             BindType::BufReadOnly,
             BindType::BufReadOnly,
-            BindType::BufReadOnly,
             BindType::Buffer,
             BindType::Buffer,
+            BindType::Buffer,
+        ],
+    )?;
+    let path_tiling_setup = engine.add_shader(
+        device,
+        "path_tiling_setup",
+        preprocess::preprocess(shader!("path_tiling_setup"), &empty, &imports).into(),
+        &[BindType::BufReadOnly, BindType::Buffer],
+    )?;
+    let path_tiling = engine.add_shader(
+        device,
+        "path_tiling",
+        preprocess::preprocess(shader!("path_tiling"), &empty, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
         ],
     )?;
     let fine = engine.add_shader(
@@ -290,16 +317,19 @@ pub fn full_shaders(
         pathtag_scan1,
         pathtag_scan_large,
         bbox_clear,
-        pathseg,
+        flatten,
         draw_reduce,
         draw_leaf,
         clip_reduce,
         clip_leaf,
         binning,
         tile_alloc,
-        path_coarse,
+        path_count_setup,
+        path_count,
         backdrop,
         coarse,
+        path_tiling_setup,
+        path_tiling,
         fine,
     })
 }