diff --git a/shader/flatten.wgsl b/shader/flatten.wgsl
index bdd1a55e0..160876f1a 100644
--- a/shader/flatten.wgsl
+++ b/shader/flatten.wgsl
@@ -52,7 +52,7 @@ fn approx_parabola_inv_integral(x: f32) -> f32 {
     return x * sqrt(1.0 - B + (B * B + 0.5 * x * x));
 }
 
-fn estimate_subdiv(p0: vec2f, p1: vec2f, p2: vec2f, sqrt_tol: f32) -> SubdivResult {
+fn estimate_subdiv(p0: vec2f, p1: vec2f, p2: vec2f, sqrt_tol: f32, transform_scale: f32) -> SubdivResult {
     let d01 = p1 - p0;
     let d12 = p2 - p1;
     let dd = d01 - d12;
@@ -60,7 +60,7 @@ fn estimate_subdiv(p0: vec2f, p1: vec2f, p2: vec2f, sqrt_tol: f32) -> SubdivResu
     let cross_inv = select(1.0 / cross, 1.0e9, abs(cross) < 1.0e-9);
     let x0 = dot(d01, dd) * cross_inv;
     let x2 = dot(d12, dd) * cross_inv;
-    let scale = abs(cross / (length(dd) * (x2 - x0)));
+    let scale = abs(transform_scale * cross / (length(dd) * (x2 - x0)));
 
     let a0 = approx_parabola_integral(x0);
     let a2 = approx_parabola_integral(x2);
@@ -128,7 +128,7 @@ fn cubic_end_normal(p0: vec2f, p1: vec2f, p2: vec2f, p3: vec2f) -> vec2f {
 
 let MAX_QUADS = 16u;
 
-fn flatten_cubic(cubic: Cubic) {
+fn flatten_cubic(cubic: Cubic, transform: Transform, offset: f32) {
     let p0 = cubic.p0;
     let p1 = cubic.p1;
     let p2 = cubic.p2;
@@ -139,7 +139,9 @@ fn flatten_cubic(cubic: Cubic) {
     let Q_ACCURACY = ACCURACY * 0.1;
     let REM_ACCURACY = ACCURACY - Q_ACCURACY;
     let MAX_HYPOT2 = 432.0 * Q_ACCURACY * Q_ACCURACY;
-    var n_quads = max(u32(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1u);
+    let scale = vec2(length(transform.mat.xz), length(transform.mat.yw));
+    let scale_factor = max(scale.x, scale.y);
+    var n_quads = max(u32(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0)) * scale_factor), 1u);
     n_quads = min(n_quads, MAX_QUADS);
     var keep_params: array<SubdivResult, MAX_QUADS>;
     var val = 0.0;
@@ -151,20 +153,16 @@ fn flatten_cubic(cubic: Cubic) {
         var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
         qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
 
-        // HACK: this increases subdivision count as a function of the stroke width for shitty
-        // strokes. This isn't systematic or correct and shouldn't be relied on in the long term.
-        var tol = sqrt(REM_ACCURACY);
-        if cubic.flags == CUBIC_IS_STROKE {
-            tol *= min(1000., dot(cubic.stroke, cubic.stroke));
-        }
-        let params = estimate_subdiv(qp0, qp1, qp2, tol);
+        // TODO: Estimate an accurate subdivision count for strokes, handling cusps.
+        let tol = sqrt(REM_ACCURACY);
+        let params = estimate_subdiv(qp0, qp1, qp2, tol, scale_factor);
         keep_params[i] = params;
         val += params.val;
         qp0 = qp2;
     }
 
-    // HACK: normal vector used to offset line segments for shitty stroke handling.
-    var n0 = cubic_start_normal(p0, p1, p2, p3) * cubic.stroke;
+    // Normal vector to calculate the start point of the offset curve.
+    var n0 = offset * cubic_start_normal(p0, p1, p2, p3);
 
     let n = max(u32(ceil(val * (0.5 / sqrt(REM_ACCURACY)))), 1u);
     var lp0 = p0;
@@ -208,16 +206,15 @@ fn flatten_cubic(cubic: Cubic) {
                 } else {
                     n1 = eval_quad_normal(qp0, qp1, qp2, t1);
                 }
-                n1 *= cubic.stroke;
-                let line_ix = atomicAdd(&bump.lines, 2u);
-                lines[line_ix]      = LineSoup(cubic.path_ix, lp0 + n0, lp1 + n1);
-                lines[line_ix + 1u] = LineSoup(cubic.path_ix, lp1 - n1, lp0 - n0);
+                n1 *= offset;
+                output_two_lines_with_transform(cubic.path_ix,
+                                                lp0 + n0, lp1 + n1,
+                                                lp1 - n1, lp0 - n0,
+                                                transform);
                 n0 = n1;
             } else {
                 // Output line segment lp0..lp1
-                let line_ix = atomicAdd(&bump.lines, 1u);
-                // TODO: check failure
-                lines[line_ix] = LineSoup(cubic.path_ix, lp0, lp1);
+                output_line_with_transform(cubic.path_ix, lp0, lp1, transform);
             }
             n_out += 1u;
             val_target += v_step;
@@ -231,14 +228,15 @@ fn flatten_cubic(cubic: Cubic) {
 fn draw_join(
     stroke: vec2f, path_ix: u32, style_flags: u32, p0: vec2f,
     tan_prev: vec2f, tan_next: vec2f,
-    n_prev: vec2f, n_next: vec2f
-) -> vec4f {
-    var miter_pt_bbox = vec4(1e31, 1e31, -1e31, -1e31);
+    n_prev: vec2f, n_next: vec2f,
+    transform: Transform,
+) {
     switch style_flags & STYLE_FLAGS_JOIN_MASK {
         case /*STYLE_FLAGS_JOIN_BEVEL*/0u: {
-            let line_ix = atomicAdd(&bump.lines, 2u);
-            lines[line_ix]      = LineSoup(path_ix, p0 + n_prev, p0 + n_next);
-            lines[line_ix + 1u] = LineSoup(path_ix, p0 - n_next, p0 - n_prev);
+            output_two_lines_with_transform(path_ix,
+                                            p0 + n_prev, p0 + n_next,
+                                            p0 - n_next, p0 - n_prev,
+                                            transform);
         }
         case /*STYLE_FLAGS_JOIN_MITER*/0x10000000u: {
             let c = tan_prev.x * tan_next.y - tan_prev.y * tan_next.x;
@@ -260,33 +258,27 @@ fn draw_join(
                 let v = fp_this - fp_last;
                 let h = (tan_prev.x * v.y - tan_prev.y * v.x) / c;
                 let miter_pt = fp_this - tan_next * h;
+                output_line_with_transform(path_ix, p, miter_pt, transform);
 
-                let line_ix = atomicAdd(&bump.lines, 1u);
-                lines[line_ix] = LineSoup(path_ix, p, miter_pt);
                 if is_backside {
                     back0 = miter_pt;
                 } else {
                     front0 = miter_pt;
                 }
-                miter_pt_bbox = vec4(miter_pt, miter_pt);
             }
-            let line_ix = atomicAdd(&bump.lines, 2u);
-            lines[line_ix]      = LineSoup(path_ix, front0, front1);
-            lines[line_ix + 1u] = LineSoup(path_ix, back0, back1);
+            output_two_lines_with_transform(path_ix, front0, front1, back0, back1, transform);
         }
         case /*STYLE_FLAGS_JOIN_ROUND*/0x20000000u: {
             // TODO: round join
-            let line_ix = atomicAdd(&bump.lines, 2u);
-            lines[line_ix]      = LineSoup(path_ix, p0 + n_prev, p0 + n_next);
-            lines[line_ix + 1u] = LineSoup(path_ix, p0 - n_next, p0 - n_prev);
+            output_two_lines_with_transform(path_ix,
+                                            p0 + n_prev, p0 + n_next,
+                                            p0 - n_next, p0 - n_prev,
+                                            transform);
         }
         default: {}
     }
-    return miter_pt_bbox;
 }
 
-var<private> pathdata_base: u32;
-
 fn read_f32_point(ix: u32) -> vec2f {
     let x = bitcast<f32>(scene[pathdata_base + ix]);
     let y = bitcast<f32>(scene[pathdata_base + ix + 1u]);
@@ -352,7 +344,7 @@ struct CubicPoints {
     p3: vec2f,
 }
 
-fn read_path_segment(tag: PathTagData, transform: Transform, is_stroke: bool) -> CubicPoints {
+fn read_path_segment(tag: PathTagData, is_stroke: bool) -> CubicPoints {
     var p0: vec2f;
     var p1: vec2f;
     var p2: vec2f;
@@ -391,12 +383,9 @@ fn read_path_segment(tag: PathTagData, transform: Transform, is_stroke: bool) ->
         // This is encoded this way because encoding this as a lineto would require adding a moveto,
         // which would terminate the subpath too early (by setting the SUBPATH_END on the
         // segment preceding the cap marker). This scheme is only used for strokes.
-        p0 = transform_apply(transform, p1);
-        p1 = transform_apply(transform, p2);
+        p0 = p1;
+        p1 = p2;
         seg_type = PATH_TAG_LINETO;
-    } else {
-        p0 = transform_apply(transform, p0);
-        p1 = transform_apply(transform, p1);
     }
 
     // Degree-raise
@@ -404,20 +393,47 @@ fn read_path_segment(tag: PathTagData, transform: Transform, is_stroke: bool) ->
         p3 = p1;
         p2 = mix(p3, p0, 1.0 / 3.0);
         p1 = mix(p0, p3, 1.0 / 3.0);
-    } else if seg_type >= PATH_TAG_QUADTO {
-        p2 = transform_apply(transform, p2);
-        if seg_type == PATH_TAG_CUBICTO {
-            p3 = transform_apply(transform, p3);
-        } else {
-            p3 = p2;
-            p2 = mix(p1, p2, 1.0 / 3.0);
-            p1 = mix(p1, p0, 1.0 / 3.0);
-        }
+    } else if seg_type == PATH_TAG_QUADTO {
+        p3 = p2;
+        p2 = mix(p1, p2, 1.0 / 3.0);
+        p1 = mix(p1, p0, 1.0 / 3.0);
     }
 
     return CubicPoints(p0, p1, p2, p3);
 }
 
+fn output_line(path_ix: u32, p0: vec2f, p1: vec2f) {
+    let line_ix = atomicAdd(&bump.lines, 1u);
+    bbox = vec4(min(bbox.xy, min(p0, p1)), max(bbox.zw, max(p0, p1)));
+    lines[line_ix] = LineSoup(path_ix, p0, p1);
+}
+
+fn output_line_with_transform(path_ix: u32, p0: vec2f, p1: vec2f, transform: Transform) {
+    let line_ix = atomicAdd(&bump.lines, 1u);
+    let tp0 = transform_apply(transform, p0);
+    let tp1 = transform_apply(transform, p1);
+    bbox = vec4(min(bbox.xy, min(tp0, tp1)), max(bbox.zw, max(tp0, tp1)));
+    lines[line_ix] = LineSoup(path_ix, tp0, tp1);
+}
+
+fn output_two_lines_with_transform(
+    path_ix: u32,
+    p00: vec2f, p01: vec2f,
+    p10: vec2f, p11: vec2f,
+    transform: Transform
+) {
+    let line_ix = atomicAdd(&bump.lines, 2u);
+    let tp00 = transform_apply(transform, p00);
+    let tp01 = transform_apply(transform, p01);
+    let tp10 = transform_apply(transform, p10);
+    let tp11 = transform_apply(transform, p11);
+
+    bbox = vec4(min(bbox.xy, min(tp00, tp01)), max(bbox.zw, max(tp00, tp01)));
+    bbox = vec4(min(bbox.xy, min(tp10, tp11)), max(bbox.zw, max(tp10, tp11)));
+    lines[line_ix]      = LineSoup(path_ix, tp00, tp01);
+    lines[line_ix + 1u] = LineSoup(path_ix, tp10, tp11);
+}
+
 struct NeighboringSegment {
     do_join: bool,
     p0: vec2f,
@@ -428,8 +444,7 @@ struct NeighboringSegment {
 
 fn read_neighboring_segment(ix: u32) -> NeighboringSegment {
     let tag = compute_tag_monoid(ix);
-    let transform = read_transform(config.transform_base, tag.monoid.trans_ix);
-    let pts = read_path_segment(tag, transform, true);
+    let pts = read_path_segment(tag, true);
 
     let is_closed = (tag.tag_byte & PATH_TAG_SEG_TYPE) == PATH_TAG_LINETO;
     let is_stroke_cap_marker = (tag.tag_byte & PATH_TAG_SUBPATH_END) != 0u;
@@ -439,6 +454,13 @@ fn read_neighboring_segment(ix: u32) -> NeighboringSegment {
     return NeighboringSegment(do_join, p0, tangent);
 }
 
+// `pathdata_base` is decoded once and reused by helpers above.
+var<private> pathdata_base: u32;
+
+// This is the bounding box of the shape flattened by a single shader invocation. This is adjusted
+// as lines are generated.
+var<private> bbox: vec4f;
+
 @compute @workgroup_size(256)
 fn main(
     @builtin(global_invocation_id) global_id: vec3<u32>,
@@ -446,6 +468,7 @@ fn main(
 ) {
     let ix = global_id.x;
     pathdata_base = config.pathdata_base;
+    bbox = vec4(1e31, 1e31, -1e31, -1e31);
 
     let tag = compute_tag_monoid(ix);
     let path_ix = tag.monoid.path_ix;
@@ -465,56 +488,48 @@ fn main(
     if seg_type != 0u {
         let is_stroke = (style_flags & STYLE_FLAGS_STYLE) != 0u;
         let transform = read_transform(config.transform_base, trans_ix);
-        let pts = read_path_segment(tag, transform, is_stroke);
-        var bbox = vec4(min(pts.p0, pts.p1), max(pts.p0, pts.p1));
-        bbox = vec4(min(bbox.xy, pts.p2), max(bbox.zw, pts.p2));
-        bbox = vec4(min(bbox.xy, pts.p3), max(bbox.zw, pts.p3));
+        let pts = read_path_segment(tag, is_stroke);
 
         var stroke = vec2(0.0, 0.0);
         if is_stroke {
             let linewidth = bitcast<f32>(scene[config.style_base + style_ix + 1u]);
+            let offset = 0.5 * linewidth;
+
             // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
             // This is the correct bounding box, but we're not handling rendering
             // in the isotropic case, so it may mismatch.
-            stroke = 0.5 * linewidth * vec2(length(transform.mat.xz), length(transform.mat.yw));
-            bbox += vec4(-stroke, stroke);
+            stroke = offset * vec2(length(transform.mat.xz), length(transform.mat.yw));
+
             let is_open = (tag.tag_byte & PATH_TAG_SEG_TYPE) != PATH_TAG_LINETO;
             let is_stroke_cap_marker = (tag.tag_byte & PATH_TAG_SUBPATH_END) != 0u;
             if is_stroke_cap_marker {
                 if is_open {
-                    let n = cubic_start_normal(pts.p0, pts.p1, pts.p2, pts.p3) * stroke;
-
-                    // Draw start cap
-                    let line_ix = atomicAdd(&bump.lines, 1u);
-                    lines[line_ix] = LineSoup(path_ix, pts.p0 - n, pts.p0 + n);
+                    // Draw start cap (butt)
+                    let n = offset * cubic_start_normal(pts.p0, pts.p1, pts.p2, pts.p3);
+                    output_line_with_transform(path_ix, pts.p0 - n, pts.p0 + n, transform);
                 } else {
                     // Don't draw anything if the path is closed.
                 }
-                // The stroke cap marker does not contribute to the path's bounding box. The stroke
-                // width is accounted for when computing the bbox for regular segments.
-                bbox = vec4(1., 1., -1., -1.);
             } else {
                 // Render offset curves
-                flatten_cubic(Cubic(pts.p0, pts.p1, pts.p2, pts.p3, stroke, path_ix, u32(is_stroke)));
+                flatten_cubic(Cubic(pts.p0, pts.p1, pts.p2, pts.p3, stroke, path_ix, u32(is_stroke)), transform, offset);
 
                 // Read the neighboring segment.
                 let neighbor = read_neighboring_segment(ix + 1u);
                 let tan_prev = cubic_end_tangent(pts.p0, pts.p1, pts.p2, pts.p3);
                 let tan_next = neighbor.tangent;
-                let n_prev = normalize(tan_prev).yx * vec2f(-1., 1.) * stroke;
-                let n_next = normalize(tan_next).yx * vec2f(-1., 1.) * stroke;
+                let n_prev = offset * (normalize(tan_prev).yx * vec2f(-1., 1.));
+                let n_next = offset * (normalize(tan_next).yx * vec2f(-1., 1.));
                 if neighbor.do_join {
-                    let miter_pt = draw_join(stroke, path_ix, style_flags, pts.p3,
-                                             tan_prev, tan_next, n_prev, n_next);
-                    bbox = vec4(min(miter_pt.xy, bbox.xy), max(miter_pt.zw, bbox.zw));
+                    draw_join(stroke, path_ix, style_flags, pts.p3,
+                              tan_prev, tan_next, n_prev, n_next, transform);
                 } else {
                     // Draw end cap.
-                    let line_ix = atomicAdd(&bump.lines, 1u);
-                    lines[line_ix] = LineSoup(path_ix, pts.p3 + n_prev, pts.p3 - n_prev);
+                    output_line_with_transform(path_ix, pts.p3 + n_prev, pts.p3 - n_prev, transform);
                 }
             }
         } else {
-            flatten_cubic(Cubic(pts.p0, pts.p1, pts.p2, pts.p3, stroke, path_ix, u32(is_stroke)));
+            flatten_cubic(Cubic(pts.p0, pts.p1, pts.p2, pts.p3, stroke, path_ix, u32(is_stroke)), transform, 0.);
         }
         // Update bounding box using atomics only. Computing a monoid is a
         // potential future optimization.