Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add multisampled antialiasing #377

Merged
merged 2 commits into from
Oct 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@
},
"wgsl-analyzer.diagnostics.nagaVersion": "main",
"wgsl-analyzer.preprocessor.shaderDefs": [
"full"
"full", "msaa16", "msaa"
]
}
319 changes: 309 additions & 10 deletions shader/fine.wgsl
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

// Fine rasterizer. This can run in simple (just path rendering) and full
// modes, controllable by #define.
//
// To enable multisampled rendering, turn on both the msaa ifdef and one of msaa8
// or msaa16.

// This is a cut'n'paste w/ backdrop.
struct Tile {
backdrop: i32,
segments: u32,
Expand All @@ -18,8 +20,6 @@ var<uniform> config: Config;
@group(0) @binding(1)
var<storage> segments: array<Segment>;

#ifdef full

#import blend
#import ptcl

Expand All @@ -40,6 +40,304 @@ var gradients: texture_2d<f32>;
@group(0) @binding(6)
var image_atlas: texture_2d<f32>;

#ifdef msaa8
let MASK_WIDTH = 32u;
let MASK_HEIGHT = 32u;
let SH_SAMPLES_SIZE = 256u;
let SAMPLE_WORDS_PER_PIXEL = 1u;
// This might be better in uniform, but that has 16 byte alignment
@group(0) @binding(7)
var<storage> mask_lut: array<u32, 256u>;
#endif

#ifdef msaa16
let MASK_WIDTH = 64u;
let MASK_HEIGHT = 64u;
let SH_SAMPLES_SIZE = 512u;
let SAMPLE_WORDS_PER_PIXEL = 2u;
@group(0) @binding(7)
var<storage> mask_lut: array<u32, 2048u>;
#endif

#ifdef msaa
let WG_SIZE = 64u;
var<workgroup> sh_count: array<u32, WG_SIZE>;

// This is 8 winding numbers packed to a u32, 4 bits per sample
var<workgroup> sh_winding: array<atomic<u32>, 32u>;
// Same packing, one group of 8 per pixel
var<workgroup> sh_samples: array<atomic<u32>, SH_SAMPLES_SIZE>;
// Same packing, accumulating winding numbers for vertical edge crossings
var<workgroup> sh_winding_y: array<atomic<u32>, 2u>;

// number of integer cells spanned by interval defined by a, b
fn span(a: f32, b: f32) -> u32 {
return u32(max(ceil(max(a, b)) - floor(min(a, b)), 1.0));
}

let SEG_SIZE = 5u;

// New multisampled algorithm.
fn fill_path_ms(fill: CmdFill, wg_id: vec2<u32>, local_id: vec2<u32>) -> array<f32, PIXELS_PER_THREAD> {
let n_segs = fill.size_and_rule >> 1u;
let even_odd = (fill.size_and_rule & 1u) != 0u;
let tile_origin = vec2(f32(wg_id.x) * f32(TILE_HEIGHT), f32(wg_id.y) * f32(TILE_WIDTH));
let th_ix = local_id.y * (TILE_WIDTH / PIXELS_PER_THREAD) + local_id.x;
if th_ix < 32u {
if th_ix < 2u {
atomicStore(&sh_winding_y[th_ix], 0x88888888u);
}
atomicStore(&sh_winding[th_ix], 0x88888888u);
}
let sample_count = PIXELS_PER_THREAD * SAMPLE_WORDS_PER_PIXEL;
for (var i = 0u; i < sample_count; i++) {
atomicStore(&sh_samples[th_ix * sample_count + i], 0x88888888u);
}
workgroupBarrier();
let n_batch = (n_segs + (WG_SIZE - 1u)) / WG_SIZE;
for (var batch = 0u; batch < n_batch; batch++) {
let seg_ix = batch * WG_SIZE + th_ix;
let seg_off = fill.seg_data + seg_ix;
var count = 0u;
let slice_size = min(n_segs - batch * WG_SIZE, WG_SIZE);
// TODO: might save a register rewriting this in terms of limit
if th_ix < slice_size {
let segment = segments[seg_off];
// Note: coords relative to tile origin probably a good idea in coarse path,
// especially as f16 would work. But keeping existing scheme for compatibility.
let xy0 = segment.origin - tile_origin;
let xy1 = xy0 + segment.delta;
var y_edge_f = f32(TILE_HEIGHT);
var delta = select(-1, 1, xy1.x <= xy0.x);
if xy0.x == 0.0 && xy1.x == 0.0 {
if xy0.y == 0.0 {
y_edge_f = 0.0;
} else if xy1.y == 0.0 {
y_edge_f = 0.0;
delta = -delta;
}
} else {
if xy0.x == 0.0 {
if xy0.y != 0.0 {
y_edge_f = xy0.y;
}
} else if xy1.x == 0.0 && xy1.y != 0.0 {
y_edge_f = xy1.y;
}
// discard horizontal lines aligned to pixel grid
if !(xy0.y == xy1.y && xy0.y == floor(xy0.y)) {
count = span(xy0.x, xy1.x) + span(xy0.y, xy1.y) - 1u;
}
}
let y_edge = u32(ceil(y_edge_f));
if y_edge < TILE_HEIGHT {
atomicAdd(&sh_winding_y[y_edge >> 3u], u32(delta) << ((y_edge & 7u) << 2u));
}
}
// workgroup prefix sum of counts
sh_count[th_ix] = count;
let lg_n = firstLeadingBit(slice_size * 2u - 1u);
for (var i = 0u; i < lg_n; i++) {
workgroupBarrier();
if th_ix >= 1u << i {
count += sh_count[th_ix - (1u << i)];
}
workgroupBarrier();
sh_count[th_ix] = count;
}
let total = workgroupUniformLoad(&sh_count[slice_size - 1u]);
for (var i = th_ix; i < total; i += WG_SIZE) {
// binary search to find pixel
var lo = 0u;
var hi = slice_size;
let goal = i;
while hi > lo + 1u {
let mid = (lo + hi) >> 1u;
if goal >= sh_count[mid - 1u] {
lo = mid;
} else {
hi = mid;
}
}
let el_ix = lo;
let last_pixel = i + 1u == sh_count[el_ix];
let sub_ix = i - select(0u, sh_count[el_ix - 1u], el_ix > 0u);
let seg_off = fill.seg_data + batch * WG_SIZE + el_ix;
let segment = segments[seg_off];
let xy0_in = segment.origin - tile_origin;
let xy1_in = xy0_in + segment.delta;
let is_down = xy1_in.y >= xy0_in.y;
let xy0 = select(xy1_in, xy0_in, is_down);
let xy1 = select(xy0_in, xy1_in, is_down);

// Set up data for line rasterization
// Note: this is duplicated work if total count exceeds a workgroup.
// One alternative is to compute it in a separate dispatch.
let dx = abs(xy1.x - xy0.x);
let dy = xy1.y - xy0.y;
// TODO: apply numerical robustness and optimization
let dy_dxdy = dy / (dx + dy);
let a = dx / (dx + dy);
Comment on lines +179 to +180
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You may want to define and reuse idxdy = 1.0 / (dx + dy) to avoid computing it twice. An optimizing shader compiler could reduce the divisor in both lines into one computation but I don't know if that's guaranteed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added a TODO here. When #378 is fixed, we'll want to apply it here as well.

An optimizing compiler could do it, but one that satisfied IEEE arithmetic could not.

let is_positive_slope = xy1.x >= xy0.x;
let sign = select(-1.0, 1.0, is_positive_slope);
let xt0 = floor(xy0.x * sign);
let c = xy0.x * sign - xt0;
let y0i = floor(xy0.y);
let ytop = y0i + 1.0;
let b = dy_dxdy * c + a * (ytop - xy0.y);
let x0i = i32(xt0 * sign + 0.5 * (sign - 1.0));
// Use line equation to plot pixel coordinates

let zf = a * f32(sub_ix) + b;
let z = floor(zf);
let x = x0i + i32(sign * z);
let y = i32(y0i) + i32(sub_ix) - i32(z);
var is_delta: bool;
// We need to adjust winding number if slope is positive and there
// is a crossing at the left edge of the pixel.
var is_bump = false;
let zp = floor(a * f32(sub_ix - 1u) + b);
if sub_ix == 0u {
is_delta = y0i == xy0.y && y0i != xy1.y;
is_bump = xy0.x == 0.0;
} else {
is_delta = z == zp;
is_bump = is_positive_slope && !is_delta;
}
let pix_ix = u32(y) * TILE_WIDTH + u32(x);
if u32(x) < TILE_WIDTH - 1u && u32(y) < TILE_HEIGHT {
let delta_pix = pix_ix + 1u;
if is_delta {
let delta = select(u32(-1), 1u, is_down) << ((delta_pix & 7u) << 2u);
atomicAdd(&sh_winding[delta_pix >> 3u], delta);
}
}
// Apply sample mask
let mask_block = u32(is_positive_slope) * (MASK_WIDTH * MASK_HEIGHT / 2u);
let half_height = f32(MASK_HEIGHT / 2u);
let mask_row = floor(min(a * half_height, half_height - 1.0)) * f32(MASK_WIDTH);
let mask_col = floor((zf - z) * f32(MASK_WIDTH));
let mask_ix = mask_block + u32(mask_row + mask_col);
#ifdef msaa8
var mask = mask_lut[mask_ix / 4u] >> ((mask_ix % 4u) * 8u);
mask &= 0xffu;
// Intersect with y half-plane masks
if sub_ix == 0u && !is_bump {
let mask_shift = u32(round(8.0 * (xy0.y - f32(y))));
mask &= 0xffu << mask_shift;
}
if last_pixel && xy1.x != 0.0 {
let mask_shift = u32(round(8.0 * (xy1.y - f32(y))));
mask &= ~(0xffu << mask_shift);
}
let mask_a = mask | (mask << 6u);
let mask_b = mask_a | (mask_a << 12u);
let mask_exp = (mask_b & 0x1010101u) | ((mask_b << 3u) & 0x10101010u);
var mask_signed = select(mask_exp, u32(-i32(mask_exp)), is_down);
if is_bump {
mask_signed += select(u32(-0x11111111), 0x1111111u, is_down);
}
atomicAdd(&sh_samples[pix_ix], mask_signed);
#endif
#ifdef msaa16
var mask = mask_lut[mask_ix / 2u] >> ((mask_ix % 2u) * 16u);
mask &= 0xffffu;
// Intersect with y half-plane masks
if sub_ix == 0u && !is_bump {
let mask_shift = u32(round(16.0 * (xy0.y - f32(y))));
mask &= 0xffffu << mask_shift;
}
if last_pixel && xy1.x != 0.0 {
let mask_shift = u32(round(16.0 * (xy1.y - f32(y))));
mask &= ~(0xffffu << mask_shift);
}
let mask0 = mask & 0xffu;
let mask0_a = mask0 | (mask0 << 6u);
let mask0_b = mask0_a | (mask0_a << 12u);
let mask0_exp = (mask0_b & 0x1010101u) | ((mask0_b << 3u) & 0x10101010u);
var mask0_signed = select(mask0_exp, u32(-i32(mask0_exp)), is_down);
let mask1 = (mask >> 8u) & 0xffu;
let mask1_a = mask1 | (mask1 << 6u);
let mask1_b = mask1_a | (mask1_a << 12u);
let mask1_exp = (mask1_b & 0x1010101u) | ((mask1_b << 3u) & 0x10101010u);
var mask1_signed = select(mask1_exp, u32(-i32(mask1_exp)), is_down);
if is_bump {
let bump_delta = select(u32(-0x11111111), 0x1111111u, is_down);
mask0_signed += bump_delta;
mask1_signed += bump_delta;
}
atomicAdd(&sh_samples[pix_ix * 2u], mask0_signed);
atomicAdd(&sh_samples[pix_ix * 2u + 1u], mask1_signed);
#endif
}
workgroupBarrier();
}
var area: array<f32, PIXELS_PER_THREAD>;
let major = (th_ix * PIXELS_PER_THREAD) >> 3u;
var packed_w = atomicLoad(&sh_winding[major]);
// Prefix sum of packed 4 bit values within u32
packed_w += (packed_w - 0x8888888u) << 4u;
packed_w += (packed_w - 0x888888u) << 8u;
packed_w += (packed_w - 0x8888u) << 16u;
// Note: could probably do bias in one go, but it would be inscrutable
if (major & 1u) != 0u {
// We could use shmem to communicate the value from another thread;
// if we had subgroups that would almost certainly be the most
// efficient way. But we just calculate again for simplicity.
var last_packed = atomicLoad(&sh_winding[major - 1u]);
last_packed += (last_packed - 0x8888888u) << 4u;
last_packed += (last_packed - 0x888888u) << 8u;
last_packed += (last_packed - 0x8888u) << 16u;
let bump = ((last_packed >> 28u) - 8u) * 0x11111111u;
packed_w += bump;
}
var packed_y = atomicLoad(&sh_winding_y[local_id.y >> 3u]);
packed_y += (packed_y - 0x8888888u) << 4u;
packed_y += (packed_y - 0x888888u) << 8u;
packed_y += (packed_y - 0x8888u) << 16u;
if th_ix == 0u {
atomicStore(&sh_winding_y[0], packed_y);
}
workgroupBarrier();
var wind_y = (packed_y >> ((local_id.y & 7u) << 2u)) - 8u;
if local_id.y >= 8u {
wind_y += (atomicLoad(&sh_winding_y[0]) >> 28u) - 8u;
}

for (var i = 0u; i < PIXELS_PER_THREAD; i++) {
let pix_ix = th_ix * PIXELS_PER_THREAD + i;
let minor = pix_ix & 7u;
//let nonzero = ((packed_w >> (minor << 2u)) & 0xfu) != u32(8 + backdrop);
// TODO: math might be off here
let expected_zero = (((packed_w >> (minor * 4u)) + wind_y) & 0xfu) - u32(fill.backdrop);
if expected_zero >= 16u {
area[i] = 1.0;
} else {
#ifdef msaa8
let samples = atomicLoad(&sh_samples[pix_ix]);
let xored = (expected_zero * 0x11111111u) ^ samples;
// Each 4-bit nibble in xored is 0 for winding = 0, nonzero otherwise
let xored2 = xored | (xored * 2u);
let xored4 = xored2 | (xored2 * 4u);
area[i] = f32(countOneBits(xored4 & 0x88888888u)) * 0.125;
#endif
#ifdef msaa16
let samples0 = atomicLoad(&sh_samples[pix_ix * 2u]);
let samples1 = atomicLoad(&sh_samples[pix_ix * 2u + 1u]);
let xored0 = (expected_zero * 0x11111111u) ^ samples0;
let xored0_2 = xored0 | (xored0 * 2u);
let xored1 = (expected_zero * 0x11111111u) ^ samples1;
let xored1_2 = xored1 | (xored1 >> 1u);
let xored2 = (xored0_2 & 0xAAAAAAAAu) | (xored1_2 & 0x55555555u);
let xored4 = xored2 | (xored2 * 4u);
area[i] = f32(countOneBits(xored4 & 0xCCCCCCCCu)) * 0.0625;
#endif
}
}
return area;
}
#endif

fn read_fill(cmd_ix: u32) -> CmdFill {
let size_and_rule = ptcl[cmd_ix + 1u];
let seg_data = ptcl[cmd_ix + 2u];
Expand Down Expand Up @@ -126,15 +424,12 @@ fn extend_mode(t: f32, mode: u32) -> f32 {
}
}

#else

@group(0) @binding(3)
var output: texture_storage_2d<r8, write>;
raphlinus marked this conversation as resolved.
Show resolved Hide resolved

#endif

let PIXELS_PER_THREAD = 4u;

// Analytic area antialiasing.
//
// This is currently dead code if msaa is enabled, but it would be fairly straightforward
// to wire this so it's a dynamic choice (even per-path).
fn fill_path(fill: CmdFill, xy: vec2<f32>) -> array<f32, PIXELS_PER_THREAD> {
let n_segs = fill.size_and_rule >> 1u;
let even_odd = (fill.size_and_rule & 1u) != 0u;
Expand Down Expand Up @@ -220,7 +515,11 @@ fn main(
// CMD_FILL
case 1u: {
let fill = read_fill(cmd_ix);
#ifdef msaa
area = fill_path_ms(fill, wg_id.xy, local_id.xy);
#else
area = fill_path(fill, xy);
#endif
cmd_ix += 4u;
}
// CMD_STROKE
Expand Down
14 changes: 14 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
mod cpu_dispatch;
mod cpu_shader;
mod engine;
mod mask;
mod render;
mod scene;
mod shaders;
Expand Down Expand Up @@ -61,6 +62,19 @@ pub type Error = Box<dyn std::error::Error>;
/// Specialization of `Result` for our catch-all error type.
pub type Result<T> = std::result::Result<T, Error>;

/// Possible configurations for antialiasing.
#[derive(PartialEq, Eq)]
#[allow(unused)]
enum AaConfig {
Area,
Msaa8,
Msaa16,
}

/// Configuration of antialiasing. Currently this is static, but could be switched to
/// a launch option or even finer-grained.
const ANTIALIASING: AaConfig = AaConfig::Area;

/// Renders a scene into a texture or surface.
#[cfg(feature = "wgpu")]
pub struct Renderer {
Expand Down
Loading