diff --git a/.vscode/settings.json b/.vscode/settings.json index a75948f7..57a124c7 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -16,6 +16,6 @@ }, "wgsl-analyzer.diagnostics.nagaVersion": "main", "wgsl-analyzer.preprocessor.shaderDefs": [ - "full" + "full", "msaa16", "msaa" ] } diff --git a/shader/fine.wgsl b/shader/fine.wgsl index 108c88cb..f41747d8 100644 --- a/shader/fine.wgsl +++ b/shader/fine.wgsl @@ -2,8 +2,10 @@ // Fine rasterizer. This can run in simple (just path rendering) and full // modes, controllable by #define. +// +// To enable multisampled rendering, turn on both the msaa ifdef and one of msaa8 +// or msaa16. -// This is a cut'n'paste w/ backdrop. struct Tile { backdrop: i32, segments: u32, @@ -18,8 +20,6 @@ var config: Config; @group(0) @binding(1) var segments: array; -#ifdef full - #import blend #import ptcl @@ -40,6 +40,304 @@ var gradients: texture_2d; @group(0) @binding(6) var image_atlas: texture_2d; +#ifdef msaa8 +let MASK_WIDTH = 32u; +let MASK_HEIGHT = 32u; +let SH_SAMPLES_SIZE = 256u; +let SAMPLE_WORDS_PER_PIXEL = 1u; +// This might be better in uniform, but that has 16 byte alignment +@group(0) @binding(7) +var mask_lut: array; +#endif + +#ifdef msaa16 +let MASK_WIDTH = 64u; +let MASK_HEIGHT = 64u; +let SH_SAMPLES_SIZE = 512u; +let SAMPLE_WORDS_PER_PIXEL = 2u; +@group(0) @binding(7) +var mask_lut: array; +#endif + +#ifdef msaa +let WG_SIZE = 64u; +var sh_count: array; + +// This is 8 winding numbers packed to a u32, 4 bits per sample +var sh_winding: array, 32u>; +// Same packing, one group of 8 per pixel +var sh_samples: array, SH_SAMPLES_SIZE>; +// Same packing, accumulating winding numbers for vertical edge crossings +var sh_winding_y: array, 2u>; + +// number of integer cells spanned by interval defined by a, b +fn span(a: f32, b: f32) -> u32 { + return u32(max(ceil(max(a, b)) - floor(min(a, b)), 1.0)); +} + +let SEG_SIZE = 5u; + +// New multisampled algorithm. +fn fill_path_ms(fill: CmdFill, wg_id: vec2, local_id: vec2) -> array { + let n_segs = fill.size_and_rule >> 1u; + let even_odd = (fill.size_and_rule & 1u) != 0u; + let tile_origin = vec2(f32(wg_id.x) * f32(TILE_HEIGHT), f32(wg_id.y) * f32(TILE_WIDTH)); + let th_ix = local_id.y * (TILE_WIDTH / PIXELS_PER_THREAD) + local_id.x; + if th_ix < 32u { + if th_ix < 2u { + atomicStore(&sh_winding_y[th_ix], 0x88888888u); + } + atomicStore(&sh_winding[th_ix], 0x88888888u); + } + let sample_count = PIXELS_PER_THREAD * SAMPLE_WORDS_PER_PIXEL; + for (var i = 0u; i < sample_count; i++) { + atomicStore(&sh_samples[th_ix * sample_count + i], 0x88888888u); + } + workgroupBarrier(); + let n_batch = (n_segs + (WG_SIZE - 1u)) / WG_SIZE; + for (var batch = 0u; batch < n_batch; batch++) { + let seg_ix = batch * WG_SIZE + th_ix; + let seg_off = fill.seg_data + seg_ix; + var count = 0u; + let slice_size = min(n_segs - batch * WG_SIZE, WG_SIZE); + // TODO: might save a register rewriting this in terms of limit + if th_ix < slice_size { + let segment = segments[seg_off]; + // Note: coords relative to tile origin probably a good idea in coarse path, + // especially as f16 would work. But keeping existing scheme for compatibility. + let xy0 = segment.origin - tile_origin; + let xy1 = xy0 + segment.delta; + var y_edge_f = f32(TILE_HEIGHT); + var delta = select(-1, 1, xy1.x <= xy0.x); + if xy0.x == 0.0 && xy1.x == 0.0 { + if xy0.y == 0.0 { + y_edge_f = 0.0; + } else if xy1.y == 0.0 { + y_edge_f = 0.0; + delta = -delta; + } + } else { + if xy0.x == 0.0 { + if xy0.y != 0.0 { + y_edge_f = xy0.y; + } + } else if xy1.x == 0.0 && xy1.y != 0.0 { + y_edge_f = xy1.y; + } + // discard horizontal lines aligned to pixel grid + if !(xy0.y == xy1.y && xy0.y == floor(xy0.y)) { + count = span(xy0.x, xy1.x) + span(xy0.y, xy1.y) - 1u; + } + } + let y_edge = u32(ceil(y_edge_f)); + if y_edge < TILE_HEIGHT { + atomicAdd(&sh_winding_y[y_edge >> 3u], u32(delta) << ((y_edge & 7u) << 2u)); + } + } + // workgroup prefix sum of counts + sh_count[th_ix] = count; + let lg_n = firstLeadingBit(slice_size * 2u - 1u); + for (var i = 0u; i < lg_n; i++) { + workgroupBarrier(); + if th_ix >= 1u << i { + count += sh_count[th_ix - (1u << i)]; + } + workgroupBarrier(); + sh_count[th_ix] = count; + } + let total = workgroupUniformLoad(&sh_count[slice_size - 1u]); + for (var i = th_ix; i < total; i += WG_SIZE) { + // binary search to find pixel + var lo = 0u; + var hi = slice_size; + let goal = i; + while hi > lo + 1u { + let mid = (lo + hi) >> 1u; + if goal >= sh_count[mid - 1u] { + lo = mid; + } else { + hi = mid; + } + } + let el_ix = lo; + let last_pixel = i + 1u == sh_count[el_ix]; + let sub_ix = i - select(0u, sh_count[el_ix - 1u], el_ix > 0u); + let seg_off = fill.seg_data + batch * WG_SIZE + el_ix; + let segment = segments[seg_off]; + let xy0_in = segment.origin - tile_origin; + let xy1_in = xy0_in + segment.delta; + let is_down = xy1_in.y >= xy0_in.y; + let xy0 = select(xy1_in, xy0_in, is_down); + let xy1 = select(xy0_in, xy1_in, is_down); + + // Set up data for line rasterization + // Note: this is duplicated work if total count exceeds a workgroup. + // One alternative is to compute it in a separate dispatch. + let dx = abs(xy1.x - xy0.x); + let dy = xy1.y - xy0.y; + // TODO: apply numerical robustness and optimization + let dy_dxdy = dy / (dx + dy); + let a = dx / (dx + dy); + let is_positive_slope = xy1.x >= xy0.x; + let sign = select(-1.0, 1.0, is_positive_slope); + let xt0 = floor(xy0.x * sign); + let c = xy0.x * sign - xt0; + let y0i = floor(xy0.y); + let ytop = y0i + 1.0; + let b = dy_dxdy * c + a * (ytop - xy0.y); + let x0i = i32(xt0 * sign + 0.5 * (sign - 1.0)); + // Use line equation to plot pixel coordinates + + let zf = a * f32(sub_ix) + b; + let z = floor(zf); + let x = x0i + i32(sign * z); + let y = i32(y0i) + i32(sub_ix) - i32(z); + var is_delta: bool; + // We need to adjust winding number if slope is positive and there + // is a crossing at the left edge of the pixel. + var is_bump = false; + let zp = floor(a * f32(sub_ix - 1u) + b); + if sub_ix == 0u { + is_delta = y0i == xy0.y && y0i != xy1.y; + is_bump = xy0.x == 0.0; + } else { + is_delta = z == zp; + is_bump = is_positive_slope && !is_delta; + } + let pix_ix = u32(y) * TILE_WIDTH + u32(x); + if u32(x) < TILE_WIDTH - 1u && u32(y) < TILE_HEIGHT { + let delta_pix = pix_ix + 1u; + if is_delta { + let delta = select(u32(-1), 1u, is_down) << ((delta_pix & 7u) << 2u); + atomicAdd(&sh_winding[delta_pix >> 3u], delta); + } + } + // Apply sample mask + let mask_block = u32(is_positive_slope) * (MASK_WIDTH * MASK_HEIGHT / 2u); + let half_height = f32(MASK_HEIGHT / 2u); + let mask_row = floor(min(a * half_height, half_height - 1.0)) * f32(MASK_WIDTH); + let mask_col = floor((zf - z) * f32(MASK_WIDTH)); + let mask_ix = mask_block + u32(mask_row + mask_col); +#ifdef msaa8 + var mask = mask_lut[mask_ix / 4u] >> ((mask_ix % 4u) * 8u); + mask &= 0xffu; + // Intersect with y half-plane masks + if sub_ix == 0u && !is_bump { + let mask_shift = u32(round(8.0 * (xy0.y - f32(y)))); + mask &= 0xffu << mask_shift; + } + if last_pixel && xy1.x != 0.0 { + let mask_shift = u32(round(8.0 * (xy1.y - f32(y)))); + mask &= ~(0xffu << mask_shift); + } + let mask_a = mask | (mask << 6u); + let mask_b = mask_a | (mask_a << 12u); + let mask_exp = (mask_b & 0x1010101u) | ((mask_b << 3u) & 0x10101010u); + var mask_signed = select(mask_exp, u32(-i32(mask_exp)), is_down); + if is_bump { + mask_signed += select(u32(-0x11111111), 0x1111111u, is_down); + } + atomicAdd(&sh_samples[pix_ix], mask_signed); +#endif +#ifdef msaa16 + var mask = mask_lut[mask_ix / 2u] >> ((mask_ix % 2u) * 16u); + mask &= 0xffffu; + // Intersect with y half-plane masks + if sub_ix == 0u && !is_bump { + let mask_shift = u32(round(16.0 * (xy0.y - f32(y)))); + mask &= 0xffffu << mask_shift; + } + if last_pixel && xy1.x != 0.0 { + let mask_shift = u32(round(16.0 * (xy1.y - f32(y)))); + mask &= ~(0xffffu << mask_shift); + } + let mask0 = mask & 0xffu; + let mask0_a = mask0 | (mask0 << 6u); + let mask0_b = mask0_a | (mask0_a << 12u); + let mask0_exp = (mask0_b & 0x1010101u) | ((mask0_b << 3u) & 0x10101010u); + var mask0_signed = select(mask0_exp, u32(-i32(mask0_exp)), is_down); + let mask1 = (mask >> 8u) & 0xffu; + let mask1_a = mask1 | (mask1 << 6u); + let mask1_b = mask1_a | (mask1_a << 12u); + let mask1_exp = (mask1_b & 0x1010101u) | ((mask1_b << 3u) & 0x10101010u); + var mask1_signed = select(mask1_exp, u32(-i32(mask1_exp)), is_down); + if is_bump { + let bump_delta = select(u32(-0x11111111), 0x1111111u, is_down); + mask0_signed += bump_delta; + mask1_signed += bump_delta; + } + atomicAdd(&sh_samples[pix_ix * 2u], mask0_signed); + atomicAdd(&sh_samples[pix_ix * 2u + 1u], mask1_signed); +#endif + } + workgroupBarrier(); + } + var area: array; + let major = (th_ix * PIXELS_PER_THREAD) >> 3u; + var packed_w = atomicLoad(&sh_winding[major]); + // Prefix sum of packed 4 bit values within u32 + packed_w += (packed_w - 0x8888888u) << 4u; + packed_w += (packed_w - 0x888888u) << 8u; + packed_w += (packed_w - 0x8888u) << 16u; + // Note: could probably do bias in one go, but it would be inscrutable + if (major & 1u) != 0u { + // We could use shmem to communicate the value from another thread; + // if we had subgroups that would almost certainly be the most + // efficient way. But we just calculate again for simplicity. + var last_packed = atomicLoad(&sh_winding[major - 1u]); + last_packed += (last_packed - 0x8888888u) << 4u; + last_packed += (last_packed - 0x888888u) << 8u; + last_packed += (last_packed - 0x8888u) << 16u; + let bump = ((last_packed >> 28u) - 8u) * 0x11111111u; + packed_w += bump; + } + var packed_y = atomicLoad(&sh_winding_y[local_id.y >> 3u]); + packed_y += (packed_y - 0x8888888u) << 4u; + packed_y += (packed_y - 0x888888u) << 8u; + packed_y += (packed_y - 0x8888u) << 16u; + if th_ix == 0u { + atomicStore(&sh_winding_y[0], packed_y); + } + workgroupBarrier(); + var wind_y = (packed_y >> ((local_id.y & 7u) << 2u)) - 8u; + if local_id.y >= 8u { + wind_y += (atomicLoad(&sh_winding_y[0]) >> 28u) - 8u; + } + + for (var i = 0u; i < PIXELS_PER_THREAD; i++) { + let pix_ix = th_ix * PIXELS_PER_THREAD + i; + let minor = pix_ix & 7u; + //let nonzero = ((packed_w >> (minor << 2u)) & 0xfu) != u32(8 + backdrop); + // TODO: math might be off here + let expected_zero = (((packed_w >> (minor * 4u)) + wind_y) & 0xfu) - u32(fill.backdrop); + if expected_zero >= 16u { + area[i] = 1.0; + } else { +#ifdef msaa8 + let samples = atomicLoad(&sh_samples[pix_ix]); + let xored = (expected_zero * 0x11111111u) ^ samples; + // Each 4-bit nibble in xored is 0 for winding = 0, nonzero otherwise + let xored2 = xored | (xored * 2u); + let xored4 = xored2 | (xored2 * 4u); + area[i] = f32(countOneBits(xored4 & 0x88888888u)) * 0.125; +#endif +#ifdef msaa16 + let samples0 = atomicLoad(&sh_samples[pix_ix * 2u]); + let samples1 = atomicLoad(&sh_samples[pix_ix * 2u + 1u]); + let xored0 = (expected_zero * 0x11111111u) ^ samples0; + let xored0_2 = xored0 | (xored0 * 2u); + let xored1 = (expected_zero * 0x11111111u) ^ samples1; + let xored1_2 = xored1 | (xored1 >> 1u); + let xored2 = (xored0_2 & 0xAAAAAAAAu) | (xored1_2 & 0x55555555u); + let xored4 = xored2 | (xored2 * 4u); + area[i] = f32(countOneBits(xored4 & 0xCCCCCCCCu)) * 0.0625; +#endif + } + } + return area; +} +#endif + fn read_fill(cmd_ix: u32) -> CmdFill { let size_and_rule = ptcl[cmd_ix + 1u]; let seg_data = ptcl[cmd_ix + 2u]; @@ -126,15 +424,12 @@ fn extend_mode(t: f32, mode: u32) -> f32 { } } -#else - -@group(0) @binding(3) -var output: texture_storage_2d; - -#endif - let PIXELS_PER_THREAD = 4u; +// Analytic area antialiasing. +// +// This is currently dead code if msaa is enabled, but it would be fairly straightforward +// to wire this so it's a dynamic choice (even per-path). fn fill_path(fill: CmdFill, xy: vec2) -> array { let n_segs = fill.size_and_rule >> 1u; let even_odd = (fill.size_and_rule & 1u) != 0u; @@ -220,7 +515,11 @@ fn main( // CMD_FILL case 1u: { let fill = read_fill(cmd_ix); +#ifdef msaa + area = fill_path_ms(fill, wg_id.xy, local_id.xy); +#else area = fill_path(fill, xy); +#endif cmd_ix += 4u; } // CMD_STROKE diff --git a/src/lib.rs b/src/lib.rs index 28e5bf7f..006accd3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,6 +17,7 @@ mod cpu_dispatch; mod cpu_shader; mod engine; +mod mask; mod render; mod scene; mod shaders; @@ -61,6 +62,19 @@ pub type Error = Box; /// Specialization of `Result` for our catch-all error type. pub type Result = std::result::Result; +/// Possible configurations for antialiasing. +#[derive(PartialEq, Eq)] +#[allow(unused)] +enum AaConfig { + Area, + Msaa8, + Msaa16, +} + +/// Configuration of antialiasing. Currently this is static, but could be switched to +/// a launch option or even finer-grained. +const ANTIALIASING: AaConfig = AaConfig::Area; + /// Renders a scene into a texture or surface. #[cfg(feature = "wgpu")] pub struct Renderer { diff --git a/src/mask.rs b/src/mask.rs new file mode 100644 index 00000000..61cacf0b --- /dev/null +++ b/src/mask.rs @@ -0,0 +1,98 @@ +// Copyright 2022 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +//! Create a lookup table of half-plane sample masks. + +// Width is number of discrete translations +const MASK_WIDTH: usize = 32; +// Height is the number of discrete slopes +const MASK_HEIGHT: usize = 32; + +const PATTERN: [u8; 8] = [0, 5, 3, 7, 1, 4, 6, 2]; + +fn one_mask(slope: f64, mut translation: f64, is_pos: bool) -> u8 { + if is_pos { + translation = 1. - translation; + } + let mut result = 0; + for (i, item) in PATTERN.iter().enumerate() { + let mut y = (i as f64 + 0.5) * 0.125; + let x = (*item as f64 + 0.5) * 0.125; + if !is_pos { + y = 1. - y; + } + if (x - (1.0 - translation)) * (1. - slope) - (y - translation) * slope >= 0. { + result |= 1 << i; + } + } + result +} + +/// Make a lookup table of half-plane masks. +/// +/// The table is organized into two blocks each with MASK_HEIGHT/2 slopes. +/// The first block is negative slopes (x decreases as y increates), +/// the second as positive. +pub fn make_mask_lut() -> Vec { + (0..MASK_WIDTH * MASK_HEIGHT) + .map(|i| { + const HALF_HEIGHT: usize = MASK_HEIGHT / 2; + let u = i % MASK_WIDTH; + let v = i / MASK_WIDTH; + let is_pos = v >= HALF_HEIGHT; + let y = ((v % HALF_HEIGHT) as f64 + 0.5) * (1.0 / HALF_HEIGHT as f64); + let x = (u as f64 + 0.5) * (1.0 / MASK_WIDTH as f64); + one_mask(y, x, is_pos) + }) + .collect() +} + +// Width is number of discrete translations +const MASK16_WIDTH: usize = 64; +// Height is the number of discrete slopes +const MASK16_HEIGHT: usize = 64; + +// This is based on the [D3D11 standard sample pattern]. +// +// [D3D11 standard sample pattern]: https://learn.microsoft.com/en-us/windows/win32/api/d3d11/ne-d3d11-d3d11_standard_multisample_quality_levels +const PATTERN_16: [u8; 16] = [1, 8, 4, 11, 15, 7, 3, 12, 0, 9, 5, 13, 2, 10, 6, 14]; + +fn one_mask_16(slope: f64, mut translation: f64, is_pos: bool) -> u16 { + if is_pos { + translation = 1. - translation; + } + let mut result = 0; + for (i, item) in PATTERN_16.iter().enumerate() { + let mut y = (i as f64 + 0.5) * 0.0625; + let x = (*item as f64 + 0.5) * 0.0625; + if !is_pos { + y = 1. - y; + } + if (x - (1.0 - translation)) * (1. - slope) - (y - translation) * slope >= 0. { + result |= 1 << i; + } + } + result +} + +/// Make a lookup table of half-plane masks. +/// +/// The table is organized into two blocks each with MASK16_HEIGHT/2 slopes. +/// The first block is negative slopes (x decreases as y increates), +/// the second as positive. +pub fn make_mask_lut_16() -> Vec { + let v16 = (0..MASK16_WIDTH * MASK16_HEIGHT) + .map(|i| { + const HALF_HEIGHT: usize = MASK16_HEIGHT / 2; + let u = i % MASK16_WIDTH; + let v = i / MASK16_WIDTH; + let is_pos = v >= HALF_HEIGHT; + let y = ((v % HALF_HEIGHT) as f64 + 0.5) * (1.0 / HALF_HEIGHT as f64); + let x = (u as f64 + 0.5) * (1.0 / MASK16_WIDTH as f64); + one_mask_16(y, x, is_pos) + }) + .collect::>(); + // This annoyingly makes another copy. We can avoid that by pushing two + // bytes per iteration of the above loop. + bytemuck::cast_slice(&v16).into() +} diff --git a/src/render.rs b/src/render.rs index 268007fa..0bb65795 100644 --- a/src/render.rs +++ b/src/render.rs @@ -3,7 +3,7 @@ use crate::{ engine::{BufProxy, ImageFormat, ImageProxy, Recording, ResourceProxy}, shaders::FullShaders, - RenderParams, Scene, + AaConfig, RenderParams, Scene, ANTIALIASING, }; use vello_encoding::{Encoding, WorkgroupSize}; @@ -11,6 +11,7 @@ use vello_encoding::{Encoding, WorkgroupSize}; pub struct Render { fine_wg_count: Option, fine_resources: Option, + mask_buf: Option, } /// Resources produced by pipeline, needed for fine rasterization. @@ -62,6 +63,7 @@ impl Render { Render { fine_wg_count: None, fine_resources: None, + mask_buf: None, } } @@ -412,19 +414,48 @@ impl Render { pub fn record_fine(&mut self, shaders: &FullShaders, recording: &mut Recording) { let fine_wg_count = self.fine_wg_count.take().unwrap(); let fine = self.fine_resources.take().unwrap(); - recording.dispatch( - shaders.fine, - fine_wg_count, - [ - fine.config_buf, - fine.segments_buf, - fine.ptcl_buf, - fine.info_bin_data_buf, - ResourceProxy::Image(fine.out_image), - fine.gradient_image, - fine.image_atlas, - ], - ); + match ANTIALIASING { + AaConfig::Area => { + recording.dispatch( + shaders.fine, + fine_wg_count, + [ + fine.config_buf, + fine.segments_buf, + fine.ptcl_buf, + fine.info_bin_data_buf, + ResourceProxy::Image(fine.out_image), + fine.gradient_image, + fine.image_atlas, + ], + ); + } + _ => { + if self.mask_buf.is_none() { + let mask_lut = match ANTIALIASING { + AaConfig::Msaa16 => crate::mask::make_mask_lut_16(), + AaConfig::Msaa8 => crate::mask::make_mask_lut(), + _ => unreachable!(), + }; + let buf = recording.upload("mask lut", mask_lut); + self.mask_buf = Some(buf.into()); + } + recording.dispatch( + shaders.fine, + fine_wg_count, + [ + fine.config_buf, + fine.segments_buf, + fine.ptcl_buf, + fine.info_bin_data_buf, + ResourceProxy::Image(fine.out_image), + fine.gradient_image, + fine.image_atlas, + self.mask_buf.unwrap(), + ], + ); + } + } recording.free_resource(fine.config_buf); recording.free_resource(fine.tile_buf); recording.free_resource(fine.segments_buf); @@ -432,6 +463,10 @@ impl Render { recording.free_resource(fine.gradient_image); recording.free_resource(fine.image_atlas); recording.free_resource(fine.info_bin_data_buf); + // TODO: make mask buf persistent + if let Some(mask_buf) = self.mask_buf.take() { + recording.free_resource(mask_buf); + } } /// Get the output image. diff --git a/src/shaders.rs b/src/shaders.rs index 86e6ed7b..668dafac 100644 --- a/src/shaders.rs +++ b/src/shaders.rs @@ -86,6 +86,8 @@ pub struct FullShaders { #[cfg(feature = "wgpu")] pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result { + use crate::ANTIALIASING; + let imports = SHARED_SHADERS .iter() .copied() @@ -93,6 +95,17 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result { + full_config.insert("msaa".into()); + full_config.insert("msaa16".into()); + } + crate::AaConfig::Msaa8 => { + full_config.insert("msaa".into()); + full_config.insert("msaa8".into()); + } + crate::AaConfig::Area => (), + } let mut small_config = HashSet::new(); small_config.insert("full".into()); small_config.insert("small".into()); @@ -292,20 +305,39 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result engine.add_shader( + device, + "fine", + preprocess::preprocess(shader!("fine"), &full_config, &imports).into(), + &[ + BindType::Uniform, + BindType::BufReadOnly, + BindType::BufReadOnly, + BindType::BufReadOnly, + BindType::Image(ImageFormat::Rgba8), + BindType::ImageRead(ImageFormat::Rgba8), + BindType::ImageRead(ImageFormat::Rgba8), + ], + )?, + _ => { + engine.add_shader( + device, + "fine", + preprocess::preprocess(shader!("fine"), &full_config, &imports).into(), + &[ + BindType::Uniform, + BindType::BufReadOnly, + BindType::BufReadOnly, + BindType::BufReadOnly, + BindType::Image(ImageFormat::Rgba8), + BindType::ImageRead(ImageFormat::Rgba8), + BindType::ImageRead(ImageFormat::Rgba8), + BindType::BufReadOnly, // mask buffer + ], + )? + } + }; Ok(FullShaders { pathtag_reduce, pathtag_reduce2,