Skip to content

Commit 17ffece

Browse files
committed
WIP: A draft of the pipeline, bearing in mind that use_cpu_from is useful
1 parent 57c55db commit 17ffece

File tree

3 files changed

+197
-41
lines changed

3 files changed

+197
-41
lines changed

vello/src/cpu.rs

Lines changed: 75 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1+
use std::ops::ControlFlow;
2+
13
use vello_encoding::{
24
BinHeader, BufferSize, BumpAllocators, Clip, ClipBbox, ClipBic, ClipElement, DrawBbox,
3-
DrawMonoid, Encoding, IndirectCount, Layout, LineSoup, Path, PathBbox, PathMonoid, PathSegment,
5+
DrawMonoid, Encoding, IndirectCount, LineSoup, Path, PathBbox, PathMonoid, PathSegment,
46
RenderConfig, Resolver, SegmentCount, Tile,
57
};
68
use vello_shaders::{
@@ -28,7 +30,7 @@ pub struct Buffer<T: bytemuck::Zeroable + bytemuck::NoUninit> {
2830
}
2931

3032
impl<T: bytemuck::Zeroable + bytemuck::NoUninit> Buffer<T> {
31-
fn to_fit(&mut self, size: BufferSize<T>) -> &mut [T] {
33+
fn fit_slice(&mut self, size: BufferSize<T>) -> &mut [T] {
3234
self.inner
3335
.resize_with(size.len().try_into().expect("32 bit platform"), || {
3436
T::zeroed()
@@ -67,11 +69,36 @@ pub struct CoarseBuffers {
6769
ptcl: Buffer<u32>,
6870
}
6971

72+
pub enum Stages {
73+
PathTagReduce,
74+
PathTagScan,
75+
BboxClear,
76+
Flatten,
77+
DrawReduce,
78+
DrawLeaf,
79+
ClipReduce,
80+
ClipLeaf,
81+
Binning,
82+
TileAlloc,
83+
PathCount,
84+
Backdrop,
85+
Coarse,
86+
PathTiling,
87+
}
88+
7089
pub fn run_coarse_cpu(
7190
params: &RenderParams,
7291
buffers: &mut CoarseBuffers,
7392
cpu_config: &RenderConfig,
7493
) {
94+
let _ = run_coarse_cpu_internal(params, buffers, cpu_config);
95+
}
96+
97+
fn run_coarse_cpu_internal(
98+
params: &RenderParams,
99+
buffers: &mut CoarseBuffers,
100+
cpu_config: &RenderConfig,
101+
) -> ControlFlow<()> {
75102
let packed = &mut buffers.packed;
76103

77104
// HACK: The coarse workgroup counts is the number of active bins.
@@ -90,47 +117,52 @@ pub fn run_coarse_cpu(
90117
let buffer_sizes = &cpu_config.buffer_sizes;
91118
let wg_counts = &cpu_config.workgroup_counts;
92119

120+
// Some buffers are marked as "write-only". This means that they will be performant to
121+
// write into https://docs.rs/wgpu/latest/wgpu/struct.BufferViewMut.html directly (not yet set up)
122+
93123
// TODO: This is an alignment hazard, which just happens to work on mainstream platforms
94124
// Maybe don't merge as-is?
95125
let scene_buf = bytemuck::cast_slice(packed);
96126
let config_buf = cpu_config.gpu;
97-
let info_bin_data_buf = buffers.bin_data.to_fit(buffer_sizes.bin_data);
98-
let tile_buf = buffers.tiles.to_fit(buffer_sizes.tiles);
99-
let segments_buf = buffers.segments.to_fit(buffer_sizes.segments);
127+
let info_bin_data_buf = buffers.bin_data.fit_slice(buffer_sizes.bin_data);
128+
let tile_buf = buffers.tiles.fit_slice(buffer_sizes.tiles);
129+
// Write-only
130+
let segments_buf = buffers.segments.fit_slice(buffer_sizes.segments);
131+
// Write-only
132+
let ptcl_buf = buffers.ptcl.fit_slice(buffer_sizes.ptcl);
100133

101-
let ptcl_buf = buffers.ptcl.to_fit(buffer_sizes.ptcl);
102-
let reduced_buf = buffers.path_reduced.to_fit(buffer_sizes.path_reduced);
134+
let reduced_buf = buffers.path_reduced.fit_slice(buffer_sizes.path_reduced);
103135

104136
pathtag_reduce_main(wg_counts.path_reduce.0, &config_buf, scene_buf, reduced_buf);
105137

106-
let tagmonoid_buf = buffers.path_monoids.to_fit(buffer_sizes.path_monoids);
138+
let tagmonoid_buf = buffers.path_monoids.fit_slice(buffer_sizes.path_monoids);
107139

108140
pathtag_scan_main(
109141
wg_counts.path_scan.0,
110142
&config_buf,
111143
scene_buf,
112-
reduced_buf,
144+
&*reduced_buf,
113145
tagmonoid_buf,
114146
);
115147

116148
// Could re-use `reduced_buf` from this point
117149

118-
let path_bbox_buf = buffers.path_bboxes.to_fit(buffer_sizes.path_bboxes);
150+
let path_bbox_buf = buffers.path_bboxes.fit_slice(buffer_sizes.path_bboxes);
119151

120152
bbox_clear_main(&config_buf, path_bbox_buf);
121153
let bump_buf = &mut buffers.bump_alloc;
122-
let lines_buf = buffers.lines.to_fit(buffer_sizes.lines);
154+
let lines_buf = buffers.lines.fit_slice(buffer_sizes.lines);
123155
flatten_main(
124156
wg_counts.flatten.0,
125157
&config_buf,
126158
scene_buf,
127-
tagmonoid_buf,
159+
&*tagmonoid_buf,
128160
path_bbox_buf,
129161
bump_buf,
130162
lines_buf,
131163
);
132164

133-
let draw_reduced_buf = buffers.draw_reduced.to_fit(buffer_sizes.draw_reduced);
165+
let draw_reduced_buf = buffers.draw_reduced.fit_slice(buffer_sizes.draw_reduced);
134166

135167
draw_reduce_main(
136168
wg_counts.draw_reduce.0,
@@ -139,58 +171,58 @@ pub fn run_coarse_cpu(
139171
draw_reduced_buf,
140172
);
141173

142-
let draw_monoid_buf = buffers.draw_monoids.to_fit(buffer_sizes.draw_monoids);
143-
let clip_inp_buf = buffers.clip_inps.to_fit(buffer_sizes.clip_inps);
174+
let draw_monoid_buf = buffers.draw_monoids.fit_slice(buffer_sizes.draw_monoids);
175+
let clip_inp_buf = buffers.clip_inps.fit_slice(buffer_sizes.clip_inps);
144176
draw_leaf_main(
145177
wg_counts.draw_leaf.0,
146178
&config_buf,
147179
scene_buf,
148-
draw_reduced_buf,
149-
path_bbox_buf,
180+
&*draw_reduced_buf,
181+
&*path_bbox_buf,
150182
draw_monoid_buf,
151183
info_bin_data_buf,
152184
clip_inp_buf,
153185
);
154186

155187
// Could re-use `draw_reduced_buf` from this point
156188

157-
let clip_el_buf = buffers.clip_els.to_fit(buffer_sizes.clip_els);
189+
let clip_el_buf = buffers.clip_els.fit_slice(buffer_sizes.clip_els);
158190

159-
let clip_bic_buf = buffers.clip_bics.to_fit(buffer_sizes.clip_bics);
191+
let clip_bic_buf = buffers.clip_bics.fit_slice(buffer_sizes.clip_bics);
160192

161193
if wg_counts.clip_reduce.0 > 0 {
162194
clip_reduce_main(
163195
wg_counts.clip_reduce.0,
164-
clip_inp_buf,
165-
path_bbox_buf,
196+
&*clip_inp_buf,
197+
&*path_bbox_buf,
166198
clip_bic_buf,
167199
clip_el_buf,
168200
);
169201
}
170-
let clip_bbox_buf = buffers.clip_bboxes.to_fit(buffer_sizes.clip_bboxes);
202+
let clip_bbox_buf = buffers.clip_bboxes.fit_slice(buffer_sizes.clip_bboxes);
171203

172204
if wg_counts.clip_leaf.0 > 0 {
173205
clip_leaf_main(
174206
&config_buf,
175207
clip_inp_buf,
176-
path_bbox_buf,
208+
&*path_bbox_buf,
177209
draw_monoid_buf,
178210
clip_bbox_buf,
179211
);
180212
}
181213

182214
// Could re-use `clip_inp_buf`, `clip_bic_buf`, and `clip_el_buf` from this point
183215

184-
let draw_bbox_buf = buffers.draw_bboxes.to_fit(buffer_sizes.draw_bboxes);
216+
let draw_bbox_buf = buffers.draw_bboxes.fit_slice(buffer_sizes.draw_bboxes);
185217

186-
let bin_header_buf = buffers.bin_headers.to_fit(buffer_sizes.bin_headers);
218+
let bin_header_buf = buffers.bin_headers.fit_slice(buffer_sizes.bin_headers);
187219

188220
binning_main(
189221
wg_counts.binning.0,
190222
&config_buf,
191-
draw_monoid_buf,
192-
path_bbox_buf,
193-
clip_bbox_buf,
223+
&*draw_monoid_buf,
224+
&*path_bbox_buf,
225+
&*clip_bbox_buf,
194226
draw_bbox_buf,
195227
bump_buf,
196228
info_bin_data_buf,
@@ -202,11 +234,11 @@ pub fn run_coarse_cpu(
202234
// TODO: What does this comment mean?
203235
// Note: this only needs to be rounded up because of the workaround to store the tile_offset
204236
// in storage rather than workgroup memory.
205-
let path_buf = buffers.paths.to_fit(buffer_sizes.paths);
237+
let path_buf = buffers.paths.fit_slice(buffer_sizes.paths);
206238
tile_alloc_main(
207239
&config_buf,
208240
scene_buf,
209-
draw_bbox_buf,
241+
&*draw_bbox_buf,
210242
bump_buf,
211243
path_buf,
212244
tile_buf,
@@ -218,36 +250,38 @@ pub fn run_coarse_cpu(
218250

219251
path_count_setup_main(bump_buf, &mut indirect_count_buf);
220252

221-
let seg_counts_buf = buffers.seg_counts.to_fit(buffer_sizes.seg_counts);
222-
path_count_main(bump_buf, lines_buf, path_buf, tile_buf, seg_counts_buf);
253+
let seg_counts_buf = buffers.seg_counts.fit_slice(buffer_sizes.seg_counts);
254+
path_count_main(bump_buf, &*lines_buf, &*path_buf, tile_buf, seg_counts_buf);
223255

224-
backdrop_main(&config_buf, bump_buf, path_buf, tile_buf);
256+
backdrop_main(&config_buf, &*bump_buf, &*path_buf, tile_buf);
225257

226258
coarse_main(
227259
&config_buf,
228260
scene_buf,
229-
draw_monoid_buf,
230-
bin_header_buf,
231-
info_bin_data_buf,
232-
path_buf,
261+
&*draw_monoid_buf,
262+
&*bin_header_buf,
263+
&*info_bin_data_buf,
264+
&*path_buf,
233265
tile_buf,
234266
bump_buf,
235267
ptcl_buf,
236268
);
237269

270+
// TODO: Remove
238271
path_tiling_setup_main(
239272
bump_buf,
240273
&mut indirect_count_buf, /* ptcl_buf (for forwarding errors to fine)*/
241274
);
242275

243276
path_tiling_main(
244277
bump_buf,
245-
seg_counts_buf,
246-
lines_buf,
247-
path_buf,
248-
tile_buf,
278+
&*seg_counts_buf,
279+
&*lines_buf,
280+
&*path_buf,
281+
&*tile_buf,
249282
segments_buf,
250283
);
284+
ControlFlow::Continue(())
251285
}
252286

253287
pub fn render_to_texture(

vello/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ mod shaders;
120120
pub mod cpu;
121121
#[cfg(feature = "wgpu")]
122122
pub mod util;
123+
pub mod v2;
123124
#[cfg(feature = "wgpu")]
124125
mod wgpu_engine;
125126

vello/src/v2.rs

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
// Copyright 2025 the Vello Authors
2+
// SPDX-License-Identifier: Apache-2.0 OR MIT
3+
4+
// Thinking about it: what do we need?
5+
6+
// Use case: Runs on demand?
7+
// 1) CPU then GPU
8+
9+
// Use case: Debug stages
10+
// 1) Run single stage with fixed input
11+
// 2) Maybe generate inputs on CPU first
12+
// 3) Download results
13+
14+
use std::ops::ControlFlow;
15+
16+
pub struct CpuSteps {
17+
end_cpu_after: PipelineStep,
18+
run: bool,
19+
}
20+
21+
#[derive(Clone, Copy)]
22+
struct StepMeta {
23+
run: bool,
24+
}
25+
26+
#[derive(Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
27+
enum PipelineStep {
28+
One,
29+
Two,
30+
}
31+
32+
impl CpuSteps {
33+
fn start_stage(&mut self, step: PipelineStep) -> ControlFlow<(), StepMeta> {
34+
// If we're a later step than the final CPU step
35+
if step > self.end_cpu_after {
36+
return ControlFlow::Break(());
37+
}
38+
ControlFlow::Continue(StepMeta { run: self.run })
39+
}
40+
}
41+
struct Buffer<T> {
42+
cpu_write_count: u16,
43+
cpu_read_count: u16,
44+
remaining_writes_cpu: u16,
45+
remaining_reads_cpu: u16,
46+
cpu_content: Vec<T>,
47+
staging_buffer: wgpu::Buffer,
48+
staging_written: bool,
49+
50+
gpu_written: bool,
51+
gpu_buffer: wgpu::Buffer,
52+
staging_queue: Vec<wgpu::Buffer>,
53+
}
54+
impl<T> Buffer<T> {
55+
fn read(&mut self, stage: StepMeta) -> &[T] {
56+
if stage.run {
57+
self.remaining_reads_cpu -= 1;
58+
&self.cpu_content
59+
} else {
60+
self.cpu_read_count += 1;
61+
&[]
62+
}
63+
}
64+
fn write(&mut self, stage: StepMeta) -> &mut [T] {
65+
if stage.run {
66+
self.remaining_writes_cpu -= 1;
67+
if self.remaining_reads_cpu == 0 && self.remaining_writes_cpu == 0 {
68+
// self.staging_written = true;
69+
// return self
70+
// .staging_buffer
71+
// .slice(..)
72+
// .get_mapped_range_mut()
73+
// .deref_mut();
74+
}
75+
&mut self.cpu_content
76+
} else {
77+
self.cpu_write_count += 1;
78+
&mut []
79+
}
80+
}
81+
fn read_write(&mut self, stage: StepMeta) -> &mut [T] {
82+
if stage.run {
83+
self.remaining_reads_cpu -= 1;
84+
self.remaining_writes_cpu -= 1;
85+
&mut self.cpu_content
86+
} else {
87+
self.cpu_write_count += 1;
88+
self.cpu_read_count += 1;
89+
&mut []
90+
}
91+
}
92+
}
93+
94+
struct Buffers {
95+
a: Buffer<u8>,
96+
b: Buffer<u16>,
97+
c: Buffer<u16>,
98+
}
99+
100+
pub fn tiny_pipeline_model(mut stages: CpuSteps, buffers: &mut Buffers) -> ControlFlow<()> {
101+
cpu_stage_1(&mut stages, buffers)?;
102+
cpu_stage_1(&mut stages, buffers)?;
103+
cpu_stage_1(&mut stages, buffers)
104+
}
105+
106+
fn cpu_stage_1(stages: &mut CpuSteps, buffers: &mut Buffers) -> ControlFlow<()> {
107+
let meta = stages.start_stage(PipelineStep::One)?;
108+
let a = buffers.a.read(meta);
109+
let b = buffers.b.write(meta);
110+
let c = buffers.c.read_write(meta);
111+
if meta.run {
112+
stage_1::stage_1(a, &*b, c);
113+
}
114+
ControlFlow::Continue(())
115+
}
116+
117+
mod stage_1 {
118+
pub fn stage_1(a: &[u8], b: &[u16], c: &mut [u16]) {
119+
// ..
120+
}
121+
}

0 commit comments

Comments
 (0)