Skip to content

Commit 807d1aa

Browse files
committed
perf: update
Signed-off-by: Niu Zhihong <zhihong@nzhnb.com>
1 parent 61c1f1a commit 807d1aa

2 files changed

Lines changed: 75 additions & 71 deletions

File tree

simple_renderer/src/renderers/deferred.rs

Lines changed: 63 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -4,25 +4,27 @@
44
//!
55
//! Algorithm:
66
//! 1. Vertex transform (sequential — `vertex_shader` needs `&mut self`)
7-
//! 2. Parallel rasterization: collect ALL fragments per pixel (no backface culling)
8-
//! 3. Merge per-thread fragment buffers
9-
//! 4. Depth resolve: find closest fragment per pixel
10-
//! 5. Deferred shading: shade only winner fragments
11-
//! 6. Write to output buffer
7+
//! 2. Parallel rasterization with per-thread depth testing:
8+
//! each thread keeps only the closest fragment per pixel (NO backface culling)
9+
//! 3. Parallel merge + deferred shading:
10+
//! find closest fragment across threads and shade only winners (in parallel)
11+
//! 4. Write to output buffer
1212
1313
use log::debug;
1414
use std::time::Instant;
1515

1616
use rayon::prelude::*;
1717

18+
use crate::color::Color;
1819
use crate::fragment::Fragment;
20+
use crate::math::{Vec2, Vec3};
1921
use crate::model::Model;
2022
use crate::rasterizer::Rasterizer;
2123
use crate::renderers::base;
2224
use crate::renderers::Renderer;
2325
use crate::shader::Shader;
2426

25-
/// AoS deferred renderer: collect all fragments, then shade only the winners.
27+
/// AoS deferred renderer: collect closest fragments per thread, then shade only the winners.
2628
///
2729
/// Key difference from `PerTriangleRenderer`:
2830
/// - NO backface culling — all fragments are collected
@@ -41,12 +43,6 @@ impl DeferredRenderer {
4143
}
4244
}
4345

44-
/// Per-pixel fragment entry: stores fragment + face index for material lookup.
45-
struct FragmentEntry {
46-
fragment: Fragment,
47-
face_index: usize,
48-
}
49-
5046
impl Renderer for DeferredRenderer {
5147
fn render(
5248
&self,
@@ -74,27 +70,38 @@ impl Renderer for DeferredRenderer {
7470
let vertex_ms = t.elapsed().as_secs_f64() * 1000.0;
7571

7672
let t = Instant::now();
77-
// 3. Parallel rasterization: collect ALL fragments (NO backface culling)
73+
// 3. Parallel rasterization with per-thread depth testing
74+
//
75+
// Each thread keeps only the CLOSEST fragment per pixel, drastically
76+
// reducing memory from O(threads × pixels × fragments_per_pixel) to
77+
// O(threads × pixels).
7878
let num_pixels = width * height;
7979
let faces = model.faces();
8080
let rasterizer = Rasterizer::new(width, height);
8181
let num_threads = rayon::current_num_threads();
8282
let chunk_size = std::cmp::max(faces.len() / num_threads, 1);
8383

84-
// Per-thread fragment buffers: Vec<Vec<FragmentEntry>> per pixel
85-
let chunk_results: Vec<Vec<Vec<FragmentEntry>>> = faces
84+
// Dummy fragment for buffer initialization (never read — only valid
85+
// entries where depth_buf < INFINITY are accessed during merge).
86+
let dummy = Fragment {
87+
screen_coord: [0, 0],
88+
normal: Vec3::ZERO,
89+
uv: Vec2::ZERO,
90+
color: Color::new(0, 0, 0, 0),
91+
depth: f32::INFINITY,
92+
};
93+
94+
// Per-thread result: (depth_buf, fragment_buf, face_index_buf)
95+
let chunk_results: Vec<(Vec<f32>, Vec<Fragment>, Vec<usize>)> = faces
8696
.par_chunks(chunk_size)
8797
.enumerate()
88-
.map(|(_chunk_idx, face_chunk)| {
89-
let mut pixel_fragments: Vec<Vec<FragmentEntry>> =
90-
(0..num_pixels).map(|_| Vec::new()).collect();
91-
92-
// Compute starting face index for this chunk
93-
let chunk_start = face_chunk.as_ptr() as usize - faces.as_ptr() as usize;
94-
let chunk_start_idx = chunk_start / std::mem::size_of_val(&faces[0]);
98+
.map(|(chunk_idx, face_chunk)| {
99+
let mut depth_buf = vec![f32::INFINITY; num_pixels];
100+
let mut frag_buf = vec![dummy.clone(); num_pixels];
101+
let mut face_buf = vec![0usize; num_pixels];
95102

96103
for (local_idx, face) in face_chunk.iter().enumerate() {
97-
let face_idx = chunk_start_idx + local_idx;
104+
let face_idx = chunk_idx * chunk_size + local_idx;
98105
let v0 = &processed_vertices[face.indices[0]];
99106
let v1 = &processed_vertices[face.indices[1]];
100107
let v2 = &processed_vertices[face.indices[2]];
@@ -114,44 +121,50 @@ impl Renderer for DeferredRenderer {
114121
continue;
115122
}
116123
let idx = x + y * width;
117-
pixel_fragments[idx].push(FragmentEntry {
118-
fragment: frag,
119-
face_index: face_idx,
120-
});
124+
// Per-thread depth test: keep only the closest fragment
125+
if frag.depth < depth_buf[idx] {
126+
depth_buf[idx] = frag.depth;
127+
frag_buf[idx] = frag;
128+
face_buf[idx] = face_idx;
129+
}
121130
}
122131
}
123132

124-
pixel_fragments
133+
(depth_buf, frag_buf, face_buf)
125134
})
126135
.collect();
127136

128137
let collect_ms = t.elapsed().as_secs_f64() * 1000.0;
129138

130-
// 4. Merge per-thread fragment buffers + depth resolve + deferred shading
139+
// 4. Parallel merge + deferred shading
140+
//
141+
// For each pixel, find the closest fragment across all threads,
142+
// then shade only that winner. Both merge and shade run in parallel.
131143
let t = Instant::now();
132-
// For each pixel: collect from all threads, find min depth, shade winner
133-
for i in 0..num_pixels {
134-
let mut best_entry: Option<(&FragmentEntry, f32)> = None;
135-
136-
for thread_buf in &chunk_results {
137-
for entry in &thread_buf[i] {
138-
let depth = entry.fragment.depth;
139-
match best_entry {
140-
None => best_entry = Some((entry, depth)),
141-
Some((_, best_depth)) if depth < best_depth => {
142-
best_entry = Some((entry, depth));
143-
}
144-
_ => {}
144+
let final_buffer: Vec<u32> = (0..num_pixels)
145+
.into_par_iter()
146+
.map(|i| {
147+
let mut best_depth = f32::INFINITY;
148+
let mut best_chunk: Option<usize> = None;
149+
150+
for (chunk_idx, (depth_buf, _, _)) in chunk_results.iter().enumerate() {
151+
if depth_buf[i] < best_depth {
152+
best_depth = depth_buf[i];
153+
best_chunk = Some(chunk_idx);
145154
}
146155
}
147-
}
148156

149-
if let Some((winner, _)) = best_entry {
150-
let material = &faces[winner.face_index].material;
151-
let color = shader.fragment_shader(&winner.fragment, material);
152-
out_buffer[i] = u32::from(color);
153-
}
154-
}
157+
if let Some(chunk_idx) = best_chunk {
158+
let winner_frag = &chunk_results[chunk_idx].1[i];
159+
let winner_face_idx = chunk_results[chunk_idx].2[i];
160+
let material = &faces[winner_face_idx].material;
161+
u32::from(shader.fragment_shader(winner_frag, material))
162+
} else {
163+
0u32
164+
}
165+
})
166+
.collect();
167+
out_buffer[..num_pixels].copy_from_slice(&final_buffer);
155168
let shade_ms = t.elapsed().as_secs_f64() * 1000.0;
156169

157170
let sum_ms = vertex_ms + collect_ms + shade_ms;
@@ -173,9 +186,8 @@ impl Renderer for DeferredRenderer {
173186
#[cfg(test)]
174187
mod tests {
175188
use super::*;
176-
use crate::color::Color;
177189
use crate::light::Light;
178-
use crate::math::{Mat4, Vec3};
190+
use crate::math::Mat4;
179191

180192
/// Set up a shader with identity matrices and a simple light for testing.
181193
fn test_shader() -> Shader {

simple_renderer/src/renderers/tile_based_deferred.rs

Lines changed: 12 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -93,16 +93,12 @@ impl Renderer for TileBasedDeferredRenderer {
9393
let tile_triangles = tile_common::triangle_tile_binning(model, &grid);
9494
let binning_ms = t.elapsed().as_secs_f64() * 1000.0;
9595

96-
// 5. Global framebuffer
97-
let num_pixels = width * height;
98-
let mut global_color = vec![COLOR_CLEAR; num_pixels];
99-
let mut global_depth = vec![DEPTH_CLEAR; num_pixels];
10096

10197
let t = Instant::now();
10298
// 6. Parallel 2-pass rasterization per tile
10399
let total_tiles = tiles_x * tiles_y;
104100

105-
let tile_results: Vec<(Vec<f32>, Vec<u32>, usize, usize, usize, usize)> =
101+
let tile_results: Vec<(Vec<u32>, usize, usize, usize, usize)> =
106102
(0..total_tiles)
107103
.into_par_iter()
108104
.map(|tile_id| {
@@ -136,8 +132,9 @@ impl Renderer for TileBasedDeferredRenderer {
136132
height,
137133
);
138134

135+
// tile_depth is only used as z-buffer within
136+
// rasterize_tile_deferred — no need to return it
139137
(
140-
tile_depth,
141138
tile_color,
142139
screen_x_start,
143140
screen_y_start,
@@ -149,23 +146,18 @@ impl Renderer for TileBasedDeferredRenderer {
149146

150147
let raster_ms = t.elapsed().as_secs_f64() * 1000.0;
151148

152-
// 7. Copy tile results to global framebuffer
149+
// 7. Copy tile results directly to output buffer
153150
let t = Instant::now();
154-
for (tile_depth, tile_color, sx, sy, tw, th) in &tile_results {
151+
for (tile_color, sx, sy, tw, th) in &tile_results {
155152
for y in 0..*th {
156153
let tile_row_off = y * tw;
157-
let global_row_off = (sy + y) * width + sx;
158-
global_color[global_row_off..global_row_off + tw]
154+
let out_row_off = (sy + y) * width + sx;
155+
out_buffer[out_row_off..out_row_off + tw]
159156
.copy_from_slice(&tile_color[tile_row_off..tile_row_off + tw]);
160-
global_depth[global_row_off..global_row_off + tw]
161-
.copy_from_slice(&tile_depth[tile_row_off..tile_row_off + tw]);
162157
}
163158
}
164159
let copy_ms = t.elapsed().as_secs_f64() * 1000.0;
165160

166-
// 8. Copy to output
167-
out_buffer[..num_pixels].copy_from_slice(&global_color);
168-
169161
let sum_ms = vertex_ms + setup_ms + binning_ms + raster_ms + copy_ms;
170162
if sum_ms > 0.0 {
171163
debug!("=== TILE-BASED DEFERRED RENDERING PERFORMANCE ===");
@@ -204,7 +196,7 @@ fn rasterize_tile_deferred(
204196
let tile_pixels = tile_width * tile_height;
205197

206198
// Per-pixel state for 2-pass
207-
let mut zmin = vec![DEPTH_CLEAR; tile_pixels];
199+
// tile_depth is used as zmin buffer (Pass A) and output depth (Pass B)
208200
let mut winner: Vec<i32> = vec![-1; tile_pixels];
209201
let mut b0c_buf = vec![0.0f32; tile_pixels];
210202
let mut b1c_buf = vec![0.0f32; tile_pixels];
@@ -319,8 +311,8 @@ fn rasterize_tile_deferred(
319311
let local_y = (y - screen_y_start as i32) as usize;
320312
let idx = local_x + local_y * tile_width;
321313

322-
if z < zmin[idx] - 1e-8 {
323-
zmin[idx] = z;
314+
if z < tile_depth[idx] - 1e-8 {
315+
tile_depth[idx] = z;
324316
winner[idx] = tri_local_idx as i32;
325317
b0c_buf[idx] = b0c;
326318
b1c_buf[idx] = b1c;
@@ -374,11 +366,11 @@ fn rasterize_tile_deferred(
374366
normal,
375367
uv,
376368
color,
377-
depth: zmin[idx],
369+
depth: tile_depth[idx],
378370
};
379371

380372
let out_color = shader.fragment_shader(&frag, &faces[tri.face_index].material);
381-
tile_depth[idx] = frag.depth;
373+
// tile_depth[idx] already set in Pass A
382374
tile_color[idx] = u32::from(out_color);
383375
}
384376
}

0 commit comments

Comments
 (0)