44//!
55//! Algorithm:
66//! 1. Vertex transform (sequential — `vertex_shader` needs `&mut self`)
7- //! 2. Parallel rasterization: collect ALL fragments per pixel (no backface culling)
8- //! 3. Merge per- thread fragment buffers
9- //! 4. Depth resolve: find closest fragment per pixel
10- //! 5. Deferred shading: shade only winner fragments
11- //! 6 . Write to output buffer
7+ //! 2. Parallel rasterization with per-thread depth testing:
8+ //! each thread keeps only the closest fragment per pixel (NO backface culling)
9+ //! 3. Parallel merge + deferred shading:
10+ //! find closest fragment across threads and shade only winners (in parallel)
11+ //! 4 . Write to output buffer
1212
1313use log:: debug;
1414use std:: time:: Instant ;
1515
1616use rayon:: prelude:: * ;
1717
18+ use crate :: color:: Color ;
1819use crate :: fragment:: Fragment ;
20+ use crate :: math:: { Vec2 , Vec3 } ;
1921use crate :: model:: Model ;
2022use crate :: rasterizer:: Rasterizer ;
2123use crate :: renderers:: base;
2224use crate :: renderers:: Renderer ;
2325use crate :: shader:: Shader ;
2426
25- /// AoS deferred renderer: collect all fragments, then shade only the winners.
27+ /// AoS deferred renderer: collect closest fragments per thread , then shade only the winners.
2628///
2729/// Key difference from `PerTriangleRenderer`:
2830/// - NO backface culling — all fragments are collected
@@ -41,12 +43,6 @@ impl DeferredRenderer {
4143 }
4244}
4345
44- /// Per-pixel fragment entry: stores fragment + face index for material lookup.
45- struct FragmentEntry {
46- fragment : Fragment ,
47- face_index : usize ,
48- }
49-
5046impl Renderer for DeferredRenderer {
5147 fn render (
5248 & self ,
@@ -74,27 +70,38 @@ impl Renderer for DeferredRenderer {
7470 let vertex_ms = t. elapsed ( ) . as_secs_f64 ( ) * 1000.0 ;
7571
7672 let t = Instant :: now ( ) ;
77- // 3. Parallel rasterization: collect ALL fragments (NO backface culling)
73+ // 3. Parallel rasterization with per-thread depth testing
74+ //
75+ // Each thread keeps only the CLOSEST fragment per pixel, drastically
76+ // reducing memory from O(threads × pixels × fragments_per_pixel) to
77+ // O(threads × pixels).
7878 let num_pixels = width * height;
7979 let faces = model. faces ( ) ;
8080 let rasterizer = Rasterizer :: new ( width, height) ;
8181 let num_threads = rayon:: current_num_threads ( ) ;
8282 let chunk_size = std:: cmp:: max ( faces. len ( ) / num_threads, 1 ) ;
8383
84- // Per-thread fragment buffers: Vec<Vec<FragmentEntry>> per pixel
85- let chunk_results: Vec < Vec < Vec < FragmentEntry > > > = faces
84+ // Dummy fragment for buffer initialization (never read — only valid
85+ // entries where depth_buf < INFINITY are accessed during merge).
86+ let dummy = Fragment {
87+ screen_coord : [ 0 , 0 ] ,
88+ normal : Vec3 :: ZERO ,
89+ uv : Vec2 :: ZERO ,
90+ color : Color :: new ( 0 , 0 , 0 , 0 ) ,
91+ depth : f32:: INFINITY ,
92+ } ;
93+
94+ // Per-thread result: (depth_buf, fragment_buf, face_index_buf)
95+ let chunk_results: Vec < ( Vec < f32 > , Vec < Fragment > , Vec < usize > ) > = faces
8696 . par_chunks ( chunk_size)
8797 . enumerate ( )
88- . map ( |( _chunk_idx, face_chunk) | {
89- let mut pixel_fragments: Vec < Vec < FragmentEntry > > =
90- ( 0 ..num_pixels) . map ( |_| Vec :: new ( ) ) . collect ( ) ;
91-
92- // Compute starting face index for this chunk
93- let chunk_start = face_chunk. as_ptr ( ) as usize - faces. as_ptr ( ) as usize ;
94- let chunk_start_idx = chunk_start / std:: mem:: size_of_val ( & faces[ 0 ] ) ;
98+ . map ( |( chunk_idx, face_chunk) | {
99+ let mut depth_buf = vec ! [ f32 :: INFINITY ; num_pixels] ;
100+ let mut frag_buf = vec ! [ dummy. clone( ) ; num_pixels] ;
101+ let mut face_buf = vec ! [ 0usize ; num_pixels] ;
95102
96103 for ( local_idx, face) in face_chunk. iter ( ) . enumerate ( ) {
97- let face_idx = chunk_start_idx + local_idx;
104+ let face_idx = chunk_idx * chunk_size + local_idx;
98105 let v0 = & processed_vertices[ face. indices [ 0 ] ] ;
99106 let v1 = & processed_vertices[ face. indices [ 1 ] ] ;
100107 let v2 = & processed_vertices[ face. indices [ 2 ] ] ;
@@ -114,44 +121,50 @@ impl Renderer for DeferredRenderer {
114121 continue ;
115122 }
116123 let idx = x + y * width;
117- pixel_fragments[ idx] . push ( FragmentEntry {
118- fragment : frag,
119- face_index : face_idx,
120- } ) ;
124+ // Per-thread depth test: keep only the closest fragment
125+ if frag. depth < depth_buf[ idx] {
126+ depth_buf[ idx] = frag. depth ;
127+ frag_buf[ idx] = frag;
128+ face_buf[ idx] = face_idx;
129+ }
121130 }
122131 }
123132
124- pixel_fragments
133+ ( depth_buf , frag_buf , face_buf )
125134 } )
126135 . collect ( ) ;
127136
128137 let collect_ms = t. elapsed ( ) . as_secs_f64 ( ) * 1000.0 ;
129138
130- // 4. Merge per-thread fragment buffers + depth resolve + deferred shading
139+ // 4. Parallel merge + deferred shading
140+ //
141+ // For each pixel, find the closest fragment across all threads,
142+ // then shade only that winner. Both merge and shade run in parallel.
131143 let t = Instant :: now ( ) ;
132- // For each pixel: collect from all threads, find min depth, shade winner
133- for i in 0 ..num_pixels {
134- let mut best_entry: Option < ( & FragmentEntry , f32 ) > = None ;
135-
136- for thread_buf in & chunk_results {
137- for entry in & thread_buf[ i] {
138- let depth = entry. fragment . depth ;
139- match best_entry {
140- None => best_entry = Some ( ( entry, depth) ) ,
141- Some ( ( _, best_depth) ) if depth < best_depth => {
142- best_entry = Some ( ( entry, depth) ) ;
143- }
144- _ => { }
144+ let final_buffer: Vec < u32 > = ( 0 ..num_pixels)
145+ . into_par_iter ( )
146+ . map ( |i| {
147+ let mut best_depth = f32:: INFINITY ;
148+ let mut best_chunk: Option < usize > = None ;
149+
150+ for ( chunk_idx, ( depth_buf, _, _) ) in chunk_results. iter ( ) . enumerate ( ) {
151+ if depth_buf[ i] < best_depth {
152+ best_depth = depth_buf[ i] ;
153+ best_chunk = Some ( chunk_idx) ;
145154 }
146155 }
147- }
148156
149- if let Some ( ( winner, _) ) = best_entry {
150- let material = & faces[ winner. face_index ] . material ;
151- let color = shader. fragment_shader ( & winner. fragment , material) ;
152- out_buffer[ i] = u32:: from ( color) ;
153- }
154- }
157+ if let Some ( chunk_idx) = best_chunk {
158+ let winner_frag = & chunk_results[ chunk_idx] . 1 [ i] ;
159+ let winner_face_idx = chunk_results[ chunk_idx] . 2 [ i] ;
160+ let material = & faces[ winner_face_idx] . material ;
161+ u32:: from ( shader. fragment_shader ( winner_frag, material) )
162+ } else {
163+ 0u32
164+ }
165+ } )
166+ . collect ( ) ;
167+ out_buffer[ ..num_pixels] . copy_from_slice ( & final_buffer) ;
155168 let shade_ms = t. elapsed ( ) . as_secs_f64 ( ) * 1000.0 ;
156169
157170 let sum_ms = vertex_ms + collect_ms + shade_ms;
@@ -173,9 +186,8 @@ impl Renderer for DeferredRenderer {
173186#[ cfg( test) ]
174187mod tests {
175188 use super :: * ;
176- use crate :: color:: Color ;
177189 use crate :: light:: Light ;
178- use crate :: math:: { Mat4 , Vec3 } ;
190+ use crate :: math:: Mat4 ;
179191
180192 /// Set up a shader with identity matrices and a simple light for testing.
181193 fn test_shader ( ) -> Shader {
0 commit comments