@@ -751,21 +751,14 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
751751 auto binning_duration = std::chrono::duration_cast<std::chrono::microseconds>(
752752 binning_end_time - binning_start_time);
753753
754- // 3. 为每个线程创建framebuffer
754+ // 3. 全局 framebuffer(单份)
755+ // 直接让每个 tile 写入这份全局缓冲区,避免末端 O(W*H*kNProc) 合并开销
755756 auto buffer_alloc_start_time = std::chrono::high_resolution_clock::now ();
756- std::vector<std::unique_ptr<float []>> depthBuffer_all_thread (kNProc );
757- std::vector<std::unique_ptr<uint32_t []>> colorBuffer_all_thread (kNProc );
758-
759- for (size_t thread_id = 0 ; thread_id < kNProc ; thread_id++) {
760- depthBuffer_all_thread[thread_id] =
761- std::make_unique<float []>(width_ * height_);
762- colorBuffer_all_thread[thread_id] =
763- std::make_unique<uint32_t []>(width_ * height_);
764-
765- std::fill_n (depthBuffer_all_thread[thread_id].get (), width_ * height_,
766- std::numeric_limits<float >::infinity ());
767- std::fill_n (colorBuffer_all_thread[thread_id].get (), width_ * height_, 0 );
768- }
757+ std::unique_ptr<float []> depthBuffer = std::make_unique<float []>(width_ * height_);
758+ std::unique_ptr<uint32_t []> colorBuffer = std::make_unique<uint32_t []>(width_ * height_);
759+ // 深度初始化为最远值,颜色清零
760+ std::fill_n (depthBuffer.get (), width_ * height_, std::numeric_limits<float >::infinity ());
761+ std::fill_n (colorBuffer.get (), width_ * height_, 0 );
769762 auto buffer_alloc_end_time = std::chrono::high_resolution_clock::now ();
770763 auto buffer_alloc_duration = std::chrono::duration_cast<std::chrono::microseconds>(
771764 buffer_alloc_end_time - buffer_alloc_start_time);
@@ -774,14 +767,12 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
774767 auto rasterization_start_time = std::chrono::high_resolution_clock::now ();
775768#pragma omp parallel num_threads(kNProc) default(none) \
776769 shared (tile_triangles, rasterizer_, shader_, width_, height_, \
777- depthBuffer_all_thread, colorBuffer_all_thread , tiles_x, tiles_y, total_tiles, \
770+ depthBuffer, colorBuffer , tiles_x, tiles_y, total_tiles, \
778771 early_z_enabled_, soa)
779772 {
780773 int thread_id = omp_get_thread_num ();
781- auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id];
782- auto &colorBuffer_per_thread = colorBuffer_all_thread[thread_id];
783774
784- // 为当前线程创建tile局部缓冲区
775+ // 为当前线程创建 tile 局部缓冲区(避免在全局缓冲上直接逐像素竞争)
785776 std::unique_ptr<float []> tile_depth_buffer =
786777 std::make_unique<float []>(TILE_SIZE * TILE_SIZE);
787778 std::unique_ptr<uint32_t []> tile_color_buffer =
@@ -794,48 +785,24 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
794785#pragma omp for
795786 for (size_t tile_id = 0 ; tile_id < total_tiles; tile_id++) {
796787 // 按照 tile 进行光栅化(SoA)
788+ // 直接写入单份全局 framebuffer;不同 tile 不重叠,无需加锁
797789 RasterizeTile (tile_id, tile_triangles[tile_id],
798- tiles_x, tiles_y, TILE_SIZE,
799- tile_depth_buffer.get (), tile_color_buffer.get (),
800- depthBuffer_per_thread, colorBuffer_per_thread ,
801- soa, early_z_enabled_, &scratch_fragments);
790+ tiles_x, tiles_y, TILE_SIZE,
791+ tile_depth_buffer.get (), tile_color_buffer.get (),
792+ depthBuffer, colorBuffer ,
793+ soa, early_z_enabled_, &scratch_fragments);
802794 }
803795 }
804796 auto rasterization_end_time = std::chrono::high_resolution_clock::now ();
805797 auto rasterization_duration = std::chrono::duration_cast<std::chrono::microseconds>(
806798 rasterization_end_time - rasterization_start_time);
807799
808- // 5. 合并所有线程结果
809- auto merge_start_time = std::chrono::high_resolution_clock::now ();
810- std::unique_ptr<float []> depthBuffer =
811- std::make_unique<float []>(width_ * height_);
812- std::unique_ptr<uint32_t []> colorBuffer =
813- std::make_unique<uint32_t []>(width_ * height_);
814-
815- std::fill_n (depthBuffer.get (), width_ * height_,
816- std::numeric_limits<float >::infinity ());
817- std::fill_n (colorBuffer.get (), width_ * height_, 0 );
818-
819- #pragma omp parallel for
820- for (size_t i = 0 ; i < width_ * height_; i++) {
821- float min_depth = std::numeric_limits<float >::infinity ();
822- uint32_t color = 0 ;
823-
824- for (size_t thread_id = 0 ; thread_id < kNProc ; thread_id++) {
825- float depth = depthBuffer_all_thread[thread_id][i];
826- if (depth < min_depth) {
827- min_depth = depth;
828- color = colorBuffer_all_thread[thread_id][i];
829- }
830- }
831- depthBuffer[i] = min_depth;
832- colorBuffer[i] = color;
833- }
834-
800+ // 5. 直接将单份全局 colorBuffer 拷贝到输出
801+ auto present_start_time = std::chrono::high_resolution_clock::now ();
835802 std::memcpy (buffer, colorBuffer.get (), width_ * height_ * sizeof (uint32_t ));
836- auto merge_end_time = std::chrono::high_resolution_clock::now ();
837- auto merge_duration = std::chrono::duration_cast<std::chrono::microseconds>(
838- merge_end_time - merge_start_time );
803+ auto present_end_time = std::chrono::high_resolution_clock::now ();
804+ auto present_duration = std::chrono::duration_cast<std::chrono::microseconds>(
805+ present_end_time - present_start_time );
839806
840807 auto total_end_time = std::chrono::high_resolution_clock::now ();
841808 auto total_duration = std::chrono::duration_cast<std::chrono::microseconds>(
@@ -846,7 +813,8 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
846813 stats.binning_ms = binning_duration.count () / 1000.0 ;
847814 stats.buffer_alloc_ms = buffer_alloc_duration.count () / 1000.0 ;
848815 stats.rasterization_ms = rasterization_duration.count () / 1000.0 ;
849- stats.merge_ms = merge_duration.count () / 1000.0 ;
816+ // 合并阶段已被消除,仅为拷贝开销
817+ stats.merge_ms = present_duration.count () / 1000.0 ;
850818 stats.total_ms = total_duration.count () / 1000.0 ;
851819
852820 return stats;
0 commit comments