Skip to content

Commit 0549211

Browse files
committed
TBR: Use global framebuffer to avoid merge overhead
Signed-off-by: ZhouFANG <indevn@outlook.com>
1 parent bb5acc1 commit 0549211

2 files changed

Lines changed: 21 additions & 119 deletions

File tree

src/rasterizer.cpp

Lines changed: 0 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -92,72 +92,6 @@ std::vector<Fragment> Rasterizer::Rasterize(const Vertex& v0, const Vertex& v1,
9292
return fragments;
9393
}
9494

95-
void Rasterizer::RasterizeTo(const Vertex& v0, const Vertex& v1, const Vertex& v2,
96-
int x0, int y0, int x1, int y1,
97-
std::vector<Fragment>& out) {
98-
// 获取三角形的最小 box(屏幕空间)
99-
Vector2f a = Vector2f(v0.GetPosition().x, v0.GetPosition().y);
100-
Vector2f b = Vector2f(v1.GetPosition().x, v1.GetPosition().y);
101-
Vector2f c = Vector2f(v2.GetPosition().x, v2.GetPosition().y);
102-
103-
Vector2f bboxMin =
104-
Vector2f{std::min({a.x, b.x, c.x}), std::min({a.y, b.y, c.y})};
105-
Vector2f bboxMax =
106-
Vector2f{std::max({a.x, b.x, c.x}), std::max({a.y, b.y, c.y})};
107-
108-
// Clamp 到屏幕尺寸
109-
float minx = std::max(0.0f, bboxMin.x);
110-
float miny = std::max(0.0f, bboxMin.y);
111-
float maxx = std::min(float(width_ - 1), bboxMax.x);
112-
float maxy = std::min(float(height_ - 1), bboxMax.y);
113-
114-
// 与外部提供的裁剪区域(半开区间)相交,转成闭区间扫描
115-
int sx = std::max(x0, int(std::floor(minx)));
116-
int sy = std::max(y0, int(std::floor(miny)));
117-
int ex = std::min(x1 - 1, int(std::floor(maxx)));
118-
int ey = std::min(y1 - 1, int(std::floor(maxy)));
119-
120-
if (sx > ex || sy > ey) {
121-
return; // 与裁剪区域无交
122-
}
123-
124-
// 透视矫正插值使用与 Rasterize 相同逻辑,但单线程写入 out
125-
float w0_inv = v0.GetPosition().w;
126-
float w1_inv = v1.GetPosition().w;
127-
float w2_inv = v2.GetPosition().w;
128-
129-
for (int x = sx; x <= ex; ++x) {
130-
for (int y = sy; y <= ey; ++y) {
131-
auto [is_inside, barycentric_coord] = GetBarycentricCoord(
132-
v0.GetPosition(), v1.GetPosition(), v2.GetPosition(),
133-
Vector3f(static_cast<float>(x), static_cast<float>(y), 0));
134-
if (!is_inside) continue;
135-
136-
// 插值 1/w 并进行透视矫正
137-
float w_inv_interpolated = Interpolate(w0_inv, w1_inv, w2_inv, barycentric_coord);
138-
Vector3f corrected_bary(
139-
barycentric_coord.x * w0_inv / w_inv_interpolated,
140-
barycentric_coord.y * w1_inv / w_inv_interpolated,
141-
barycentric_coord.z * w2_inv / w_inv_interpolated);
142-
143-
auto z = Interpolate(v0.GetPosition().z, v1.GetPosition().z,
144-
v2.GetPosition().z, corrected_bary);
145-
146-
Fragment fragment;
147-
fragment.screen_coord = {x, y};
148-
fragment.normal = Interpolate(v0.GetNormal(), v1.GetNormal(),
149-
v2.GetNormal(), corrected_bary);
150-
fragment.uv = Interpolate(v0.GetTexCoords(), v1.GetTexCoords(),
151-
v2.GetTexCoords(), corrected_bary);
152-
fragment.color = InterpolateColor(v0.GetColor(), v1.GetColor(),
153-
v2.GetColor(), corrected_bary);
154-
fragment.depth = z;
155-
156-
out.push_back(fragment);
157-
}
158-
}
159-
}
160-
16195
void Rasterizer::RasterizeTo(const VertexSoA& soa, size_t i0, size_t i1, size_t i2,
16296
int x0, int y0, int x1, int y1,
16397
std::vector<Fragment>& out) {

src/renderer.cpp

Lines changed: 21 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -751,21 +751,14 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
751751
auto binning_duration = std::chrono::duration_cast<std::chrono::microseconds>(
752752
binning_end_time - binning_start_time);
753753

754-
// 3. 为每个线程创建framebuffer
754+
// 3. 全局 framebuffer(单份)
755+
// 直接让每个 tile 写入这份全局缓冲区,避免末端 O(W*H*kNProc) 合并开销
755756
auto buffer_alloc_start_time = std::chrono::high_resolution_clock::now();
756-
std::vector<std::unique_ptr<float[]>> depthBuffer_all_thread(kNProc);
757-
std::vector<std::unique_ptr<uint32_t[]>> colorBuffer_all_thread(kNProc);
758-
759-
for (size_t thread_id = 0; thread_id < kNProc; thread_id++) {
760-
depthBuffer_all_thread[thread_id] =
761-
std::make_unique<float[]>(width_ * height_);
762-
colorBuffer_all_thread[thread_id] =
763-
std::make_unique<uint32_t[]>(width_ * height_);
764-
765-
std::fill_n(depthBuffer_all_thread[thread_id].get(), width_ * height_,
766-
std::numeric_limits<float>::infinity());
767-
std::fill_n(colorBuffer_all_thread[thread_id].get(), width_ * height_, 0);
768-
}
757+
std::unique_ptr<float[]> depthBuffer = std::make_unique<float[]>(width_ * height_);
758+
std::unique_ptr<uint32_t[]> colorBuffer = std::make_unique<uint32_t[]>(width_ * height_);
759+
// 深度初始化为最远值,颜色清零
760+
std::fill_n(depthBuffer.get(), width_ * height_, std::numeric_limits<float>::infinity());
761+
std::fill_n(colorBuffer.get(), width_ * height_, 0);
769762
auto buffer_alloc_end_time = std::chrono::high_resolution_clock::now();
770763
auto buffer_alloc_duration = std::chrono::duration_cast<std::chrono::microseconds>(
771764
buffer_alloc_end_time - buffer_alloc_start_time);
@@ -774,14 +767,12 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
774767
auto rasterization_start_time = std::chrono::high_resolution_clock::now();
775768
#pragma omp parallel num_threads(kNProc) default(none) \
776769
shared(tile_triangles, rasterizer_, shader_, width_, height_, \
777-
depthBuffer_all_thread, colorBuffer_all_thread, tiles_x, tiles_y, total_tiles, \
770+
depthBuffer, colorBuffer, tiles_x, tiles_y, total_tiles, \
778771
early_z_enabled_, soa)
779772
{
780773
int thread_id = omp_get_thread_num();
781-
auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id];
782-
auto &colorBuffer_per_thread = colorBuffer_all_thread[thread_id];
783774

784-
// 为当前线程创建tile局部缓冲区
775+
// 为当前线程创建 tile 局部缓冲区(避免在全局缓冲上直接逐像素竞争)
785776
std::unique_ptr<float[]> tile_depth_buffer =
786777
std::make_unique<float[]>(TILE_SIZE * TILE_SIZE);
787778
std::unique_ptr<uint32_t[]> tile_color_buffer =
@@ -794,48 +785,24 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
794785
#pragma omp for
795786
for (size_t tile_id = 0; tile_id < total_tiles; tile_id++) {
796787
// 按照 tile 进行光栅化(SoA)
788+
// 直接写入单份全局 framebuffer;不同 tile 不重叠,无需加锁
797789
RasterizeTile(tile_id, tile_triangles[tile_id],
798-
tiles_x, tiles_y, TILE_SIZE,
799-
tile_depth_buffer.get(), tile_color_buffer.get(),
800-
depthBuffer_per_thread, colorBuffer_per_thread,
801-
soa, early_z_enabled_, &scratch_fragments);
790+
tiles_x, tiles_y, TILE_SIZE,
791+
tile_depth_buffer.get(), tile_color_buffer.get(),
792+
depthBuffer, colorBuffer,
793+
soa, early_z_enabled_, &scratch_fragments);
802794
}
803795
}
804796
auto rasterization_end_time = std::chrono::high_resolution_clock::now();
805797
auto rasterization_duration = std::chrono::duration_cast<std::chrono::microseconds>(
806798
rasterization_end_time - rasterization_start_time);
807799

808-
// 5. 合并所有线程结果
809-
auto merge_start_time = std::chrono::high_resolution_clock::now();
810-
std::unique_ptr<float[]> depthBuffer =
811-
std::make_unique<float[]>(width_ * height_);
812-
std::unique_ptr<uint32_t[]> colorBuffer =
813-
std::make_unique<uint32_t[]>(width_ * height_);
814-
815-
std::fill_n(depthBuffer.get(), width_ * height_,
816-
std::numeric_limits<float>::infinity());
817-
std::fill_n(colorBuffer.get(), width_ * height_, 0);
818-
819-
#pragma omp parallel for
820-
for (size_t i = 0; i < width_ * height_; i++) {
821-
float min_depth = std::numeric_limits<float>::infinity();
822-
uint32_t color = 0;
823-
824-
for (size_t thread_id = 0; thread_id < kNProc; thread_id++) {
825-
float depth = depthBuffer_all_thread[thread_id][i];
826-
if (depth < min_depth) {
827-
min_depth = depth;
828-
color = colorBuffer_all_thread[thread_id][i];
829-
}
830-
}
831-
depthBuffer[i] = min_depth;
832-
colorBuffer[i] = color;
833-
}
834-
800+
// 5. 直接将单份全局 colorBuffer 拷贝到输出
801+
auto present_start_time = std::chrono::high_resolution_clock::now();
835802
std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t));
836-
auto merge_end_time = std::chrono::high_resolution_clock::now();
837-
auto merge_duration = std::chrono::duration_cast<std::chrono::microseconds>(
838-
merge_end_time - merge_start_time);
803+
auto present_end_time = std::chrono::high_resolution_clock::now();
804+
auto present_duration = std::chrono::duration_cast<std::chrono::microseconds>(
805+
present_end_time - present_start_time);
839806

840807
auto total_end_time = std::chrono::high_resolution_clock::now();
841808
auto total_duration = std::chrono::duration_cast<std::chrono::microseconds>(
@@ -846,7 +813,8 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
846813
stats.binning_ms = binning_duration.count() / 1000.0;
847814
stats.buffer_alloc_ms = buffer_alloc_duration.count() / 1000.0;
848815
stats.rasterization_ms = rasterization_duration.count() / 1000.0;
849-
stats.merge_ms = merge_duration.count() / 1000.0;
816+
// 合并阶段已被消除,仅为拷贝开销
817+
stats.merge_ms = present_duration.count() / 1000.0;
850818
stats.total_ms = total_duration.count() / 1000.0;
851819

852820
return stats;

0 commit comments

Comments
 (0)