Skip to content

Commit 1d2d9a9

Browse files
committed
TBR: Pre-allocate and reuse fragment caches; add RasterizeTo; two-pass counting in Binning to eliminate frequent dynamic memory reallocations.
Signed-off-by: ZhouFANG <indevn@outlook.com>
1 parent b57d907 commit 1d2d9a9

4 files changed

Lines changed: 224 additions & 32 deletions

File tree

src/include/rasterizer.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,13 @@ class Rasterizer {
2020
std::vector<Fragment> Rasterize(const Vertex& v0, const Vertex& v1,
2121
const Vertex& v2);
2222

23+
// 非分配版本:将片段直接写入调用方提供的容器
24+
// 可选的裁剪区域为半开区间 [x0, x1) × [y0, y1)
25+
// 用于 TBR:将光栅化限制在 tile 边界内,便于复用外部 scratch 容器
26+
void RasterizeTo(const Vertex& v0, const Vertex& v1, const Vertex& v2,
27+
int x0, int y0, int x1, int y1,
28+
std::vector<Fragment>& out);
29+
2330
private:
2431
size_t width_, height_;
2532

src/include/renderer.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,8 @@ class SimpleRenderer {
190190
float* tile_depth_buffer, uint32_t* tile_color_buffer,
191191
std::unique_ptr<float[]> &global_depth_buffer,
192192
std::unique_ptr<uint32_t[]> &global_color_buffer,
193-
bool use_early_z = false);
193+
bool use_early_z = false,
194+
std::vector<Fragment>* scratch_fragments = nullptr);
194195

195196

196197
/**

src/rasterizer.cpp

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
#include "rasterizer.hpp"
22

33
#include <omp.h>
4+
#include <algorithm>
5+
#include <cmath>
46

57
namespace simple_renderer {
68

@@ -90,6 +92,72 @@ std::vector<Fragment> Rasterizer::Rasterize(const Vertex& v0, const Vertex& v1,
9092
return fragments;
9193
}
9294

95+
void Rasterizer::RasterizeTo(const Vertex& v0, const Vertex& v1, const Vertex& v2,
96+
int x0, int y0, int x1, int y1,
97+
std::vector<Fragment>& out) {
98+
// 获取三角形的最小 box(屏幕空间)
99+
Vector2f a = Vector2f(v0.GetPosition().x, v0.GetPosition().y);
100+
Vector2f b = Vector2f(v1.GetPosition().x, v1.GetPosition().y);
101+
Vector2f c = Vector2f(v2.GetPosition().x, v2.GetPosition().y);
102+
103+
Vector2f bboxMin =
104+
Vector2f{std::min({a.x, b.x, c.x}), std::min({a.y, b.y, c.y})};
105+
Vector2f bboxMax =
106+
Vector2f{std::max({a.x, b.x, c.x}), std::max({a.y, b.y, c.y})};
107+
108+
// Clamp 到屏幕尺寸
109+
float minx = std::max(0.0f, bboxMin.x);
110+
float miny = std::max(0.0f, bboxMin.y);
111+
float maxx = std::min(float(width_ - 1), bboxMax.x);
112+
float maxy = std::min(float(height_ - 1), bboxMax.y);
113+
114+
// 与外部提供的裁剪区域(半开区间)相交,转成闭区间扫描
115+
int sx = std::max(x0, int(std::floor(minx)));
116+
int sy = std::max(y0, int(std::floor(miny)));
117+
int ex = std::min(x1 - 1, int(std::floor(maxx)));
118+
int ey = std::min(y1 - 1, int(std::floor(maxy)));
119+
120+
if (sx > ex || sy > ey) {
121+
return; // 与裁剪区域无交
122+
}
123+
124+
// 透视矫正插值使用与 Rasterize 相同逻辑,但单线程写入 out
125+
float w0_inv = v0.GetPosition().w;
126+
float w1_inv = v1.GetPosition().w;
127+
float w2_inv = v2.GetPosition().w;
128+
129+
for (int x = sx; x <= ex; ++x) {
130+
for (int y = sy; y <= ey; ++y) {
131+
auto [is_inside, barycentric_coord] = GetBarycentricCoord(
132+
v0.GetPosition(), v1.GetPosition(), v2.GetPosition(),
133+
Vector3f(static_cast<float>(x), static_cast<float>(y), 0));
134+
if (!is_inside) continue;
135+
136+
// 插值 1/w 并进行透视矫正
137+
float w_inv_interpolated = Interpolate(w0_inv, w1_inv, w2_inv, barycentric_coord);
138+
Vector3f corrected_bary(
139+
barycentric_coord.x * w0_inv / w_inv_interpolated,
140+
barycentric_coord.y * w1_inv / w_inv_interpolated,
141+
barycentric_coord.z * w2_inv / w_inv_interpolated);
142+
143+
auto z = Interpolate(v0.GetPosition().z, v1.GetPosition().z,
144+
v2.GetPosition().z, corrected_bary);
145+
146+
Fragment fragment;
147+
fragment.screen_coord = {x, y};
148+
fragment.normal = Interpolate(v0.GetNormal(), v1.GetNormal(),
149+
v2.GetNormal(), corrected_bary);
150+
fragment.uv = Interpolate(v0.GetTexCoords(), v1.GetTexCoords(),
151+
v2.GetTexCoords(), corrected_bary);
152+
fragment.color = InterpolateColor(v0.GetColor(), v1.GetColor(),
153+
v2.GetColor(), corrected_bary);
154+
fragment.depth = z;
155+
156+
out.push_back(fragment);
157+
}
158+
}
159+
}
160+
93161
std::pair<bool, Vector3f> Rasterizer::GetBarycentricCoord(const Vector3f& p0,
94162
const Vector3f& p1,
95163
const Vector3f& p2,
@@ -157,4 +225,4 @@ Vector3f Rasterizer::CalculateNormal(const Vector3f& v0, const Vector3f& v1,
157225
glm::cross(edge1, edge2));
158226
}
159227

160-
} // namespace simple_renderer
228+
} // namespace simple_renderer

src/renderer.cpp

Lines changed: 146 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,86 @@ void SimpleRenderer::TriangleTileBinning(
405405
SPDLOG_INFO("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}",
406406
width_, height_, tile_size, tiles_x, tiles_y);
407407

408+
// 第一遍:仅统计每个 tile 的三角形数量以便预分配,避免 push_back 扩容
409+
std::vector<size_t> tile_counts(tiles_x * tiles_y, 0);
410+
for (size_t tri_idx = 0; tri_idx < model.GetFaces().size(); tri_idx++) {
411+
const auto &f = model.GetFaces()[tri_idx];
412+
auto v0 = screenVertices[f.GetIndex(0)];
413+
auto v1 = screenVertices[f.GetIndex(1)];
414+
auto v2 = screenVertices[f.GetIndex(2)];
415+
416+
if (v0.HasClipPosition()) {
417+
Vector4f c0 = v0.GetClipPosition();
418+
Vector4f c1 = v1.GetClipPosition();
419+
Vector4f c2 = v2.GetClipPosition();
420+
bool frustum_cull =
421+
(c0.x > c0.w && c1.x > c1.w && c2.x > c2.w) ||
422+
(c0.x < -c0.w && c1.x < -c0.w && c2.x < -c0.w) ||
423+
(c0.y > c0.w && c1.y > c1.w && c2.y > c2.w) ||
424+
(c0.y < -c0.w && c1.y < -c0.w && c2.y < -c0.w) ||
425+
(c0.z > c0.w && c1.z > c1.w && c2.z > c2.w) ||
426+
(c0.z < -c0.w && c1.z < -c0.w && c2.z < -c0.w);
427+
if (frustum_cull) {
428+
continue;
429+
}
430+
}
431+
432+
Vector4f pos0 = v0.GetPosition();
433+
Vector4f pos1 = v1.GetPosition();
434+
Vector4f pos2 = v2.GetPosition();
435+
436+
Vector2f screen0(pos0.x, pos0.y);
437+
Vector2f screen1(pos1.x, pos1.y);
438+
Vector2f screen2(pos2.x, pos2.y);
439+
Vector2f edge1 = screen1 - screen0;
440+
Vector2f edge2 = screen2 - screen0;
441+
float cross_product = edge1.x * edge2.y - edge1.y * edge2.x;
442+
if (cross_product > 0.0f) {
443+
continue;
444+
}
445+
446+
bool has_clipped_vertex = (pos0.x == -1000.0f || pos1.x == -1000.0f || pos2.x == -1000.0f);
447+
if (has_clipped_vertex) {
448+
continue;
449+
}
450+
451+
float screen_x0 = pos0.x;
452+
float screen_y0 = pos0.y;
453+
float screen_x1 = pos1.x;
454+
float screen_y1 = pos1.y;
455+
float screen_x2 = pos2.x;
456+
float screen_y2 = pos2.y;
457+
458+
float min_x = std::min({screen_x0, screen_x1, screen_x2});
459+
float max_x = std::max({screen_x0, screen_x1, screen_x2});
460+
float min_y = std::min({screen_y0, screen_y1, screen_y2});
461+
float max_y = std::max({screen_y0, screen_y1, screen_y2});
462+
463+
int start_tile_x = std::max(0, static_cast<int>(min_x) / static_cast<int>(tile_size));
464+
int end_tile_x = std::min(static_cast<int>(tiles_x - 1),
465+
static_cast<int>(max_x) / static_cast<int>(tile_size));
466+
int start_tile_y = std::max(0, static_cast<int>(min_y) / static_cast<int>(tile_size));
467+
int end_tile_y = std::min(static_cast<int>(tiles_y - 1),
468+
static_cast<int>(max_y) / static_cast<int>(tile_size));
469+
470+
if (start_tile_x > end_tile_x || start_tile_y > end_tile_y) {
471+
continue;
472+
}
473+
474+
for (int ty = start_tile_y; ty <= end_tile_y; ++ty) {
475+
for (int tx = start_tile_x; tx <= end_tile_x; ++tx) {
476+
size_t tile_id = ty * tiles_x + tx;
477+
tile_counts[tile_id]++;
478+
}
479+
}
480+
}
481+
482+
// 依据统计结果进行容量预留
483+
for (size_t tile_id = 0; tile_id < tile_triangles.size(); ++tile_id) {
484+
if (tile_counts[tile_id] > 0) {
485+
tile_triangles[tile_id].reserve(tile_counts[tile_id]);
486+
}
487+
}
408488
for (size_t tri_idx = 0; tri_idx < model.GetFaces().size(); tri_idx++) {
409489
const auto &f = model.GetFaces()[tri_idx];
410490
auto v0 = screenVertices[f.GetIndex(0)];
@@ -522,7 +602,8 @@ void SimpleRenderer::RasterizeTile(
522602
float* tile_depth_buffer, uint32_t* tile_color_buffer,
523603
std::unique_ptr<float[]> &global_depth_buffer,
524604
std::unique_ptr<uint32_t[]> &global_color_buffer,
525-
bool use_early_z) {
605+
bool use_early_z,
606+
std::vector<Fragment>* scratch_fragments) {
526607
// 计算tile在屏幕空间的范围
527608
size_t tile_x = tile_id % tiles_x;
528609
size_t tile_y = tile_id / tiles_x;
@@ -539,38 +620,69 @@ void SimpleRenderer::RasterizeTile(
539620
std::fill_n(tile_color_buffer, tile_width * tile_height, 0);
540621

541622
// 在tile内光栅化所有三角形
623+
(void)tiles_y; // 避免未使用参数告警
542624
for (const auto &triangle : triangles) {
543-
auto fragments = rasterizer_->Rasterize(triangle.v0, triangle.v1, triangle.v2);
544-
545-
for (auto &fragment : fragments) {
546-
fragment.material = triangle.material;
547-
548-
size_t screen_x = fragment.screen_coord[0];
549-
size_t screen_y = fragment.screen_coord[1];
550-
551-
// 检查fragment是否在当前tile内
552-
if (screen_x >= screen_x_start && screen_x < screen_x_end &&
553-
screen_y >= screen_y_start && screen_y < screen_y_end) {
554-
555-
size_t tile_local_x = screen_x - screen_x_start;
556-
size_t tile_local_y = screen_y - screen_y_start;
557-
size_t tile_index = tile_local_x + tile_local_y * tile_width;
558-
559-
// tile内深度测试
560-
if (use_early_z) { // Early-Z模式:深度测试在Fragment Shader之前
561-
if (fragment.depth < tile_depth_buffer[tile_index]) {
625+
// 复用线程本地 scratch 容器,限制在 tile 边界内栅格化
626+
if (scratch_fragments) { // 提供scratch容器
627+
scratch_fragments->clear();
628+
if (scratch_fragments->capacity() < tile_width * tile_height) { // 二次确认,为日后可能的可变tile进行设计
629+
scratch_fragments->reserve(tile_width * tile_height);
630+
}
631+
rasterizer_->RasterizeTo(triangle.v0, triangle.v1, triangle.v2,
632+
static_cast<int>(screen_x_start), static_cast<int>(screen_y_start),
633+
static_cast<int>(screen_x_end), static_cast<int>(screen_y_end),
634+
*scratch_fragments);
635+
636+
for (auto &fragment : *scratch_fragments) {
637+
fragment.material = triangle.material;
638+
size_t screen_x = fragment.screen_coord[0];
639+
size_t screen_y = fragment.screen_coord[1];
640+
if (screen_x >= screen_x_start && screen_x < screen_x_end &&
641+
screen_y >= screen_y_start && screen_y < screen_y_end) {
642+
size_t tile_local_x = screen_x - screen_x_start;
643+
size_t tile_local_y = screen_y - screen_y_start;
644+
size_t tile_index = tile_local_x + tile_local_y * tile_width;
645+
if (use_early_z) {
646+
if (fragment.depth < tile_depth_buffer[tile_index]) {
647+
auto color = shader_->FragmentShader(fragment);
648+
tile_depth_buffer[tile_index] = fragment.depth;
649+
tile_color_buffer[tile_index] = uint32_t(color);
650+
}
651+
} else {
562652
auto color = shader_->FragmentShader(fragment);
563-
tile_depth_buffer[tile_index] = fragment.depth;
564-
tile_color_buffer[tile_index] = uint32_t(color);
565-
}
566-
} else { // Late-Z模式:Fragment Shader在深度测试之前
567-
auto color = shader_->FragmentShader(fragment);
568-
if (fragment.depth < tile_depth_buffer[tile_index]) {
569-
tile_depth_buffer[tile_index] = fragment.depth;
570-
tile_color_buffer[tile_index] = uint32_t(color);
653+
if (fragment.depth < tile_depth_buffer[tile_index]) {
654+
tile_depth_buffer[tile_index] = fragment.depth;
655+
tile_color_buffer[tile_index] = uint32_t(color);
656+
}
571657
}
572658
}
659+
}
660+
} else { // 不提供scratch容器的版本
661+
auto fragments = rasterizer_->Rasterize(triangle.v0, triangle.v1, triangle.v2);
662+
for (auto &fragment : fragments) {
663+
fragment.material = triangle.material;
664+
size_t screen_x = fragment.screen_coord[0];
665+
size_t screen_y = fragment.screen_coord[1];
666+
if (screen_x >= screen_x_start && screen_x < screen_x_end &&
667+
screen_y >= screen_y_start && screen_y < screen_y_end) {
668+
size_t tile_local_x = screen_x - screen_x_start;
669+
size_t tile_local_y = screen_y - screen_y_start;
670+
size_t tile_index = tile_local_x + tile_local_y * tile_width;
671+
if (use_early_z) {
672+
if (fragment.depth < tile_depth_buffer[tile_index]) {
673+
auto color = shader_->FragmentShader(fragment);
674+
tile_depth_buffer[tile_index] = fragment.depth;
675+
tile_color_buffer[tile_index] = uint32_t(color);
676+
}
677+
} else {
678+
auto color = shader_->FragmentShader(fragment);
679+
if (fragment.depth < tile_depth_buffer[tile_index]) {
680+
tile_depth_buffer[tile_index] = fragment.depth;
681+
tile_color_buffer[tile_index] = uint32_t(color);
682+
}
573683
}
684+
}
685+
}
574686
}
575687
}
576688

@@ -785,14 +897,18 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
785897
std::unique_ptr<uint32_t[]> tile_color_buffer =
786898
std::make_unique<uint32_t[]>(TILE_SIZE * TILE_SIZE);
787899

900+
// 线程本地片段 scratch 容器(复用),容量按单 tile 上限预估
901+
std::vector<Fragment> scratch_fragments;
902+
scratch_fragments.reserve(TILE_SIZE * TILE_SIZE);
903+
788904
#pragma omp for
789905
for (size_t tile_id = 0; tile_id < total_tiles; tile_id++) {
790-
// 按照tile进行光栅化
906+
// 按照tile进行光栅化,每个Tile进行区域限制+scratch复用,区域限制避免了可能的数据竞争
791907
RasterizeTile(tile_id, tile_triangles[tile_id],
792908
tiles_x, tiles_y, TILE_SIZE,
793909
tile_depth_buffer.get(), tile_color_buffer.get(),
794910
depthBuffer_per_thread, colorBuffer_per_thread,
795-
early_z_enabled_);
911+
early_z_enabled_, &scratch_fragments);
796912
}
797913
}
798914
auto rasterization_end_time = std::chrono::high_resolution_clock::now();

0 commit comments

Comments
 (0)