Skip to content

Commit b627545

Browse files
Thiemo Wiedemeyerxlz
authored andcommitted
opencl: Use precomputed sin/cos tables
Instead of computing the sine and cosine for the p0 table and the phases on the GPU, they are now precomputed once on the CPU. Details: Replaced sin(a+b) by sin(a)*cos(b)+cos(a)*sin(b), where sin(a),cos(b),cos(a),sin(b) are stored in a LUT. Simplyfied processPixelStage1 code and removed processMeasurementTriple. Moved one if from decodePixelMeasurement to processPixelStage1. Removed the first part of `valid && any(...)` because valid has been checked before.
1 parent 6f7a600 commit b627545

2 files changed

Lines changed: 71 additions & 56 deletions

File tree

src/opencl_depth_packet_processor.cl

Lines changed: 28 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,17 @@
2424
* either License.
2525
*/
2626

27+
#define PHASE_SIN (float3)(PHASE_IN_RAD0_SIN, PHASE_IN_RAD1_SIN, PHASE_IN_RAD2_SIN)
28+
#define PHASE_COS (float3)(PHASE_IN_RAD0_COS, PHASE_IN_RAD1_COS, PHASE_IN_RAD2_COS)
29+
#define AB_MULTIPLIER_PER_FRQ (float3)(AB_MULTIPLIER_PER_FRQ0, AB_MULTIPLIER_PER_FRQ1, AB_MULTIPLIER_PER_FRQ2)
30+
2731
/*******************************************************************************
2832
* Process pixel stage 1
2933
******************************************************************************/
3034

3135
float decodePixelMeasurement(global const ushort *data, global const short *lut11to16, const uint sub, const uint x, const uint y)
3236
{
33-
uint row_idx = (424 * sub + (y < 212 ? y + 212 : 423 - y)) * 352;
37+
uint row_idx = (424 * sub + y) * 352;
3438
uint idx = (((x >> 2) + ((x << 7) & BFI_BITMASK)) * 11) & (uint)0xffffffff;
3539

3640
uint col_idx = idx >> 4;
@@ -43,60 +47,50 @@ float decodePixelMeasurement(global const ushort *data, global const short *lut1
4347
return (float)lut11to16[(x < 1 || 510 < x || col_idx > 352) ? 0 : ((data[data_idx0] >> upper_bytes) | (data[data_idx1] << lower_bytes)) & 2047];
4448
}
4549

46-
float2 processMeasurementTriple(const float ab_multiplier_per_frq, const float p0, const float3 v, int *invalid)
47-
{
48-
float3 p0vec = (float3)(p0 + PHASE_IN_RAD0, p0 + PHASE_IN_RAD1, p0 + PHASE_IN_RAD2);
49-
float3 p0cos = cos(p0vec);
50-
float3 p0sin = sin(-p0vec);
51-
52-
*invalid = *invalid && any(isequal(v, (float3)(32767.0f)));
53-
54-
return (float2)(dot(v, p0cos), dot(v, p0sin)) * ab_multiplier_per_frq;
55-
}
56-
57-
void kernel processPixelStage1(global const short *lut11to16, global const float *z_table, global const float3 *p0_table, global const ushort *data,
58-
global float3 *a_out, global float3 *b_out, global float3 *n_out, global float *ir_out)
50+
void kernel processPixelStage1(global const short *lut11to16, global const float *z_table, global const float3 *p0_sin_table, global const float3 *p0_cos_table,
51+
global const ushort *data, global float3 *a_out, global float3 *b_out, global float3 *n_out, global float *ir_out)
5952
{
6053
const uint i = get_global_id(0);
6154

6255
const uint x = i % 512;
6356
const uint y = i / 512;
6457

65-
const uint y_in = (423 - y);
58+
const uint y_tmp = (423 - y);
59+
const uint y_in = (y_tmp < 212 ? y_tmp + 212 : 423 - y_tmp);
60+
61+
const int3 invalid = (int)(0.0f >= z_table[i]);
62+
const float3 p0_sin = p0_sin_table[i];
63+
const float3 p0_cos = p0_cos_table[i];
6664

67-
const float zmultiplier = z_table[i];
68-
int valid = (int)(0.0f < zmultiplier);
69-
int saturatedX = valid;
70-
int saturatedY = valid;
71-
int saturatedZ = valid;
72-
int3 invalid_pixel = (int3)((int)(!valid));
73-
const float3 p0 = p0_table[i];
65+
int3 invalid_pixel = (int3)(invalid);
7466

7567
const float3 v0 = (float3)(decodePixelMeasurement(data, lut11to16, 0, x, y_in),
7668
decodePixelMeasurement(data, lut11to16, 1, x, y_in),
7769
decodePixelMeasurement(data, lut11to16, 2, x, y_in));
78-
const float2 ab0 = processMeasurementTriple(AB_MULTIPLIER_PER_FRQ0, p0.x, v0, &saturatedX);
79-
8070
const float3 v1 = (float3)(decodePixelMeasurement(data, lut11to16, 3, x, y_in),
8171
decodePixelMeasurement(data, lut11to16, 4, x, y_in),
8272
decodePixelMeasurement(data, lut11to16, 5, x, y_in));
83-
const float2 ab1 = processMeasurementTriple(AB_MULTIPLIER_PER_FRQ1, p0.y, v1, &saturatedY);
84-
8573
const float3 v2 = (float3)(decodePixelMeasurement(data, lut11to16, 6, x, y_in),
8674
decodePixelMeasurement(data, lut11to16, 7, x, y_in),
8775
decodePixelMeasurement(data, lut11to16, 8, x, y_in));
88-
const float2 ab2 = processMeasurementTriple(AB_MULTIPLIER_PER_FRQ2, p0.z, v2, &saturatedZ);
8976

90-
float3 a = select((float3)(ab0.x, ab1.x, ab2.x), (float3)(0.0f), invalid_pixel);
91-
float3 b = select((float3)(ab0.y, ab1.y, ab2.y), (float3)(0.0f), invalid_pixel);
77+
float3 a = (float3)(dot(v0, PHASE_COS * p0_cos.x - PHASE_SIN * p0_sin.x),
78+
dot(v1, PHASE_COS * p0_cos.y - PHASE_SIN * p0_sin.y),
79+
dot(v2, PHASE_COS * p0_cos.z - PHASE_SIN * p0_sin.z)) * AB_MULTIPLIER_PER_FRQ;
80+
float3 b = (float3)(dot(v0, PHASE_COS * p0_sin.x + PHASE_SIN * p0_cos.x),
81+
dot(v1, PHASE_COS * p0_sin.y + PHASE_SIN * p0_cos.y),
82+
dot(v2, PHASE_COS * p0_sin.z + PHASE_SIN * p0_cos.z)) * AB_MULTIPLIER_PER_FRQ;
83+
84+
a = select(a, (float3)(0.0f), invalid_pixel);
85+
b = select(b, (float3)(0.0f), invalid_pixel);
9286
float3 n = sqrt(a * a + b * b);
9387

94-
int3 saturated = (int3)(saturatedX, saturatedY, saturatedZ);
95-
a = select(a, (float3)(0.0f), saturated);
96-
b = select(b, (float3)(0.0f), saturated);
88+
int3 saturated = (int3)(any(isequal(v0, (float3)(32767.0f))),
89+
any(isequal(v1, (float3)(32767.0f))),
90+
any(isequal(v2, (float3)(32767.0f))));
9791

98-
a_out[i] = a;
99-
b_out[i] = b;
92+
a_out[i] = select(a, (float3)(0.0f), saturated);
93+
b_out[i] = select(b, (float3)(0.0f), saturated);
10094
n_out[i] = n;
10195
ir_out[i] = min(dot(select(n, (float3)(65535.0f), saturated), (float3)(0.333333333f * AB_MULTIPLIER * AB_OUTPUT_MULTIPLIER)), 65535.0f);
10296
}

src/opencl_depth_packet_processor.cpp

Lines changed: 43 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,8 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
7878
cl_short lut11to16[2048];
7979
cl_float x_table[512 * 424];
8080
cl_float z_table[512 * 424];
81-
cl_float3 p0_table[512 * 424];
81+
cl_float3 p0_sin_table[512 * 424];
82+
cl_float3 p0_cos_table[512 * 424];
8283
libfreenect2::DepthPacketProcessor::Config config;
8384
DepthPacketProcessor::Parameters params;
8485

@@ -105,7 +106,8 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
105106
size_t buf_packet_size;
106107

107108
cl::Buffer buf_lut11to16;
108-
cl::Buffer buf_p0_table;
109+
cl::Buffer buf_p0_sin_table;
110+
cl::Buffer buf_p0_cos_table;
109111
cl::Buffer buf_x_table;
110112
cl::Buffer buf_z_table;
111113
cl::Buffer buf_packet;
@@ -200,9 +202,12 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
200202
oss << " -D AB_MULTIPLIER_PER_FRQ2=" << params.ab_multiplier_per_frq[2] << "f";
201203
oss << " -D AB_OUTPUT_MULTIPLIER=" << params.ab_output_multiplier << "f";
202204

203-
oss << " -D PHASE_IN_RAD0=" << params.phase_in_rad[0] << "f";
204-
oss << " -D PHASE_IN_RAD1=" << params.phase_in_rad[1] << "f";
205-
oss << " -D PHASE_IN_RAD2=" << params.phase_in_rad[2] << "f";
205+
oss << " -D PHASE_IN_RAD0_SIN=" << std::sin(-params.phase_in_rad[0]) << "f";
206+
oss << " -D PHASE_IN_RAD0_COS=" << std::cos(params.phase_in_rad[0]) << "f";
207+
oss << " -D PHASE_IN_RAD1_SIN=" << std::sin(-params.phase_in_rad[1]) << "f";
208+
oss << " -D PHASE_IN_RAD1_COS=" << std::cos(params.phase_in_rad[1]) << "f";
209+
oss << " -D PHASE_IN_RAD2_SIN=" << std::sin(-params.phase_in_rad[2]) << "f";
210+
oss << " -D PHASE_IN_RAD2_COS=" << std::cos(params.phase_in_rad[2]) << "f";
206211

207212
oss << " -D JOINT_BILATERAL_AB_THRESHOLD=" << params.joint_bilateral_ab_threshold << "f";
208213
oss << " -D JOINT_BILATERAL_MAX_EDGE=" << params.joint_bilateral_max_edge << "f";
@@ -382,7 +387,9 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
382387

383388
buf_lut11to16 = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_lut11to16_size, NULL, &err);
384389
CHECK_CL_ERROR(err, "cl::Buffer");
385-
buf_p0_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_p0_table_size, NULL, &err);
390+
buf_p0_sin_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_p0_table_size, NULL, &err);
391+
CHECK_CL_ERROR(err, "cl::Buffer");
392+
buf_p0_cos_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_p0_table_size, NULL, &err);
386393
CHECK_CL_ERROR(err, "cl::Buffer");
387394
buf_x_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_x_table_size, NULL, &err);
388395
CHECK_CL_ERROR(err, "cl::Buffer");
@@ -430,17 +437,19 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
430437
CHECK_CL_ERROR(err, "setArg");
431438
err = kernel_processPixelStage1.setArg(1, buf_z_table);
432439
CHECK_CL_ERROR(err, "setArg");
433-
err = kernel_processPixelStage1.setArg(2, buf_p0_table);
440+
err = kernel_processPixelStage1.setArg(2, buf_p0_sin_table);
441+
CHECK_CL_ERROR(err, "setArg");
442+
err = kernel_processPixelStage1.setArg(3, buf_p0_cos_table);
434443
CHECK_CL_ERROR(err, "setArg");
435-
err = kernel_processPixelStage1.setArg(3, buf_packet);
444+
err = kernel_processPixelStage1.setArg(4, buf_packet);
436445
CHECK_CL_ERROR(err, "setArg");
437-
err = kernel_processPixelStage1.setArg(4, buf_a);
446+
err = kernel_processPixelStage1.setArg(5, buf_a);
438447
CHECK_CL_ERROR(err, "setArg");
439-
err = kernel_processPixelStage1.setArg(5, buf_b);
448+
err = kernel_processPixelStage1.setArg(6, buf_b);
440449
CHECK_CL_ERROR(err, "setArg");
441-
err = kernel_processPixelStage1.setArg(6, buf_n);
450+
err = kernel_processPixelStage1.setArg(7, buf_n);
442451
CHECK_CL_ERROR(err, "setArg");
443-
err = kernel_processPixelStage1.setArg(7, buf_ir);
452+
err = kernel_processPixelStage1.setArg(8, buf_ir);
444453
CHECK_CL_ERROR(err, "setArg");
445454

446455
kernel_filterPixelStage1 = cl::Kernel(program, "filterPixelStage1", &err);
@@ -484,14 +493,16 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
484493
err = kernel_filterPixelStage2.setArg(3, buf_filtered);
485494
CHECK_CL_ERROR(err, "setArg");
486495

487-
cl::Event event0, event1, event2, event3;
496+
cl::Event event0, event1, event2, event3, event4;
488497
err = queue.enqueueWriteBuffer(buf_lut11to16, CL_FALSE, 0, buf_lut11to16_size, lut11to16, NULL, &event0);
489498
CHECK_CL_ERROR(err, "enqueueWriteBuffer");
490-
err = queue.enqueueWriteBuffer(buf_p0_table, CL_FALSE, 0, buf_p0_table_size, p0_table, NULL, &event1);
499+
err = queue.enqueueWriteBuffer(buf_p0_sin_table, CL_FALSE, 0, buf_p0_table_size, p0_sin_table, NULL, &event1);
491500
CHECK_CL_ERROR(err, "enqueueWriteBuffer");
492-
err = queue.enqueueWriteBuffer(buf_x_table, CL_FALSE, 0, buf_x_table_size, x_table, NULL, &event2);
501+
err = queue.enqueueWriteBuffer(buf_p0_cos_table, CL_FALSE, 0, buf_p0_table_size, p0_cos_table, NULL, &event2);
493502
CHECK_CL_ERROR(err, "enqueueWriteBuffer");
494-
err = queue.enqueueWriteBuffer(buf_z_table, CL_FALSE, 0, buf_z_table_size, z_table, NULL, &event3);
503+
err = queue.enqueueWriteBuffer(buf_x_table, CL_FALSE, 0, buf_x_table_size, x_table, NULL, &event3);
504+
CHECK_CL_ERROR(err, "enqueueWriteBuffer");
505+
err = queue.enqueueWriteBuffer(buf_z_table, CL_FALSE, 0, buf_z_table_size, z_table, NULL, &event4);
495506
CHECK_CL_ERROR(err, "enqueueWriteBuffer");
496507

497508
err = event0.wait();
@@ -502,6 +513,8 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
502513
CHECK_CL_ERROR(err, "wait");
503514
err = event3.wait();
504515
CHECK_CL_ERROR(err, "wait");
516+
err = event4.wait();
517+
CHECK_CL_ERROR(err, "wait");
505518
}
506519

507520
programInitialized = true;
@@ -606,16 +619,24 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
606619
{
607620
for(int r = 0; r < 424; ++r)
608621
{
609-
cl_float3 *it = &p0_table[r * 512];
622+
cl_float3 *itS = &p0_sin_table[r * 512];
623+
cl_float3 *itC = &p0_cos_table[r * 512];
610624
const uint16_t *it0 = &p0table->p0table0[r * 512];
611625
const uint16_t *it1 = &p0table->p0table1[r * 512];
612626
const uint16_t *it2 = &p0table->p0table2[r * 512];
613-
for(int c = 0; c < 512; ++c, ++it, ++it0, ++it1, ++it2)
627+
for(int c = 0; c < 512; ++c, ++itS, ++itC, ++it0, ++it1, ++it2)
614628
{
615-
it->s[0] = -((float) * it0) * 0.000031 * M_PI;
616-
it->s[1] = -((float) * it1) * 0.000031 * M_PI;
617-
it->s[2] = -((float) * it2) * 0.000031 * M_PI;
618-
it->s[3] = 0.0f;
629+
const float x = ((float)*it0) * 0.000031 * M_PI;
630+
const float y = ((float)*it1) * 0.000031 * M_PI;
631+
const float z = ((float)*it2) * 0.000031 * M_PI;
632+
itS->s[0] = std::sin(x);
633+
itS->s[1] = std::sin(y);
634+
itS->s[2] = std::sin(z);
635+
itS->s[3] = 0.0f;
636+
itC->s[0] = std::cos(-x);
637+
itC->s[1] = std::cos(-y);
638+
itC->s[2] = std::cos(-z);
639+
itC->s[3] = 0.0f;
619640
}
620641
}
621642
}

0 commit comments

Comments
 (0)