opencl: Use precomputed sin/cos tables

Thiemo Wiedemeyer · xlz · commit b627545362be · 2016-02-20T20:18:31.000-05:00
Instead of computing the sine and cosine for the p0 table and the
phases on the GPU, they are now precomputed once on the CPU.

Details: Replaced sin(a+b) by sin(a)*cos(b)+cos(a)*sin(b), where
sin(a),cos(b),cos(a),sin(b) are stored in a LUT.  Simplyfied
processPixelStage1 code and removed processMeasurementTriple.

Moved one if from decodePixelMeasurement to processPixelStage1.

Removed the first part of `valid &amp;&amp; any(...)` because valid has been
checked before.
diff --git a/src/opencl_depth_packet_processor.cl b/src/opencl_depth_packet_processor.cl
@@ -24,13 +24,17 @@
  * either License.
  */
 
+#define PHASE_SIN (float3)(PHASE_IN_RAD0_SIN, PHASE_IN_RAD1_SIN, PHASE_IN_RAD2_SIN)
+#define PHASE_COS (float3)(PHASE_IN_RAD0_COS, PHASE_IN_RAD1_COS, PHASE_IN_RAD2_COS)
+#define AB_MULTIPLIER_PER_FRQ (float3)(AB_MULTIPLIER_PER_FRQ0, AB_MULTIPLIER_PER_FRQ1, AB_MULTIPLIER_PER_FRQ2)
+
 /*******************************************************************************
  * Process pixel stage 1
  ******************************************************************************/
 
 float decodePixelMeasurement(global const ushort *data, global const short *lut11to16, const uint sub, const uint x, const uint y)
 {
-  uint row_idx = (424 * sub + (y < 212 ? y + 212 : 423 - y)) * 352;
+  uint row_idx = (424 * sub + y) * 352;
   uint idx = (((x >> 2) + ((x << 7) & BFI_BITMASK)) * 11) & (uint)0xffffffff;
 
   uint col_idx = idx >> 4;
@@ -43,60 +47,50 @@ float decodePixelMeasurement(global const ushort *data, global const short *lut1
   return (float)lut11to16[(x < 1 || 510 < x || col_idx > 352) ? 0 : ((data[data_idx0] >> upper_bytes) | (data[data_idx1] << lower_bytes)) & 2047];
 }
 
-float2 processMeasurementTriple(const float ab_multiplier_per_frq, const float p0, const float3 v, int *invalid)
-{
-  float3 p0vec = (float3)(p0 + PHASE_IN_RAD0, p0 + PHASE_IN_RAD1, p0 + PHASE_IN_RAD2);
-  float3 p0cos = cos(p0vec);
-  float3 p0sin = sin(-p0vec);
-
-  *invalid = *invalid && any(isequal(v, (float3)(32767.0f)));
-
-  return (float2)(dot(v, p0cos), dot(v, p0sin)) * ab_multiplier_per_frq;
-}
-
-void kernel processPixelStage1(global const short *lut11to16, global const float *z_table, global const float3 *p0_table, global const ushort *data,
-                               global float3 *a_out, global float3 *b_out, global float3 *n_out, global float *ir_out)
+void kernel processPixelStage1(global const short *lut11to16, global const float *z_table, global const float3 *p0_sin_table, global const float3 *p0_cos_table,
+                               global const ushort *data, global float3 *a_out, global float3 *b_out, global float3 *n_out, global float *ir_out)
 {
   const uint i = get_global_id(0);
 
   const uint x = i % 512;
   const uint y = i / 512;
 
-  const uint y_in = (423 - y);
+  const uint y_tmp = (423 - y);
+  const uint y_in = (y_tmp < 212 ? y_tmp + 212 : 423 - y_tmp);
+
+  const int3 invalid = (int)(0.0f >= z_table[i]);
+  const float3 p0_sin = p0_sin_table[i];
+  const float3 p0_cos = p0_cos_table[i];
 
-  const float zmultiplier = z_table[i];
-  int valid = (int)(0.0f < zmultiplier);
-  int saturatedX = valid;
-  int saturatedY = valid;
-  int saturatedZ = valid;
-  int3 invalid_pixel = (int3)((int)(!valid));
-  const float3 p0 = p0_table[i];
+  int3 invalid_pixel = (int3)(invalid);
 
   const float3 v0 = (float3)(decodePixelMeasurement(data, lut11to16, 0, x, y_in),
                              decodePixelMeasurement(data, lut11to16, 1, x, y_in),
                              decodePixelMeasurement(data, lut11to16, 2, x, y_in));
-  const float2 ab0 = processMeasurementTriple(AB_MULTIPLIER_PER_FRQ0, p0.x, v0, &saturatedX);
-
   const float3 v1 = (float3)(decodePixelMeasurement(data, lut11to16, 3, x, y_in),
                              decodePixelMeasurement(data, lut11to16, 4, x, y_in),
                              decodePixelMeasurement(data, lut11to16, 5, x, y_in));
-  const float2 ab1 = processMeasurementTriple(AB_MULTIPLIER_PER_FRQ1, p0.y, v1, &saturatedY);
-
   const float3 v2 = (float3)(decodePixelMeasurement(data, lut11to16, 6, x, y_in),
                              decodePixelMeasurement(data, lut11to16, 7, x, y_in),
                              decodePixelMeasurement(data, lut11to16, 8, x, y_in));
-  const float2 ab2 = processMeasurementTriple(AB_MULTIPLIER_PER_FRQ2, p0.z, v2, &saturatedZ);
 
-  float3 a = select((float3)(ab0.x, ab1.x, ab2.x), (float3)(0.0f), invalid_pixel);
-  float3 b = select((float3)(ab0.y, ab1.y, ab2.y), (float3)(0.0f), invalid_pixel);
+  float3 a = (float3)(dot(v0, PHASE_COS * p0_cos.x - PHASE_SIN * p0_sin.x),
+                      dot(v1, PHASE_COS * p0_cos.y - PHASE_SIN * p0_sin.y),
+                      dot(v2, PHASE_COS * p0_cos.z - PHASE_SIN * p0_sin.z)) * AB_MULTIPLIER_PER_FRQ;
+  float3 b = (float3)(dot(v0, PHASE_COS * p0_sin.x + PHASE_SIN * p0_cos.x),
+                      dot(v1, PHASE_COS * p0_sin.y + PHASE_SIN * p0_cos.y),
+                      dot(v2, PHASE_COS * p0_sin.z + PHASE_SIN * p0_cos.z)) * AB_MULTIPLIER_PER_FRQ;
+
+  a = select(a, (float3)(0.0f), invalid_pixel);
+  b = select(b, (float3)(0.0f), invalid_pixel);
   float3 n = sqrt(a * a + b * b);
 
-  int3 saturated = (int3)(saturatedX, saturatedY, saturatedZ);
-  a = select(a, (float3)(0.0f), saturated);
-  b = select(b, (float3)(0.0f), saturated);
+  int3 saturated = (int3)(any(isequal(v0, (float3)(32767.0f))),
+                          any(isequal(v1, (float3)(32767.0f))),
+                          any(isequal(v2, (float3)(32767.0f))));
 
-  a_out[i] = a;
-  b_out[i] = b;
+  a_out[i] = select(a, (float3)(0.0f), saturated);
+  b_out[i] = select(b, (float3)(0.0f), saturated);
   n_out[i] = n;
   ir_out[i] = min(dot(select(n, (float3)(65535.0f), saturated), (float3)(0.333333333f  * AB_MULTIPLIER * AB_OUTPUT_MULTIPLIER)), 65535.0f);
 }
diff --git a/src/opencl_depth_packet_processor.cpp b/src/opencl_depth_packet_processor.cpp
@@ -78,7 +78,8 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
   cl_short lut11to16[2048];
   cl_float x_table[512 * 424];
   cl_float z_table[512 * 424];
-  cl_float3 p0_table[512 * 424];
+  cl_float3 p0_sin_table[512 * 424];
+  cl_float3 p0_cos_table[512 * 424];
   libfreenect2::DepthPacketProcessor::Config config;
   DepthPacketProcessor::Parameters params;
 
@@ -105,7 +106,8 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
   size_t buf_packet_size;
 
   cl::Buffer buf_lut11to16;
-  cl::Buffer buf_p0_table;
+  cl::Buffer buf_p0_sin_table;
+  cl::Buffer buf_p0_cos_table;
   cl::Buffer buf_x_table;
   cl::Buffer buf_z_table;
   cl::Buffer buf_packet;
@@ -200,9 +202,12 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
     oss << " -D AB_MULTIPLIER_PER_FRQ2=" << params.ab_multiplier_per_frq[2] << "f";
     oss << " -D AB_OUTPUT_MULTIPLIER=" << params.ab_output_multiplier << "f";
 
-    oss << " -D PHASE_IN_RAD0=" << params.phase_in_rad[0] << "f";
-    oss << " -D PHASE_IN_RAD1=" << params.phase_in_rad[1] << "f";
-    oss << " -D PHASE_IN_RAD2=" << params.phase_in_rad[2] << "f";
+    oss << " -D PHASE_IN_RAD0_SIN=" << std::sin(-params.phase_in_rad[0]) << "f";
+    oss << " -D PHASE_IN_RAD0_COS=" << std::cos(params.phase_in_rad[0]) << "f";
+    oss << " -D PHASE_IN_RAD1_SIN=" << std::sin(-params.phase_in_rad[1]) << "f";
+    oss << " -D PHASE_IN_RAD1_COS=" << std::cos(params.phase_in_rad[1]) << "f";
+    oss << " -D PHASE_IN_RAD2_SIN=" << std::sin(-params.phase_in_rad[2]) << "f";
+    oss << " -D PHASE_IN_RAD2_COS=" << std::cos(params.phase_in_rad[2]) << "f";
 
     oss << " -D JOINT_BILATERAL_AB_THRESHOLD=" << params.joint_bilateral_ab_threshold << "f";
     oss << " -D JOINT_BILATERAL_MAX_EDGE=" << params.joint_bilateral_max_edge << "f";
@@ -382,7 +387,9 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
 
       buf_lut11to16 = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_lut11to16_size, NULL, &err);
       CHECK_CL_ERROR(err, "cl::Buffer");
-      buf_p0_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_p0_table_size, NULL, &err);
+      buf_p0_sin_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_p0_table_size, NULL, &err);
+      CHECK_CL_ERROR(err, "cl::Buffer");
+      buf_p0_cos_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_p0_table_size, NULL, &err);
       CHECK_CL_ERROR(err, "cl::Buffer");
       buf_x_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_x_table_size, NULL, &err);
       CHECK_CL_ERROR(err, "cl::Buffer");
@@ -430,17 +437,19 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
       CHECK_CL_ERROR(err, "setArg");
       err = kernel_processPixelStage1.setArg(1, buf_z_table);
       CHECK_CL_ERROR(err, "setArg");
-      err = kernel_processPixelStage1.setArg(2, buf_p0_table);
+      err = kernel_processPixelStage1.setArg(2, buf_p0_sin_table);
+      CHECK_CL_ERROR(err, "setArg");
+      err = kernel_processPixelStage1.setArg(3, buf_p0_cos_table);
       CHECK_CL_ERROR(err, "setArg");
-      err = kernel_processPixelStage1.setArg(3, buf_packet);
+      err = kernel_processPixelStage1.setArg(4, buf_packet);
       CHECK_CL_ERROR(err, "setArg");
-      err = kernel_processPixelStage1.setArg(4, buf_a);
+      err = kernel_processPixelStage1.setArg(5, buf_a);
       CHECK_CL_ERROR(err, "setArg");
-      err = kernel_processPixelStage1.setArg(5, buf_b);
+      err = kernel_processPixelStage1.setArg(6, buf_b);
       CHECK_CL_ERROR(err, "setArg");
-      err = kernel_processPixelStage1.setArg(6, buf_n);
+      err = kernel_processPixelStage1.setArg(7, buf_n);
       CHECK_CL_ERROR(err, "setArg");
-      err = kernel_processPixelStage1.setArg(7, buf_ir);
+      err = kernel_processPixelStage1.setArg(8, buf_ir);
       CHECK_CL_ERROR(err, "setArg");
 
       kernel_filterPixelStage1 = cl::Kernel(program, "filterPixelStage1", &err);
@@ -484,14 +493,16 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
       err = kernel_filterPixelStage2.setArg(3, buf_filtered);
       CHECK_CL_ERROR(err, "setArg");
 
-      cl::Event event0, event1, event2, event3;
+      cl::Event event0, event1, event2, event3, event4;
       err = queue.enqueueWriteBuffer(buf_lut11to16, CL_FALSE, 0, buf_lut11to16_size, lut11to16, NULL, &event0);
       CHECK_CL_ERROR(err, "enqueueWriteBuffer");
-      err = queue.enqueueWriteBuffer(buf_p0_table, CL_FALSE, 0, buf_p0_table_size, p0_table, NULL, &event1);
+      err = queue.enqueueWriteBuffer(buf_p0_sin_table, CL_FALSE, 0, buf_p0_table_size, p0_sin_table, NULL, &event1);
       CHECK_CL_ERROR(err, "enqueueWriteBuffer");
-      err = queue.enqueueWriteBuffer(buf_x_table, CL_FALSE, 0, buf_x_table_size, x_table, NULL, &event2);
+      err = queue.enqueueWriteBuffer(buf_p0_cos_table, CL_FALSE, 0, buf_p0_table_size, p0_cos_table, NULL, &event2);
       CHECK_CL_ERROR(err, "enqueueWriteBuffer");
-      err = queue.enqueueWriteBuffer(buf_z_table, CL_FALSE, 0, buf_z_table_size, z_table, NULL, &event3);
+      err = queue.enqueueWriteBuffer(buf_x_table, CL_FALSE, 0, buf_x_table_size, x_table, NULL, &event3);
+      CHECK_CL_ERROR(err, "enqueueWriteBuffer");
+      err = queue.enqueueWriteBuffer(buf_z_table, CL_FALSE, 0, buf_z_table_size, z_table, NULL, &event4);
       CHECK_CL_ERROR(err, "enqueueWriteBuffer");
 
       err = event0.wait();
@@ -502,6 +513,8 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
       CHECK_CL_ERROR(err, "wait");
       err = event3.wait();
       CHECK_CL_ERROR(err, "wait");
+      err = event4.wait();
+      CHECK_CL_ERROR(err, "wait");
     }
 
     programInitialized = true;
@@ -606,16 +619,24 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
   {
     for(int r = 0; r < 424; ++r)
     {
-      cl_float3 *it = &p0_table[r * 512];
+      cl_float3 *itS = &p0_sin_table[r * 512];
+      cl_float3 *itC = &p0_cos_table[r * 512];
       const uint16_t *it0 = &p0table->p0table0[r * 512];
       const uint16_t *it1 = &p0table->p0table1[r * 512];
       const uint16_t *it2 = &p0table->p0table2[r * 512];
-      for(int c = 0; c < 512; ++c, ++it, ++it0, ++it1, ++it2)
+      for(int c = 0; c < 512; ++c, ++itS, ++itC, ++it0, ++it1, ++it2)
       {
-        it->s[0] = -((float) * it0) * 0.000031 * M_PI;
-        it->s[1] = -((float) * it1) * 0.000031 * M_PI;
-        it->s[2] = -((float) * it2) * 0.000031 * M_PI;
-        it->s[3] = 0.0f;
+        const float x = ((float)*it0) * 0.000031 * M_PI;
+        const float y = ((float)*it1) * 0.000031 * M_PI;
+        const float z = ((float)*it2) * 0.000031 * M_PI;
+        itS->s[0] = std::sin(x);
+        itS->s[1] = std::sin(y);
+        itS->s[2] = std::sin(z);
+        itS->s[3] = 0.0f;
+        itC->s[0] = std::cos(-x);
+        itC->s[1] = std::cos(-y);
+        itC->s[2] = std::cos(-z);
+        itC->s[3] = 0.0f;
       }
     }
   }