Refactor convolutions (#23807)

* one conv with defines * add conv * building works on C3 * this is num_outputs too, process replay is so useful Co-authored-by: Comma Device <device@comma.ai>
4 years ago · 2c7542d34e
parent 719801845b
commit 2c7542d34e
7 changed files with 292 additions and 599 deletions
--- a/selfdrive/modeld/thneed/kernels/convolution_.cl
+++ b/selfdrive/modeld/thneed/kernels/convolution_.cl
@ -0,0 +1,272 @@
+    read_only image2d_t input,
+#ifndef DEPTHWISE
+    short startPackedInputChannel,
+    short numPackedInputChannelsForGroup, short totalNumPackedInputChannels,
+    // typo required for API compatibility
+    short packedOuputChannelOffset, short totalNumPackedOutputChannels,
+#else
+    short totalNumPackedChannels,
+#endif
+    read_only image2d_t weights, __constant float *biases,
+    short filterSizeX, short filterSizeY,
+    write_only image2d_t output,
+    short paddingX, short paddingY, short strideX, short strideY,
+#ifdef SUPPORT_DILATION
+    short dilationX, short dilationY,
+#endif
+    short neuron, float a, float b, float min_clamp, float max_clamp,
+#ifndef DEPTHWISE
+    // note: these are not supported
+    __constant float *parameters, __constant float *batchNormBiases,
+#endif
+    short numOutputColumns
+#ifdef SUPPORT_ACCUMULATION
+    , short doAccumulate, read_only image2d_t accumulator
+#endif
+    ) {
+
+#ifndef NUM_OUTPUTS
+  #define NUM_OUTPUTS 4
+#endif
+
+  // init
+  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  short packedOutputChannel = get_global_id(0);
+  short startOutputColumn = mul24((short)get_global_id(1), NUM_OUTPUTS);
+  short outputRow = get_global_id(2);
+
+#ifdef DEPTHWISE
+  short totalNumPackedInputChannels = totalNumPackedChannels;
+  short totalNumPackedOutputChannels = totalNumPackedChannels;
+  short startPackedInputChannel = packedOutputChannel;
+#endif
+
+  short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), totalNumPackedInputChannels, startPackedInputChannel);
+  short strideWithChannels = mul24(strideX, totalNumPackedInputChannels);
+
+  float4 outputValues[NUM_OUTPUTS];
+  for (short i = 0; i < NUM_OUTPUTS; ++i) {
+    outputValues[i] = (float4)(0, 0, 0, 0);
+  }
+
+  int2 inputLocation;
+  inputLocation.y = mad24(outputRow, strideY, -paddingY);
+
+  int2 weightLocation;
+  weightLocation.x = 0;
+  weightLocation.y = packedOutputChannel;
+
+#ifdef DEPTHWISE
+
+#ifdef SUPPORT_DILATION
+
+  // depthwise convolution
+  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
+    for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
+      short dilatedStepX = mul24(totalNumPackedChannels, dilationX);
+      inputLocation.x = mad24(rfColumn, dilatedStepX, startX);
+      float4 inputValues[4];
+      for (short i = 0; i < 4; ++i) {
+        inputValues[i] = read_imagef(input, smp, inputLocation);
+        inputLocation.x += strideWithChannels;
+      }
+      float4 weightValues = read_imagef(weights, smp, weightLocation);
+      ++weightLocation.x;
+      outputValues[0] += inputValues[0] * weightValues;
+      outputValues[1] += inputValues[1] * weightValues;
+      outputValues[2] += inputValues[2] * weightValues;
+      outputValues[3] += inputValues[3] * weightValues;
+    }
+    inputLocation.y += dilationY;
+  }
+
+#else
+
+  // depthwise unstrided convolution
+  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
+    float4 inputValues[4];
+    inputLocation.x = startX;
+    for (short i = 1; i < 4; ++i) {
+      inputValues[i] = read_imagef(input, smp, inputLocation);
+      inputLocation.x += totalNumPackedOutputChannels;
+    }
+    for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
+      inputValues[0] = inputValues[1];
+      inputValues[1] = inputValues[2];
+      inputValues[2] = inputValues[3];
+      inputValues[3] = read_imagef(input, smp, inputLocation);
+      inputLocation.x += totalNumPackedChannels;
+      float4 weightValues = read_imagef(weights, smp, weightLocation);
+      ++weightLocation.x;
+      outputValues[0] += inputValues[0] * weightValues;
+      outputValues[1] += inputValues[1] * weightValues;
+      outputValues[2] += inputValues[2] * weightValues;
+      outputValues[3] += inputValues[3] * weightValues;
+    }
+    ++inputLocation.y;
+  }
+
+#endif
+
+#elif defined(ONLY_1X1_CONV)
+
+  // 1x1 convolution
+  short endPackedInputChannel = startPackedInputChannel + numPackedInputChannelsForGroup;
+  for (short packedInputChannel = startPackedInputChannel; packedInputChannel < endPackedInputChannel; ++packedInputChannel) {
+    float4 weightValues[4];
+    for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
+      weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
+      ++weightLocation.x;
+    }
+
+    inputLocation.x = startX + packedInputChannel;
+    float4 inputValues[NUM_OUTPUTS];
+    for (short i = 0; i < NUM_OUTPUTS; ++i) {
+      inputValues[i] = read_imagef(input, smp, inputLocation);
+      inputLocation.x += strideWithChannels;
+    }
+
+    for (short i = 0; i < NUM_OUTPUTS; ++i) {
+      float4 curOutputValues = outputValues[i];
+      curOutputValues.x += inputValues[i].x * weightValues[0].x;
+      curOutputValues.x += inputValues[i].y * weightValues[0].y;
+      curOutputValues.x += inputValues[i].z * weightValues[0].z;
+      curOutputValues.x += inputValues[i].w * weightValues[0].w;
+      curOutputValues.y += inputValues[i].x * weightValues[1].x;
+      curOutputValues.y += inputValues[i].y * weightValues[1].y;
+      curOutputValues.y += inputValues[i].z * weightValues[1].z;
+      curOutputValues.y += inputValues[i].w * weightValues[1].w;
+      curOutputValues.z += inputValues[i].x * weightValues[2].x;
+      curOutputValues.z += inputValues[i].y * weightValues[2].y;
+      curOutputValues.z += inputValues[i].z * weightValues[2].z;
+      curOutputValues.z += inputValues[i].w * weightValues[2].w;
+      curOutputValues.w += inputValues[i].x * weightValues[3].x;
+      curOutputValues.w += inputValues[i].y * weightValues[3].y;
+      curOutputValues.w += inputValues[i].z * weightValues[3].z;
+      curOutputValues.w += inputValues[i].w * weightValues[3].w;
+      outputValues[i] = curOutputValues;
+    }
+  }
+  packedOutputChannel += packedOuputChannelOffset;
+
+#else
+
+  // normal convolution
+  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
+    for (short packedInputChannel = 0; packedInputChannel < numPackedInputChannelsForGroup; ++packedInputChannel) {
+      short startXForChannel = startX + packedInputChannel;
+      for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
+
+        float4 weightValues[4];
+        for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
+          weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
+          ++weightLocation.x;
+        }
+
+#ifdef SUPPORT_DILATION
+        short dilatedStepX = mul24(totalNumPackedInputChannels, dilationX);
+        inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel);
+#else
+        inputLocation.x = mad24(rfColumn, totalNumPackedInputChannels, startXForChannel);
+#endif
+        float4 inputValues[NUM_OUTPUTS];
+        for (short i = 0; i < NUM_OUTPUTS; ++i) {
+          inputValues[i] = read_imagef(input, smp, inputLocation);
+          inputLocation.x += strideWithChannels;
+        }
+        for (short i = 0; i < NUM_OUTPUTS; ++i) {
+          float4 curOutputValues = outputValues[i];
+          curOutputValues.x += inputValues[i].x * weightValues[0].x;
+          curOutputValues.x += inputValues[i].y * weightValues[0].y;
+          curOutputValues.x += inputValues[i].z * weightValues[0].z;
+          curOutputValues.x += inputValues[i].w * weightValues[0].w;
+          curOutputValues.y += inputValues[i].x * weightValues[1].x;
+          curOutputValues.y += inputValues[i].y * weightValues[1].y;
+          curOutputValues.y += inputValues[i].z * weightValues[1].z;
+          curOutputValues.y += inputValues[i].w * weightValues[1].w;
+          curOutputValues.z += inputValues[i].x * weightValues[2].x;
+          curOutputValues.z += inputValues[i].y * weightValues[2].y;
+          curOutputValues.z += inputValues[i].z * weightValues[2].z;
+          curOutputValues.z += inputValues[i].w * weightValues[2].w;
+          curOutputValues.w += inputValues[i].x * weightValues[3].x;
+          curOutputValues.w += inputValues[i].y * weightValues[3].y;
+          curOutputValues.w += inputValues[i].z * weightValues[3].z;
+          curOutputValues.w += inputValues[i].w * weightValues[3].w;
+          outputValues[i] = curOutputValues;
+        }
+      }
+    }
+#ifdef SUPPORT_DILATION
+    inputLocation.y += dilationY;
+#else
+    ++inputLocation.y;
+#endif
+  }
+  packedOutputChannel += packedOuputChannelOffset;
+#endif
+
+  // bias
+  short outputChannel = mul24(packedOutputChannel, 4);
+  float4 biasValues = vload4(0, biases + outputChannel);
+  for (short i = 0; i < NUM_OUTPUTS; ++i) {
+    outputValues[i] += biasValues;
+  }
+
+#ifdef SUPPORT_ACCUMULATION
+  // accumulate
+  if (doAccumulate) {
+    int2 outputLocation;
+    short outputColumn = startOutputColumn;
+    outputLocation.y = outputRow;
+    for (short i = 0; i < NUM_OUTPUTS; ++i) {
+      outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
+      if (outputColumn < numOutputColumns) {
+        outputValues[i] += read_imagef(accumulator, smp, outputLocation);
+      }
+      ++outputColumn;
+    }
+  }
+#endif
+
+  // activation
+  switch (neuron) {
+  case 1:
+    for (short i = 0; i < NUM_OUTPUTS; ++i) {
+      outputValues[i] = max(outputValues[i], 0.0f);
+    }
+    break;
+  case 2:
+    for (short i = 0; i < NUM_OUTPUTS; ++i) {
+      outputValues[i] = a * tanh(b * outputValues[i]);
+    }
+    break;
+  case 3:
+    for (short i = 0; i < NUM_OUTPUTS; ++i) {
+      outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
+    }
+    break;
+  case 4:
+    for (short i = 0; i < NUM_OUTPUTS; ++i) {
+      outputValues[i] = max(outputValues[i], min_clamp);
+      outputValues[i] = min(outputValues[i], max_clamp);
+    }
+    break;
+  case 5:
+    for (short i = 0; i < NUM_OUTPUTS; ++i) {
+      outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
+    }
+    break;
+  }
+
+  // output
+  int2 outputLocation;
+  short outputColumn = startOutputColumn;
+  outputLocation.y = outputRow;
+  for (short i = 0; i < NUM_OUTPUTS; ++i) {
+    outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
+    if (outputColumn < numOutputColumns) {
+      write_imagef(output, outputLocation, outputValues[i]);
+    }
+    ++outputColumn;
+  }
+}
--- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads.cl
+++ b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads.cl
@ -1,129 +1,4 @@
-__kernel void convolution_horizontal_reduced_reads(
-    read_only image2d_t input,
-    short startPackedInputChannel,
-    short numPackedInputChannelsForGroup, short totalNumPackedInputChannels,
-    short packedOuputChannelOffset, short totalNumPackedOutputChannels,
-    read_only image2d_t weights, __constant float *biases,
-    short filterSizeX, short filterSizeY,
-    write_only image2d_t output,
-    short paddingX, short paddingY, short strideX, short strideY,
-    short dilationX, short dilationY,
-    short neuron, float a, float b, float min_clamp, float max_clamp,
-    __constant float *parameters, __constant float *batchNormBiases,
-    short numOutputColumns) {
-
-  // init
-  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  short packedOutputChannel = get_global_id(0);
-  short startOutputColumn = mul24((short)get_global_id(1), 4);
-  short outputRow = get_global_id(2);
-  short startX = mad24(mad24(startOutputColumn, strideX, -paddingX),
-                       totalNumPackedInputChannels, startPackedInputChannel);
-  short strideWithChannels = mul24(strideX, totalNumPackedInputChannels);
-
-  float4 outputValues[4];
-  for (short i = 0; i < 4; ++i) {
-    outputValues[i] = (float4)(0, 0, 0, 0);
-  }
-
-  int2 inputLocation;
-  inputLocation.y = mad24(outputRow, strideY, -paddingY);
-
-  int2 weightLocation;
-  weightLocation.x = 0;
-  weightLocation.y = packedOutputChannel;
+#define SUPPORT_DILATION

-  // convolution
-  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
-    for (short packedInputChannel = 0;
-         packedInputChannel < numPackedInputChannelsForGroup;
-         ++packedInputChannel) {
-      short startXForChannel = startX + packedInputChannel;
-      for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
-        float4 weightValues[4];
-        for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
-          weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
-          ++weightLocation.x;
-        }
-        short dilatedStepX = mul24(totalNumPackedInputChannels, dilationX);
-        inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel);
-        float4 inputValues[4];
-        for (short i = 0; i < 4; ++i) {
-          inputValues[i] = read_imagef(input, smp, inputLocation);
-          inputLocation.x += strideWithChannels;
-        }
-        for (short i = 0; i < 4; ++i) {
-          float4 curOutputValues = outputValues[i];
-          curOutputValues.x += inputValues[i].x * weightValues[0].x;
-          curOutputValues.x += inputValues[i].y * weightValues[0].y;
-          curOutputValues.x += inputValues[i].z * weightValues[0].z;
-          curOutputValues.x += inputValues[i].w * weightValues[0].w;
-          curOutputValues.y += inputValues[i].x * weightValues[1].x;
-          curOutputValues.y += inputValues[i].y * weightValues[1].y;
-          curOutputValues.y += inputValues[i].z * weightValues[1].z;
-          curOutputValues.y += inputValues[i].w * weightValues[1].w;
-          curOutputValues.z += inputValues[i].x * weightValues[2].x;
-          curOutputValues.z += inputValues[i].y * weightValues[2].y;
-          curOutputValues.z += inputValues[i].z * weightValues[2].z;
-          curOutputValues.z += inputValues[i].w * weightValues[2].w;
-          curOutputValues.w += inputValues[i].x * weightValues[3].x;
-          curOutputValues.w += inputValues[i].y * weightValues[3].y;
-          curOutputValues.w += inputValues[i].z * weightValues[3].z;
-          curOutputValues.w += inputValues[i].w * weightValues[3].w;
-          outputValues[i] = curOutputValues;
-        }
-      }
-    }
-    inputLocation.y += dilationY;
-  }
-
-  // bias
-  packedOutputChannel += packedOuputChannelOffset;
-  short outputChannel = mul24(packedOutputChannel, 4);
-  float4 biasValues = vload4(0, biases + outputChannel);
-  for (short i = 0; i < 4; ++i) {
-    outputValues[i] += biasValues;
-  }
-
-  // activation
-  switch (neuron) {
-  case 1:
-    for (short i = 0; i < 4; ++i) {
-      outputValues[i] = max(outputValues[i], 0.0f);
-    }
-    break;
-  case 2:
-    for (short i = 0; i < 4; ++i) {
-      outputValues[i] = a * tanh(b * outputValues[i]);
-    }
-    break;
-  case 3:
-    for (short i = 0; i < 4; ++i) {
-      outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
-    }
-    break;
-  case 4:
-    for (short i = 0; i < 4; ++i) {
-      outputValues[i] = max(outputValues[i], min_clamp);
-      outputValues[i] = min(outputValues[i], max_clamp);
-    }
-    break;
-  case 5:
-    for (short i = 0; i < 4; ++i) {
-      outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
-    }
-    break;
-  }
-
-  // output
-  int2 outputLocation;
-  short outputColumn = startOutputColumn;
-  outputLocation.y = outputRow;
-  for (short i = 0; i < 4; ++i) {
-    outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
-    if (outputColumn < numOutputColumns) {
-      write_imagef(output, outputLocation, outputValues[i]);
-    }
-    ++outputColumn;
-  }
-}
+__kernel void convolution_horizontal_reduced_reads(
+#include "convolution_.cl"
--- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_1x1.cl
+++ b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_1x1.cl
@ -1,140 +1,5 @@
-__kernel void convolution_horizontal_reduced_reads_1x1(
-    read_only image2d_t input,
-    short startPackedInputChannel,
-    short numPackedInputChannelsForGroup, short totalNumPackedInputChannels,
-    short packedOuputChannelOffset, short totalNumPackedOutputChannels,
-    read_only image2d_t weights, __constant float *biases,
-    short filterSizeX, short filterSizeY,
-    write_only image2d_t output,
-    short paddingX, short paddingY, short strideX, short strideY,
-    short neuron, float a, float b, float min_clamp, float max_clamp,
-    __constant float *parameters, __constant float *batchNormBiases,
-    short numOutputColumns,
-    short doAccumulate, read_only image2d_t accumulator) {
-
-  // init
-  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  short packedOutputChannel = get_global_id(0);
-  short startOutputColumn = mul24((short)get_global_id(1), 4);
-  short outputRow = get_global_id(2);
-  short endPackedInputChannel = startPackedInputChannel + numPackedInputChannelsForGroup;
-  short startX = mad24(mad24(startOutputColumn, strideX, -paddingX),
-                       totalNumPackedInputChannels, startPackedInputChannel);
-  short strideWithChannels = mul24(strideX, totalNumPackedInputChannels);
-
-  float4 outputValues[4];
-  for (short i = 0; i < 4; ++i) {
-    outputValues[i] = (float4)(0, 0, 0, 0);
-  }
-
-  int2 inputLocation;
-  inputLocation.y = mad24(outputRow, strideY, -paddingY);
-
-  int2 weightLocation;
-  weightLocation.x = 0;
-  weightLocation.y = packedOutputChannel;
-
-  // convolution
-  for (short packedInputChannel = startPackedInputChannel;
-       packedInputChannel < endPackedInputChannel; ++packedInputChannel) {
-
-    float4 weightValues[4];
-    for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
-      weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
-      ++weightLocation.x;
-    }
-
-    inputLocation.x = startX + packedInputChannel;
-    float4 inputValues[4];
-    for (short i = 0; i < 4; ++i) {
-      inputValues[i] = read_imagef(input, smp, inputLocation);
-      inputLocation.x += strideWithChannels;
-    }
-
-    for (short i = 0; i < 4; ++i) {
-      float4 curOutputValues = outputValues[i];
-      curOutputValues.x += inputValues[i].x * weightValues[0].x;
-      curOutputValues.x += inputValues[i].y * weightValues[0].y;
-      curOutputValues.x += inputValues[i].z * weightValues[0].z;
-      curOutputValues.x += inputValues[i].w * weightValues[0].w;
-      curOutputValues.y += inputValues[i].x * weightValues[1].x;
-      curOutputValues.y += inputValues[i].y * weightValues[1].y;
-      curOutputValues.y += inputValues[i].z * weightValues[1].z;
-      curOutputValues.y += inputValues[i].w * weightValues[1].w;
-      curOutputValues.z += inputValues[i].x * weightValues[2].x;
-      curOutputValues.z += inputValues[i].y * weightValues[2].y;
-      curOutputValues.z += inputValues[i].z * weightValues[2].z;
-      curOutputValues.z += inputValues[i].w * weightValues[2].w;
-      curOutputValues.w += inputValues[i].x * weightValues[3].x;
-      curOutputValues.w += inputValues[i].y * weightValues[3].y;
-      curOutputValues.w += inputValues[i].z * weightValues[3].z;
-      curOutputValues.w += inputValues[i].w * weightValues[3].w;
-      outputValues[i] = curOutputValues;
-    }
-  }
-
-  // bias
-  packedOutputChannel += packedOuputChannelOffset;
-  short outputChannel = mul24(packedOutputChannel, 4);
-  float4 biasValues = vload4(0, biases + outputChannel);
-  for (short i = 0; i < 4; ++i) {
-    outputValues[i] += biasValues;
-  }
-
-  // accumulate
-  if (doAccumulate) {
-    int2 outputLocation;
-    short outputColumn = startOutputColumn;
-    outputLocation.y = outputRow;
-    for (short i = 0; i < 4; ++i) {
-      outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
-      if (outputColumn < numOutputColumns) {
-        outputValues[i] += read_imagef(accumulator, smp, outputLocation);
-      }
-      ++outputColumn;
-    }
-  }
-
-  // activation
-  switch (neuron) {
-  case 1:
-    for (short i = 0; i < 4; ++i) {
-      outputValues[i] = max(outputValues[i], 0.0f);
-    }
-    break;
-  case 2:
-    for (short i = 0; i < 4; ++i) {
-      outputValues[i] = a * tanh(b * outputValues[i]);
-    }
-    break;
-  case 3:
-    for (short i = 0; i < 4; ++i) {
-      outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
-    }
-    break;
-  case 4:
-    for (short i = 0; i < 4; ++i) {
-      outputValues[i] = max(outputValues[i], min_clamp);
-      outputValues[i] = min(outputValues[i], max_clamp);
-    }
-    break;
-  case 5:
-    for (short i = 0; i < 4; ++i) {
-      outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
-    }
-    break;
-  }
-
-  // output
-  int2 outputLocation;
-  short outputColumn = startOutputColumn;
-  outputLocation.y = outputRow;
-  for (short i = 0; i < 4; ++i) {
-    outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
-    if (outputColumn < numOutputColumns) {
-      write_imagef(output, outputLocation, outputValues[i]);
-    }
-    ++outputColumn;
-  }
-}
+#define ONLY_1X1_CONV
+#define SUPPORT_ACCUMULATION

+__kernel void convolution_horizontal_reduced_reads_1x1(
+#include "convolution_.cl"
--- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_5_outputs.cl
+++ b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_5_outputs.cl
@ -1,130 +1,4 @@
-__kernel void convolution_horizontal_reduced_reads_5_outputs(
-    read_only image2d_t input,
-    short startPackedInputChannel,
-    short numPackedInputChannelsForGroup, short totalNumPackedInputChannels,
-    short packedOuputChannelOffset, short totalNumPackedOutputChannels,
-    read_only image2d_t weights, __constant float *biases,
-    short filterSizeX, short filterSizeY,
-    write_only image2d_t output,
-    short paddingX, short paddingY, short strideX, short strideY,
-    short neuron, float a, float b, float min_clamp, float max_clamp,
-    __constant float *parameters, __constant float *batchNormBiases,
-    short numOutputColumns) {
-
-  // init
-  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  short packedOutputChannel = get_global_id(0);
-  short startOutputColumn = mul24((short)get_global_id(1), 5);
-  short outputRow = get_global_id(2);
-  short startX = mad24(mad24(startOutputColumn, strideX, -paddingX),
-                       totalNumPackedInputChannels, startPackedInputChannel);
-  short strideWithChannels = mul24(strideX, totalNumPackedInputChannels);
-
-  float4 outputValues[5];
-  for (short i = 0; i < 5; ++i) {
-    outputValues[i] = (float4)(0, 0, 0, 0);
-  }
-
-  int2 inputLocation;
-  inputLocation.y = mad24(outputRow, strideY, -paddingY);
-
-  int2 weightLocation;
-  weightLocation.x = 0;
-  weightLocation.y = packedOutputChannel;
-
-  // convolution
-  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
-    for (short packedInputChannel = 0;
-         packedInputChannel < numPackedInputChannelsForGroup;
-         ++packedInputChannel) {
-      short startXForChannel = startX + packedInputChannel;
-      for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
-        float4 weightValues[4];
-        for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
-          weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
-          ++weightLocation.x;
-        }
-        inputLocation.x =
-            mad24(rfColumn, totalNumPackedInputChannels, startXForChannel);
+#define NUM_OUTPUTS 5

-        float4 inputValues[5];
-        for (short i = 0; i < 5; ++i) {
-          inputValues[i] = read_imagef(input, smp, inputLocation);
-          inputLocation.x += strideWithChannels;
-        }
-
-        for (short i = 0; i < 5; ++i) {
-          float4 curOutputValues = outputValues[i];
-          curOutputValues.x += inputValues[i].x * weightValues[0].x;
-          curOutputValues.x += inputValues[i].y * weightValues[0].y;
-          curOutputValues.x += inputValues[i].z * weightValues[0].z;
-          curOutputValues.x += inputValues[i].w * weightValues[0].w;
-          curOutputValues.y += inputValues[i].x * weightValues[1].x;
-          curOutputValues.y += inputValues[i].y * weightValues[1].y;
-          curOutputValues.y += inputValues[i].z * weightValues[1].z;
-          curOutputValues.y += inputValues[i].w * weightValues[1].w;
-          curOutputValues.z += inputValues[i].x * weightValues[2].x;
-          curOutputValues.z += inputValues[i].y * weightValues[2].y;
-          curOutputValues.z += inputValues[i].z * weightValues[2].z;
-          curOutputValues.z += inputValues[i].w * weightValues[2].w;
-          curOutputValues.w += inputValues[i].x * weightValues[3].x;
-          curOutputValues.w += inputValues[i].y * weightValues[3].y;
-          curOutputValues.w += inputValues[i].z * weightValues[3].z;
-          curOutputValues.w += inputValues[i].w * weightValues[3].w;
-          outputValues[i] = curOutputValues;
-        }
-      }
-    }
-    ++inputLocation.y;
-  }
-
-  // bias
-  packedOutputChannel += packedOuputChannelOffset;
-  short outputChannel = mul24(packedOutputChannel, 4);
-  float4 biasValues = vload4(0, biases + outputChannel);
-  for (short i = 0; i < 5; ++i) {
-    outputValues[i] += biasValues;
-  }
-
-  // activation
-  switch (neuron) {
-  case 1:
-    for (short i = 0; i < 5; ++i) {
-      outputValues[i] = max(outputValues[i], 0.0f);
-    }
-    break;
-  case 2:
-    for (short i = 0; i < 5; ++i) {
-      outputValues[i] = a * tanh(b * outputValues[i]);
-    }
-    break;
-  case 3:
-    for (short i = 0; i < 5; ++i) {
-      outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
-    }
-    break;
-  case 4:
-    for (short i = 0; i < 5; ++i) {
-      outputValues[i] = max(outputValues[i], min_clamp);
-      outputValues[i] = min(outputValues[i], max_clamp);
-    }
-    break;
-  case 5:
-    for (short i = 0; i < 5; ++i) {
-      outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
-    }
-    break;
-  }
-
-  // output
-  int2 outputLocation;
-  short outputColumn = startOutputColumn;
-  outputLocation.y = outputRow;
-  for (short i = 0; i < 5; ++i) {
-    outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
-    if (outputColumn < numOutputColumns) {
-      write_imagef(output, outputLocation, outputValues[i]);
-    }
-    ++outputColumn;
-  }
-}
+__kernel void convolution_horizontal_reduced_reads_5_outputs(
+#include "convolution_.cl"
--- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise.cl
+++ b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise.cl
@ -1,101 +1,5 @@
-__kernel void convolution_horizontal_reduced_reads_depthwise(
-    read_only image2d_t input,
-    short totalNumPackedChannels,
-    read_only image2d_t weights, __constant float *biases,
-    short filterSizeX, short filterSizeY,
-    write_only image2d_t output,
-    short paddingX, short paddingY, short strideX, short strideY,
-    short dilationX, short dilationY,
-    short neuron, float a, float b, float min_clamp, float max_clamp,
-    short numOutputColumns) {
-
-  // init
-  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  short packedChannel = get_global_id(0);
-  short startOutputColumn = mul24((short)get_global_id(1), 4);
-  short outputRow = get_global_id(2);
-  short startXForChannel = mad24(mad24(startOutputColumn, strideX, -paddingX),
-                                 totalNumPackedChannels, packedChannel);
-  short strideWithChannels = mul24(strideX, totalNumPackedChannels);
-
-  float4 outputValues[4];
-  for (short i = 0; i < 4; ++i) {
-    outputValues[i] = (float4)(0, 0, 0, 0);
-  }
-
-  int2 inputLocation;
-  inputLocation.y = mad24(outputRow, strideY, -paddingY);
-
-  int2 weightLocation;
-  weightLocation.x = 0;
-  weightLocation.y = packedChannel;
+#define DEPTHWISE
+#define SUPPORT_DILATION

-  // convolution
-  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
-    for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
-      short dilatedStepX = mul24(totalNumPackedChannels, dilationX);
-      inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel);
-      float4 inputValues[4];
-      for (short i = 0; i < 4; ++i) {
-        inputValues[i] = read_imagef(input, smp, inputLocation);
-        inputLocation.x += strideWithChannels;
-      }
-      float4 weightValues = read_imagef(weights, smp, weightLocation);
-      ++weightLocation.x;
-      outputValues[0] += inputValues[0] * weightValues;
-      outputValues[1] += inputValues[1] * weightValues;
-      outputValues[2] += inputValues[2] * weightValues;
-      outputValues[3] += inputValues[3] * weightValues;
-    }
-    inputLocation.y += dilationY;
-  }
-
-  // bias
-  short outputChannel = mul24(packedChannel, 4);
-  float4 biasValues = vload4(0, biases + outputChannel);
-  for (short i = 0; i < 4; ++i) {
-    outputValues[i] += biasValues;
-  }
-
-  // activation
-  switch (neuron) {
-  case 1:
-    for (short i = 0; i < 4; ++i) {
-      outputValues[i] = max(outputValues[i], 0.0f);
-    }
-    break;
-  case 2:
-    for (short i = 0; i < 4; ++i) {
-      outputValues[i] = a * tanh(b * outputValues[i]);
-    }
-    break;
-  case 3:
-    for (short i = 0; i < 4; ++i) {
-      outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
-    }
-    break;
-  case 4:
-    for (short i = 0; i < 4; ++i) {
-      outputValues[i] = max(outputValues[i], min_clamp);
-      outputValues[i] = min(outputValues[i], max_clamp);
-    }
-    break;
-  case 5:
-    for (short i = 0; i < 4; ++i) {
-      outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
-    }
-    break;
-  }
-
-  // output
-  int2 outputLocation;
-  short outputColumn = startOutputColumn;
-  outputLocation.y = outputRow;
-  for (short i = 0; i < 4; ++i) {
-    outputLocation.x = mad24(outputColumn, totalNumPackedChannels, packedChannel);
-    if (outputColumn < numOutputColumns) {
-      write_imagef(output, outputLocation, outputValues[i]);
-    }
-    ++outputColumn;
-  }
-}
+__kernel void convolution_horizontal_reduced_reads_depthwise(
+#include "convolution_.cl"
--- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise_stride_1.cl
+++ b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise_stride_1.cl
@ -1,103 +1,4 @@
-__kernel void convolution_horizontal_reduced_reads_depthwise_stride_1(
-    read_only image2d_t input,
-    short totalNumPackedChannels,
-    read_only image2d_t weights, __constant float *biases,
-    short filterSizeX, short filterSizeY,
-    write_only image2d_t output,
-    short paddingX, short paddingY, short strideX, short strideY,
-    short neuron, float a, float b, float min_clamp, float max_clamp,
-    short numOutputColumns) {
-
-  // init
-  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  short packedChannel = get_global_id(0);
-  short startOutputColumn = mul24((short)get_global_id(1), 4);
-  short outputRow = get_global_id(2);
-  short startXForChannel = mad24(mad24(startOutputColumn, strideX, -paddingX),
-                                 totalNumPackedChannels, packedChannel);
-
-  float4 outputValues[4];
-  for (short i = 0; i < 4; ++i) {
-    outputValues[i] = (float4)(0, 0, 0, 0);
-  }
-
-  int2 inputLocation;
-  inputLocation.y = mad24(outputRow, strideY, -paddingY);
-
-  int2 weightLocation;
-  weightLocation.x = 0;
-  weightLocation.y = packedChannel;
+#define DEPTHWISE

-  // convolution
-  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
-    float4 inputValues[4];
-    inputLocation.x = startXForChannel;
-    for (short i = 1; i < 4; ++i) {
-      inputValues[i] = read_imagef(input, smp, inputLocation);
-      inputLocation.x += totalNumPackedChannels;
-    }
-    for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
-      inputValues[0] = inputValues[1];
-      inputValues[1] = inputValues[2];
-      inputValues[2] = inputValues[3];
-      inputValues[3] = read_imagef(input, smp, inputLocation);
-      inputLocation.x += totalNumPackedChannels;
-      float4 weightValues = read_imagef(weights, smp, weightLocation);
-      ++weightLocation.x;
-      outputValues[0] += inputValues[0] * weightValues;
-      outputValues[1] += inputValues[1] * weightValues;
-      outputValues[2] += inputValues[2] * weightValues;
-      outputValues[3] += inputValues[3] * weightValues;
-    }
-    ++inputLocation.y;
-  }
-
-  // bias
-  short outputChannel = mul24(packedChannel, 4);
-  float4 biasValues = vload4(0, biases + outputChannel);
-  for (short i = 0; i < 4; ++i) {
-    outputValues[i] += biasValues;
-  }
-
-  // activation
-  switch (neuron) {
-  case 1:
-    for (short i = 0; i < 4; ++i) {
-      outputValues[i] = max(outputValues[i], 0.0f);
-    }
-    break;
-  case 2:
-    for (short i = 0; i < 4; ++i) {
-      outputValues[i] = a * tanh(b * outputValues[i]);
-    }
-    break;
-  case 3:
-    for (short i = 0; i < 4; ++i) {
-      outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
-    }
-    break;
-  case 4:
-    for (short i = 0; i < 4; ++i) {
-      outputValues[i] = max(outputValues[i], min_clamp);
-      outputValues[i] = min(outputValues[i], max_clamp);
-    }
-    break;
-  case 5:
-    for (short i = 0; i < 4; ++i) {
-      outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
-    }
-    break;
-  }
-
-  // output
-  int2 outputLocation;
-  short outputColumn = startOutputColumn;
-  outputLocation.y = outputRow;
-  for (short i = 0; i < 4; ++i) {
-    outputLocation.x = mad24(outputColumn, totalNumPackedChannels, packedChannel);
-    if (outputColumn < numOutputColumns) {
-      write_imagef(output, outputLocation, outputValues[i]);
-    }
-    ++outputColumn;
-  }
-}
+__kernel void convolution_horizontal_reduced_reads_depthwise_stride_1(
+#include "convolution_.cl"
--- a/selfdrive/modeld/thneed/optimizer.cc
+++ b/selfdrive/modeld/thneed/optimizer.cc
@ -80,7 +80,9 @@ int Thneed::optimize() {

        printf("building kernel %s\n", k->name.c_str());
        k->program = clCreateProgramWithSource(context, 1, srcs, &length, NULL);
-        int err = clBuildProgram(k->program, 1, &device_id, "", NULL, NULL);
+        char options[0x100];
+        snprintf(options, sizeof(options)-1, "-I %s", kernel_path);
+        int err = clBuildProgram(k->program, 1, &device_id, options, NULL, NULL);

        if (err != 0) {
          printf("got err %d\n", err);