Refactor convolutions (#23807)

* one conv with defines * add conv * building works on C3 * this is num_outputs too, process replay is so useful Co-authored-by: Comma Device <device@comma.ai>
4 years ago · 2c7542d34e
parent 719801845b
commit 2c7542d34e
7 changed files with 292 additions and 599 deletions
--- a/selfdrive/modeld/thneed/kernels/convolution_.cl
+++ b/selfdrive/modeld/thneed/kernels/convolution_.cl
@ -0,0 +1,272 @@
    read_only image2d_t input,
 #ifndef DEPTHWISE
    short startPackedInputChannel,
    short numPackedInputChannelsForGroup, short totalNumPackedInputChannels,
    // typo required for API compatibility
    short packedOuputChannelOffset, short totalNumPackedOutputChannels,
 #else
    short totalNumPackedChannels,
 #endif
    read_only image2d_t weights, __constant float *biases,
    short filterSizeX, short filterSizeY,
    write_only image2d_t output,
    short paddingX, short paddingY, short strideX, short strideY,
 #ifdef SUPPORT_DILATION
    short dilationX, short dilationY,
 #endif
    short neuron, float a, float b, float min_clamp, float max_clamp,
 #ifndef DEPTHWISE
    // note: these are not supported
    __constant float *parameters, __constant float *batchNormBiases,
 #endif
    short numOutputColumns
 #ifdef SUPPORT_ACCUMULATION
    , short doAccumulate, read_only image2d_t accumulator
 #endif
    ) {
 #ifndef NUM_OUTPUTS
  #define NUM_OUTPUTS 4
 #endif
  // init
  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  short packedOutputChannel = get_global_id(0);
  short startOutputColumn = mul24((short)get_global_id(1), NUM_OUTPUTS);
  short outputRow = get_global_id(2);
 #ifdef DEPTHWISE
  short totalNumPackedInputChannels = totalNumPackedChannels;
  short totalNumPackedOutputChannels = totalNumPackedChannels;
  short startPackedInputChannel = packedOutputChannel;
 #endif
  short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), totalNumPackedInputChannels, startPackedInputChannel);
  short strideWithChannels = mul24(strideX, totalNumPackedInputChannels);
  float4 outputValues[NUM_OUTPUTS];
  for (short i = 0; i < NUM_OUTPUTS; ++i) {
    outputValues[i] = (float4)(0, 0, 0, 0);
  }
  int2 inputLocation;
  inputLocation.y = mad24(outputRow, strideY, -paddingY);
  int2 weightLocation;
  weightLocation.x = 0;
  weightLocation.y = packedOutputChannel;
 #ifdef DEPTHWISE
 #ifdef SUPPORT_DILATION
  // depthwise convolution
  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
    for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
      short dilatedStepX = mul24(totalNumPackedChannels, dilationX);
      inputLocation.x = mad24(rfColumn, dilatedStepX, startX);
      float4 inputValues[4];
      for (short i = 0; i < 4; ++i) {
        inputValues[i] = read_imagef(input, smp, inputLocation);
        inputLocation.x += strideWithChannels;
      }
      float4 weightValues = read_imagef(weights, smp, weightLocation);
      ++weightLocation.x;
      outputValues[0] += inputValues[0] * weightValues;
      outputValues[1] += inputValues[1] * weightValues;
      outputValues[2] += inputValues[2] * weightValues;
      outputValues[3] += inputValues[3] * weightValues;
    }
    inputLocation.y += dilationY;
  }
 #else
  // depthwise unstrided convolution
  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
    float4 inputValues[4];
    inputLocation.x = startX;
    for (short i = 1; i < 4; ++i) {
      inputValues[i] = read_imagef(input, smp, inputLocation);
      inputLocation.x += totalNumPackedOutputChannels;
    }
    for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
      inputValues[0] = inputValues[1];
      inputValues[1] = inputValues[2];
      inputValues[2] = inputValues[3];
      inputValues[3] = read_imagef(input, smp, inputLocation);
      inputLocation.x += totalNumPackedChannels;
      float4 weightValues = read_imagef(weights, smp, weightLocation);
      ++weightLocation.x;
      outputValues[0] += inputValues[0] * weightValues;
      outputValues[1] += inputValues[1] * weightValues;
      outputValues[2] += inputValues[2] * weightValues;
      outputValues[3] += inputValues[3] * weightValues;
    }
    ++inputLocation.y;
  }
 #endif
 #elif defined(ONLY_1X1_CONV)
  // 1x1 convolution
  short endPackedInputChannel = startPackedInputChannel + numPackedInputChannelsForGroup;
  for (short packedInputChannel = startPackedInputChannel; packedInputChannel < endPackedInputChannel; ++packedInputChannel) {
    float4 weightValues[4];
    for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
      weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
      ++weightLocation.x;
    }
    inputLocation.x = startX + packedInputChannel;
    float4 inputValues[NUM_OUTPUTS];
    for (short i = 0; i < NUM_OUTPUTS; ++i) {
      inputValues[i] = read_imagef(input, smp, inputLocation);
      inputLocation.x += strideWithChannels;
    }
    for (short i = 0; i < NUM_OUTPUTS; ++i) {
      float4 curOutputValues = outputValues[i];
      curOutputValues.x += inputValues[i].x * weightValues[0].x;
      curOutputValues.x += inputValues[i].y * weightValues[0].y;
      curOutputValues.x += inputValues[i].z * weightValues[0].z;
      curOutputValues.x += inputValues[i].w * weightValues[0].w;
      curOutputValues.y += inputValues[i].x * weightValues[1].x;
      curOutputValues.y += inputValues[i].y * weightValues[1].y;
      curOutputValues.y += inputValues[i].z * weightValues[1].z;
      curOutputValues.y += inputValues[i].w * weightValues[1].w;
      curOutputValues.z += inputValues[i].x * weightValues[2].x;
      curOutputValues.z += inputValues[i].y * weightValues[2].y;
      curOutputValues.z += inputValues[i].z * weightValues[2].z;
      curOutputValues.z += inputValues[i].w * weightValues[2].w;
      curOutputValues.w += inputValues[i].x * weightValues[3].x;
      curOutputValues.w += inputValues[i].y * weightValues[3].y;
      curOutputValues.w += inputValues[i].z * weightValues[3].z;
      curOutputValues.w += inputValues[i].w * weightValues[3].w;
      outputValues[i] = curOutputValues;
    }
  }
  packedOutputChannel += packedOuputChannelOffset;
 #else
  // normal convolution
  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
    for (short packedInputChannel = 0; packedInputChannel < numPackedInputChannelsForGroup; ++packedInputChannel) {
      short startXForChannel = startX + packedInputChannel;
      for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
        float4 weightValues[4];
        for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
          weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
          ++weightLocation.x;
        }
 #ifdef SUPPORT_DILATION
        short dilatedStepX = mul24(totalNumPackedInputChannels, dilationX);
        inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel);
 #else
        inputLocation.x = mad24(rfColumn, totalNumPackedInputChannels, startXForChannel);
 #endif
        float4 inputValues[NUM_OUTPUTS];
        for (short i = 0; i < NUM_OUTPUTS; ++i) {
          inputValues[i] = read_imagef(input, smp, inputLocation);
          inputLocation.x += strideWithChannels;
        }
        for (short i = 0; i < NUM_OUTPUTS; ++i) {
          float4 curOutputValues = outputValues[i];
          curOutputValues.x += inputValues[i].x * weightValues[0].x;
          curOutputValues.x += inputValues[i].y * weightValues[0].y;
          curOutputValues.x += inputValues[i].z * weightValues[0].z;
          curOutputValues.x += inputValues[i].w * weightValues[0].w;
          curOutputValues.y += inputValues[i].x * weightValues[1].x;
          curOutputValues.y += inputValues[i].y * weightValues[1].y;
          curOutputValues.y += inputValues[i].z * weightValues[1].z;
          curOutputValues.y += inputValues[i].w * weightValues[1].w;
          curOutputValues.z += inputValues[i].x * weightValues[2].x;
          curOutputValues.z += inputValues[i].y * weightValues[2].y;
          curOutputValues.z += inputValues[i].z * weightValues[2].z;
          curOutputValues.z += inputValues[i].w * weightValues[2].w;
          curOutputValues.w += inputValues[i].x * weightValues[3].x;
          curOutputValues.w += inputValues[i].y * weightValues[3].y;
          curOutputValues.w += inputValues[i].z * weightValues[3].z;
          curOutputValues.w += inputValues[i].w * weightValues[3].w;
          outputValues[i] = curOutputValues;
        }
      }
    }
 #ifdef SUPPORT_DILATION
    inputLocation.y += dilationY;
 #else
    ++inputLocation.y;
 #endif
  }
  packedOutputChannel += packedOuputChannelOffset;
 #endif
  // bias
  short outputChannel = mul24(packedOutputChannel, 4);
  float4 biasValues = vload4(0, biases + outputChannel);
  for (short i = 0; i < NUM_OUTPUTS; ++i) {
    outputValues[i] += biasValues;
  }
 #ifdef SUPPORT_ACCUMULATION
  // accumulate
  if (doAccumulate) {
    int2 outputLocation;
    short outputColumn = startOutputColumn;
    outputLocation.y = outputRow;
    for (short i = 0; i < NUM_OUTPUTS; ++i) {
      outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
      if (outputColumn < numOutputColumns) {
        outputValues[i] += read_imagef(accumulator, smp, outputLocation);
      }
      ++outputColumn;
    }
  }
 #endif
  // activation
  switch (neuron) {
  case 1:
    for (short i = 0; i < NUM_OUTPUTS; ++i) {
      outputValues[i] = max(outputValues[i], 0.0f);
    }
    break;
  case 2:
    for (short i = 0; i < NUM_OUTPUTS; ++i) {
      outputValues[i] = a * tanh(b * outputValues[i]);
    }
    break;
  case 3:
    for (short i = 0; i < NUM_OUTPUTS; ++i) {
      outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
    }
    break;
  case 4:
    for (short i = 0; i < NUM_OUTPUTS; ++i) {
      outputValues[i] = max(outputValues[i], min_clamp);
      outputValues[i] = min(outputValues[i], max_clamp);
    }
    break;
  case 5:
    for (short i = 0; i < NUM_OUTPUTS; ++i) {
      outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
    }
    break;
  }
  // output
  int2 outputLocation;
  short outputColumn = startOutputColumn;
  outputLocation.y = outputRow;
  for (short i = 0; i < NUM_OUTPUTS; ++i) {
    outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
    if (outputColumn < numOutputColumns) {
      write_imagef(output, outputLocation, outputValues[i]);
    }
    ++outputColumn;
  }
 }
--- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads.cl
+++ b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads.cl
@ -1,129 +1,4 @@
-__kernel void convolution_horizontal_reduced_reads(
+#define SUPPORT_DILATION
    read_only image2d_t input,
    short startPackedInputChannel,
    short numPackedInputChannelsForGroup, short totalNumPackedInputChannels,
    short packedOuputChannelOffset, short totalNumPackedOutputChannels,
    read_only image2d_t weights, __constant float *biases,
    short filterSizeX, short filterSizeY,
    write_only image2d_t output,
    short paddingX, short paddingY, short strideX, short strideY,
    short dilationX, short dilationY,
    short neuron, float a, float b, float min_clamp, float max_clamp,
    __constant float *parameters, __constant float *batchNormBiases,
    short numOutputColumns) {
  // init
  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  short packedOutputChannel = get_global_id(0);
  short startOutputColumn = mul24((short)get_global_id(1), 4);
  short outputRow = get_global_id(2);
  short startX = mad24(mad24(startOutputColumn, strideX, -paddingX),
                       totalNumPackedInputChannels, startPackedInputChannel);
  short strideWithChannels = mul24(strideX, totalNumPackedInputChannels);
  float4 outputValues[4];
  for (short i = 0; i < 4; ++i) {
    outputValues[i] = (float4)(0, 0, 0, 0);
  }
  int2 inputLocation;
  inputLocation.y = mad24(outputRow, strideY, -paddingY);
  int2 weightLocation;
  weightLocation.x = 0;
  weightLocation.y = packedOutputChannel;
-  // convolution
+__kernel void convolution_horizontal_reduced_reads(
-  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
+#include "convolution_.cl"
    for (short packedInputChannel = 0;
         packedInputChannel < numPackedInputChannelsForGroup;
         ++packedInputChannel) {
      short startXForChannel = startX + packedInputChannel;
      for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
        float4 weightValues[4];
        for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
          weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
          ++weightLocation.x;
        }
        short dilatedStepX = mul24(totalNumPackedInputChannels, dilationX);
        inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel);
        float4 inputValues[4];
        for (short i = 0; i < 4; ++i) {
          inputValues[i] = read_imagef(input, smp, inputLocation);
          inputLocation.x += strideWithChannels;
        }
        for (short i = 0; i < 4; ++i) {
          float4 curOutputValues = outputValues[i];
          curOutputValues.x += inputValues[i].x * weightValues[0].x;
          curOutputValues.x += inputValues[i].y * weightValues[0].y;
          curOutputValues.x += inputValues[i].z * weightValues[0].z;
          curOutputValues.x += inputValues[i].w * weightValues[0].w;
          curOutputValues.y += inputValues[i].x * weightValues[1].x;
          curOutputValues.y += inputValues[i].y * weightValues[1].y;
          curOutputValues.y += inputValues[i].z * weightValues[1].z;
          curOutputValues.y += inputValues[i].w * weightValues[1].w;
          curOutputValues.z += inputValues[i].x * weightValues[2].x;
          curOutputValues.z += inputValues[i].y * weightValues[2].y;
          curOutputValues.z += inputValues[i].z * weightValues[2].z;
          curOutputValues.z += inputValues[i].w * weightValues[2].w;
          curOutputValues.w += inputValues[i].x * weightValues[3].x;
          curOutputValues.w += inputValues[i].y * weightValues[3].y;
          curOutputValues.w += inputValues[i].z * weightValues[3].z;
          curOutputValues.w += inputValues[i].w * weightValues[3].w;
          outputValues[i] = curOutputValues;
        }
      }
    }
    inputLocation.y += dilationY;
  }
  // bias
  packedOutputChannel += packedOuputChannelOffset;
  short outputChannel = mul24(packedOutputChannel, 4);
  float4 biasValues = vload4(0, biases + outputChannel);
  for (short i = 0; i < 4; ++i) {
    outputValues[i] += biasValues;
  }
  // activation
  switch (neuron) {
  case 1:
    for (short i = 0; i < 4; ++i) {
      outputValues[i] = max(outputValues[i], 0.0f);
    }
    break;
  case 2:
    for (short i = 0; i < 4; ++i) {
      outputValues[i] = a * tanh(b * outputValues[i]);
    }
    break;
  case 3:
    for (short i = 0; i < 4; ++i) {
      outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
    }
    break;
  case 4:
    for (short i = 0; i < 4; ++i) {
      outputValues[i] = max(outputValues[i], min_clamp);
      outputValues[i] = min(outputValues[i], max_clamp);
    }
    break;
  case 5:
    for (short i = 0; i < 4; ++i) {
      outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
    }
    break;
  }
  // output
  int2 outputLocation;
  short outputColumn = startOutputColumn;
  outputLocation.y = outputRow;
  for (short i = 0; i < 4; ++i) {
    outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
    if (outputColumn < numOutputColumns) {
      write_imagef(output, outputLocation, outputValues[i]);
    }
    ++outputColumn;
  }
 }
--- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_1x1.cl
+++ b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_1x1.cl
@ -1,140 +1,5 @@
-__kernel void convolution_horizontal_reduced_reads_1x1(
+#define ONLY_1X1_CONV
-    read_only image2d_t input,
+#define SUPPORT_ACCUMULATION
    short startPackedInputChannel,
    short numPackedInputChannelsForGroup, short totalNumPackedInputChannels,
    short packedOuputChannelOffset, short totalNumPackedOutputChannels,
    read_only image2d_t weights, __constant float *biases,
    short filterSizeX, short filterSizeY,
    write_only image2d_t output,
    short paddingX, short paddingY, short strideX, short strideY,
    short neuron, float a, float b, float min_clamp, float max_clamp,
    __constant float *parameters, __constant float *batchNormBiases,
    short numOutputColumns,
    short doAccumulate, read_only image2d_t accumulator) {
  // init
  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  short packedOutputChannel = get_global_id(0);
  short startOutputColumn = mul24((short)get_global_id(1), 4);
  short outputRow = get_global_id(2);
  short endPackedInputChannel = startPackedInputChannel + numPackedInputChannelsForGroup;
  short startX = mad24(mad24(startOutputColumn, strideX, -paddingX),
                       totalNumPackedInputChannels, startPackedInputChannel);
  short strideWithChannels = mul24(strideX, totalNumPackedInputChannels);
  float4 outputValues[4];
  for (short i = 0; i < 4; ++i) {
    outputValues[i] = (float4)(0, 0, 0, 0);
  }
  int2 inputLocation;
  inputLocation.y = mad24(outputRow, strideY, -paddingY);
  int2 weightLocation;
  weightLocation.x = 0;
  weightLocation.y = packedOutputChannel;
  // convolution
  for (short packedInputChannel = startPackedInputChannel;
       packedInputChannel < endPackedInputChannel; ++packedInputChannel) {
    float4 weightValues[4];
    for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
      weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
      ++weightLocation.x;
    }
    inputLocation.x = startX + packedInputChannel;
    float4 inputValues[4];
    for (short i = 0; i < 4; ++i) {
      inputValues[i] = read_imagef(input, smp, inputLocation);
      inputLocation.x += strideWithChannels;
    }
    for (short i = 0; i < 4; ++i) {
      float4 curOutputValues = outputValues[i];
      curOutputValues.x += inputValues[i].x * weightValues[0].x;
      curOutputValues.x += inputValues[i].y * weightValues[0].y;
      curOutputValues.x += inputValues[i].z * weightValues[0].z;
      curOutputValues.x += inputValues[i].w * weightValues[0].w;
      curOutputValues.y += inputValues[i].x * weightValues[1].x;
      curOutputValues.y += inputValues[i].y * weightValues[1].y;
      curOutputValues.y += inputValues[i].z * weightValues[1].z;
      curOutputValues.y += inputValues[i].w * weightValues[1].w;
      curOutputValues.z += inputValues[i].x * weightValues[2].x;
      curOutputValues.z += inputValues[i].y * weightValues[2].y;
      curOutputValues.z += inputValues[i].z * weightValues[2].z;
      curOutputValues.z += inputValues[i].w * weightValues[2].w;
      curOutputValues.w += inputValues[i].x * weightValues[3].x;
      curOutputValues.w += inputValues[i].y * weightValues[3].y;
      curOutputValues.w += inputValues[i].z * weightValues[3].z;
      curOutputValues.w += inputValues[i].w * weightValues[3].w;
      outputValues[i] = curOutputValues;
    }
  }
  // bias
  packedOutputChannel += packedOuputChannelOffset;
  short outputChannel = mul24(packedOutputChannel, 4);
  float4 biasValues = vload4(0, biases + outputChannel);
  for (short i = 0; i < 4; ++i) {
    outputValues[i] += biasValues;
  }
  // accumulate
  if (doAccumulate) {
    int2 outputLocation;
    short outputColumn = startOutputColumn;
    outputLocation.y = outputRow;
    for (short i = 0; i < 4; ++i) {
      outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
      if (outputColumn < numOutputColumns) {
        outputValues[i] += read_imagef(accumulator, smp, outputLocation);
      }
      ++outputColumn;
    }
  }
  // activation
  switch (neuron) {
  case 1:
    for (short i = 0; i < 4; ++i) {
      outputValues[i] = max(outputValues[i], 0.0f);
    }
    break;
  case 2:
    for (short i = 0; i < 4; ++i) {
      outputValues[i] = a * tanh(b * outputValues[i]);
    }
    break;
  case 3:
    for (short i = 0; i < 4; ++i) {
      outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
    }
    break;
  case 4:
    for (short i = 0; i < 4; ++i) {
      outputValues[i] = max(outputValues[i], min_clamp);
      outputValues[i] = min(outputValues[i], max_clamp);
    }
    break;
  case 5:
    for (short i = 0; i < 4; ++i) {
      outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
    }
    break;
  }
  // output
  int2 outputLocation;
  short outputColumn = startOutputColumn;
  outputLocation.y = outputRow;
  for (short i = 0; i < 4; ++i) {
    outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
    if (outputColumn < numOutputColumns) {
      write_imagef(output, outputLocation, outputValues[i]);
    }
    ++outputColumn;
  }
 }
 __kernel void convolution_horizontal_reduced_reads_1x1(
 #include "convolution_.cl"
--- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_5_outputs.cl
+++ b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_5_outputs.cl
@ -1,130 +1,4 @@
-__kernel void convolution_horizontal_reduced_reads_5_outputs(
+#define NUM_OUTPUTS 5
    read_only image2d_t input,
    short startPackedInputChannel,
    short numPackedInputChannelsForGroup, short totalNumPackedInputChannels,
    short packedOuputChannelOffset, short totalNumPackedOutputChannels,
    read_only image2d_t weights, __constant float *biases,
    short filterSizeX, short filterSizeY,
    write_only image2d_t output,
    short paddingX, short paddingY, short strideX, short strideY,
    short neuron, float a, float b, float min_clamp, float max_clamp,
    __constant float *parameters, __constant float *batchNormBiases,
    short numOutputColumns) {
  // init
  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  short packedOutputChannel = get_global_id(0);
  short startOutputColumn = mul24((short)get_global_id(1), 5);
  short outputRow = get_global_id(2);
  short startX = mad24(mad24(startOutputColumn, strideX, -paddingX),
                       totalNumPackedInputChannels, startPackedInputChannel);
  short strideWithChannels = mul24(strideX, totalNumPackedInputChannels);
  float4 outputValues[5];
  for (short i = 0; i < 5; ++i) {
    outputValues[i] = (float4)(0, 0, 0, 0);
  }
  int2 inputLocation;
  inputLocation.y = mad24(outputRow, strideY, -paddingY);
  int2 weightLocation;
  weightLocation.x = 0;
  weightLocation.y = packedOutputChannel;
  // convolution
  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
    for (short packedInputChannel = 0;
         packedInputChannel < numPackedInputChannelsForGroup;
         ++packedInputChannel) {
      short startXForChannel = startX + packedInputChannel;
      for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
        float4 weightValues[4];
        for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
          weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
          ++weightLocation.x;
        }
        inputLocation.x =
            mad24(rfColumn, totalNumPackedInputChannels, startXForChannel);
-        float4 inputValues[5];
+__kernel void convolution_horizontal_reduced_reads_5_outputs(
-        for (short i = 0; i < 5; ++i) {
+#include "convolution_.cl"
          inputValues[i] = read_imagef(input, smp, inputLocation);
          inputLocation.x += strideWithChannels;
        }
        for (short i = 0; i < 5; ++i) {
          float4 curOutputValues = outputValues[i];
          curOutputValues.x += inputValues[i].x * weightValues[0].x;
          curOutputValues.x += inputValues[i].y * weightValues[0].y;
          curOutputValues.x += inputValues[i].z * weightValues[0].z;
          curOutputValues.x += inputValues[i].w * weightValues[0].w;
          curOutputValues.y += inputValues[i].x * weightValues[1].x;
          curOutputValues.y += inputValues[i].y * weightValues[1].y;
          curOutputValues.y += inputValues[i].z * weightValues[1].z;
          curOutputValues.y += inputValues[i].w * weightValues[1].w;
          curOutputValues.z += inputValues[i].x * weightValues[2].x;
          curOutputValues.z += inputValues[i].y * weightValues[2].y;
          curOutputValues.z += inputValues[i].z * weightValues[2].z;
          curOutputValues.z += inputValues[i].w * weightValues[2].w;
          curOutputValues.w += inputValues[i].x * weightValues[3].x;
          curOutputValues.w += inputValues[i].y * weightValues[3].y;
          curOutputValues.w += inputValues[i].z * weightValues[3].z;
          curOutputValues.w += inputValues[i].w * weightValues[3].w;
          outputValues[i] = curOutputValues;
        }
      }
    }
    ++inputLocation.y;
  }
  // bias
  packedOutputChannel += packedOuputChannelOffset;
  short outputChannel = mul24(packedOutputChannel, 4);
  float4 biasValues = vload4(0, biases + outputChannel);
  for (short i = 0; i < 5; ++i) {
    outputValues[i] += biasValues;
  }
  // activation
  switch (neuron) {
  case 1:
    for (short i = 0; i < 5; ++i) {
      outputValues[i] = max(outputValues[i], 0.0f);
    }
    break;
  case 2:
    for (short i = 0; i < 5; ++i) {
      outputValues[i] = a * tanh(b * outputValues[i]);
    }
    break;
  case 3:
    for (short i = 0; i < 5; ++i) {
      outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
    }
    break;
  case 4:
    for (short i = 0; i < 5; ++i) {
      outputValues[i] = max(outputValues[i], min_clamp);
      outputValues[i] = min(outputValues[i], max_clamp);
    }
    break;
  case 5:
    for (short i = 0; i < 5; ++i) {
      outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
    }
    break;
  }
  // output
  int2 outputLocation;
  short outputColumn = startOutputColumn;
  outputLocation.y = outputRow;
  for (short i = 0; i < 5; ++i) {
    outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
    if (outputColumn < numOutputColumns) {
      write_imagef(output, outputLocation, outputValues[i]);
    }
    ++outputColumn;
  }
 }
--- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise.cl
+++ b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise.cl
@ -1,101 +1,5 @@
-__kernel void convolution_horizontal_reduced_reads_depthwise(
+#define DEPTHWISE
-    read_only image2d_t input,
+#define SUPPORT_DILATION
    short totalNumPackedChannels,
    read_only image2d_t weights, __constant float *biases,
    short filterSizeX, short filterSizeY,
    write_only image2d_t output,
    short paddingX, short paddingY, short strideX, short strideY,
    short dilationX, short dilationY,
    short neuron, float a, float b, float min_clamp, float max_clamp,
    short numOutputColumns) {
  // init
  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  short packedChannel = get_global_id(0);
  short startOutputColumn = mul24((short)get_global_id(1), 4);
  short outputRow = get_global_id(2);
  short startXForChannel = mad24(mad24(startOutputColumn, strideX, -paddingX),
                                 totalNumPackedChannels, packedChannel);
  short strideWithChannels = mul24(strideX, totalNumPackedChannels);
  float4 outputValues[4];
  for (short i = 0; i < 4; ++i) {
    outputValues[i] = (float4)(0, 0, 0, 0);
  }
  int2 inputLocation;
  inputLocation.y = mad24(outputRow, strideY, -paddingY);
  int2 weightLocation;
  weightLocation.x = 0;
  weightLocation.y = packedChannel;
-  // convolution
+__kernel void convolution_horizontal_reduced_reads_depthwise(
-  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
+#include "convolution_.cl"
    for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
      short dilatedStepX = mul24(totalNumPackedChannels, dilationX);
      inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel);
      float4 inputValues[4];
      for (short i = 0; i < 4; ++i) {
        inputValues[i] = read_imagef(input, smp, inputLocation);
        inputLocation.x += strideWithChannels;
      }
      float4 weightValues = read_imagef(weights, smp, weightLocation);
      ++weightLocation.x;
      outputValues[0] += inputValues[0] * weightValues;
      outputValues[1] += inputValues[1] * weightValues;
      outputValues[2] += inputValues[2] * weightValues;
      outputValues[3] += inputValues[3] * weightValues;
    }
    inputLocation.y += dilationY;
  }
  // bias
  short outputChannel = mul24(packedChannel, 4);
  float4 biasValues = vload4(0, biases + outputChannel);
  for (short i = 0; i < 4; ++i) {
    outputValues[i] += biasValues;
  }
  // activation
  switch (neuron) {
  case 1:
    for (short i = 0; i < 4; ++i) {
      outputValues[i] = max(outputValues[i], 0.0f);
    }
    break;
  case 2:
    for (short i = 0; i < 4; ++i) {
      outputValues[i] = a * tanh(b * outputValues[i]);
    }
    break;
  case 3:
    for (short i = 0; i < 4; ++i) {
      outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
    }
    break;
  case 4:
    for (short i = 0; i < 4; ++i) {
      outputValues[i] = max(outputValues[i], min_clamp);
      outputValues[i] = min(outputValues[i], max_clamp);
    }
    break;
  case 5:
    for (short i = 0; i < 4; ++i) {
      outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
    }
    break;
  }
  // output
  int2 outputLocation;
  short outputColumn = startOutputColumn;
  outputLocation.y = outputRow;
  for (short i = 0; i < 4; ++i) {
    outputLocation.x = mad24(outputColumn, totalNumPackedChannels, packedChannel);
    if (outputColumn < numOutputColumns) {
      write_imagef(output, outputLocation, outputValues[i]);
    }
    ++outputColumn;
  }
 }
--- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise_stride_1.cl
+++ b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise_stride_1.cl
@ -1,103 +1,4 @@
-__kernel void convolution_horizontal_reduced_reads_depthwise_stride_1(
+#define DEPTHWISE
    read_only image2d_t input,
    short totalNumPackedChannels,
    read_only image2d_t weights, __constant float *biases,
    short filterSizeX, short filterSizeY,
    write_only image2d_t output,
    short paddingX, short paddingY, short strideX, short strideY,
    short neuron, float a, float b, float min_clamp, float max_clamp,
    short numOutputColumns) {
  // init
  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  short packedChannel = get_global_id(0);
  short startOutputColumn = mul24((short)get_global_id(1), 4);
  short outputRow = get_global_id(2);
  short startXForChannel = mad24(mad24(startOutputColumn, strideX, -paddingX),
                                 totalNumPackedChannels, packedChannel);
  float4 outputValues[4];
  for (short i = 0; i < 4; ++i) {
    outputValues[i] = (float4)(0, 0, 0, 0);
  }
  int2 inputLocation;
  inputLocation.y = mad24(outputRow, strideY, -paddingY);
  int2 weightLocation;
  weightLocation.x = 0;
  weightLocation.y = packedChannel;
-  // convolution
+__kernel void convolution_horizontal_reduced_reads_depthwise_stride_1(
-  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
+#include "convolution_.cl"
    float4 inputValues[4];
    inputLocation.x = startXForChannel;
    for (short i = 1; i < 4; ++i) {
      inputValues[i] = read_imagef(input, smp, inputLocation);
      inputLocation.x += totalNumPackedChannels;
    }
    for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
      inputValues[0] = inputValues[1];
      inputValues[1] = inputValues[2];
      inputValues[2] = inputValues[3];
      inputValues[3] = read_imagef(input, smp, inputLocation);
      inputLocation.x += totalNumPackedChannels;
      float4 weightValues = read_imagef(weights, smp, weightLocation);
      ++weightLocation.x;
      outputValues[0] += inputValues[0] * weightValues;
      outputValues[1] += inputValues[1] * weightValues;
      outputValues[2] += inputValues[2] * weightValues;
      outputValues[3] += inputValues[3] * weightValues;
    }
    ++inputLocation.y;
  }
  // bias
  short outputChannel = mul24(packedChannel, 4);
  float4 biasValues = vload4(0, biases + outputChannel);
  for (short i = 0; i < 4; ++i) {
    outputValues[i] += biasValues;
  }
  // activation
  switch (neuron) {
  case 1:
    for (short i = 0; i < 4; ++i) {
      outputValues[i] = max(outputValues[i], 0.0f);
    }
    break;
  case 2:
    for (short i = 0; i < 4; ++i) {
      outputValues[i] = a * tanh(b * outputValues[i]);
    }
    break;
  case 3:
    for (short i = 0; i < 4; ++i) {
      outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
    }
    break;
  case 4:
    for (short i = 0; i < 4; ++i) {
      outputValues[i] = max(outputValues[i], min_clamp);
      outputValues[i] = min(outputValues[i], max_clamp);
    }
    break;
  case 5:
    for (short i = 0; i < 4; ++i) {
      outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
    }
    break;
  }
  // output
  int2 outputLocation;
  short outputColumn = startOutputColumn;
  outputLocation.y = outputRow;
  for (short i = 0; i < 4; ++i) {
    outputLocation.x = mad24(outputColumn, totalNumPackedChannels, packedChannel);
    if (outputColumn < numOutputColumns) {
      write_imagef(output, outputLocation, outputValues[i]);
    }
    ++outputColumn;
  }
 }
--- a/selfdrive/modeld/thneed/optimizer.cc
+++ b/selfdrive/modeld/thneed/optimizer.cc
@ -80,7 +80,9 @@ int Thneed::optimize() {
        printf("building kernel %s\n", k->name.c_str());
        k->program = clCreateProgramWithSource(context, 1, srcs, &length, NULL);
-        int err = clBuildProgram(k->program, 1, &device_id, "", NULL, NULL);
+        char options[0x100];
        snprintf(options, sizeof(options)-1, "-I %s", kernel_path);
        int err = clBuildProgram(k->program, 1, &device_id, options, NULL, NULL);
        if (err != 0) {
          printf("got err %d\n", err);