add thneed optimizer (#23772)
	
		
	
				
					
				
			* add thneed optimizer * local work group opt * kernels and final mods * release files * build system touchups * fix kernel path, rand inputs for self test * broken since extra is gone * update model replay ref Co-authored-by: Comma Device <device@comma.ai>pull/214/head
							parent
							
								
									7176f5c401
								
							
						
					
					
						commit
						90beaebefb
					
				
				 12 changed files with 903 additions and 7 deletions
			
			
		@ -0,0 +1,129 @@ | 
				
			||||
__kernel void convolution_horizontal_reduced_reads( | 
				
			||||
    read_only image2d_t input, | 
				
			||||
    short startPackedInputChannel, | 
				
			||||
    short numPackedInputChannelsForGroup, short totalNumPackedInputChannels, | 
				
			||||
    short packedOuputChannelOffset, short totalNumPackedOutputChannels, | 
				
			||||
    read_only image2d_t weights, __constant float *biases, | 
				
			||||
    short filterSizeX, short filterSizeY, | 
				
			||||
    write_only image2d_t output, | 
				
			||||
    short paddingX, short paddingY, short strideX, short strideY, | 
				
			||||
    short dilationX, short dilationY, | 
				
			||||
    short neuron, float a, float b, float min_clamp, float max_clamp, | 
				
			||||
    __constant float *parameters, __constant float *batchNormBiases, | 
				
			||||
    short numOutputColumns) { | 
				
			||||
 | 
				
			||||
  // init | 
				
			||||
  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | 
				
			||||
  short packedOutputChannel = get_global_id(0); | 
				
			||||
  short startOutputColumn = mul24((short)get_global_id(1), 4); | 
				
			||||
  short outputRow = get_global_id(2); | 
				
			||||
  short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), | 
				
			||||
                       totalNumPackedInputChannels, startPackedInputChannel); | 
				
			||||
  short strideWithChannels = mul24(strideX, totalNumPackedInputChannels); | 
				
			||||
 | 
				
			||||
  float4 outputValues[4]; | 
				
			||||
  for (short i = 0; i < 4; ++i) { | 
				
			||||
    outputValues[i] = (float4)(0, 0, 0, 0); | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  int2 inputLocation; | 
				
			||||
  inputLocation.y = mad24(outputRow, strideY, -paddingY); | 
				
			||||
 | 
				
			||||
  int2 weightLocation; | 
				
			||||
  weightLocation.x = 0; | 
				
			||||
  weightLocation.y = packedOutputChannel; | 
				
			||||
 | 
				
			||||
  // convolution | 
				
			||||
  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { | 
				
			||||
    for (short packedInputChannel = 0; | 
				
			||||
         packedInputChannel < numPackedInputChannelsForGroup; | 
				
			||||
         ++packedInputChannel) { | 
				
			||||
      short startXForChannel = startX + packedInputChannel; | 
				
			||||
      for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { | 
				
			||||
        float4 weightValues[4]; | 
				
			||||
        for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { | 
				
			||||
          weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); | 
				
			||||
          ++weightLocation.x; | 
				
			||||
        } | 
				
			||||
        short dilatedStepX = mul24(totalNumPackedInputChannels, dilationX); | 
				
			||||
        inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel); | 
				
			||||
        float4 inputValues[4]; | 
				
			||||
        for (short i = 0; i < 4; ++i) { | 
				
			||||
          inputValues[i] = read_imagef(input, smp, inputLocation); | 
				
			||||
          inputLocation.x += strideWithChannels; | 
				
			||||
        } | 
				
			||||
        for (short i = 0; i < 4; ++i) { | 
				
			||||
          float4 curOutputValues = outputValues[i]; | 
				
			||||
          curOutputValues.x += inputValues[i].x * weightValues[0].x; | 
				
			||||
          curOutputValues.x += inputValues[i].y * weightValues[0].y; | 
				
			||||
          curOutputValues.x += inputValues[i].z * weightValues[0].z; | 
				
			||||
          curOutputValues.x += inputValues[i].w * weightValues[0].w; | 
				
			||||
          curOutputValues.y += inputValues[i].x * weightValues[1].x; | 
				
			||||
          curOutputValues.y += inputValues[i].y * weightValues[1].y; | 
				
			||||
          curOutputValues.y += inputValues[i].z * weightValues[1].z; | 
				
			||||
          curOutputValues.y += inputValues[i].w * weightValues[1].w; | 
				
			||||
          curOutputValues.z += inputValues[i].x * weightValues[2].x; | 
				
			||||
          curOutputValues.z += inputValues[i].y * weightValues[2].y; | 
				
			||||
          curOutputValues.z += inputValues[i].z * weightValues[2].z; | 
				
			||||
          curOutputValues.z += inputValues[i].w * weightValues[2].w; | 
				
			||||
          curOutputValues.w += inputValues[i].x * weightValues[3].x; | 
				
			||||
          curOutputValues.w += inputValues[i].y * weightValues[3].y; | 
				
			||||
          curOutputValues.w += inputValues[i].z * weightValues[3].z; | 
				
			||||
          curOutputValues.w += inputValues[i].w * weightValues[3].w; | 
				
			||||
          outputValues[i] = curOutputValues; | 
				
			||||
        } | 
				
			||||
      } | 
				
			||||
    } | 
				
			||||
    inputLocation.y += dilationY; | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  // bias | 
				
			||||
  packedOutputChannel += packedOuputChannelOffset; | 
				
			||||
  short outputChannel = mul24(packedOutputChannel, 4); | 
				
			||||
  float4 biasValues = vload4(0, biases + outputChannel); | 
				
			||||
  for (short i = 0; i < 4; ++i) { | 
				
			||||
    outputValues[i] += biasValues; | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  // activation | 
				
			||||
  switch (neuron) { | 
				
			||||
  case 1: | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      outputValues[i] = max(outputValues[i], 0.0f); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  case 2: | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      outputValues[i] = a * tanh(b * outputValues[i]); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  case 3: | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  case 4: | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      outputValues[i] = max(outputValues[i], min_clamp); | 
				
			||||
      outputValues[i] = min(outputValues[i], max_clamp); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  case 5: | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  // output | 
				
			||||
  int2 outputLocation; | 
				
			||||
  short outputColumn = startOutputColumn; | 
				
			||||
  outputLocation.y = outputRow; | 
				
			||||
  for (short i = 0; i < 4; ++i) { | 
				
			||||
    outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); | 
				
			||||
    if (outputColumn < numOutputColumns) { | 
				
			||||
      write_imagef(output, outputLocation, outputValues[i]); | 
				
			||||
    } | 
				
			||||
    ++outputColumn; | 
				
			||||
  } | 
				
			||||
} | 
				
			||||
@ -0,0 +1,140 @@ | 
				
			||||
__kernel void convolution_horizontal_reduced_reads_1x1( | 
				
			||||
    read_only image2d_t input, | 
				
			||||
    short startPackedInputChannel, | 
				
			||||
    short numPackedInputChannelsForGroup, short totalNumPackedInputChannels, | 
				
			||||
    short packedOuputChannelOffset, short totalNumPackedOutputChannels, | 
				
			||||
    read_only image2d_t weights, __constant float *biases, | 
				
			||||
    short filterSizeX, short filterSizeY, | 
				
			||||
    write_only image2d_t output, | 
				
			||||
    short paddingX, short paddingY, short strideX, short strideY, | 
				
			||||
    short neuron, float a, float b, float min_clamp, float max_clamp, | 
				
			||||
    __constant float *parameters, __constant float *batchNormBiases, | 
				
			||||
    short numOutputColumns, | 
				
			||||
    short doAccumulate, read_only image2d_t accumulator) { | 
				
			||||
 | 
				
			||||
  // init | 
				
			||||
  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | 
				
			||||
  short packedOutputChannel = get_global_id(0); | 
				
			||||
  short startOutputColumn = mul24((short)get_global_id(1), 4); | 
				
			||||
  short outputRow = get_global_id(2); | 
				
			||||
  short endPackedInputChannel = startPackedInputChannel + numPackedInputChannelsForGroup; | 
				
			||||
  short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), | 
				
			||||
                       totalNumPackedInputChannels, startPackedInputChannel); | 
				
			||||
  short strideWithChannels = mul24(strideX, totalNumPackedInputChannels); | 
				
			||||
 | 
				
			||||
  float4 outputValues[4]; | 
				
			||||
  for (short i = 0; i < 4; ++i) { | 
				
			||||
    outputValues[i] = (float4)(0, 0, 0, 0); | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  int2 inputLocation; | 
				
			||||
  inputLocation.y = mad24(outputRow, strideY, -paddingY); | 
				
			||||
 | 
				
			||||
  int2 weightLocation; | 
				
			||||
  weightLocation.x = 0; | 
				
			||||
  weightLocation.y = packedOutputChannel; | 
				
			||||
 | 
				
			||||
  // convolution | 
				
			||||
  for (short packedInputChannel = startPackedInputChannel; | 
				
			||||
       packedInputChannel < endPackedInputChannel; ++packedInputChannel) { | 
				
			||||
 | 
				
			||||
    float4 weightValues[4]; | 
				
			||||
    for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { | 
				
			||||
      weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); | 
				
			||||
      ++weightLocation.x; | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    inputLocation.x = startX + packedInputChannel; | 
				
			||||
    float4 inputValues[4]; | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      inputValues[i] = read_imagef(input, smp, inputLocation); | 
				
			||||
      inputLocation.x += strideWithChannels; | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      float4 curOutputValues = outputValues[i]; | 
				
			||||
      curOutputValues.x += inputValues[i].x * weightValues[0].x; | 
				
			||||
      curOutputValues.x += inputValues[i].y * weightValues[0].y; | 
				
			||||
      curOutputValues.x += inputValues[i].z * weightValues[0].z; | 
				
			||||
      curOutputValues.x += inputValues[i].w * weightValues[0].w; | 
				
			||||
      curOutputValues.y += inputValues[i].x * weightValues[1].x; | 
				
			||||
      curOutputValues.y += inputValues[i].y * weightValues[1].y; | 
				
			||||
      curOutputValues.y += inputValues[i].z * weightValues[1].z; | 
				
			||||
      curOutputValues.y += inputValues[i].w * weightValues[1].w; | 
				
			||||
      curOutputValues.z += inputValues[i].x * weightValues[2].x; | 
				
			||||
      curOutputValues.z += inputValues[i].y * weightValues[2].y; | 
				
			||||
      curOutputValues.z += inputValues[i].z * weightValues[2].z; | 
				
			||||
      curOutputValues.z += inputValues[i].w * weightValues[2].w; | 
				
			||||
      curOutputValues.w += inputValues[i].x * weightValues[3].x; | 
				
			||||
      curOutputValues.w += inputValues[i].y * weightValues[3].y; | 
				
			||||
      curOutputValues.w += inputValues[i].z * weightValues[3].z; | 
				
			||||
      curOutputValues.w += inputValues[i].w * weightValues[3].w; | 
				
			||||
      outputValues[i] = curOutputValues; | 
				
			||||
    } | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  // bias | 
				
			||||
  packedOutputChannel += packedOuputChannelOffset; | 
				
			||||
  short outputChannel = mul24(packedOutputChannel, 4); | 
				
			||||
  float4 biasValues = vload4(0, biases + outputChannel); | 
				
			||||
  for (short i = 0; i < 4; ++i) { | 
				
			||||
    outputValues[i] += biasValues; | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  // accumulate | 
				
			||||
  if (doAccumulate) { | 
				
			||||
    int2 outputLocation; | 
				
			||||
    short outputColumn = startOutputColumn; | 
				
			||||
    outputLocation.y = outputRow; | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); | 
				
			||||
      if (outputColumn < numOutputColumns) { | 
				
			||||
        outputValues[i] += read_imagef(accumulator, smp, outputLocation); | 
				
			||||
      } | 
				
			||||
      ++outputColumn; | 
				
			||||
    } | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  // activation | 
				
			||||
  switch (neuron) { | 
				
			||||
  case 1: | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      outputValues[i] = max(outputValues[i], 0.0f); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  case 2: | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      outputValues[i] = a * tanh(b * outputValues[i]); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  case 3: | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  case 4: | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      outputValues[i] = max(outputValues[i], min_clamp); | 
				
			||||
      outputValues[i] = min(outputValues[i], max_clamp); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  case 5: | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  // output | 
				
			||||
  int2 outputLocation; | 
				
			||||
  short outputColumn = startOutputColumn; | 
				
			||||
  outputLocation.y = outputRow; | 
				
			||||
  for (short i = 0; i < 4; ++i) { | 
				
			||||
    outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); | 
				
			||||
    if (outputColumn < numOutputColumns) { | 
				
			||||
      write_imagef(output, outputLocation, outputValues[i]); | 
				
			||||
    } | 
				
			||||
    ++outputColumn; | 
				
			||||
  } | 
				
			||||
} | 
				
			||||
 | 
				
			||||
@ -0,0 +1,130 @@ | 
				
			||||
__kernel void convolution_horizontal_reduced_reads_5_outputs( | 
				
			||||
    read_only image2d_t input, | 
				
			||||
    short startPackedInputChannel, | 
				
			||||
    short numPackedInputChannelsForGroup, short totalNumPackedInputChannels, | 
				
			||||
    short packedOuputChannelOffset, short totalNumPackedOutputChannels, | 
				
			||||
    read_only image2d_t weights, __constant float *biases, | 
				
			||||
    short filterSizeX, short filterSizeY, | 
				
			||||
    write_only image2d_t output, | 
				
			||||
    short paddingX, short paddingY, short strideX, short strideY, | 
				
			||||
    short neuron, float a, float b, float min_clamp, float max_clamp, | 
				
			||||
    __constant float *parameters, __constant float *batchNormBiases, | 
				
			||||
    short numOutputColumns) { | 
				
			||||
 | 
				
			||||
  // init | 
				
			||||
  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | 
				
			||||
  short packedOutputChannel = get_global_id(0); | 
				
			||||
  short startOutputColumn = mul24((short)get_global_id(1), 5); | 
				
			||||
  short outputRow = get_global_id(2); | 
				
			||||
  short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), | 
				
			||||
                       totalNumPackedInputChannels, startPackedInputChannel); | 
				
			||||
  short strideWithChannels = mul24(strideX, totalNumPackedInputChannels); | 
				
			||||
 | 
				
			||||
  float4 outputValues[5]; | 
				
			||||
  for (short i = 0; i < 5; ++i) { | 
				
			||||
    outputValues[i] = (float4)(0, 0, 0, 0); | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  int2 inputLocation; | 
				
			||||
  inputLocation.y = mad24(outputRow, strideY, -paddingY); | 
				
			||||
 | 
				
			||||
  int2 weightLocation; | 
				
			||||
  weightLocation.x = 0; | 
				
			||||
  weightLocation.y = packedOutputChannel; | 
				
			||||
 | 
				
			||||
  // convolution | 
				
			||||
  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { | 
				
			||||
    for (short packedInputChannel = 0; | 
				
			||||
         packedInputChannel < numPackedInputChannelsForGroup; | 
				
			||||
         ++packedInputChannel) { | 
				
			||||
      short startXForChannel = startX + packedInputChannel; | 
				
			||||
      for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { | 
				
			||||
        float4 weightValues[4]; | 
				
			||||
        for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { | 
				
			||||
          weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); | 
				
			||||
          ++weightLocation.x; | 
				
			||||
        } | 
				
			||||
        inputLocation.x = | 
				
			||||
            mad24(rfColumn, totalNumPackedInputChannels, startXForChannel); | 
				
			||||
 | 
				
			||||
        float4 inputValues[5]; | 
				
			||||
        for (short i = 0; i < 5; ++i) { | 
				
			||||
          inputValues[i] = read_imagef(input, smp, inputLocation); | 
				
			||||
          inputLocation.x += strideWithChannels; | 
				
			||||
        } | 
				
			||||
 | 
				
			||||
        for (short i = 0; i < 5; ++i) { | 
				
			||||
          float4 curOutputValues = outputValues[i]; | 
				
			||||
          curOutputValues.x += inputValues[i].x * weightValues[0].x; | 
				
			||||
          curOutputValues.x += inputValues[i].y * weightValues[0].y; | 
				
			||||
          curOutputValues.x += inputValues[i].z * weightValues[0].z; | 
				
			||||
          curOutputValues.x += inputValues[i].w * weightValues[0].w; | 
				
			||||
          curOutputValues.y += inputValues[i].x * weightValues[1].x; | 
				
			||||
          curOutputValues.y += inputValues[i].y * weightValues[1].y; | 
				
			||||
          curOutputValues.y += inputValues[i].z * weightValues[1].z; | 
				
			||||
          curOutputValues.y += inputValues[i].w * weightValues[1].w; | 
				
			||||
          curOutputValues.z += inputValues[i].x * weightValues[2].x; | 
				
			||||
          curOutputValues.z += inputValues[i].y * weightValues[2].y; | 
				
			||||
          curOutputValues.z += inputValues[i].z * weightValues[2].z; | 
				
			||||
          curOutputValues.z += inputValues[i].w * weightValues[2].w; | 
				
			||||
          curOutputValues.w += inputValues[i].x * weightValues[3].x; | 
				
			||||
          curOutputValues.w += inputValues[i].y * weightValues[3].y; | 
				
			||||
          curOutputValues.w += inputValues[i].z * weightValues[3].z; | 
				
			||||
          curOutputValues.w += inputValues[i].w * weightValues[3].w; | 
				
			||||
          outputValues[i] = curOutputValues; | 
				
			||||
        } | 
				
			||||
      } | 
				
			||||
    } | 
				
			||||
    ++inputLocation.y; | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  // bias | 
				
			||||
  packedOutputChannel += packedOuputChannelOffset; | 
				
			||||
  short outputChannel = mul24(packedOutputChannel, 4); | 
				
			||||
  float4 biasValues = vload4(0, biases + outputChannel); | 
				
			||||
  for (short i = 0; i < 5; ++i) { | 
				
			||||
    outputValues[i] += biasValues; | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  // activation | 
				
			||||
  switch (neuron) { | 
				
			||||
  case 1: | 
				
			||||
    for (short i = 0; i < 5; ++i) { | 
				
			||||
      outputValues[i] = max(outputValues[i], 0.0f); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  case 2: | 
				
			||||
    for (short i = 0; i < 5; ++i) { | 
				
			||||
      outputValues[i] = a * tanh(b * outputValues[i]); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  case 3: | 
				
			||||
    for (short i = 0; i < 5; ++i) { | 
				
			||||
      outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  case 4: | 
				
			||||
    for (short i = 0; i < 5; ++i) { | 
				
			||||
      outputValues[i] = max(outputValues[i], min_clamp); | 
				
			||||
      outputValues[i] = min(outputValues[i], max_clamp); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  case 5: | 
				
			||||
    for (short i = 0; i < 5; ++i) { | 
				
			||||
      outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  // output | 
				
			||||
  int2 outputLocation; | 
				
			||||
  short outputColumn = startOutputColumn; | 
				
			||||
  outputLocation.y = outputRow; | 
				
			||||
  for (short i = 0; i < 5; ++i) { | 
				
			||||
    outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); | 
				
			||||
    if (outputColumn < numOutputColumns) { | 
				
			||||
      write_imagef(output, outputLocation, outputValues[i]); | 
				
			||||
    } | 
				
			||||
    ++outputColumn; | 
				
			||||
  } | 
				
			||||
} | 
				
			||||
@ -0,0 +1,101 @@ | 
				
			||||
__kernel void convolution_horizontal_reduced_reads_depthwise( | 
				
			||||
    read_only image2d_t input, | 
				
			||||
    short totalNumPackedChannels, | 
				
			||||
    read_only image2d_t weights, __constant float *biases, | 
				
			||||
    short filterSizeX, short filterSizeY, | 
				
			||||
    write_only image2d_t output, | 
				
			||||
    short paddingX, short paddingY, short strideX, short strideY, | 
				
			||||
    short dilationX, short dilationY, | 
				
			||||
    short neuron, float a, float b, float min_clamp, float max_clamp, | 
				
			||||
    short numOutputColumns) { | 
				
			||||
 | 
				
			||||
  // init | 
				
			||||
  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | 
				
			||||
  short packedChannel = get_global_id(0); | 
				
			||||
  short startOutputColumn = mul24((short)get_global_id(1), 4); | 
				
			||||
  short outputRow = get_global_id(2); | 
				
			||||
  short startXForChannel = mad24(mad24(startOutputColumn, strideX, -paddingX), | 
				
			||||
                                 totalNumPackedChannels, packedChannel); | 
				
			||||
  short strideWithChannels = mul24(strideX, totalNumPackedChannels); | 
				
			||||
 | 
				
			||||
  float4 outputValues[4]; | 
				
			||||
  for (short i = 0; i < 4; ++i) { | 
				
			||||
    outputValues[i] = (float4)(0, 0, 0, 0); | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  int2 inputLocation; | 
				
			||||
  inputLocation.y = mad24(outputRow, strideY, -paddingY); | 
				
			||||
 | 
				
			||||
  int2 weightLocation; | 
				
			||||
  weightLocation.x = 0; | 
				
			||||
  weightLocation.y = packedChannel; | 
				
			||||
 | 
				
			||||
  // convolution | 
				
			||||
  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { | 
				
			||||
    for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { | 
				
			||||
      short dilatedStepX = mul24(totalNumPackedChannels, dilationX); | 
				
			||||
      inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel); | 
				
			||||
      float4 inputValues[4]; | 
				
			||||
      for (short i = 0; i < 4; ++i) { | 
				
			||||
        inputValues[i] = read_imagef(input, smp, inputLocation); | 
				
			||||
        inputLocation.x += strideWithChannels; | 
				
			||||
      } | 
				
			||||
      float4 weightValues = read_imagef(weights, smp, weightLocation); | 
				
			||||
      ++weightLocation.x; | 
				
			||||
      outputValues[0] += inputValues[0] * weightValues; | 
				
			||||
      outputValues[1] += inputValues[1] * weightValues; | 
				
			||||
      outputValues[2] += inputValues[2] * weightValues; | 
				
			||||
      outputValues[3] += inputValues[3] * weightValues; | 
				
			||||
    } | 
				
			||||
    inputLocation.y += dilationY; | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  // bias | 
				
			||||
  short outputChannel = mul24(packedChannel, 4); | 
				
			||||
  float4 biasValues = vload4(0, biases + outputChannel); | 
				
			||||
  for (short i = 0; i < 4; ++i) { | 
				
			||||
    outputValues[i] += biasValues; | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  // activation | 
				
			||||
  switch (neuron) { | 
				
			||||
  case 1: | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      outputValues[i] = max(outputValues[i], 0.0f); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  case 2: | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      outputValues[i] = a * tanh(b * outputValues[i]); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  case 3: | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  case 4: | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      outputValues[i] = max(outputValues[i], min_clamp); | 
				
			||||
      outputValues[i] = min(outputValues[i], max_clamp); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  case 5: | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  // output | 
				
			||||
  int2 outputLocation; | 
				
			||||
  short outputColumn = startOutputColumn; | 
				
			||||
  outputLocation.y = outputRow; | 
				
			||||
  for (short i = 0; i < 4; ++i) { | 
				
			||||
    outputLocation.x = mad24(outputColumn, totalNumPackedChannels, packedChannel); | 
				
			||||
    if (outputColumn < numOutputColumns) { | 
				
			||||
      write_imagef(output, outputLocation, outputValues[i]); | 
				
			||||
    } | 
				
			||||
    ++outputColumn; | 
				
			||||
  } | 
				
			||||
} | 
				
			||||
@ -0,0 +1,103 @@ | 
				
			||||
__kernel void convolution_horizontal_reduced_reads_depthwise_stride_1( | 
				
			||||
    read_only image2d_t input, | 
				
			||||
    short totalNumPackedChannels, | 
				
			||||
    read_only image2d_t weights, __constant float *biases, | 
				
			||||
    short filterSizeX, short filterSizeY, | 
				
			||||
    write_only image2d_t output, | 
				
			||||
    short paddingX, short paddingY, short strideX, short strideY, | 
				
			||||
    short neuron, float a, float b, float min_clamp, float max_clamp, | 
				
			||||
    short numOutputColumns) { | 
				
			||||
 | 
				
			||||
  // init | 
				
			||||
  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | 
				
			||||
  short packedChannel = get_global_id(0); | 
				
			||||
  short startOutputColumn = mul24((short)get_global_id(1), 4); | 
				
			||||
  short outputRow = get_global_id(2); | 
				
			||||
  short startXForChannel = mad24(mad24(startOutputColumn, strideX, -paddingX), | 
				
			||||
                                 totalNumPackedChannels, packedChannel); | 
				
			||||
 | 
				
			||||
  float4 outputValues[4]; | 
				
			||||
  for (short i = 0; i < 4; ++i) { | 
				
			||||
    outputValues[i] = (float4)(0, 0, 0, 0); | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  int2 inputLocation; | 
				
			||||
  inputLocation.y = mad24(outputRow, strideY, -paddingY); | 
				
			||||
 | 
				
			||||
  int2 weightLocation; | 
				
			||||
  weightLocation.x = 0; | 
				
			||||
  weightLocation.y = packedChannel; | 
				
			||||
 | 
				
			||||
  // convolution | 
				
			||||
  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { | 
				
			||||
    float4 inputValues[4]; | 
				
			||||
    inputLocation.x = startXForChannel; | 
				
			||||
    for (short i = 1; i < 4; ++i) { | 
				
			||||
      inputValues[i] = read_imagef(input, smp, inputLocation); | 
				
			||||
      inputLocation.x += totalNumPackedChannels; | 
				
			||||
    } | 
				
			||||
    for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { | 
				
			||||
      inputValues[0] = inputValues[1]; | 
				
			||||
      inputValues[1] = inputValues[2]; | 
				
			||||
      inputValues[2] = inputValues[3]; | 
				
			||||
      inputValues[3] = read_imagef(input, smp, inputLocation); | 
				
			||||
      inputLocation.x += totalNumPackedChannels; | 
				
			||||
      float4 weightValues = read_imagef(weights, smp, weightLocation); | 
				
			||||
      ++weightLocation.x; | 
				
			||||
      outputValues[0] += inputValues[0] * weightValues; | 
				
			||||
      outputValues[1] += inputValues[1] * weightValues; | 
				
			||||
      outputValues[2] += inputValues[2] * weightValues; | 
				
			||||
      outputValues[3] += inputValues[3] * weightValues; | 
				
			||||
    } | 
				
			||||
    ++inputLocation.y; | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  // bias | 
				
			||||
  short outputChannel = mul24(packedChannel, 4); | 
				
			||||
  float4 biasValues = vload4(0, biases + outputChannel); | 
				
			||||
  for (short i = 0; i < 4; ++i) { | 
				
			||||
    outputValues[i] += biasValues; | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  // activation | 
				
			||||
  switch (neuron) { | 
				
			||||
  case 1: | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      outputValues[i] = max(outputValues[i], 0.0f); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  case 2: | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      outputValues[i] = a * tanh(b * outputValues[i]); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  case 3: | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  case 4: | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      outputValues[i] = max(outputValues[i], min_clamp); | 
				
			||||
      outputValues[i] = min(outputValues[i], max_clamp); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  case 5: | 
				
			||||
    for (short i = 0; i < 4; ++i) { | 
				
			||||
      outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); | 
				
			||||
    } | 
				
			||||
    break; | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  // output | 
				
			||||
  int2 outputLocation; | 
				
			||||
  short outputColumn = startOutputColumn; | 
				
			||||
  outputLocation.y = outputRow; | 
				
			||||
  for (short i = 0; i < 4; ++i) { | 
				
			||||
    outputLocation.x = mad24(outputColumn, totalNumPackedChannels, packedChannel); | 
				
			||||
    if (outputColumn < numOutputColumns) { | 
				
			||||
      write_imagef(output, outputLocation, outputValues[i]); | 
				
			||||
    } | 
				
			||||
    ++outputColumn; | 
				
			||||
  } | 
				
			||||
} | 
				
			||||
@ -0,0 +1,259 @@ | 
				
			||||
#include <map> | 
				
			||||
#include <string> | 
				
			||||
#include <string.h> | 
				
			||||
#include <assert.h> | 
				
			||||
#include "thneed.h" | 
				
			||||
 | 
				
			||||
extern map<cl_program, string> g_program_source; | 
				
			||||
 | 
				
			||||
static int is_same_size_image(cl_mem a, cl_mem b) { | 
				
			||||
  size_t a_width, a_height, a_depth, a_array_size, a_row_pitch, a_slice_pitch; | 
				
			||||
  clGetImageInfo(a, CL_IMAGE_WIDTH, sizeof(a_width), &a_width, NULL); | 
				
			||||
  clGetImageInfo(a, CL_IMAGE_HEIGHT, sizeof(a_height), &a_height, NULL); | 
				
			||||
  clGetImageInfo(a, CL_IMAGE_DEPTH, sizeof(a_depth), &a_depth, NULL); | 
				
			||||
  clGetImageInfo(a, CL_IMAGE_ARRAY_SIZE, sizeof(a_array_size), &a_array_size, NULL); | 
				
			||||
  clGetImageInfo(a, CL_IMAGE_ROW_PITCH, sizeof(a_row_pitch), &a_row_pitch, NULL); | 
				
			||||
  clGetImageInfo(a, CL_IMAGE_SLICE_PITCH, sizeof(a_slice_pitch), &a_slice_pitch, NULL); | 
				
			||||
 | 
				
			||||
  size_t b_width, b_height, b_depth, b_array_size, b_row_pitch, b_slice_pitch; | 
				
			||||
  clGetImageInfo(b, CL_IMAGE_WIDTH, sizeof(b_width), &b_width, NULL); | 
				
			||||
  clGetImageInfo(b, CL_IMAGE_HEIGHT, sizeof(b_height), &b_height, NULL); | 
				
			||||
  clGetImageInfo(b, CL_IMAGE_DEPTH, sizeof(b_depth), &b_depth, NULL); | 
				
			||||
  clGetImageInfo(b, CL_IMAGE_ARRAY_SIZE, sizeof(b_array_size), &b_array_size, NULL); | 
				
			||||
  clGetImageInfo(b, CL_IMAGE_ROW_PITCH, sizeof(b_row_pitch), &b_row_pitch, NULL); | 
				
			||||
  clGetImageInfo(b, CL_IMAGE_SLICE_PITCH, sizeof(b_slice_pitch), &b_slice_pitch, NULL); | 
				
			||||
 | 
				
			||||
  return (a_width == b_width) && (a_height == b_height) && | 
				
			||||
    (a_depth == b_depth) && (a_array_size == b_array_size) && | 
				
			||||
    (a_row_pitch == b_row_pitch) && (a_slice_pitch == b_slice_pitch); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
static cl_mem make_image_like(cl_context context, cl_mem val) { | 
				
			||||
  cl_image_format format; | 
				
			||||
  size_t width, height, row_pitch; | 
				
			||||
  clGetImageInfo(val, CL_IMAGE_FORMAT, sizeof(format), &format, NULL); | 
				
			||||
  assert(format.image_channel_order == CL_RGBA); | 
				
			||||
  assert(format.image_channel_data_type == CL_HALF_FLOAT); | 
				
			||||
  clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL); | 
				
			||||
  clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL); | 
				
			||||
  clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL); | 
				
			||||
 | 
				
			||||
  cl_image_desc desc = {0}; | 
				
			||||
  desc.image_type = CL_MEM_OBJECT_IMAGE2D; | 
				
			||||
  desc.image_width = width; | 
				
			||||
  desc.image_height = height; | 
				
			||||
  desc.image_row_pitch = row_pitch; | 
				
			||||
 | 
				
			||||
  cl_mem buf = clCreateBuffer(context, CL_MEM_READ_WRITE, row_pitch*height, NULL, NULL); | 
				
			||||
  assert(buf != NULL); | 
				
			||||
  desc.buffer = buf; | 
				
			||||
 | 
				
			||||
  cl_int err; | 
				
			||||
  cl_mem tmp = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err); | 
				
			||||
  //printf("got %d for image %zux%zu %zu\n", err, width, height, row_pitch);
 | 
				
			||||
  assert(tmp != NULL); | 
				
			||||
 | 
				
			||||
  return tmp; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
// convolution_horizontal_reduced_reads_1x1 is 66% of the model runtime
 | 
				
			||||
// make that faster and the model gets faster
 | 
				
			||||
 | 
				
			||||
// this cuts ~2 ms off the model runtime right now
 | 
				
			||||
int Thneed::optimize() { | 
				
			||||
  const char *kernel_path = getenv("KERNEL_PATH"); | 
				
			||||
  if (!kernel_path) { kernel_path = "/data/openpilot/selfdrive/modeld/thneed/kernels"; printf("no KERNEL_PATH set, defaulting to %s\n", kernel_path); } | 
				
			||||
  // load custom kernels
 | 
				
			||||
  map<string, cl_program> g_programs; | 
				
			||||
  for (auto &k : kq) { | 
				
			||||
    // replace program?
 | 
				
			||||
    if (g_programs.find(k->name) == g_programs.end()) { | 
				
			||||
      char fn[0x100]; | 
				
			||||
      snprintf(fn, sizeof(fn), "%s/%s.cl", kernel_path, k->name.c_str()); | 
				
			||||
      FILE *g = fopen(fn, "rb"); | 
				
			||||
      if (g != NULL) { | 
				
			||||
        char *src[0x10000]; | 
				
			||||
        const char *srcs[1]; srcs[0] = (const char *)src; | 
				
			||||
        memset(src, 0, sizeof(src)); | 
				
			||||
        size_t length = fread(src, 1, sizeof(src), g); | 
				
			||||
        fclose(g); | 
				
			||||
 | 
				
			||||
        printf("building kernel %s\n", k->name.c_str()); | 
				
			||||
        k->program = clCreateProgramWithSource(context, 1, srcs, &length, NULL); | 
				
			||||
        int err = clBuildProgram(k->program, 1, &device_id, "", NULL, NULL); | 
				
			||||
 | 
				
			||||
        if (err != 0) { | 
				
			||||
          printf("got err %d\n", err); | 
				
			||||
          size_t err_length; | 
				
			||||
          char buffer[2048]; | 
				
			||||
          clGetProgramBuildInfo(k->program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &err_length); | 
				
			||||
          buffer[err_length] = '\0'; | 
				
			||||
          printf("%s\n", buffer); | 
				
			||||
        } | 
				
			||||
        assert(err == 0); | 
				
			||||
 | 
				
			||||
        // save in cache
 | 
				
			||||
        g_programs[k->name] = k->program; | 
				
			||||
        g_program_source[k->program] = string((char *)src, length); | 
				
			||||
      } else { | 
				
			||||
        g_programs[k->name] = NULL; | 
				
			||||
      } | 
				
			||||
    } else { | 
				
			||||
      // cached replacement
 | 
				
			||||
      if (g_programs[k->name] != NULL) { | 
				
			||||
        k->program = g_programs[k->name]; | 
				
			||||
      } | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    // hack in accumulator to convolution_horizontal_reduced_reads_1x1
 | 
				
			||||
    if (k->name == "convolution_horizontal_reduced_reads_1x1") { | 
				
			||||
      k->arg_names.push_back("doAccumulate"); | 
				
			||||
      short doAccumulate = 0; | 
				
			||||
      k->args.push_back(string((char *)&doAccumulate, sizeof(doAccumulate))); | 
				
			||||
      k->args_size.push_back(2); | 
				
			||||
      k->arg_names.push_back("accumulator"); | 
				
			||||
      k->args.push_back(k->args[k->get_arg_num("output")]); | 
				
			||||
      k->args_size.push_back(8); | 
				
			||||
      k->num_args += 2; | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    // assert that parameters + batchNormBiases are not used
 | 
				
			||||
    // since they aren't supported in custom replacement kernels
 | 
				
			||||
    if (k->name == "convolution_horizontal_reduced_reads_1x1" || | 
				
			||||
        k->name == "convolution_horizontal_reduced_reads" || | 
				
			||||
        k->name == "convolution_horizontal_reduced_reads_5_outputs") { | 
				
			||||
      string p1 = k->args[k->get_arg_num("parameters")]; | 
				
			||||
      string p2 = k->args[k->get_arg_num("batchNormBiases")]; | 
				
			||||
      assert(p1.length() == 8 && *((uint64_t*)p1.data()) == 0); | 
				
			||||
      assert(p2.length() == 8 && *((uint64_t*)p2.data()) == 0); | 
				
			||||
    } | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  // optimizer
 | 
				
			||||
  size_t start_size; | 
				
			||||
  do { | 
				
			||||
    start_size = kq.size(); | 
				
			||||
 | 
				
			||||
    // get optimizations
 | 
				
			||||
    map<string, string> replacements; | 
				
			||||
    for (int i = 0; i < kq.size(); i++) { | 
				
			||||
      // fusing elementwise_sum + activate_image will save 3 enqueues
 | 
				
			||||
 | 
				
			||||
      // delete useless copy layers
 | 
				
			||||
      // saves ~0.7 ms
 | 
				
			||||
      if (kq[i]->name == "concatenation" || kq[i]->name == "flatten") { | 
				
			||||
        string in = kq[i]->args[kq[i]->get_arg_num("input")]; | 
				
			||||
        string out = kq[i]->args[kq[i]->get_arg_num("output")]; | 
				
			||||
        if (is_same_size_image(*(cl_mem*)in.data(), *(cl_mem*)out.data())) { | 
				
			||||
          cl_mem tmp = make_image_like(context, *(cl_mem *)in.data()); | 
				
			||||
          replacements[in] = string((char *)&tmp, sizeof(tmp)); | 
				
			||||
          replacements[out] = string((char *)&tmp, sizeof(tmp)); | 
				
			||||
 | 
				
			||||
          kq.erase(kq.begin()+i); --i; | 
				
			||||
        } | 
				
			||||
      } | 
				
			||||
 | 
				
			||||
      // NOTE: if activations/accumulation are done in the wrong order, this will be wrong
 | 
				
			||||
 | 
				
			||||
      // fuse activations into convs and fc_Wtx
 | 
				
			||||
      // saves ~1.5 ms
 | 
				
			||||
      // NOTE: this changes the outputs because of rounding, should be better now!
 | 
				
			||||
      if (i != 0 && kq[i]->name == "activate_image") { | 
				
			||||
        if (kq[i-1]->name == "convolution_horizontal_reduced_reads_1x1" || | 
				
			||||
            kq[i-1]->name == "convolution_horizontal_reduced_reads_5_outputs" || | 
				
			||||
            kq[i-1]->name == "convolution_horizontal_reduced_reads" || | 
				
			||||
            kq[i-1]->name == "convolution_horizontal_reduced_reads_depthwise" || | 
				
			||||
            kq[i-1]->name == "convolution_horizontal_reduced_reads_depthwise_stride_1" || | 
				
			||||
            kq[i-1]->name == "fc_Wtx") { | 
				
			||||
          string lastout = kq[i-1]->args[kq[i-1]->get_arg_num("output")]; | 
				
			||||
          string in = kq[i]->args[kq[i]->get_arg_num("input")]; | 
				
			||||
          string out = kq[i]->args[kq[i]->get_arg_num("output")]; | 
				
			||||
 | 
				
			||||
          if (lastout == in) { | 
				
			||||
            short neuron = *(int*)kq[i]->args[kq[i]->get_arg_num("neuron")].data(); | 
				
			||||
            kq[i-1]->args[kq[i-1]->get_arg_num("neuron")] = string((char *)&neuron, sizeof(neuron)); | 
				
			||||
 | 
				
			||||
            cl_mem tmp = make_image_like(context, *(cl_mem *)lastout.data()); | 
				
			||||
            replacements[in] = string((char *)&tmp, sizeof(tmp)); | 
				
			||||
            replacements[out] = string((char *)&tmp, sizeof(tmp)); | 
				
			||||
 | 
				
			||||
            kq.erase(kq.begin()+i); --i; | 
				
			||||
          } | 
				
			||||
        } | 
				
			||||
      } | 
				
			||||
 | 
				
			||||
      // fuse accumulation into convs and fc_Wtx
 | 
				
			||||
      if (i != 0 && kq[i]->name == "elementwise_sum") { | 
				
			||||
        if (kq[i-1]->name == "convolution_horizontal_reduced_reads_1x1" || | 
				
			||||
            kq[i-1]->name == "fc_Wtx") { | 
				
			||||
          string lastout = kq[i-1]->args[kq[i-1]->get_arg_num("output")]; | 
				
			||||
          string a = kq[i]->args[kq[i]->get_arg_num("a")]; | 
				
			||||
          string b = kq[i]->args[kq[i]->get_arg_num("b")]; | 
				
			||||
          string out = kq[i]->args[kq[i]->get_arg_num("output")]; | 
				
			||||
 | 
				
			||||
          if (lastout == a) { | 
				
			||||
            kq[i-1]->args[kq[i-1]->get_arg_num("accumulator")] = b; | 
				
			||||
          } else if (lastout == b) { | 
				
			||||
            kq[i-1]->args[kq[i-1]->get_arg_num("accumulator")] = a; | 
				
			||||
          } else { | 
				
			||||
            continue; | 
				
			||||
          } | 
				
			||||
 | 
				
			||||
          cl_mem tmp = make_image_like(context, *(cl_mem *)lastout.data()); | 
				
			||||
          replacements[lastout] = string((char *)&tmp, sizeof(tmp)); | 
				
			||||
          replacements[out] = string((char *)&tmp, sizeof(tmp)); | 
				
			||||
 | 
				
			||||
          short doAccumulate = 1; | 
				
			||||
          kq[i-1]->args[kq[i-1]->get_arg_num("doAccumulate")] = string((char *)&doAccumulate, sizeof(doAccumulate)); | 
				
			||||
 | 
				
			||||
          kq.erase(kq.begin()+i); --i; | 
				
			||||
        } | 
				
			||||
      } | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    // remap inputs and outputs, and clear the kernels
 | 
				
			||||
    for (int i = 0; i < kq.size(); i++) { | 
				
			||||
      kq[i]->kernel = NULL; | 
				
			||||
      for (int j = 0; j < kq[i]->num_args; j++) { | 
				
			||||
        if (replacements.find(kq[i]->args[j]) != replacements.end()) { | 
				
			||||
          kq[i]->args[j] = replacements[kq[i]->args[j]]; | 
				
			||||
        } | 
				
			||||
      } | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    printf("optimize %lu -> %lu\n", start_size, kq.size()); | 
				
			||||
  } while (kq.size() != start_size); | 
				
			||||
 | 
				
			||||
  size_t work_group_size = 0; | 
				
			||||
  clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(work_group_size), &work_group_size, NULL); | 
				
			||||
  printf("max work group size %lu\n", work_group_size); | 
				
			||||
 | 
				
			||||
  // local work group optimizer
 | 
				
			||||
  for (auto &k : kq) { | 
				
			||||
    // only do it for convs, since others might share memory
 | 
				
			||||
    if (k->name.rfind("convolution_", 0) == 0) { | 
				
			||||
      int best = -1; | 
				
			||||
      if (k->local_work_size[0] * k->local_work_size[1] * k->local_work_size[2] < work_group_size/2) { | 
				
			||||
        uint64_t base_time = k->benchmark(); | 
				
			||||
        uint64_t best_time = base_time; | 
				
			||||
        for (int i = 0; i < 3; i++) { | 
				
			||||
          k->local_work_size[i] *= 2; | 
				
			||||
          uint64_t this_time = k->benchmark(); | 
				
			||||
          if (this_time < best_time) { | 
				
			||||
            best = i; | 
				
			||||
            best_time = this_time; | 
				
			||||
          } | 
				
			||||
          k->local_work_size[i] /= 2; | 
				
			||||
        } | 
				
			||||
        if (best != -1) { | 
				
			||||
          k->local_work_size[best] *= 2; | 
				
			||||
          //printf("%s %.2f ms doubled %d to %.2f ms\n", k->name.c_str(), base_time/1e6, best, best_time/1e6);
 | 
				
			||||
        } | 
				
			||||
      } | 
				
			||||
 | 
				
			||||
    } | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  return 0; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
@ -1 +1 @@ | 
				
			||||
7d3ad941bc4ba4c923af7a1d7b48544bfc0d3e13 | 
				
			||||
0c94f7c258bcdabc34c7f7be6cb6c2502afbb339 | 
				
			||||
 | 
				
			||||
					Loading…
					
					
				
		Reference in new issue