You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
			
				
					273 lines
				
				9.9 KiB
			
		
		
			
		
	
	
					273 lines
				
				9.9 KiB
			| 
								 
											4 years ago
										 
									 | 
							
								    read_only image2d_t input,
							 | 
						||
| 
								 | 
							
								#ifndef DEPTHWISE
							 | 
						||
| 
								 | 
							
								    short startPackedInputChannel,
							 | 
						||
| 
								 | 
							
								    short numPackedInputChannelsForGroup, short totalNumPackedInputChannels,
							 | 
						||
| 
								 | 
							
								    // typo required for API compatibility
							 | 
						||
| 
								 | 
							
								    short packedOuputChannelOffset, short totalNumPackedOutputChannels,
							 | 
						||
| 
								 | 
							
								#else
							 | 
						||
| 
								 | 
							
								    short totalNumPackedChannels,
							 | 
						||
| 
								 | 
							
								#endif
							 | 
						||
| 
								 | 
							
								    read_only image2d_t weights, __constant float *biases,
							 | 
						||
| 
								 | 
							
								    short filterSizeX, short filterSizeY,
							 | 
						||
| 
								 | 
							
								    write_only image2d_t output,
							 | 
						||
| 
								 | 
							
								    short paddingX, short paddingY, short strideX, short strideY,
							 | 
						||
| 
								 | 
							
								#ifdef SUPPORT_DILATION
							 | 
						||
| 
								 | 
							
								    short dilationX, short dilationY,
							 | 
						||
| 
								 | 
							
								#endif
							 | 
						||
| 
								 | 
							
								    short neuron, float a, float b, float min_clamp, float max_clamp,
							 | 
						||
| 
								 | 
							
								#ifndef DEPTHWISE
							 | 
						||
| 
								 | 
							
								    // note: these are not supported
							 | 
						||
| 
								 | 
							
								    __constant float *parameters, __constant float *batchNormBiases,
							 | 
						||
| 
								 | 
							
								#endif
							 | 
						||
| 
								 | 
							
								    short numOutputColumns
							 | 
						||
| 
								 | 
							
								#ifdef SUPPORT_ACCUMULATION
							 | 
						||
| 
								 | 
							
								    , short doAccumulate, read_only image2d_t accumulator
							 | 
						||
| 
								 | 
							
								#endif
							 | 
						||
| 
								 | 
							
								    ) {
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#ifndef NUM_OUTPUTS
							 | 
						||
| 
								 | 
							
								  #define NUM_OUTPUTS 4
							 | 
						||
| 
								 | 
							
								#endif
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  // init
							 | 
						||
| 
								 | 
							
								  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
							 | 
						||
| 
								 | 
							
								  short packedOutputChannel = get_global_id(0);
							 | 
						||
| 
								 | 
							
								  short startOutputColumn = mul24((short)get_global_id(1), NUM_OUTPUTS);
							 | 
						||
| 
								 | 
							
								  short outputRow = get_global_id(2);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#ifdef DEPTHWISE
							 | 
						||
| 
								 | 
							
								  short totalNumPackedInputChannels = totalNumPackedChannels;
							 | 
						||
| 
								 | 
							
								  short totalNumPackedOutputChannels = totalNumPackedChannels;
							 | 
						||
| 
								 | 
							
								  short startPackedInputChannel = packedOutputChannel;
							 | 
						||
| 
								 | 
							
								#endif
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), totalNumPackedInputChannels, startPackedInputChannel);
							 | 
						||
| 
								 | 
							
								  short strideWithChannels = mul24(strideX, totalNumPackedInputChannels);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  float4 outputValues[NUM_OUTPUTS];
							 | 
						||
| 
								 | 
							
								  for (short i = 0; i < NUM_OUTPUTS; ++i) {
							 | 
						||
| 
								 | 
							
								    outputValues[i] = (float4)(0, 0, 0, 0);
							 | 
						||
| 
								 | 
							
								  }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  int2 inputLocation;
							 | 
						||
| 
								 | 
							
								  inputLocation.y = mad24(outputRow, strideY, -paddingY);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  int2 weightLocation;
							 | 
						||
| 
								 | 
							
								  weightLocation.x = 0;
							 | 
						||
| 
								 | 
							
								  weightLocation.y = packedOutputChannel;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#ifdef DEPTHWISE
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#ifdef SUPPORT_DILATION
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  // depthwise convolution
							 | 
						||
| 
								 | 
							
								  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
							 | 
						||
| 
								 | 
							
								    for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
							 | 
						||
| 
								 | 
							
								      short dilatedStepX = mul24(totalNumPackedChannels, dilationX);
							 | 
						||
| 
								 | 
							
								      inputLocation.x = mad24(rfColumn, dilatedStepX, startX);
							 | 
						||
| 
								 | 
							
								      float4 inputValues[4];
							 | 
						||
| 
								 | 
							
								      for (short i = 0; i < 4; ++i) {
							 | 
						||
| 
								 | 
							
								        inputValues[i] = read_imagef(input, smp, inputLocation);
							 | 
						||
| 
								 | 
							
								        inputLocation.x += strideWithChannels;
							 | 
						||
| 
								 | 
							
								      }
							 | 
						||
| 
								 | 
							
								      float4 weightValues = read_imagef(weights, smp, weightLocation);
							 | 
						||
| 
								 | 
							
								      ++weightLocation.x;
							 | 
						||
| 
								 | 
							
								      outputValues[0] += inputValues[0] * weightValues;
							 | 
						||
| 
								 | 
							
								      outputValues[1] += inputValues[1] * weightValues;
							 | 
						||
| 
								 | 
							
								      outputValues[2] += inputValues[2] * weightValues;
							 | 
						||
| 
								 | 
							
								      outputValues[3] += inputValues[3] * weightValues;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    inputLocation.y += dilationY;
							 | 
						||
| 
								 | 
							
								  }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#else
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  // depthwise unstrided convolution
							 | 
						||
| 
								 | 
							
								  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
							 | 
						||
| 
								 | 
							
								    float4 inputValues[4];
							 | 
						||
| 
								 | 
							
								    inputLocation.x = startX;
							 | 
						||
| 
								 | 
							
								    for (short i = 1; i < 4; ++i) {
							 | 
						||
| 
								 | 
							
								      inputValues[i] = read_imagef(input, smp, inputLocation);
							 | 
						||
| 
								 | 
							
								      inputLocation.x += totalNumPackedOutputChannels;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
							 | 
						||
| 
								 | 
							
								      inputValues[0] = inputValues[1];
							 | 
						||
| 
								 | 
							
								      inputValues[1] = inputValues[2];
							 | 
						||
| 
								 | 
							
								      inputValues[2] = inputValues[3];
							 | 
						||
| 
								 | 
							
								      inputValues[3] = read_imagef(input, smp, inputLocation);
							 | 
						||
| 
								 | 
							
								      inputLocation.x += totalNumPackedChannels;
							 | 
						||
| 
								 | 
							
								      float4 weightValues = read_imagef(weights, smp, weightLocation);
							 | 
						||
| 
								 | 
							
								      ++weightLocation.x;
							 | 
						||
| 
								 | 
							
								      outputValues[0] += inputValues[0] * weightValues;
							 | 
						||
| 
								 | 
							
								      outputValues[1] += inputValues[1] * weightValues;
							 | 
						||
| 
								 | 
							
								      outputValues[2] += inputValues[2] * weightValues;
							 | 
						||
| 
								 | 
							
								      outputValues[3] += inputValues[3] * weightValues;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    ++inputLocation.y;
							 | 
						||
| 
								 | 
							
								  }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#endif
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#elif defined(ONLY_1X1_CONV)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  // 1x1 convolution
							 | 
						||
| 
								 | 
							
								  short endPackedInputChannel = startPackedInputChannel + numPackedInputChannelsForGroup;
							 | 
						||
| 
								 | 
							
								  for (short packedInputChannel = startPackedInputChannel; packedInputChannel < endPackedInputChannel; ++packedInputChannel) {
							 | 
						||
| 
								 | 
							
								    float4 weightValues[4];
							 | 
						||
| 
								 | 
							
								    for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
							 | 
						||
| 
								 | 
							
								      weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
							 | 
						||
| 
								 | 
							
								      ++weightLocation.x;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    inputLocation.x = startX + packedInputChannel;
							 | 
						||
| 
								 | 
							
								    float4 inputValues[NUM_OUTPUTS];
							 | 
						||
| 
								 | 
							
								    for (short i = 0; i < NUM_OUTPUTS; ++i) {
							 | 
						||
| 
								 | 
							
								      inputValues[i] = read_imagef(input, smp, inputLocation);
							 | 
						||
| 
								 | 
							
								      inputLocation.x += strideWithChannels;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    for (short i = 0; i < NUM_OUTPUTS; ++i) {
							 | 
						||
| 
								 | 
							
								      float4 curOutputValues = outputValues[i];
							 | 
						||
| 
								 | 
							
								      curOutputValues.x += inputValues[i].x * weightValues[0].x;
							 | 
						||
| 
								 | 
							
								      curOutputValues.x += inputValues[i].y * weightValues[0].y;
							 | 
						||
| 
								 | 
							
								      curOutputValues.x += inputValues[i].z * weightValues[0].z;
							 | 
						||
| 
								 | 
							
								      curOutputValues.x += inputValues[i].w * weightValues[0].w;
							 | 
						||
| 
								 | 
							
								      curOutputValues.y += inputValues[i].x * weightValues[1].x;
							 | 
						||
| 
								 | 
							
								      curOutputValues.y += inputValues[i].y * weightValues[1].y;
							 | 
						||
| 
								 | 
							
								      curOutputValues.y += inputValues[i].z * weightValues[1].z;
							 | 
						||
| 
								 | 
							
								      curOutputValues.y += inputValues[i].w * weightValues[1].w;
							 | 
						||
| 
								 | 
							
								      curOutputValues.z += inputValues[i].x * weightValues[2].x;
							 | 
						||
| 
								 | 
							
								      curOutputValues.z += inputValues[i].y * weightValues[2].y;
							 | 
						||
| 
								 | 
							
								      curOutputValues.z += inputValues[i].z * weightValues[2].z;
							 | 
						||
| 
								 | 
							
								      curOutputValues.z += inputValues[i].w * weightValues[2].w;
							 | 
						||
| 
								 | 
							
								      curOutputValues.w += inputValues[i].x * weightValues[3].x;
							 | 
						||
| 
								 | 
							
								      curOutputValues.w += inputValues[i].y * weightValues[3].y;
							 | 
						||
| 
								 | 
							
								      curOutputValues.w += inputValues[i].z * weightValues[3].z;
							 | 
						||
| 
								 | 
							
								      curOutputValues.w += inputValues[i].w * weightValues[3].w;
							 | 
						||
| 
								 | 
							
								      outputValues[i] = curOutputValues;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								  }
							 | 
						||
| 
								 | 
							
								  packedOutputChannel += packedOuputChannelOffset;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#else
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  // normal convolution
							 | 
						||
| 
								 | 
							
								  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
							 | 
						||
| 
								 | 
							
								    for (short packedInputChannel = 0; packedInputChannel < numPackedInputChannelsForGroup; ++packedInputChannel) {
							 | 
						||
| 
								 | 
							
								      short startXForChannel = startX + packedInputChannel;
							 | 
						||
| 
								 | 
							
								      for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        float4 weightValues[4];
							 | 
						||
| 
								 | 
							
								        for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
							 | 
						||
| 
								 | 
							
								          weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
							 | 
						||
| 
								 | 
							
								          ++weightLocation.x;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#ifdef SUPPORT_DILATION
							 | 
						||
| 
								 | 
							
								        short dilatedStepX = mul24(totalNumPackedInputChannels, dilationX);
							 | 
						||
| 
								 | 
							
								        inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel);
							 | 
						||
| 
								 | 
							
								#else
							 | 
						||
| 
								 | 
							
								        inputLocation.x = mad24(rfColumn, totalNumPackedInputChannels, startXForChannel);
							 | 
						||
| 
								 | 
							
								#endif
							 | 
						||
| 
								 | 
							
								        float4 inputValues[NUM_OUTPUTS];
							 | 
						||
| 
								 | 
							
								        for (short i = 0; i < NUM_OUTPUTS; ++i) {
							 | 
						||
| 
								 | 
							
								          inputValues[i] = read_imagef(input, smp, inputLocation);
							 | 
						||
| 
								 | 
							
								          inputLocation.x += strideWithChannels;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        for (short i = 0; i < NUM_OUTPUTS; ++i) {
							 | 
						||
| 
								 | 
							
								          float4 curOutputValues = outputValues[i];
							 | 
						||
| 
								 | 
							
								          curOutputValues.x += inputValues[i].x * weightValues[0].x;
							 | 
						||
| 
								 | 
							
								          curOutputValues.x += inputValues[i].y * weightValues[0].y;
							 | 
						||
| 
								 | 
							
								          curOutputValues.x += inputValues[i].z * weightValues[0].z;
							 | 
						||
| 
								 | 
							
								          curOutputValues.x += inputValues[i].w * weightValues[0].w;
							 | 
						||
| 
								 | 
							
								          curOutputValues.y += inputValues[i].x * weightValues[1].x;
							 | 
						||
| 
								 | 
							
								          curOutputValues.y += inputValues[i].y * weightValues[1].y;
							 | 
						||
| 
								 | 
							
								          curOutputValues.y += inputValues[i].z * weightValues[1].z;
							 | 
						||
| 
								 | 
							
								          curOutputValues.y += inputValues[i].w * weightValues[1].w;
							 | 
						||
| 
								 | 
							
								          curOutputValues.z += inputValues[i].x * weightValues[2].x;
							 | 
						||
| 
								 | 
							
								          curOutputValues.z += inputValues[i].y * weightValues[2].y;
							 | 
						||
| 
								 | 
							
								          curOutputValues.z += inputValues[i].z * weightValues[2].z;
							 | 
						||
| 
								 | 
							
								          curOutputValues.z += inputValues[i].w * weightValues[2].w;
							 | 
						||
| 
								 | 
							
								          curOutputValues.w += inputValues[i].x * weightValues[3].x;
							 | 
						||
| 
								 | 
							
								          curOutputValues.w += inputValues[i].y * weightValues[3].y;
							 | 
						||
| 
								 | 
							
								          curOutputValues.w += inputValues[i].z * weightValues[3].z;
							 | 
						||
| 
								 | 
							
								          curOutputValues.w += inputValues[i].w * weightValues[3].w;
							 | 
						||
| 
								 | 
							
								          outputValues[i] = curOutputValues;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								      }
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								#ifdef SUPPORT_DILATION
							 | 
						||
| 
								 | 
							
								    inputLocation.y += dilationY;
							 | 
						||
| 
								 | 
							
								#else
							 | 
						||
| 
								 | 
							
								    ++inputLocation.y;
							 | 
						||
| 
								 | 
							
								#endif
							 | 
						||
| 
								 | 
							
								  }
							 | 
						||
| 
								 | 
							
								  packedOutputChannel += packedOuputChannelOffset;
							 | 
						||
| 
								 | 
							
								#endif
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  // bias
							 | 
						||
| 
								 | 
							
								  short outputChannel = mul24(packedOutputChannel, 4);
							 | 
						||
| 
								 | 
							
								  float4 biasValues = vload4(0, biases + outputChannel);
							 | 
						||
| 
								 | 
							
								  for (short i = 0; i < NUM_OUTPUTS; ++i) {
							 | 
						||
| 
								 | 
							
								    outputValues[i] += biasValues;
							 | 
						||
| 
								 | 
							
								  }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#ifdef SUPPORT_ACCUMULATION
							 | 
						||
| 
								 | 
							
								  // accumulate
							 | 
						||
| 
								 | 
							
								  if (doAccumulate) {
							 | 
						||
| 
								 | 
							
								    int2 outputLocation;
							 | 
						||
| 
								 | 
							
								    short outputColumn = startOutputColumn;
							 | 
						||
| 
								 | 
							
								    outputLocation.y = outputRow;
							 | 
						||
| 
								 | 
							
								    for (short i = 0; i < NUM_OUTPUTS; ++i) {
							 | 
						||
| 
								 | 
							
								      outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
							 | 
						||
| 
								 | 
							
								      if (outputColumn < numOutputColumns) {
							 | 
						||
| 
								 | 
							
								        outputValues[i] += read_imagef(accumulator, smp, outputLocation);
							 | 
						||
| 
								 | 
							
								      }
							 | 
						||
| 
								 | 
							
								      ++outputColumn;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								  }
							 | 
						||
| 
								 | 
							
								#endif
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  // activation
							 | 
						||
| 
								 | 
							
								  switch (neuron) {
							 | 
						||
| 
								 | 
							
								  case 1:
							 | 
						||
| 
								 | 
							
								    for (short i = 0; i < NUM_OUTPUTS; ++i) {
							 | 
						||
| 
								 | 
							
								      outputValues[i] = max(outputValues[i], 0.0f);
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    break;
							 | 
						||
| 
								 | 
							
								  case 2:
							 | 
						||
| 
								 | 
							
								    for (short i = 0; i < NUM_OUTPUTS; ++i) {
							 | 
						||
| 
								 | 
							
								      outputValues[i] = a * tanh(b * outputValues[i]);
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    break;
							 | 
						||
| 
								 | 
							
								  case 3:
							 | 
						||
| 
								 | 
							
								    for (short i = 0; i < NUM_OUTPUTS; ++i) {
							 | 
						||
| 
								 | 
							
								      outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    break;
							 | 
						||
| 
								 | 
							
								  case 4:
							 | 
						||
| 
								 | 
							
								    for (short i = 0; i < NUM_OUTPUTS; ++i) {
							 | 
						||
| 
								 | 
							
								      outputValues[i] = max(outputValues[i], min_clamp);
							 | 
						||
| 
								 | 
							
								      outputValues[i] = min(outputValues[i], max_clamp);
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    break;
							 | 
						||
| 
								 | 
							
								  case 5:
							 | 
						||
| 
								 | 
							
								    for (short i = 0; i < NUM_OUTPUTS; ++i) {
							 | 
						||
| 
								 | 
							
								      outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    break;
							 | 
						||
| 
								 | 
							
								  }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  // output
							 | 
						||
| 
								 | 
							
								  int2 outputLocation;
							 | 
						||
| 
								 | 
							
								  short outputColumn = startOutputColumn;
							 | 
						||
| 
								 | 
							
								  outputLocation.y = outputRow;
							 | 
						||
| 
								 | 
							
								  for (short i = 0; i < NUM_OUTPUTS; ++i) {
							 | 
						||
| 
								 | 
							
								    outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
							 | 
						||
| 
								 | 
							
								    if (outputColumn < numOutputColumns) {
							 | 
						||
| 
								 | 
							
								      write_imagef(output, outputLocation, outputValues[i]);
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    ++outputColumn;
							 | 
						||
| 
								 | 
							
								  }
							 | 
						||
| 
								 | 
							
								}
							 |