You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							272 lines
						
					
					
						
							9.9 KiB
						
					
					
				
			
		
		
	
	
							272 lines
						
					
					
						
							9.9 KiB
						
					
					
				|     read_only image2d_t input,
 | |
| #ifndef DEPTHWISE
 | |
|     short startPackedInputChannel,
 | |
|     short numPackedInputChannelsForGroup, short totalNumPackedInputChannels,
 | |
|     // typo required for API compatibility
 | |
|     short packedOuputChannelOffset, short totalNumPackedOutputChannels,
 | |
| #else
 | |
|     short totalNumPackedChannels,
 | |
| #endif
 | |
|     read_only image2d_t weights, __constant float *biases,
 | |
|     short filterSizeX, short filterSizeY,
 | |
|     write_only image2d_t output,
 | |
|     short paddingX, short paddingY, short strideX, short strideY,
 | |
| #ifdef SUPPORT_DILATION
 | |
|     short dilationX, short dilationY,
 | |
| #endif
 | |
|     short neuron, float a, float b, float min_clamp, float max_clamp,
 | |
| #ifndef DEPTHWISE
 | |
|     // note: these are not supported
 | |
|     __constant float *parameters, __constant float *batchNormBiases,
 | |
| #endif
 | |
|     short numOutputColumns
 | |
| #ifdef SUPPORT_ACCUMULATION
 | |
|     , short doAccumulate, read_only image2d_t accumulator
 | |
| #endif
 | |
|     ) {
 | |
| 
 | |
| #ifndef NUM_OUTPUTS
 | |
|   #define NUM_OUTPUTS 4
 | |
| #endif
 | |
| 
 | |
|   // init
 | |
|   const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 | |
|   short packedOutputChannel = get_global_id(0);
 | |
|   short startOutputColumn = mul24((short)get_global_id(1), NUM_OUTPUTS);
 | |
|   short outputRow = get_global_id(2);
 | |
| 
 | |
| #ifdef DEPTHWISE
 | |
|   short totalNumPackedInputChannels = totalNumPackedChannels;
 | |
|   short totalNumPackedOutputChannels = totalNumPackedChannels;
 | |
|   short startPackedInputChannel = packedOutputChannel;
 | |
| #endif
 | |
| 
 | |
|   short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), totalNumPackedInputChannels, startPackedInputChannel);
 | |
|   short strideWithChannels = mul24(strideX, totalNumPackedInputChannels);
 | |
| 
 | |
|   float4 outputValues[NUM_OUTPUTS];
 | |
|   for (short i = 0; i < NUM_OUTPUTS; ++i) {
 | |
|     outputValues[i] = (float4)(0, 0, 0, 0);
 | |
|   }
 | |
| 
 | |
|   int2 inputLocation;
 | |
|   inputLocation.y = mad24(outputRow, strideY, -paddingY);
 | |
| 
 | |
|   int2 weightLocation;
 | |
|   weightLocation.x = 0;
 | |
|   weightLocation.y = packedOutputChannel;
 | |
| 
 | |
| #ifdef DEPTHWISE
 | |
| 
 | |
| #ifdef SUPPORT_DILATION
 | |
| 
 | |
|   // depthwise convolution
 | |
|   for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
 | |
|     for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
 | |
|       short dilatedStepX = mul24(totalNumPackedChannels, dilationX);
 | |
|       inputLocation.x = mad24(rfColumn, dilatedStepX, startX);
 | |
|       float4 inputValues[4];
 | |
|       for (short i = 0; i < 4; ++i) {
 | |
|         inputValues[i] = read_imagef(input, smp, inputLocation);
 | |
|         inputLocation.x += strideWithChannels;
 | |
|       }
 | |
|       float4 weightValues = read_imagef(weights, smp, weightLocation);
 | |
|       ++weightLocation.x;
 | |
|       outputValues[0] += inputValues[0] * weightValues;
 | |
|       outputValues[1] += inputValues[1] * weightValues;
 | |
|       outputValues[2] += inputValues[2] * weightValues;
 | |
|       outputValues[3] += inputValues[3] * weightValues;
 | |
|     }
 | |
|     inputLocation.y += dilationY;
 | |
|   }
 | |
| 
 | |
| #else
 | |
| 
 | |
|   // depthwise unstrided convolution
 | |
|   for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
 | |
|     float4 inputValues[4];
 | |
|     inputLocation.x = startX;
 | |
|     for (short i = 1; i < 4; ++i) {
 | |
|       inputValues[i] = read_imagef(input, smp, inputLocation);
 | |
|       inputLocation.x += totalNumPackedOutputChannels;
 | |
|     }
 | |
|     for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
 | |
|       inputValues[0] = inputValues[1];
 | |
|       inputValues[1] = inputValues[2];
 | |
|       inputValues[2] = inputValues[3];
 | |
|       inputValues[3] = read_imagef(input, smp, inputLocation);
 | |
|       inputLocation.x += totalNumPackedChannels;
 | |
|       float4 weightValues = read_imagef(weights, smp, weightLocation);
 | |
|       ++weightLocation.x;
 | |
|       outputValues[0] += inputValues[0] * weightValues;
 | |
|       outputValues[1] += inputValues[1] * weightValues;
 | |
|       outputValues[2] += inputValues[2] * weightValues;
 | |
|       outputValues[3] += inputValues[3] * weightValues;
 | |
|     }
 | |
|     ++inputLocation.y;
 | |
|   }
 | |
| 
 | |
| #endif
 | |
| 
 | |
| #elif defined(ONLY_1X1_CONV)
 | |
| 
 | |
|   // 1x1 convolution
 | |
|   short endPackedInputChannel = startPackedInputChannel + numPackedInputChannelsForGroup;
 | |
|   for (short packedInputChannel = startPackedInputChannel; packedInputChannel < endPackedInputChannel; ++packedInputChannel) {
 | |
|     float4 weightValues[4];
 | |
|     for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
 | |
|       weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
 | |
|       ++weightLocation.x;
 | |
|     }
 | |
| 
 | |
|     inputLocation.x = startX + packedInputChannel;
 | |
|     float4 inputValues[NUM_OUTPUTS];
 | |
|     for (short i = 0; i < NUM_OUTPUTS; ++i) {
 | |
|       inputValues[i] = read_imagef(input, smp, inputLocation);
 | |
|       inputLocation.x += strideWithChannels;
 | |
|     }
 | |
| 
 | |
|     for (short i = 0; i < NUM_OUTPUTS; ++i) {
 | |
|       float4 curOutputValues = outputValues[i];
 | |
|       curOutputValues.x += inputValues[i].x * weightValues[0].x;
 | |
|       curOutputValues.x += inputValues[i].y * weightValues[0].y;
 | |
|       curOutputValues.x += inputValues[i].z * weightValues[0].z;
 | |
|       curOutputValues.x += inputValues[i].w * weightValues[0].w;
 | |
|       curOutputValues.y += inputValues[i].x * weightValues[1].x;
 | |
|       curOutputValues.y += inputValues[i].y * weightValues[1].y;
 | |
|       curOutputValues.y += inputValues[i].z * weightValues[1].z;
 | |
|       curOutputValues.y += inputValues[i].w * weightValues[1].w;
 | |
|       curOutputValues.z += inputValues[i].x * weightValues[2].x;
 | |
|       curOutputValues.z += inputValues[i].y * weightValues[2].y;
 | |
|       curOutputValues.z += inputValues[i].z * weightValues[2].z;
 | |
|       curOutputValues.z += inputValues[i].w * weightValues[2].w;
 | |
|       curOutputValues.w += inputValues[i].x * weightValues[3].x;
 | |
|       curOutputValues.w += inputValues[i].y * weightValues[3].y;
 | |
|       curOutputValues.w += inputValues[i].z * weightValues[3].z;
 | |
|       curOutputValues.w += inputValues[i].w * weightValues[3].w;
 | |
|       outputValues[i] = curOutputValues;
 | |
|     }
 | |
|   }
 | |
|   packedOutputChannel += packedOuputChannelOffset;
 | |
| 
 | |
| #else
 | |
| 
 | |
|   // normal convolution
 | |
|   for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
 | |
|     for (short packedInputChannel = 0; packedInputChannel < numPackedInputChannelsForGroup; ++packedInputChannel) {
 | |
|       short startXForChannel = startX + packedInputChannel;
 | |
|       for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
 | |
| 
 | |
|         float4 weightValues[4];
 | |
|         for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
 | |
|           weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
 | |
|           ++weightLocation.x;
 | |
|         }
 | |
| 
 | |
| #ifdef SUPPORT_DILATION
 | |
|         short dilatedStepX = mul24(totalNumPackedInputChannels, dilationX);
 | |
|         inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel);
 | |
| #else
 | |
|         inputLocation.x = mad24(rfColumn, totalNumPackedInputChannels, startXForChannel);
 | |
| #endif
 | |
|         float4 inputValues[NUM_OUTPUTS];
 | |
|         for (short i = 0; i < NUM_OUTPUTS; ++i) {
 | |
|           inputValues[i] = read_imagef(input, smp, inputLocation);
 | |
|           inputLocation.x += strideWithChannels;
 | |
|         }
 | |
|         for (short i = 0; i < NUM_OUTPUTS; ++i) {
 | |
|           float4 curOutputValues = outputValues[i];
 | |
|           curOutputValues.x += inputValues[i].x * weightValues[0].x;
 | |
|           curOutputValues.x += inputValues[i].y * weightValues[0].y;
 | |
|           curOutputValues.x += inputValues[i].z * weightValues[0].z;
 | |
|           curOutputValues.x += inputValues[i].w * weightValues[0].w;
 | |
|           curOutputValues.y += inputValues[i].x * weightValues[1].x;
 | |
|           curOutputValues.y += inputValues[i].y * weightValues[1].y;
 | |
|           curOutputValues.y += inputValues[i].z * weightValues[1].z;
 | |
|           curOutputValues.y += inputValues[i].w * weightValues[1].w;
 | |
|           curOutputValues.z += inputValues[i].x * weightValues[2].x;
 | |
|           curOutputValues.z += inputValues[i].y * weightValues[2].y;
 | |
|           curOutputValues.z += inputValues[i].z * weightValues[2].z;
 | |
|           curOutputValues.z += inputValues[i].w * weightValues[2].w;
 | |
|           curOutputValues.w += inputValues[i].x * weightValues[3].x;
 | |
|           curOutputValues.w += inputValues[i].y * weightValues[3].y;
 | |
|           curOutputValues.w += inputValues[i].z * weightValues[3].z;
 | |
|           curOutputValues.w += inputValues[i].w * weightValues[3].w;
 | |
|           outputValues[i] = curOutputValues;
 | |
|         }
 | |
|       }
 | |
|     }
 | |
| #ifdef SUPPORT_DILATION
 | |
|     inputLocation.y += dilationY;
 | |
| #else
 | |
|     ++inputLocation.y;
 | |
| #endif
 | |
|   }
 | |
|   packedOutputChannel += packedOuputChannelOffset;
 | |
| #endif
 | |
| 
 | |
|   // bias
 | |
|   short outputChannel = mul24(packedOutputChannel, 4);
 | |
|   float4 biasValues = vload4(0, biases + outputChannel);
 | |
|   for (short i = 0; i < NUM_OUTPUTS; ++i) {
 | |
|     outputValues[i] += biasValues;
 | |
|   }
 | |
| 
 | |
| #ifdef SUPPORT_ACCUMULATION
 | |
|   // accumulate
 | |
|   if (doAccumulate) {
 | |
|     int2 outputLocation;
 | |
|     short outputColumn = startOutputColumn;
 | |
|     outputLocation.y = outputRow;
 | |
|     for (short i = 0; i < NUM_OUTPUTS; ++i) {
 | |
|       outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
 | |
|       if (outputColumn < numOutputColumns) {
 | |
|         outputValues[i] += read_imagef(accumulator, smp, outputLocation);
 | |
|       }
 | |
|       ++outputColumn;
 | |
|     }
 | |
|   }
 | |
| #endif
 | |
| 
 | |
|   // activation
 | |
|   switch (neuron) {
 | |
|   case 1:
 | |
|     for (short i = 0; i < NUM_OUTPUTS; ++i) {
 | |
|       outputValues[i] = max(outputValues[i], 0.0f);
 | |
|     }
 | |
|     break;
 | |
|   case 2:
 | |
|     for (short i = 0; i < NUM_OUTPUTS; ++i) {
 | |
|       outputValues[i] = a * tanh(b * outputValues[i]);
 | |
|     }
 | |
|     break;
 | |
|   case 3:
 | |
|     for (short i = 0; i < NUM_OUTPUTS; ++i) {
 | |
|       outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
 | |
|     }
 | |
|     break;
 | |
|   case 4:
 | |
|     for (short i = 0; i < NUM_OUTPUTS; ++i) {
 | |
|       outputValues[i] = max(outputValues[i], min_clamp);
 | |
|       outputValues[i] = min(outputValues[i], max_clamp);
 | |
|     }
 | |
|     break;
 | |
|   case 5:
 | |
|     for (short i = 0; i < NUM_OUTPUTS; ++i) {
 | |
|       outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
 | |
|     }
 | |
|     break;
 | |
|   }
 | |
| 
 | |
|   // output
 | |
|   int2 outputLocation;
 | |
|   short outputColumn = startOutputColumn;
 | |
|   outputLocation.y = outputRow;
 | |
|   for (short i = 0; i < NUM_OUTPUTS; ++i) {
 | |
|     outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
 | |
|     if (outputColumn < numOutputColumns) {
 | |
|       write_imagef(output, outputLocation, outputValues[i]);
 | |
|     }
 | |
|     ++outputColumn;
 | |
|   }
 | |
| }
 | |
| 
 |