Refactor convolutions (#23807)
* one conv with defines * add conv * building works on C3 * this is num_outputs too, process replay is so useful Co-authored-by: Comma Device <device@comma.ai>pull/23811/head
parent
719801845b
commit
2c7542d34e
7 changed files with 292 additions and 599 deletions
@ -0,0 +1,272 @@ |
||||
read_only image2d_t input, |
||||
#ifndef DEPTHWISE |
||||
short startPackedInputChannel, |
||||
short numPackedInputChannelsForGroup, short totalNumPackedInputChannels, |
||||
// typo required for API compatibility |
||||
short packedOuputChannelOffset, short totalNumPackedOutputChannels, |
||||
#else |
||||
short totalNumPackedChannels, |
||||
#endif |
||||
read_only image2d_t weights, __constant float *biases, |
||||
short filterSizeX, short filterSizeY, |
||||
write_only image2d_t output, |
||||
short paddingX, short paddingY, short strideX, short strideY, |
||||
#ifdef SUPPORT_DILATION |
||||
short dilationX, short dilationY, |
||||
#endif |
||||
short neuron, float a, float b, float min_clamp, float max_clamp, |
||||
#ifndef DEPTHWISE |
||||
// note: these are not supported |
||||
__constant float *parameters, __constant float *batchNormBiases, |
||||
#endif |
||||
short numOutputColumns |
||||
#ifdef SUPPORT_ACCUMULATION |
||||
, short doAccumulate, read_only image2d_t accumulator |
||||
#endif |
||||
) { |
||||
|
||||
#ifndef NUM_OUTPUTS |
||||
#define NUM_OUTPUTS 4 |
||||
#endif |
||||
|
||||
// init |
||||
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; |
||||
short packedOutputChannel = get_global_id(0); |
||||
short startOutputColumn = mul24((short)get_global_id(1), NUM_OUTPUTS); |
||||
short outputRow = get_global_id(2); |
||||
|
||||
#ifdef DEPTHWISE |
||||
short totalNumPackedInputChannels = totalNumPackedChannels; |
||||
short totalNumPackedOutputChannels = totalNumPackedChannels; |
||||
short startPackedInputChannel = packedOutputChannel; |
||||
#endif |
||||
|
||||
short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), totalNumPackedInputChannels, startPackedInputChannel); |
||||
short strideWithChannels = mul24(strideX, totalNumPackedInputChannels); |
||||
|
||||
float4 outputValues[NUM_OUTPUTS]; |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputValues[i] = (float4)(0, 0, 0, 0); |
||||
} |
||||
|
||||
int2 inputLocation; |
||||
inputLocation.y = mad24(outputRow, strideY, -paddingY); |
||||
|
||||
int2 weightLocation; |
||||
weightLocation.x = 0; |
||||
weightLocation.y = packedOutputChannel; |
||||
|
||||
#ifdef DEPTHWISE |
||||
|
||||
#ifdef SUPPORT_DILATION |
||||
|
||||
// depthwise convolution |
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { |
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { |
||||
short dilatedStepX = mul24(totalNumPackedChannels, dilationX); |
||||
inputLocation.x = mad24(rfColumn, dilatedStepX, startX); |
||||
float4 inputValues[4]; |
||||
for (short i = 0; i < 4; ++i) { |
||||
inputValues[i] = read_imagef(input, smp, inputLocation); |
||||
inputLocation.x += strideWithChannels; |
||||
} |
||||
float4 weightValues = read_imagef(weights, smp, weightLocation); |
||||
++weightLocation.x; |
||||
outputValues[0] += inputValues[0] * weightValues; |
||||
outputValues[1] += inputValues[1] * weightValues; |
||||
outputValues[2] += inputValues[2] * weightValues; |
||||
outputValues[3] += inputValues[3] * weightValues; |
||||
} |
||||
inputLocation.y += dilationY; |
||||
} |
||||
|
||||
#else |
||||
|
||||
// depthwise unstrided convolution |
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { |
||||
float4 inputValues[4]; |
||||
inputLocation.x = startX; |
||||
for (short i = 1; i < 4; ++i) { |
||||
inputValues[i] = read_imagef(input, smp, inputLocation); |
||||
inputLocation.x += totalNumPackedOutputChannels; |
||||
} |
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { |
||||
inputValues[0] = inputValues[1]; |
||||
inputValues[1] = inputValues[2]; |
||||
inputValues[2] = inputValues[3]; |
||||
inputValues[3] = read_imagef(input, smp, inputLocation); |
||||
inputLocation.x += totalNumPackedChannels; |
||||
float4 weightValues = read_imagef(weights, smp, weightLocation); |
||||
++weightLocation.x; |
||||
outputValues[0] += inputValues[0] * weightValues; |
||||
outputValues[1] += inputValues[1] * weightValues; |
||||
outputValues[2] += inputValues[2] * weightValues; |
||||
outputValues[3] += inputValues[3] * weightValues; |
||||
} |
||||
++inputLocation.y; |
||||
} |
||||
|
||||
#endif |
||||
|
||||
#elif defined(ONLY_1X1_CONV) |
||||
|
||||
// 1x1 convolution |
||||
short endPackedInputChannel = startPackedInputChannel + numPackedInputChannelsForGroup; |
||||
for (short packedInputChannel = startPackedInputChannel; packedInputChannel < endPackedInputChannel; ++packedInputChannel) { |
||||
float4 weightValues[4]; |
||||
for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { |
||||
weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); |
||||
++weightLocation.x; |
||||
} |
||||
|
||||
inputLocation.x = startX + packedInputChannel; |
||||
float4 inputValues[NUM_OUTPUTS]; |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
inputValues[i] = read_imagef(input, smp, inputLocation); |
||||
inputLocation.x += strideWithChannels; |
||||
} |
||||
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
float4 curOutputValues = outputValues[i]; |
||||
curOutputValues.x += inputValues[i].x * weightValues[0].x; |
||||
curOutputValues.x += inputValues[i].y * weightValues[0].y; |
||||
curOutputValues.x += inputValues[i].z * weightValues[0].z; |
||||
curOutputValues.x += inputValues[i].w * weightValues[0].w; |
||||
curOutputValues.y += inputValues[i].x * weightValues[1].x; |
||||
curOutputValues.y += inputValues[i].y * weightValues[1].y; |
||||
curOutputValues.y += inputValues[i].z * weightValues[1].z; |
||||
curOutputValues.y += inputValues[i].w * weightValues[1].w; |
||||
curOutputValues.z += inputValues[i].x * weightValues[2].x; |
||||
curOutputValues.z += inputValues[i].y * weightValues[2].y; |
||||
curOutputValues.z += inputValues[i].z * weightValues[2].z; |
||||
curOutputValues.z += inputValues[i].w * weightValues[2].w; |
||||
curOutputValues.w += inputValues[i].x * weightValues[3].x; |
||||
curOutputValues.w += inputValues[i].y * weightValues[3].y; |
||||
curOutputValues.w += inputValues[i].z * weightValues[3].z; |
||||
curOutputValues.w += inputValues[i].w * weightValues[3].w; |
||||
outputValues[i] = curOutputValues; |
||||
} |
||||
} |
||||
packedOutputChannel += packedOuputChannelOffset; |
||||
|
||||
#else |
||||
|
||||
// normal convolution |
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { |
||||
for (short packedInputChannel = 0; packedInputChannel < numPackedInputChannelsForGroup; ++packedInputChannel) { |
||||
short startXForChannel = startX + packedInputChannel; |
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { |
||||
|
||||
float4 weightValues[4]; |
||||
for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { |
||||
weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); |
||||
++weightLocation.x; |
||||
} |
||||
|
||||
#ifdef SUPPORT_DILATION |
||||
short dilatedStepX = mul24(totalNumPackedInputChannels, dilationX); |
||||
inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel); |
||||
#else |
||||
inputLocation.x = mad24(rfColumn, totalNumPackedInputChannels, startXForChannel); |
||||
#endif |
||||
float4 inputValues[NUM_OUTPUTS]; |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
inputValues[i] = read_imagef(input, smp, inputLocation); |
||||
inputLocation.x += strideWithChannels; |
||||
} |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
float4 curOutputValues = outputValues[i]; |
||||
curOutputValues.x += inputValues[i].x * weightValues[0].x; |
||||
curOutputValues.x += inputValues[i].y * weightValues[0].y; |
||||
curOutputValues.x += inputValues[i].z * weightValues[0].z; |
||||
curOutputValues.x += inputValues[i].w * weightValues[0].w; |
||||
curOutputValues.y += inputValues[i].x * weightValues[1].x; |
||||
curOutputValues.y += inputValues[i].y * weightValues[1].y; |
||||
curOutputValues.y += inputValues[i].z * weightValues[1].z; |
||||
curOutputValues.y += inputValues[i].w * weightValues[1].w; |
||||
curOutputValues.z += inputValues[i].x * weightValues[2].x; |
||||
curOutputValues.z += inputValues[i].y * weightValues[2].y; |
||||
curOutputValues.z += inputValues[i].z * weightValues[2].z; |
||||
curOutputValues.z += inputValues[i].w * weightValues[2].w; |
||||
curOutputValues.w += inputValues[i].x * weightValues[3].x; |
||||
curOutputValues.w += inputValues[i].y * weightValues[3].y; |
||||
curOutputValues.w += inputValues[i].z * weightValues[3].z; |
||||
curOutputValues.w += inputValues[i].w * weightValues[3].w; |
||||
outputValues[i] = curOutputValues; |
||||
} |
||||
} |
||||
} |
||||
#ifdef SUPPORT_DILATION |
||||
inputLocation.y += dilationY; |
||||
#else |
||||
++inputLocation.y; |
||||
#endif |
||||
} |
||||
packedOutputChannel += packedOuputChannelOffset; |
||||
#endif |
||||
|
||||
// bias |
||||
short outputChannel = mul24(packedOutputChannel, 4); |
||||
float4 biasValues = vload4(0, biases + outputChannel); |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputValues[i] += biasValues; |
||||
} |
||||
|
||||
#ifdef SUPPORT_ACCUMULATION |
||||
// accumulate |
||||
if (doAccumulate) { |
||||
int2 outputLocation; |
||||
short outputColumn = startOutputColumn; |
||||
outputLocation.y = outputRow; |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); |
||||
if (outputColumn < numOutputColumns) { |
||||
outputValues[i] += read_imagef(accumulator, smp, outputLocation); |
||||
} |
||||
++outputColumn; |
||||
} |
||||
} |
||||
#endif |
||||
|
||||
// activation |
||||
switch (neuron) { |
||||
case 1: |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputValues[i] = max(outputValues[i], 0.0f); |
||||
} |
||||
break; |
||||
case 2: |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputValues[i] = a * tanh(b * outputValues[i]); |
||||
} |
||||
break; |
||||
case 3: |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); |
||||
} |
||||
break; |
||||
case 4: |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputValues[i] = max(outputValues[i], min_clamp); |
||||
outputValues[i] = min(outputValues[i], max_clamp); |
||||
} |
||||
break; |
||||
case 5: |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); |
||||
} |
||||
break; |
||||
} |
||||
|
||||
// output |
||||
int2 outputLocation; |
||||
short outputColumn = startOutputColumn; |
||||
outputLocation.y = outputRow; |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); |
||||
if (outputColumn < numOutputColumns) { |
||||
write_imagef(output, outputLocation, outputValues[i]); |
||||
} |
||||
++outputColumn; |
||||
} |
||||
} |
@ -1,129 +1,4 @@ |
||||
__kernel void convolution_horizontal_reduced_reads( |
||||
read_only image2d_t input, |
||||
short startPackedInputChannel, |
||||
short numPackedInputChannelsForGroup, short totalNumPackedInputChannels, |
||||
short packedOuputChannelOffset, short totalNumPackedOutputChannels, |
||||
read_only image2d_t weights, __constant float *biases, |
||||
short filterSizeX, short filterSizeY, |
||||
write_only image2d_t output, |
||||
short paddingX, short paddingY, short strideX, short strideY, |
||||
short dilationX, short dilationY, |
||||
short neuron, float a, float b, float min_clamp, float max_clamp, |
||||
__constant float *parameters, __constant float *batchNormBiases, |
||||
short numOutputColumns) { |
||||
|
||||
// init |
||||
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; |
||||
short packedOutputChannel = get_global_id(0); |
||||
short startOutputColumn = mul24((short)get_global_id(1), 4); |
||||
short outputRow = get_global_id(2); |
||||
short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), |
||||
totalNumPackedInputChannels, startPackedInputChannel); |
||||
short strideWithChannels = mul24(strideX, totalNumPackedInputChannels); |
||||
|
||||
float4 outputValues[4]; |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = (float4)(0, 0, 0, 0); |
||||
} |
||||
|
||||
int2 inputLocation; |
||||
inputLocation.y = mad24(outputRow, strideY, -paddingY); |
||||
|
||||
int2 weightLocation; |
||||
weightLocation.x = 0; |
||||
weightLocation.y = packedOutputChannel; |
||||
#define SUPPORT_DILATION |
||||
|
||||
// convolution |
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { |
||||
for (short packedInputChannel = 0; |
||||
packedInputChannel < numPackedInputChannelsForGroup; |
||||
++packedInputChannel) { |
||||
short startXForChannel = startX + packedInputChannel; |
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { |
||||
float4 weightValues[4]; |
||||
for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { |
||||
weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); |
||||
++weightLocation.x; |
||||
} |
||||
short dilatedStepX = mul24(totalNumPackedInputChannels, dilationX); |
||||
inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel); |
||||
float4 inputValues[4]; |
||||
for (short i = 0; i < 4; ++i) { |
||||
inputValues[i] = read_imagef(input, smp, inputLocation); |
||||
inputLocation.x += strideWithChannels; |
||||
} |
||||
for (short i = 0; i < 4; ++i) { |
||||
float4 curOutputValues = outputValues[i]; |
||||
curOutputValues.x += inputValues[i].x * weightValues[0].x; |
||||
curOutputValues.x += inputValues[i].y * weightValues[0].y; |
||||
curOutputValues.x += inputValues[i].z * weightValues[0].z; |
||||
curOutputValues.x += inputValues[i].w * weightValues[0].w; |
||||
curOutputValues.y += inputValues[i].x * weightValues[1].x; |
||||
curOutputValues.y += inputValues[i].y * weightValues[1].y; |
||||
curOutputValues.y += inputValues[i].z * weightValues[1].z; |
||||
curOutputValues.y += inputValues[i].w * weightValues[1].w; |
||||
curOutputValues.z += inputValues[i].x * weightValues[2].x; |
||||
curOutputValues.z += inputValues[i].y * weightValues[2].y; |
||||
curOutputValues.z += inputValues[i].z * weightValues[2].z; |
||||
curOutputValues.z += inputValues[i].w * weightValues[2].w; |
||||
curOutputValues.w += inputValues[i].x * weightValues[3].x; |
||||
curOutputValues.w += inputValues[i].y * weightValues[3].y; |
||||
curOutputValues.w += inputValues[i].z * weightValues[3].z; |
||||
curOutputValues.w += inputValues[i].w * weightValues[3].w; |
||||
outputValues[i] = curOutputValues; |
||||
} |
||||
} |
||||
} |
||||
inputLocation.y += dilationY; |
||||
} |
||||
|
||||
// bias |
||||
packedOutputChannel += packedOuputChannelOffset; |
||||
short outputChannel = mul24(packedOutputChannel, 4); |
||||
float4 biasValues = vload4(0, biases + outputChannel); |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] += biasValues; |
||||
} |
||||
|
||||
// activation |
||||
switch (neuron) { |
||||
case 1: |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = max(outputValues[i], 0.0f); |
||||
} |
||||
break; |
||||
case 2: |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = a * tanh(b * outputValues[i]); |
||||
} |
||||
break; |
||||
case 3: |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); |
||||
} |
||||
break; |
||||
case 4: |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = max(outputValues[i], min_clamp); |
||||
outputValues[i] = min(outputValues[i], max_clamp); |
||||
} |
||||
break; |
||||
case 5: |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); |
||||
} |
||||
break; |
||||
} |
||||
|
||||
// output |
||||
int2 outputLocation; |
||||
short outputColumn = startOutputColumn; |
||||
outputLocation.y = outputRow; |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); |
||||
if (outputColumn < numOutputColumns) { |
||||
write_imagef(output, outputLocation, outputValues[i]); |
||||
} |
||||
++outputColumn; |
||||
} |
||||
} |
||||
__kernel void convolution_horizontal_reduced_reads( |
||||
#include "convolution_.cl" |
||||
|
@ -1,140 +1,5 @@ |
||||
__kernel void convolution_horizontal_reduced_reads_1x1( |
||||
read_only image2d_t input, |
||||
short startPackedInputChannel, |
||||
short numPackedInputChannelsForGroup, short totalNumPackedInputChannels, |
||||
short packedOuputChannelOffset, short totalNumPackedOutputChannels, |
||||
read_only image2d_t weights, __constant float *biases, |
||||
short filterSizeX, short filterSizeY, |
||||
write_only image2d_t output, |
||||
short paddingX, short paddingY, short strideX, short strideY, |
||||
short neuron, float a, float b, float min_clamp, float max_clamp, |
||||
__constant float *parameters, __constant float *batchNormBiases, |
||||
short numOutputColumns, |
||||
short doAccumulate, read_only image2d_t accumulator) { |
||||
|
||||
// init |
||||
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; |
||||
short packedOutputChannel = get_global_id(0); |
||||
short startOutputColumn = mul24((short)get_global_id(1), 4); |
||||
short outputRow = get_global_id(2); |
||||
short endPackedInputChannel = startPackedInputChannel + numPackedInputChannelsForGroup; |
||||
short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), |
||||
totalNumPackedInputChannels, startPackedInputChannel); |
||||
short strideWithChannels = mul24(strideX, totalNumPackedInputChannels); |
||||
|
||||
float4 outputValues[4]; |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = (float4)(0, 0, 0, 0); |
||||
} |
||||
|
||||
int2 inputLocation; |
||||
inputLocation.y = mad24(outputRow, strideY, -paddingY); |
||||
|
||||
int2 weightLocation; |
||||
weightLocation.x = 0; |
||||
weightLocation.y = packedOutputChannel; |
||||
|
||||
// convolution |
||||
for (short packedInputChannel = startPackedInputChannel; |
||||
packedInputChannel < endPackedInputChannel; ++packedInputChannel) { |
||||
|
||||
float4 weightValues[4]; |
||||
for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { |
||||
weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); |
||||
++weightLocation.x; |
||||
} |
||||
|
||||
inputLocation.x = startX + packedInputChannel; |
||||
float4 inputValues[4]; |
||||
for (short i = 0; i < 4; ++i) { |
||||
inputValues[i] = read_imagef(input, smp, inputLocation); |
||||
inputLocation.x += strideWithChannels; |
||||
} |
||||
|
||||
for (short i = 0; i < 4; ++i) { |
||||
float4 curOutputValues = outputValues[i]; |
||||
curOutputValues.x += inputValues[i].x * weightValues[0].x; |
||||
curOutputValues.x += inputValues[i].y * weightValues[0].y; |
||||
curOutputValues.x += inputValues[i].z * weightValues[0].z; |
||||
curOutputValues.x += inputValues[i].w * weightValues[0].w; |
||||
curOutputValues.y += inputValues[i].x * weightValues[1].x; |
||||
curOutputValues.y += inputValues[i].y * weightValues[1].y; |
||||
curOutputValues.y += inputValues[i].z * weightValues[1].z; |
||||
curOutputValues.y += inputValues[i].w * weightValues[1].w; |
||||
curOutputValues.z += inputValues[i].x * weightValues[2].x; |
||||
curOutputValues.z += inputValues[i].y * weightValues[2].y; |
||||
curOutputValues.z += inputValues[i].z * weightValues[2].z; |
||||
curOutputValues.z += inputValues[i].w * weightValues[2].w; |
||||
curOutputValues.w += inputValues[i].x * weightValues[3].x; |
||||
curOutputValues.w += inputValues[i].y * weightValues[3].y; |
||||
curOutputValues.w += inputValues[i].z * weightValues[3].z; |
||||
curOutputValues.w += inputValues[i].w * weightValues[3].w; |
||||
outputValues[i] = curOutputValues; |
||||
} |
||||
} |
||||
|
||||
// bias |
||||
packedOutputChannel += packedOuputChannelOffset; |
||||
short outputChannel = mul24(packedOutputChannel, 4); |
||||
float4 biasValues = vload4(0, biases + outputChannel); |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] += biasValues; |
||||
} |
||||
|
||||
// accumulate |
||||
if (doAccumulate) { |
||||
int2 outputLocation; |
||||
short outputColumn = startOutputColumn; |
||||
outputLocation.y = outputRow; |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); |
||||
if (outputColumn < numOutputColumns) { |
||||
outputValues[i] += read_imagef(accumulator, smp, outputLocation); |
||||
} |
||||
++outputColumn; |
||||
} |
||||
} |
||||
|
||||
// activation |
||||
switch (neuron) { |
||||
case 1: |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = max(outputValues[i], 0.0f); |
||||
} |
||||
break; |
||||
case 2: |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = a * tanh(b * outputValues[i]); |
||||
} |
||||
break; |
||||
case 3: |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); |
||||
} |
||||
break; |
||||
case 4: |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = max(outputValues[i], min_clamp); |
||||
outputValues[i] = min(outputValues[i], max_clamp); |
||||
} |
||||
break; |
||||
case 5: |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); |
||||
} |
||||
break; |
||||
} |
||||
|
||||
// output |
||||
int2 outputLocation; |
||||
short outputColumn = startOutputColumn; |
||||
outputLocation.y = outputRow; |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); |
||||
if (outputColumn < numOutputColumns) { |
||||
write_imagef(output, outputLocation, outputValues[i]); |
||||
} |
||||
++outputColumn; |
||||
} |
||||
} |
||||
#define ONLY_1X1_CONV |
||||
#define SUPPORT_ACCUMULATION |
||||
|
||||
__kernel void convolution_horizontal_reduced_reads_1x1( |
||||
#include "convolution_.cl" |
||||
|
@ -1,130 +1,4 @@ |
||||
__kernel void convolution_horizontal_reduced_reads_5_outputs( |
||||
read_only image2d_t input, |
||||
short startPackedInputChannel, |
||||
short numPackedInputChannelsForGroup, short totalNumPackedInputChannels, |
||||
short packedOuputChannelOffset, short totalNumPackedOutputChannels, |
||||
read_only image2d_t weights, __constant float *biases, |
||||
short filterSizeX, short filterSizeY, |
||||
write_only image2d_t output, |
||||
short paddingX, short paddingY, short strideX, short strideY, |
||||
short neuron, float a, float b, float min_clamp, float max_clamp, |
||||
__constant float *parameters, __constant float *batchNormBiases, |
||||
short numOutputColumns) { |
||||
|
||||
// init |
||||
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; |
||||
short packedOutputChannel = get_global_id(0); |
||||
short startOutputColumn = mul24((short)get_global_id(1), 5); |
||||
short outputRow = get_global_id(2); |
||||
short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), |
||||
totalNumPackedInputChannels, startPackedInputChannel); |
||||
short strideWithChannels = mul24(strideX, totalNumPackedInputChannels); |
||||
|
||||
float4 outputValues[5]; |
||||
for (short i = 0; i < 5; ++i) { |
||||
outputValues[i] = (float4)(0, 0, 0, 0); |
||||
} |
||||
|
||||
int2 inputLocation; |
||||
inputLocation.y = mad24(outputRow, strideY, -paddingY); |
||||
|
||||
int2 weightLocation; |
||||
weightLocation.x = 0; |
||||
weightLocation.y = packedOutputChannel; |
||||
|
||||
// convolution |
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { |
||||
for (short packedInputChannel = 0; |
||||
packedInputChannel < numPackedInputChannelsForGroup; |
||||
++packedInputChannel) { |
||||
short startXForChannel = startX + packedInputChannel; |
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { |
||||
float4 weightValues[4]; |
||||
for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { |
||||
weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); |
||||
++weightLocation.x; |
||||
} |
||||
inputLocation.x = |
||||
mad24(rfColumn, totalNumPackedInputChannels, startXForChannel); |
||||
#define NUM_OUTPUTS 5 |
||||
|
||||
float4 inputValues[5]; |
||||
for (short i = 0; i < 5; ++i) { |
||||
inputValues[i] = read_imagef(input, smp, inputLocation); |
||||
inputLocation.x += strideWithChannels; |
||||
} |
||||
|
||||
for (short i = 0; i < 5; ++i) { |
||||
float4 curOutputValues = outputValues[i]; |
||||
curOutputValues.x += inputValues[i].x * weightValues[0].x; |
||||
curOutputValues.x += inputValues[i].y * weightValues[0].y; |
||||
curOutputValues.x += inputValues[i].z * weightValues[0].z; |
||||
curOutputValues.x += inputValues[i].w * weightValues[0].w; |
||||
curOutputValues.y += inputValues[i].x * weightValues[1].x; |
||||
curOutputValues.y += inputValues[i].y * weightValues[1].y; |
||||
curOutputValues.y += inputValues[i].z * weightValues[1].z; |
||||
curOutputValues.y += inputValues[i].w * weightValues[1].w; |
||||
curOutputValues.z += inputValues[i].x * weightValues[2].x; |
||||
curOutputValues.z += inputValues[i].y * weightValues[2].y; |
||||
curOutputValues.z += inputValues[i].z * weightValues[2].z; |
||||
curOutputValues.z += inputValues[i].w * weightValues[2].w; |
||||
curOutputValues.w += inputValues[i].x * weightValues[3].x; |
||||
curOutputValues.w += inputValues[i].y * weightValues[3].y; |
||||
curOutputValues.w += inputValues[i].z * weightValues[3].z; |
||||
curOutputValues.w += inputValues[i].w * weightValues[3].w; |
||||
outputValues[i] = curOutputValues; |
||||
} |
||||
} |
||||
} |
||||
++inputLocation.y; |
||||
} |
||||
|
||||
// bias |
||||
packedOutputChannel += packedOuputChannelOffset; |
||||
short outputChannel = mul24(packedOutputChannel, 4); |
||||
float4 biasValues = vload4(0, biases + outputChannel); |
||||
for (short i = 0; i < 5; ++i) { |
||||
outputValues[i] += biasValues; |
||||
} |
||||
|
||||
// activation |
||||
switch (neuron) { |
||||
case 1: |
||||
for (short i = 0; i < 5; ++i) { |
||||
outputValues[i] = max(outputValues[i], 0.0f); |
||||
} |
||||
break; |
||||
case 2: |
||||
for (short i = 0; i < 5; ++i) { |
||||
outputValues[i] = a * tanh(b * outputValues[i]); |
||||
} |
||||
break; |
||||
case 3: |
||||
for (short i = 0; i < 5; ++i) { |
||||
outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); |
||||
} |
||||
break; |
||||
case 4: |
||||
for (short i = 0; i < 5; ++i) { |
||||
outputValues[i] = max(outputValues[i], min_clamp); |
||||
outputValues[i] = min(outputValues[i], max_clamp); |
||||
} |
||||
break; |
||||
case 5: |
||||
for (short i = 0; i < 5; ++i) { |
||||
outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); |
||||
} |
||||
break; |
||||
} |
||||
|
||||
// output |
||||
int2 outputLocation; |
||||
short outputColumn = startOutputColumn; |
||||
outputLocation.y = outputRow; |
||||
for (short i = 0; i < 5; ++i) { |
||||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); |
||||
if (outputColumn < numOutputColumns) { |
||||
write_imagef(output, outputLocation, outputValues[i]); |
||||
} |
||||
++outputColumn; |
||||
} |
||||
} |
||||
__kernel void convolution_horizontal_reduced_reads_5_outputs( |
||||
#include "convolution_.cl" |
||||
|
@ -1,101 +1,5 @@ |
||||
__kernel void convolution_horizontal_reduced_reads_depthwise( |
||||
read_only image2d_t input, |
||||
short totalNumPackedChannels, |
||||
read_only image2d_t weights, __constant float *biases, |
||||
short filterSizeX, short filterSizeY, |
||||
write_only image2d_t output, |
||||
short paddingX, short paddingY, short strideX, short strideY, |
||||
short dilationX, short dilationY, |
||||
short neuron, float a, float b, float min_clamp, float max_clamp, |
||||
short numOutputColumns) { |
||||
|
||||
// init |
||||
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; |
||||
short packedChannel = get_global_id(0); |
||||
short startOutputColumn = mul24((short)get_global_id(1), 4); |
||||
short outputRow = get_global_id(2); |
||||
short startXForChannel = mad24(mad24(startOutputColumn, strideX, -paddingX), |
||||
totalNumPackedChannels, packedChannel); |
||||
short strideWithChannels = mul24(strideX, totalNumPackedChannels); |
||||
|
||||
float4 outputValues[4]; |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = (float4)(0, 0, 0, 0); |
||||
} |
||||
|
||||
int2 inputLocation; |
||||
inputLocation.y = mad24(outputRow, strideY, -paddingY); |
||||
|
||||
int2 weightLocation; |
||||
weightLocation.x = 0; |
||||
weightLocation.y = packedChannel; |
||||
#define DEPTHWISE |
||||
#define SUPPORT_DILATION |
||||
|
||||
// convolution |
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { |
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { |
||||
short dilatedStepX = mul24(totalNumPackedChannels, dilationX); |
||||
inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel); |
||||
float4 inputValues[4]; |
||||
for (short i = 0; i < 4; ++i) { |
||||
inputValues[i] = read_imagef(input, smp, inputLocation); |
||||
inputLocation.x += strideWithChannels; |
||||
} |
||||
float4 weightValues = read_imagef(weights, smp, weightLocation); |
||||
++weightLocation.x; |
||||
outputValues[0] += inputValues[0] * weightValues; |
||||
outputValues[1] += inputValues[1] * weightValues; |
||||
outputValues[2] += inputValues[2] * weightValues; |
||||
outputValues[3] += inputValues[3] * weightValues; |
||||
} |
||||
inputLocation.y += dilationY; |
||||
} |
||||
|
||||
// bias |
||||
short outputChannel = mul24(packedChannel, 4); |
||||
float4 biasValues = vload4(0, biases + outputChannel); |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] += biasValues; |
||||
} |
||||
|
||||
// activation |
||||
switch (neuron) { |
||||
case 1: |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = max(outputValues[i], 0.0f); |
||||
} |
||||
break; |
||||
case 2: |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = a * tanh(b * outputValues[i]); |
||||
} |
||||
break; |
||||
case 3: |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); |
||||
} |
||||
break; |
||||
case 4: |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = max(outputValues[i], min_clamp); |
||||
outputValues[i] = min(outputValues[i], max_clamp); |
||||
} |
||||
break; |
||||
case 5: |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); |
||||
} |
||||
break; |
||||
} |
||||
|
||||
// output |
||||
int2 outputLocation; |
||||
short outputColumn = startOutputColumn; |
||||
outputLocation.y = outputRow; |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputLocation.x = mad24(outputColumn, totalNumPackedChannels, packedChannel); |
||||
if (outputColumn < numOutputColumns) { |
||||
write_imagef(output, outputLocation, outputValues[i]); |
||||
} |
||||
++outputColumn; |
||||
} |
||||
} |
||||
__kernel void convolution_horizontal_reduced_reads_depthwise( |
||||
#include "convolution_.cl" |
||||
|
@ -1,103 +1,4 @@ |
||||
__kernel void convolution_horizontal_reduced_reads_depthwise_stride_1( |
||||
read_only image2d_t input, |
||||
short totalNumPackedChannels, |
||||
read_only image2d_t weights, __constant float *biases, |
||||
short filterSizeX, short filterSizeY, |
||||
write_only image2d_t output, |
||||
short paddingX, short paddingY, short strideX, short strideY, |
||||
short neuron, float a, float b, float min_clamp, float max_clamp, |
||||
short numOutputColumns) { |
||||
|
||||
// init |
||||
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; |
||||
short packedChannel = get_global_id(0); |
||||
short startOutputColumn = mul24((short)get_global_id(1), 4); |
||||
short outputRow = get_global_id(2); |
||||
short startXForChannel = mad24(mad24(startOutputColumn, strideX, -paddingX), |
||||
totalNumPackedChannels, packedChannel); |
||||
|
||||
float4 outputValues[4]; |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = (float4)(0, 0, 0, 0); |
||||
} |
||||
|
||||
int2 inputLocation; |
||||
inputLocation.y = mad24(outputRow, strideY, -paddingY); |
||||
|
||||
int2 weightLocation; |
||||
weightLocation.x = 0; |
||||
weightLocation.y = packedChannel; |
||||
#define DEPTHWISE |
||||
|
||||
// convolution |
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { |
||||
float4 inputValues[4]; |
||||
inputLocation.x = startXForChannel; |
||||
for (short i = 1; i < 4; ++i) { |
||||
inputValues[i] = read_imagef(input, smp, inputLocation); |
||||
inputLocation.x += totalNumPackedChannels; |
||||
} |
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { |
||||
inputValues[0] = inputValues[1]; |
||||
inputValues[1] = inputValues[2]; |
||||
inputValues[2] = inputValues[3]; |
||||
inputValues[3] = read_imagef(input, smp, inputLocation); |
||||
inputLocation.x += totalNumPackedChannels; |
||||
float4 weightValues = read_imagef(weights, smp, weightLocation); |
||||
++weightLocation.x; |
||||
outputValues[0] += inputValues[0] * weightValues; |
||||
outputValues[1] += inputValues[1] * weightValues; |
||||
outputValues[2] += inputValues[2] * weightValues; |
||||
outputValues[3] += inputValues[3] * weightValues; |
||||
} |
||||
++inputLocation.y; |
||||
} |
||||
|
||||
// bias |
||||
short outputChannel = mul24(packedChannel, 4); |
||||
float4 biasValues = vload4(0, biases + outputChannel); |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] += biasValues; |
||||
} |
||||
|
||||
// activation |
||||
switch (neuron) { |
||||
case 1: |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = max(outputValues[i], 0.0f); |
||||
} |
||||
break; |
||||
case 2: |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = a * tanh(b * outputValues[i]); |
||||
} |
||||
break; |
||||
case 3: |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); |
||||
} |
||||
break; |
||||
case 4: |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = max(outputValues[i], min_clamp); |
||||
outputValues[i] = min(outputValues[i], max_clamp); |
||||
} |
||||
break; |
||||
case 5: |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); |
||||
} |
||||
break; |
||||
} |
||||
|
||||
// output |
||||
int2 outputLocation; |
||||
short outputColumn = startOutputColumn; |
||||
outputLocation.y = outputRow; |
||||
for (short i = 0; i < 4; ++i) { |
||||
outputLocation.x = mad24(outputColumn, totalNumPackedChannels, packedChannel); |
||||
if (outputColumn < numOutputColumns) { |
||||
write_imagef(output, outputLocation, outputValues[i]); |
||||
} |
||||
++outputColumn; |
||||
} |
||||
} |
||||
__kernel void convolution_horizontal_reduced_reads_depthwise_stride_1( |
||||
#include "convolution_.cl" |
||||
|
Loading…
Reference in new issue