diff --git a/selfdrive/modeld/thneed/kernels/convolution_.cl b/selfdrive/modeld/thneed/kernels/convolution_.cl new file mode 100644 index 0000000000..1b9d74b83f --- /dev/null +++ b/selfdrive/modeld/thneed/kernels/convolution_.cl @@ -0,0 +1,272 @@ + read_only image2d_t input, +#ifndef DEPTHWISE + short startPackedInputChannel, + short numPackedInputChannelsForGroup, short totalNumPackedInputChannels, + // typo required for API compatibility + short packedOuputChannelOffset, short totalNumPackedOutputChannels, +#else + short totalNumPackedChannels, +#endif + read_only image2d_t weights, __constant float *biases, + short filterSizeX, short filterSizeY, + write_only image2d_t output, + short paddingX, short paddingY, short strideX, short strideY, +#ifdef SUPPORT_DILATION + short dilationX, short dilationY, +#endif + short neuron, float a, float b, float min_clamp, float max_clamp, +#ifndef DEPTHWISE + // note: these are not supported + __constant float *parameters, __constant float *batchNormBiases, +#endif + short numOutputColumns +#ifdef SUPPORT_ACCUMULATION + , short doAccumulate, read_only image2d_t accumulator +#endif + ) { + +#ifndef NUM_OUTPUTS + #define NUM_OUTPUTS 4 +#endif + + // init + const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + short packedOutputChannel = get_global_id(0); + short startOutputColumn = mul24((short)get_global_id(1), NUM_OUTPUTS); + short outputRow = get_global_id(2); + +#ifdef DEPTHWISE + short totalNumPackedInputChannels = totalNumPackedChannels; + short totalNumPackedOutputChannels = totalNumPackedChannels; + short startPackedInputChannel = packedOutputChannel; +#endif + + short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), totalNumPackedInputChannels, startPackedInputChannel); + short strideWithChannels = mul24(strideX, totalNumPackedInputChannels); + + float4 outputValues[NUM_OUTPUTS]; + for (short i = 0; i < NUM_OUTPUTS; ++i) { + outputValues[i] = (float4)(0, 0, 0, 0); + } + + int2 inputLocation; + inputLocation.y = mad24(outputRow, strideY, -paddingY); + + int2 weightLocation; + weightLocation.x = 0; + weightLocation.y = packedOutputChannel; + +#ifdef DEPTHWISE + +#ifdef SUPPORT_DILATION + + // depthwise convolution + for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { + for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { + short dilatedStepX = mul24(totalNumPackedChannels, dilationX); + inputLocation.x = mad24(rfColumn, dilatedStepX, startX); + float4 inputValues[4]; + for (short i = 0; i < 4; ++i) { + inputValues[i] = read_imagef(input, smp, inputLocation); + inputLocation.x += strideWithChannels; + } + float4 weightValues = read_imagef(weights, smp, weightLocation); + ++weightLocation.x; + outputValues[0] += inputValues[0] * weightValues; + outputValues[1] += inputValues[1] * weightValues; + outputValues[2] += inputValues[2] * weightValues; + outputValues[3] += inputValues[3] * weightValues; + } + inputLocation.y += dilationY; + } + +#else + + // depthwise unstrided convolution + for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { + float4 inputValues[4]; + inputLocation.x = startX; + for (short i = 1; i < 4; ++i) { + inputValues[i] = read_imagef(input, smp, inputLocation); + inputLocation.x += totalNumPackedOutputChannels; + } + for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { + inputValues[0] = inputValues[1]; + inputValues[1] = inputValues[2]; + inputValues[2] = inputValues[3]; + inputValues[3] = read_imagef(input, smp, inputLocation); + inputLocation.x += totalNumPackedChannels; + float4 weightValues = read_imagef(weights, smp, weightLocation); + ++weightLocation.x; + outputValues[0] += inputValues[0] * weightValues; + outputValues[1] += inputValues[1] * weightValues; + outputValues[2] += inputValues[2] * weightValues; + outputValues[3] += inputValues[3] * weightValues; + } + ++inputLocation.y; + } + +#endif + +#elif defined(ONLY_1X1_CONV) + + // 1x1 convolution + short endPackedInputChannel = startPackedInputChannel + numPackedInputChannelsForGroup; + for (short packedInputChannel = startPackedInputChannel; packedInputChannel < endPackedInputChannel; ++packedInputChannel) { + float4 weightValues[4]; + for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { + weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); + ++weightLocation.x; + } + + inputLocation.x = startX + packedInputChannel; + float4 inputValues[NUM_OUTPUTS]; + for (short i = 0; i < NUM_OUTPUTS; ++i) { + inputValues[i] = read_imagef(input, smp, inputLocation); + inputLocation.x += strideWithChannels; + } + + for (short i = 0; i < NUM_OUTPUTS; ++i) { + float4 curOutputValues = outputValues[i]; + curOutputValues.x += inputValues[i].x * weightValues[0].x; + curOutputValues.x += inputValues[i].y * weightValues[0].y; + curOutputValues.x += inputValues[i].z * weightValues[0].z; + curOutputValues.x += inputValues[i].w * weightValues[0].w; + curOutputValues.y += inputValues[i].x * weightValues[1].x; + curOutputValues.y += inputValues[i].y * weightValues[1].y; + curOutputValues.y += inputValues[i].z * weightValues[1].z; + curOutputValues.y += inputValues[i].w * weightValues[1].w; + curOutputValues.z += inputValues[i].x * weightValues[2].x; + curOutputValues.z += inputValues[i].y * weightValues[2].y; + curOutputValues.z += inputValues[i].z * weightValues[2].z; + curOutputValues.z += inputValues[i].w * weightValues[2].w; + curOutputValues.w += inputValues[i].x * weightValues[3].x; + curOutputValues.w += inputValues[i].y * weightValues[3].y; + curOutputValues.w += inputValues[i].z * weightValues[3].z; + curOutputValues.w += inputValues[i].w * weightValues[3].w; + outputValues[i] = curOutputValues; + } + } + packedOutputChannel += packedOuputChannelOffset; + +#else + + // normal convolution + for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { + for (short packedInputChannel = 0; packedInputChannel < numPackedInputChannelsForGroup; ++packedInputChannel) { + short startXForChannel = startX + packedInputChannel; + for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { + + float4 weightValues[4]; + for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { + weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); + ++weightLocation.x; + } + +#ifdef SUPPORT_DILATION + short dilatedStepX = mul24(totalNumPackedInputChannels, dilationX); + inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel); +#else + inputLocation.x = mad24(rfColumn, totalNumPackedInputChannels, startXForChannel); +#endif + float4 inputValues[NUM_OUTPUTS]; + for (short i = 0; i < NUM_OUTPUTS; ++i) { + inputValues[i] = read_imagef(input, smp, inputLocation); + inputLocation.x += strideWithChannels; + } + for (short i = 0; i < NUM_OUTPUTS; ++i) { + float4 curOutputValues = outputValues[i]; + curOutputValues.x += inputValues[i].x * weightValues[0].x; + curOutputValues.x += inputValues[i].y * weightValues[0].y; + curOutputValues.x += inputValues[i].z * weightValues[0].z; + curOutputValues.x += inputValues[i].w * weightValues[0].w; + curOutputValues.y += inputValues[i].x * weightValues[1].x; + curOutputValues.y += inputValues[i].y * weightValues[1].y; + curOutputValues.y += inputValues[i].z * weightValues[1].z; + curOutputValues.y += inputValues[i].w * weightValues[1].w; + curOutputValues.z += inputValues[i].x * weightValues[2].x; + curOutputValues.z += inputValues[i].y * weightValues[2].y; + curOutputValues.z += inputValues[i].z * weightValues[2].z; + curOutputValues.z += inputValues[i].w * weightValues[2].w; + curOutputValues.w += inputValues[i].x * weightValues[3].x; + curOutputValues.w += inputValues[i].y * weightValues[3].y; + curOutputValues.w += inputValues[i].z * weightValues[3].z; + curOutputValues.w += inputValues[i].w * weightValues[3].w; + outputValues[i] = curOutputValues; + } + } + } +#ifdef SUPPORT_DILATION + inputLocation.y += dilationY; +#else + ++inputLocation.y; +#endif + } + packedOutputChannel += packedOuputChannelOffset; +#endif + + // bias + short outputChannel = mul24(packedOutputChannel, 4); + float4 biasValues = vload4(0, biases + outputChannel); + for (short i = 0; i < NUM_OUTPUTS; ++i) { + outputValues[i] += biasValues; + } + +#ifdef SUPPORT_ACCUMULATION + // accumulate + if (doAccumulate) { + int2 outputLocation; + short outputColumn = startOutputColumn; + outputLocation.y = outputRow; + for (short i = 0; i < NUM_OUTPUTS; ++i) { + outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); + if (outputColumn < numOutputColumns) { + outputValues[i] += read_imagef(accumulator, smp, outputLocation); + } + ++outputColumn; + } + } +#endif + + // activation + switch (neuron) { + case 1: + for (short i = 0; i < NUM_OUTPUTS; ++i) { + outputValues[i] = max(outputValues[i], 0.0f); + } + break; + case 2: + for (short i = 0; i < NUM_OUTPUTS; ++i) { + outputValues[i] = a * tanh(b * outputValues[i]); + } + break; + case 3: + for (short i = 0; i < NUM_OUTPUTS; ++i) { + outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); + } + break; + case 4: + for (short i = 0; i < NUM_OUTPUTS; ++i) { + outputValues[i] = max(outputValues[i], min_clamp); + outputValues[i] = min(outputValues[i], max_clamp); + } + break; + case 5: + for (short i = 0; i < NUM_OUTPUTS; ++i) { + outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); + } + break; + } + + // output + int2 outputLocation; + short outputColumn = startOutputColumn; + outputLocation.y = outputRow; + for (short i = 0; i < NUM_OUTPUTS; ++i) { + outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); + if (outputColumn < numOutputColumns) { + write_imagef(output, outputLocation, outputValues[i]); + } + ++outputColumn; + } +} diff --git a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads.cl b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads.cl index 1cd077aecb..bc8add79aa 100644 --- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads.cl +++ b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads.cl @@ -1,129 +1,4 @@ -__kernel void convolution_horizontal_reduced_reads( - read_only image2d_t input, - short startPackedInputChannel, - short numPackedInputChannelsForGroup, short totalNumPackedInputChannels, - short packedOuputChannelOffset, short totalNumPackedOutputChannels, - read_only image2d_t weights, __constant float *biases, - short filterSizeX, short filterSizeY, - write_only image2d_t output, - short paddingX, short paddingY, short strideX, short strideY, - short dilationX, short dilationY, - short neuron, float a, float b, float min_clamp, float max_clamp, - __constant float *parameters, __constant float *batchNormBiases, - short numOutputColumns) { - - // init - const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - short packedOutputChannel = get_global_id(0); - short startOutputColumn = mul24((short)get_global_id(1), 4); - short outputRow = get_global_id(2); - short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), - totalNumPackedInputChannels, startPackedInputChannel); - short strideWithChannels = mul24(strideX, totalNumPackedInputChannels); - - float4 outputValues[4]; - for (short i = 0; i < 4; ++i) { - outputValues[i] = (float4)(0, 0, 0, 0); - } - - int2 inputLocation; - inputLocation.y = mad24(outputRow, strideY, -paddingY); - - int2 weightLocation; - weightLocation.x = 0; - weightLocation.y = packedOutputChannel; +#define SUPPORT_DILATION - // convolution - for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { - for (short packedInputChannel = 0; - packedInputChannel < numPackedInputChannelsForGroup; - ++packedInputChannel) { - short startXForChannel = startX + packedInputChannel; - for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { - float4 weightValues[4]; - for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { - weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); - ++weightLocation.x; - } - short dilatedStepX = mul24(totalNumPackedInputChannels, dilationX); - inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel); - float4 inputValues[4]; - for (short i = 0; i < 4; ++i) { - inputValues[i] = read_imagef(input, smp, inputLocation); - inputLocation.x += strideWithChannels; - } - for (short i = 0; i < 4; ++i) { - float4 curOutputValues = outputValues[i]; - curOutputValues.x += inputValues[i].x * weightValues[0].x; - curOutputValues.x += inputValues[i].y * weightValues[0].y; - curOutputValues.x += inputValues[i].z * weightValues[0].z; - curOutputValues.x += inputValues[i].w * weightValues[0].w; - curOutputValues.y += inputValues[i].x * weightValues[1].x; - curOutputValues.y += inputValues[i].y * weightValues[1].y; - curOutputValues.y += inputValues[i].z * weightValues[1].z; - curOutputValues.y += inputValues[i].w * weightValues[1].w; - curOutputValues.z += inputValues[i].x * weightValues[2].x; - curOutputValues.z += inputValues[i].y * weightValues[2].y; - curOutputValues.z += inputValues[i].z * weightValues[2].z; - curOutputValues.z += inputValues[i].w * weightValues[2].w; - curOutputValues.w += inputValues[i].x * weightValues[3].x; - curOutputValues.w += inputValues[i].y * weightValues[3].y; - curOutputValues.w += inputValues[i].z * weightValues[3].z; - curOutputValues.w += inputValues[i].w * weightValues[3].w; - outputValues[i] = curOutputValues; - } - } - } - inputLocation.y += dilationY; - } - - // bias - packedOutputChannel += packedOuputChannelOffset; - short outputChannel = mul24(packedOutputChannel, 4); - float4 biasValues = vload4(0, biases + outputChannel); - for (short i = 0; i < 4; ++i) { - outputValues[i] += biasValues; - } - - // activation - switch (neuron) { - case 1: - for (short i = 0; i < 4; ++i) { - outputValues[i] = max(outputValues[i], 0.0f); - } - break; - case 2: - for (short i = 0; i < 4; ++i) { - outputValues[i] = a * tanh(b * outputValues[i]); - } - break; - case 3: - for (short i = 0; i < 4; ++i) { - outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); - } - break; - case 4: - for (short i = 0; i < 4; ++i) { - outputValues[i] = max(outputValues[i], min_clamp); - outputValues[i] = min(outputValues[i], max_clamp); - } - break; - case 5: - for (short i = 0; i < 4; ++i) { - outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); - } - break; - } - - // output - int2 outputLocation; - short outputColumn = startOutputColumn; - outputLocation.y = outputRow; - for (short i = 0; i < 4; ++i) { - outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); - if (outputColumn < numOutputColumns) { - write_imagef(output, outputLocation, outputValues[i]); - } - ++outputColumn; - } -} +__kernel void convolution_horizontal_reduced_reads( +#include "convolution_.cl" diff --git a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_1x1.cl b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_1x1.cl index bbadf50bfa..75a090ca22 100644 --- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_1x1.cl +++ b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_1x1.cl @@ -1,140 +1,5 @@ -__kernel void convolution_horizontal_reduced_reads_1x1( - read_only image2d_t input, - short startPackedInputChannel, - short numPackedInputChannelsForGroup, short totalNumPackedInputChannels, - short packedOuputChannelOffset, short totalNumPackedOutputChannels, - read_only image2d_t weights, __constant float *biases, - short filterSizeX, short filterSizeY, - write_only image2d_t output, - short paddingX, short paddingY, short strideX, short strideY, - short neuron, float a, float b, float min_clamp, float max_clamp, - __constant float *parameters, __constant float *batchNormBiases, - short numOutputColumns, - short doAccumulate, read_only image2d_t accumulator) { - - // init - const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - short packedOutputChannel = get_global_id(0); - short startOutputColumn = mul24((short)get_global_id(1), 4); - short outputRow = get_global_id(2); - short endPackedInputChannel = startPackedInputChannel + numPackedInputChannelsForGroup; - short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), - totalNumPackedInputChannels, startPackedInputChannel); - short strideWithChannels = mul24(strideX, totalNumPackedInputChannels); - - float4 outputValues[4]; - for (short i = 0; i < 4; ++i) { - outputValues[i] = (float4)(0, 0, 0, 0); - } - - int2 inputLocation; - inputLocation.y = mad24(outputRow, strideY, -paddingY); - - int2 weightLocation; - weightLocation.x = 0; - weightLocation.y = packedOutputChannel; - - // convolution - for (short packedInputChannel = startPackedInputChannel; - packedInputChannel < endPackedInputChannel; ++packedInputChannel) { - - float4 weightValues[4]; - for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { - weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); - ++weightLocation.x; - } - - inputLocation.x = startX + packedInputChannel; - float4 inputValues[4]; - for (short i = 0; i < 4; ++i) { - inputValues[i] = read_imagef(input, smp, inputLocation); - inputLocation.x += strideWithChannels; - } - - for (short i = 0; i < 4; ++i) { - float4 curOutputValues = outputValues[i]; - curOutputValues.x += inputValues[i].x * weightValues[0].x; - curOutputValues.x += inputValues[i].y * weightValues[0].y; - curOutputValues.x += inputValues[i].z * weightValues[0].z; - curOutputValues.x += inputValues[i].w * weightValues[0].w; - curOutputValues.y += inputValues[i].x * weightValues[1].x; - curOutputValues.y += inputValues[i].y * weightValues[1].y; - curOutputValues.y += inputValues[i].z * weightValues[1].z; - curOutputValues.y += inputValues[i].w * weightValues[1].w; - curOutputValues.z += inputValues[i].x * weightValues[2].x; - curOutputValues.z += inputValues[i].y * weightValues[2].y; - curOutputValues.z += inputValues[i].z * weightValues[2].z; - curOutputValues.z += inputValues[i].w * weightValues[2].w; - curOutputValues.w += inputValues[i].x * weightValues[3].x; - curOutputValues.w += inputValues[i].y * weightValues[3].y; - curOutputValues.w += inputValues[i].z * weightValues[3].z; - curOutputValues.w += inputValues[i].w * weightValues[3].w; - outputValues[i] = curOutputValues; - } - } - - // bias - packedOutputChannel += packedOuputChannelOffset; - short outputChannel = mul24(packedOutputChannel, 4); - float4 biasValues = vload4(0, biases + outputChannel); - for (short i = 0; i < 4; ++i) { - outputValues[i] += biasValues; - } - - // accumulate - if (doAccumulate) { - int2 outputLocation; - short outputColumn = startOutputColumn; - outputLocation.y = outputRow; - for (short i = 0; i < 4; ++i) { - outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); - if (outputColumn < numOutputColumns) { - outputValues[i] += read_imagef(accumulator, smp, outputLocation); - } - ++outputColumn; - } - } - - // activation - switch (neuron) { - case 1: - for (short i = 0; i < 4; ++i) { - outputValues[i] = max(outputValues[i], 0.0f); - } - break; - case 2: - for (short i = 0; i < 4; ++i) { - outputValues[i] = a * tanh(b * outputValues[i]); - } - break; - case 3: - for (short i = 0; i < 4; ++i) { - outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); - } - break; - case 4: - for (short i = 0; i < 4; ++i) { - outputValues[i] = max(outputValues[i], min_clamp); - outputValues[i] = min(outputValues[i], max_clamp); - } - break; - case 5: - for (short i = 0; i < 4; ++i) { - outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); - } - break; - } - - // output - int2 outputLocation; - short outputColumn = startOutputColumn; - outputLocation.y = outputRow; - for (short i = 0; i < 4; ++i) { - outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); - if (outputColumn < numOutputColumns) { - write_imagef(output, outputLocation, outputValues[i]); - } - ++outputColumn; - } -} +#define ONLY_1X1_CONV +#define SUPPORT_ACCUMULATION +__kernel void convolution_horizontal_reduced_reads_1x1( +#include "convolution_.cl" diff --git a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_5_outputs.cl b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_5_outputs.cl index 399e1a77cd..980e7d1f67 100644 --- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_5_outputs.cl +++ b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_5_outputs.cl @@ -1,130 +1,4 @@ -__kernel void convolution_horizontal_reduced_reads_5_outputs( - read_only image2d_t input, - short startPackedInputChannel, - short numPackedInputChannelsForGroup, short totalNumPackedInputChannels, - short packedOuputChannelOffset, short totalNumPackedOutputChannels, - read_only image2d_t weights, __constant float *biases, - short filterSizeX, short filterSizeY, - write_only image2d_t output, - short paddingX, short paddingY, short strideX, short strideY, - short neuron, float a, float b, float min_clamp, float max_clamp, - __constant float *parameters, __constant float *batchNormBiases, - short numOutputColumns) { - - // init - const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - short packedOutputChannel = get_global_id(0); - short startOutputColumn = mul24((short)get_global_id(1), 5); - short outputRow = get_global_id(2); - short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), - totalNumPackedInputChannels, startPackedInputChannel); - short strideWithChannels = mul24(strideX, totalNumPackedInputChannels); - - float4 outputValues[5]; - for (short i = 0; i < 5; ++i) { - outputValues[i] = (float4)(0, 0, 0, 0); - } - - int2 inputLocation; - inputLocation.y = mad24(outputRow, strideY, -paddingY); - - int2 weightLocation; - weightLocation.x = 0; - weightLocation.y = packedOutputChannel; - - // convolution - for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { - for (short packedInputChannel = 0; - packedInputChannel < numPackedInputChannelsForGroup; - ++packedInputChannel) { - short startXForChannel = startX + packedInputChannel; - for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { - float4 weightValues[4]; - for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { - weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); - ++weightLocation.x; - } - inputLocation.x = - mad24(rfColumn, totalNumPackedInputChannels, startXForChannel); +#define NUM_OUTPUTS 5 - float4 inputValues[5]; - for (short i = 0; i < 5; ++i) { - inputValues[i] = read_imagef(input, smp, inputLocation); - inputLocation.x += strideWithChannels; - } - - for (short i = 0; i < 5; ++i) { - float4 curOutputValues = outputValues[i]; - curOutputValues.x += inputValues[i].x * weightValues[0].x; - curOutputValues.x += inputValues[i].y * weightValues[0].y; - curOutputValues.x += inputValues[i].z * weightValues[0].z; - curOutputValues.x += inputValues[i].w * weightValues[0].w; - curOutputValues.y += inputValues[i].x * weightValues[1].x; - curOutputValues.y += inputValues[i].y * weightValues[1].y; - curOutputValues.y += inputValues[i].z * weightValues[1].z; - curOutputValues.y += inputValues[i].w * weightValues[1].w; - curOutputValues.z += inputValues[i].x * weightValues[2].x; - curOutputValues.z += inputValues[i].y * weightValues[2].y; - curOutputValues.z += inputValues[i].z * weightValues[2].z; - curOutputValues.z += inputValues[i].w * weightValues[2].w; - curOutputValues.w += inputValues[i].x * weightValues[3].x; - curOutputValues.w += inputValues[i].y * weightValues[3].y; - curOutputValues.w += inputValues[i].z * weightValues[3].z; - curOutputValues.w += inputValues[i].w * weightValues[3].w; - outputValues[i] = curOutputValues; - } - } - } - ++inputLocation.y; - } - - // bias - packedOutputChannel += packedOuputChannelOffset; - short outputChannel = mul24(packedOutputChannel, 4); - float4 biasValues = vload4(0, biases + outputChannel); - for (short i = 0; i < 5; ++i) { - outputValues[i] += biasValues; - } - - // activation - switch (neuron) { - case 1: - for (short i = 0; i < 5; ++i) { - outputValues[i] = max(outputValues[i], 0.0f); - } - break; - case 2: - for (short i = 0; i < 5; ++i) { - outputValues[i] = a * tanh(b * outputValues[i]); - } - break; - case 3: - for (short i = 0; i < 5; ++i) { - outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); - } - break; - case 4: - for (short i = 0; i < 5; ++i) { - outputValues[i] = max(outputValues[i], min_clamp); - outputValues[i] = min(outputValues[i], max_clamp); - } - break; - case 5: - for (short i = 0; i < 5; ++i) { - outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); - } - break; - } - - // output - int2 outputLocation; - short outputColumn = startOutputColumn; - outputLocation.y = outputRow; - for (short i = 0; i < 5; ++i) { - outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); - if (outputColumn < numOutputColumns) { - write_imagef(output, outputLocation, outputValues[i]); - } - ++outputColumn; - } -} +__kernel void convolution_horizontal_reduced_reads_5_outputs( +#include "convolution_.cl" diff --git a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise.cl b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise.cl index b3ff16bcd0..80be0da924 100644 --- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise.cl +++ b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise.cl @@ -1,101 +1,5 @@ -__kernel void convolution_horizontal_reduced_reads_depthwise( - read_only image2d_t input, - short totalNumPackedChannels, - read_only image2d_t weights, __constant float *biases, - short filterSizeX, short filterSizeY, - write_only image2d_t output, - short paddingX, short paddingY, short strideX, short strideY, - short dilationX, short dilationY, - short neuron, float a, float b, float min_clamp, float max_clamp, - short numOutputColumns) { - - // init - const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - short packedChannel = get_global_id(0); - short startOutputColumn = mul24((short)get_global_id(1), 4); - short outputRow = get_global_id(2); - short startXForChannel = mad24(mad24(startOutputColumn, strideX, -paddingX), - totalNumPackedChannels, packedChannel); - short strideWithChannels = mul24(strideX, totalNumPackedChannels); - - float4 outputValues[4]; - for (short i = 0; i < 4; ++i) { - outputValues[i] = (float4)(0, 0, 0, 0); - } - - int2 inputLocation; - inputLocation.y = mad24(outputRow, strideY, -paddingY); - - int2 weightLocation; - weightLocation.x = 0; - weightLocation.y = packedChannel; +#define DEPTHWISE +#define SUPPORT_DILATION - // convolution - for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { - for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { - short dilatedStepX = mul24(totalNumPackedChannels, dilationX); - inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel); - float4 inputValues[4]; - for (short i = 0; i < 4; ++i) { - inputValues[i] = read_imagef(input, smp, inputLocation); - inputLocation.x += strideWithChannels; - } - float4 weightValues = read_imagef(weights, smp, weightLocation); - ++weightLocation.x; - outputValues[0] += inputValues[0] * weightValues; - outputValues[1] += inputValues[1] * weightValues; - outputValues[2] += inputValues[2] * weightValues; - outputValues[3] += inputValues[3] * weightValues; - } - inputLocation.y += dilationY; - } - - // bias - short outputChannel = mul24(packedChannel, 4); - float4 biasValues = vload4(0, biases + outputChannel); - for (short i = 0; i < 4; ++i) { - outputValues[i] += biasValues; - } - - // activation - switch (neuron) { - case 1: - for (short i = 0; i < 4; ++i) { - outputValues[i] = max(outputValues[i], 0.0f); - } - break; - case 2: - for (short i = 0; i < 4; ++i) { - outputValues[i] = a * tanh(b * outputValues[i]); - } - break; - case 3: - for (short i = 0; i < 4; ++i) { - outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); - } - break; - case 4: - for (short i = 0; i < 4; ++i) { - outputValues[i] = max(outputValues[i], min_clamp); - outputValues[i] = min(outputValues[i], max_clamp); - } - break; - case 5: - for (short i = 0; i < 4; ++i) { - outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); - } - break; - } - - // output - int2 outputLocation; - short outputColumn = startOutputColumn; - outputLocation.y = outputRow; - for (short i = 0; i < 4; ++i) { - outputLocation.x = mad24(outputColumn, totalNumPackedChannels, packedChannel); - if (outputColumn < numOutputColumns) { - write_imagef(output, outputLocation, outputValues[i]); - } - ++outputColumn; - } -} +__kernel void convolution_horizontal_reduced_reads_depthwise( +#include "convolution_.cl" diff --git a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise_stride_1.cl b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise_stride_1.cl index 0ac2b7796f..3d651c229b 100644 --- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise_stride_1.cl +++ b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise_stride_1.cl @@ -1,103 +1,4 @@ -__kernel void convolution_horizontal_reduced_reads_depthwise_stride_1( - read_only image2d_t input, - short totalNumPackedChannels, - read_only image2d_t weights, __constant float *biases, - short filterSizeX, short filterSizeY, - write_only image2d_t output, - short paddingX, short paddingY, short strideX, short strideY, - short neuron, float a, float b, float min_clamp, float max_clamp, - short numOutputColumns) { - - // init - const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - short packedChannel = get_global_id(0); - short startOutputColumn = mul24((short)get_global_id(1), 4); - short outputRow = get_global_id(2); - short startXForChannel = mad24(mad24(startOutputColumn, strideX, -paddingX), - totalNumPackedChannels, packedChannel); - - float4 outputValues[4]; - for (short i = 0; i < 4; ++i) { - outputValues[i] = (float4)(0, 0, 0, 0); - } - - int2 inputLocation; - inputLocation.y = mad24(outputRow, strideY, -paddingY); - - int2 weightLocation; - weightLocation.x = 0; - weightLocation.y = packedChannel; +#define DEPTHWISE - // convolution - for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { - float4 inputValues[4]; - inputLocation.x = startXForChannel; - for (short i = 1; i < 4; ++i) { - inputValues[i] = read_imagef(input, smp, inputLocation); - inputLocation.x += totalNumPackedChannels; - } - for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { - inputValues[0] = inputValues[1]; - inputValues[1] = inputValues[2]; - inputValues[2] = inputValues[3]; - inputValues[3] = read_imagef(input, smp, inputLocation); - inputLocation.x += totalNumPackedChannels; - float4 weightValues = read_imagef(weights, smp, weightLocation); - ++weightLocation.x; - outputValues[0] += inputValues[0] * weightValues; - outputValues[1] += inputValues[1] * weightValues; - outputValues[2] += inputValues[2] * weightValues; - outputValues[3] += inputValues[3] * weightValues; - } - ++inputLocation.y; - } - - // bias - short outputChannel = mul24(packedChannel, 4); - float4 biasValues = vload4(0, biases + outputChannel); - for (short i = 0; i < 4; ++i) { - outputValues[i] += biasValues; - } - - // activation - switch (neuron) { - case 1: - for (short i = 0; i < 4; ++i) { - outputValues[i] = max(outputValues[i], 0.0f); - } - break; - case 2: - for (short i = 0; i < 4; ++i) { - outputValues[i] = a * tanh(b * outputValues[i]); - } - break; - case 3: - for (short i = 0; i < 4; ++i) { - outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); - } - break; - case 4: - for (short i = 0; i < 4; ++i) { - outputValues[i] = max(outputValues[i], min_clamp); - outputValues[i] = min(outputValues[i], max_clamp); - } - break; - case 5: - for (short i = 0; i < 4; ++i) { - outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); - } - break; - } - - // output - int2 outputLocation; - short outputColumn = startOutputColumn; - outputLocation.y = outputRow; - for (short i = 0; i < 4; ++i) { - outputLocation.x = mad24(outputColumn, totalNumPackedChannels, packedChannel); - if (outputColumn < numOutputColumns) { - write_imagef(output, outputLocation, outputValues[i]); - } - ++outputColumn; - } -} +__kernel void convolution_horizontal_reduced_reads_depthwise_stride_1( +#include "convolution_.cl" diff --git a/selfdrive/modeld/thneed/optimizer.cc b/selfdrive/modeld/thneed/optimizer.cc index 0d1da18ec0..d5e6ce063c 100644 --- a/selfdrive/modeld/thneed/optimizer.cc +++ b/selfdrive/modeld/thneed/optimizer.cc @@ -80,7 +80,9 @@ int Thneed::optimize() { printf("building kernel %s\n", k->name.c_str()); k->program = clCreateProgramWithSource(context, 1, srcs, &length, NULL); - int err = clBuildProgram(k->program, 1, &device_id, "", NULL, NULL); + char options[0x100]; + snprintf(options, sizeof(options)-1, "-I %s", kernel_path); + int err = clBuildProgram(k->program, 1, &device_id, options, NULL, NULL); if (err != 0) { printf("got err %d\n", err);