modeld: delete unused SNPE stuff after move to tinygrad (#25635)
* delete unused stuff * remove CL interceptor from thneed since we don't use SNPE anymore * remove dead files from release * that's removed * oops, didn't savepull/25642/head
parent
6e062ea5d2
commit
6c39382d71
18 changed files with 14 additions and 1158 deletions
@ -1,81 +0,0 @@ |
|||||||
#include <cstring> |
|
||||||
#include <getopt.h> |
|
||||||
|
|
||||||
#include "selfdrive/modeld/runners/snpemodel.h" |
|
||||||
#include "selfdrive/modeld/thneed/thneed.h" |
|
||||||
#include "system/hardware/hw.h" |
|
||||||
|
|
||||||
#define TEMPORAL_SIZE 512 |
|
||||||
#define DESIRE_LEN 8 |
|
||||||
#define TRAFFIC_CONVENTION_LEN 2 |
|
||||||
|
|
||||||
// TODO: This should probably use SNPE directly.
|
|
||||||
int main(int argc, char* argv[]) { |
|
||||||
bool run_optimizer = false, save_binaries = false; |
|
||||||
const char *input_file = NULL, *output_file = NULL; |
|
||||||
static struct option long_options[] = { |
|
||||||
{"in", required_argument, 0, 'i' }, |
|
||||||
{"out", required_argument, 0, 'o' }, |
|
||||||
{"binary", no_argument, 0, 'b' }, |
|
||||||
{"optimize", no_argument, 0, 'f' }, |
|
||||||
{0, 0, 0, 0 } |
|
||||||
}; |
|
||||||
int long_index = 0, opt = 0; |
|
||||||
while ((opt = getopt_long_only(argc, argv,"", long_options, &long_index)) != -1) { |
|
||||||
switch (opt) { |
|
||||||
case 'i': input_file = optarg; break; |
|
||||||
case 'o': output_file = optarg; break; |
|
||||||
case 'b': save_binaries = true; break; |
|
||||||
case 'f': run_optimizer = true; break; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// no input?
|
|
||||||
if (!input_file) { |
|
||||||
printf("usage: -i <input file> -o <output file> --binary --optimize\n"); |
|
||||||
return -1; |
|
||||||
} |
|
||||||
|
|
||||||
#define OUTPUT_SIZE 0x10000 |
|
||||||
|
|
||||||
float *output = (float*)calloc(OUTPUT_SIZE, sizeof(float)); |
|
||||||
SNPEModel mdl(input_file, output, 0, USE_GPU_RUNTIME, true); |
|
||||||
mdl.thneed->run_optimizer = run_optimizer; |
|
||||||
|
|
||||||
float state[TEMPORAL_SIZE] = {0}; |
|
||||||
float desire[DESIRE_LEN] = {0}; |
|
||||||
float traffic_convention[TRAFFIC_CONVENTION_LEN] = {0}; |
|
||||||
float *input = (float*)calloc(0x1000000, sizeof(float)); |
|
||||||
float *extra = (float*)calloc(0x1000000, sizeof(float)); |
|
||||||
|
|
||||||
mdl.addRecurrent(state, TEMPORAL_SIZE); |
|
||||||
mdl.addDesire(desire, DESIRE_LEN); |
|
||||||
mdl.addTrafficConvention(traffic_convention, TRAFFIC_CONVENTION_LEN); |
|
||||||
mdl.addImage(input, 0); |
|
||||||
mdl.addExtra(extra, 0); |
|
||||||
|
|
||||||
// first run
|
|
||||||
printf("************** execute 1 **************\n"); |
|
||||||
memset(output, 0, OUTPUT_SIZE * sizeof(float)); |
|
||||||
mdl.execute(); |
|
||||||
|
|
||||||
// don't save?
|
|
||||||
if (!output_file) { |
|
||||||
printf("no output file, exiting\n"); |
|
||||||
return 0; |
|
||||||
} |
|
||||||
|
|
||||||
// save model
|
|
||||||
printf("saving %s with binary %d\n", output_file, save_binaries); |
|
||||||
mdl.thneed->save(output_file, save_binaries); |
|
||||||
|
|
||||||
// test model
|
|
||||||
auto thneed = new Thneed(true); |
|
||||||
thneed->record = false; |
|
||||||
thneed->load(output_file); |
|
||||||
thneed->clexec(); |
|
||||||
thneed->find_inputs_outputs(); |
|
||||||
|
|
||||||
return 0; |
|
||||||
} |
|
||||||
|
|
@ -1,272 +0,0 @@ |
|||||||
read_only image2d_t input, |
|
||||||
#ifndef DEPTHWISE |
|
||||||
short startPackedInputChannel, |
|
||||||
short numPackedInputChannelsForGroup, short totalNumPackedInputChannels, |
|
||||||
// typo required for API compatibility |
|
||||||
short packedOuputChannelOffset, short totalNumPackedOutputChannels, |
|
||||||
#else |
|
||||||
short totalNumPackedChannels, |
|
||||||
#endif |
|
||||||
read_only image2d_t weights, __constant float *biases, |
|
||||||
short filterSizeX, short filterSizeY, |
|
||||||
write_only image2d_t output, |
|
||||||
short paddingX, short paddingY, short strideX, short strideY, |
|
||||||
#ifdef SUPPORT_DILATION |
|
||||||
short dilationX, short dilationY, |
|
||||||
#endif |
|
||||||
short neuron, float a, float b, float min_clamp, float max_clamp, |
|
||||||
#ifndef DEPTHWISE |
|
||||||
// note: these are not supported |
|
||||||
__constant float *parameters, __constant float *batchNormBiases, |
|
||||||
#endif |
|
||||||
short numOutputColumns |
|
||||||
#ifdef SUPPORT_ACCUMULATION |
|
||||||
, short doAccumulate, read_only image2d_t accumulator |
|
||||||
#endif |
|
||||||
) { |
|
||||||
|
|
||||||
#ifndef NUM_OUTPUTS |
|
||||||
#define NUM_OUTPUTS 4 |
|
||||||
#endif |
|
||||||
|
|
||||||
// init |
|
||||||
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; |
|
||||||
short packedOutputChannel = get_global_id(0); |
|
||||||
short startOutputColumn = mul24((short)get_global_id(1), NUM_OUTPUTS); |
|
||||||
short outputRow = get_global_id(2); |
|
||||||
|
|
||||||
#ifdef DEPTHWISE |
|
||||||
short totalNumPackedInputChannels = totalNumPackedChannels; |
|
||||||
short totalNumPackedOutputChannels = totalNumPackedChannels; |
|
||||||
short startPackedInputChannel = packedOutputChannel; |
|
||||||
#endif |
|
||||||
|
|
||||||
short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), totalNumPackedInputChannels, startPackedInputChannel); |
|
||||||
short strideWithChannels = mul24(strideX, totalNumPackedInputChannels); |
|
||||||
|
|
||||||
float4 outputValues[NUM_OUTPUTS]; |
|
||||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
|
||||||
outputValues[i] = (float4)(0, 0, 0, 0); |
|
||||||
} |
|
||||||
|
|
||||||
int2 inputLocation; |
|
||||||
inputLocation.y = mad24(outputRow, strideY, -paddingY); |
|
||||||
|
|
||||||
int2 weightLocation; |
|
||||||
weightLocation.x = 0; |
|
||||||
weightLocation.y = packedOutputChannel; |
|
||||||
|
|
||||||
#ifdef DEPTHWISE |
|
||||||
|
|
||||||
#ifdef SUPPORT_DILATION |
|
||||||
|
|
||||||
// depthwise convolution |
|
||||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { |
|
||||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { |
|
||||||
short dilatedStepX = mul24(totalNumPackedChannels, dilationX); |
|
||||||
inputLocation.x = mad24(rfColumn, dilatedStepX, startX); |
|
||||||
float4 inputValues[4]; |
|
||||||
for (short i = 0; i < 4; ++i) { |
|
||||||
inputValues[i] = read_imagef(input, smp, inputLocation); |
|
||||||
inputLocation.x += strideWithChannels; |
|
||||||
} |
|
||||||
float4 weightValues = read_imagef(weights, smp, weightLocation); |
|
||||||
++weightLocation.x; |
|
||||||
outputValues[0] += inputValues[0] * weightValues; |
|
||||||
outputValues[1] += inputValues[1] * weightValues; |
|
||||||
outputValues[2] += inputValues[2] * weightValues; |
|
||||||
outputValues[3] += inputValues[3] * weightValues; |
|
||||||
} |
|
||||||
inputLocation.y += dilationY; |
|
||||||
} |
|
||||||
|
|
||||||
#else |
|
||||||
|
|
||||||
// depthwise unstrided convolution |
|
||||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { |
|
||||||
float4 inputValues[4]; |
|
||||||
inputLocation.x = startX; |
|
||||||
for (short i = 1; i < 4; ++i) { |
|
||||||
inputValues[i] = read_imagef(input, smp, inputLocation); |
|
||||||
inputLocation.x += totalNumPackedOutputChannels; |
|
||||||
} |
|
||||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { |
|
||||||
inputValues[0] = inputValues[1]; |
|
||||||
inputValues[1] = inputValues[2]; |
|
||||||
inputValues[2] = inputValues[3]; |
|
||||||
inputValues[3] = read_imagef(input, smp, inputLocation); |
|
||||||
inputLocation.x += totalNumPackedChannels; |
|
||||||
float4 weightValues = read_imagef(weights, smp, weightLocation); |
|
||||||
++weightLocation.x; |
|
||||||
outputValues[0] += inputValues[0] * weightValues; |
|
||||||
outputValues[1] += inputValues[1] * weightValues; |
|
||||||
outputValues[2] += inputValues[2] * weightValues; |
|
||||||
outputValues[3] += inputValues[3] * weightValues; |
|
||||||
} |
|
||||||
++inputLocation.y; |
|
||||||
} |
|
||||||
|
|
||||||
#endif |
|
||||||
|
|
||||||
#elif defined(ONLY_1X1_CONV) |
|
||||||
|
|
||||||
// 1x1 convolution |
|
||||||
short endPackedInputChannel = startPackedInputChannel + numPackedInputChannelsForGroup; |
|
||||||
for (short packedInputChannel = startPackedInputChannel; packedInputChannel < endPackedInputChannel; ++packedInputChannel) { |
|
||||||
float4 weightValues[4]; |
|
||||||
for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { |
|
||||||
weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); |
|
||||||
++weightLocation.x; |
|
||||||
} |
|
||||||
|
|
||||||
inputLocation.x = startX + packedInputChannel; |
|
||||||
float4 inputValues[NUM_OUTPUTS]; |
|
||||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
|
||||||
inputValues[i] = read_imagef(input, smp, inputLocation); |
|
||||||
inputLocation.x += strideWithChannels; |
|
||||||
} |
|
||||||
|
|
||||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
|
||||||
float4 curOutputValues = outputValues[i]; |
|
||||||
curOutputValues.x += inputValues[i].x * weightValues[0].x; |
|
||||||
curOutputValues.x += inputValues[i].y * weightValues[0].y; |
|
||||||
curOutputValues.x += inputValues[i].z * weightValues[0].z; |
|
||||||
curOutputValues.x += inputValues[i].w * weightValues[0].w; |
|
||||||
curOutputValues.y += inputValues[i].x * weightValues[1].x; |
|
||||||
curOutputValues.y += inputValues[i].y * weightValues[1].y; |
|
||||||
curOutputValues.y += inputValues[i].z * weightValues[1].z; |
|
||||||
curOutputValues.y += inputValues[i].w * weightValues[1].w; |
|
||||||
curOutputValues.z += inputValues[i].x * weightValues[2].x; |
|
||||||
curOutputValues.z += inputValues[i].y * weightValues[2].y; |
|
||||||
curOutputValues.z += inputValues[i].z * weightValues[2].z; |
|
||||||
curOutputValues.z += inputValues[i].w * weightValues[2].w; |
|
||||||
curOutputValues.w += inputValues[i].x * weightValues[3].x; |
|
||||||
curOutputValues.w += inputValues[i].y * weightValues[3].y; |
|
||||||
curOutputValues.w += inputValues[i].z * weightValues[3].z; |
|
||||||
curOutputValues.w += inputValues[i].w * weightValues[3].w; |
|
||||||
outputValues[i] = curOutputValues; |
|
||||||
} |
|
||||||
} |
|
||||||
packedOutputChannel += packedOuputChannelOffset; |
|
||||||
|
|
||||||
#else |
|
||||||
|
|
||||||
// normal convolution |
|
||||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { |
|
||||||
for (short packedInputChannel = 0; packedInputChannel < numPackedInputChannelsForGroup; ++packedInputChannel) { |
|
||||||
short startXForChannel = startX + packedInputChannel; |
|
||||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { |
|
||||||
|
|
||||||
float4 weightValues[4]; |
|
||||||
for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { |
|
||||||
weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); |
|
||||||
++weightLocation.x; |
|
||||||
} |
|
||||||
|
|
||||||
#ifdef SUPPORT_DILATION |
|
||||||
short dilatedStepX = mul24(totalNumPackedInputChannels, dilationX); |
|
||||||
inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel); |
|
||||||
#else |
|
||||||
inputLocation.x = mad24(rfColumn, totalNumPackedInputChannels, startXForChannel); |
|
||||||
#endif |
|
||||||
float4 inputValues[NUM_OUTPUTS]; |
|
||||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
|
||||||
inputValues[i] = read_imagef(input, smp, inputLocation); |
|
||||||
inputLocation.x += strideWithChannels; |
|
||||||
} |
|
||||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
|
||||||
float4 curOutputValues = outputValues[i]; |
|
||||||
curOutputValues.x += inputValues[i].x * weightValues[0].x; |
|
||||||
curOutputValues.x += inputValues[i].y * weightValues[0].y; |
|
||||||
curOutputValues.x += inputValues[i].z * weightValues[0].z; |
|
||||||
curOutputValues.x += inputValues[i].w * weightValues[0].w; |
|
||||||
curOutputValues.y += inputValues[i].x * weightValues[1].x; |
|
||||||
curOutputValues.y += inputValues[i].y * weightValues[1].y; |
|
||||||
curOutputValues.y += inputValues[i].z * weightValues[1].z; |
|
||||||
curOutputValues.y += inputValues[i].w * weightValues[1].w; |
|
||||||
curOutputValues.z += inputValues[i].x * weightValues[2].x; |
|
||||||
curOutputValues.z += inputValues[i].y * weightValues[2].y; |
|
||||||
curOutputValues.z += inputValues[i].z * weightValues[2].z; |
|
||||||
curOutputValues.z += inputValues[i].w * weightValues[2].w; |
|
||||||
curOutputValues.w += inputValues[i].x * weightValues[3].x; |
|
||||||
curOutputValues.w += inputValues[i].y * weightValues[3].y; |
|
||||||
curOutputValues.w += inputValues[i].z * weightValues[3].z; |
|
||||||
curOutputValues.w += inputValues[i].w * weightValues[3].w; |
|
||||||
outputValues[i] = curOutputValues; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
#ifdef SUPPORT_DILATION |
|
||||||
inputLocation.y += dilationY; |
|
||||||
#else |
|
||||||
++inputLocation.y; |
|
||||||
#endif |
|
||||||
} |
|
||||||
packedOutputChannel += packedOuputChannelOffset; |
|
||||||
#endif |
|
||||||
|
|
||||||
// bias |
|
||||||
short outputChannel = mul24(packedOutputChannel, 4); |
|
||||||
float4 biasValues = vload4(0, biases + outputChannel); |
|
||||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
|
||||||
outputValues[i] += biasValues; |
|
||||||
} |
|
||||||
|
|
||||||
#ifdef SUPPORT_ACCUMULATION |
|
||||||
// accumulate |
|
||||||
if (doAccumulate) { |
|
||||||
int2 outputLocation; |
|
||||||
short outputColumn = startOutputColumn; |
|
||||||
outputLocation.y = outputRow; |
|
||||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
|
||||||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); |
|
||||||
if (outputColumn < numOutputColumns) { |
|
||||||
outputValues[i] += read_imagef(accumulator, smp, outputLocation); |
|
||||||
} |
|
||||||
++outputColumn; |
|
||||||
} |
|
||||||
} |
|
||||||
#endif |
|
||||||
|
|
||||||
// activation |
|
||||||
switch (neuron) { |
|
||||||
case 1: |
|
||||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
|
||||||
outputValues[i] = max(outputValues[i], 0.0f); |
|
||||||
} |
|
||||||
break; |
|
||||||
case 2: |
|
||||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
|
||||||
outputValues[i] = a * tanh(b * outputValues[i]); |
|
||||||
} |
|
||||||
break; |
|
||||||
case 3: |
|
||||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
|
||||||
outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); |
|
||||||
} |
|
||||||
break; |
|
||||||
case 4: |
|
||||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
|
||||||
outputValues[i] = max(outputValues[i], min_clamp); |
|
||||||
outputValues[i] = min(outputValues[i], max_clamp); |
|
||||||
} |
|
||||||
break; |
|
||||||
case 5: |
|
||||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
|
||||||
outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); |
|
||||||
} |
|
||||||
break; |
|
||||||
} |
|
||||||
|
|
||||||
// output |
|
||||||
int2 outputLocation; |
|
||||||
short outputColumn = startOutputColumn; |
|
||||||
outputLocation.y = outputRow; |
|
||||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
|
||||||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); |
|
||||||
if (outputColumn < numOutputColumns) { |
|
||||||
write_imagef(output, outputLocation, outputValues[i]); |
|
||||||
} |
|
||||||
++outputColumn; |
|
||||||
} |
|
||||||
} |
|
@ -1,3 +0,0 @@ |
|||||||
#define SUPPORT_DILATION |
|
||||||
|
|
||||||
__kernel void convolution_horizontal_reduced_reads( |
|
@ -1,4 +0,0 @@ |
|||||||
#define ONLY_1X1_CONV |
|
||||||
#define SUPPORT_ACCUMULATION |
|
||||||
|
|
||||||
__kernel void convolution_horizontal_reduced_reads_1x1( |
|
@ -1,3 +0,0 @@ |
|||||||
#define NUM_OUTPUTS 5 |
|
||||||
|
|
||||||
__kernel void convolution_horizontal_reduced_reads_5_outputs( |
|
@ -1,4 +0,0 @@ |
|||||||
#define DEPTHWISE |
|
||||||
#define SUPPORT_DILATION |
|
||||||
|
|
||||||
__kernel void convolution_horizontal_reduced_reads_depthwise( |
|
@ -1,3 +0,0 @@ |
|||||||
#define DEPTHWISE |
|
||||||
|
|
||||||
__kernel void convolution_horizontal_reduced_reads_depthwise_stride_1( |
|
@ -1,261 +0,0 @@ |
|||||||
#include <map> |
|
||||||
#include <string> |
|
||||||
#include <string.h> |
|
||||||
#include <assert.h> |
|
||||||
#include "thneed.h" |
|
||||||
|
|
||||||
#include "common/util.h" |
|
||||||
#include "common/clutil.h" |
|
||||||
|
|
||||||
extern map<cl_program, string> g_program_source; |
|
||||||
|
|
||||||
/*static int is_same_size_image(cl_mem a, cl_mem b) {
|
|
||||||
size_t a_width, a_height, a_depth, a_array_size, a_row_pitch, a_slice_pitch; |
|
||||||
clGetImageInfo(a, CL_IMAGE_WIDTH, sizeof(a_width), &a_width, NULL); |
|
||||||
clGetImageInfo(a, CL_IMAGE_HEIGHT, sizeof(a_height), &a_height, NULL); |
|
||||||
clGetImageInfo(a, CL_IMAGE_DEPTH, sizeof(a_depth), &a_depth, NULL); |
|
||||||
clGetImageInfo(a, CL_IMAGE_ARRAY_SIZE, sizeof(a_array_size), &a_array_size, NULL); |
|
||||||
clGetImageInfo(a, CL_IMAGE_ROW_PITCH, sizeof(a_row_pitch), &a_row_pitch, NULL); |
|
||||||
clGetImageInfo(a, CL_IMAGE_SLICE_PITCH, sizeof(a_slice_pitch), &a_slice_pitch, NULL); |
|
||||||
|
|
||||||
size_t b_width, b_height, b_depth, b_array_size, b_row_pitch, b_slice_pitch; |
|
||||||
clGetImageInfo(b, CL_IMAGE_WIDTH, sizeof(b_width), &b_width, NULL); |
|
||||||
clGetImageInfo(b, CL_IMAGE_HEIGHT, sizeof(b_height), &b_height, NULL); |
|
||||||
clGetImageInfo(b, CL_IMAGE_DEPTH, sizeof(b_depth), &b_depth, NULL); |
|
||||||
clGetImageInfo(b, CL_IMAGE_ARRAY_SIZE, sizeof(b_array_size), &b_array_size, NULL); |
|
||||||
clGetImageInfo(b, CL_IMAGE_ROW_PITCH, sizeof(b_row_pitch), &b_row_pitch, NULL); |
|
||||||
clGetImageInfo(b, CL_IMAGE_SLICE_PITCH, sizeof(b_slice_pitch), &b_slice_pitch, NULL); |
|
||||||
|
|
||||||
return (a_width == b_width) && (a_height == b_height) && |
|
||||||
(a_depth == b_depth) && (a_array_size == b_array_size) && |
|
||||||
(a_row_pitch == b_row_pitch) && (a_slice_pitch == b_slice_pitch); |
|
||||||
}*/ |
|
||||||
|
|
||||||
static cl_mem make_image_like(cl_context context, cl_mem val) { |
|
||||||
cl_image_format format; |
|
||||||
size_t width, height, row_pitch; |
|
||||||
clGetImageInfo(val, CL_IMAGE_FORMAT, sizeof(format), &format, NULL); |
|
||||||
assert(format.image_channel_order == CL_RGBA); |
|
||||||
assert(format.image_channel_data_type == CL_HALF_FLOAT); |
|
||||||
clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL); |
|
||||||
clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL); |
|
||||||
clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL); |
|
||||||
|
|
||||||
cl_image_desc desc = {0}; |
|
||||||
desc.image_type = CL_MEM_OBJECT_IMAGE2D; |
|
||||||
desc.image_width = width; |
|
||||||
desc.image_height = height; |
|
||||||
desc.image_row_pitch = row_pitch; |
|
||||||
|
|
||||||
cl_mem buf = clCreateBuffer(context, CL_MEM_READ_WRITE, row_pitch*height, NULL, NULL); |
|
||||||
assert(buf != NULL); |
|
||||||
desc.buffer = buf; |
|
||||||
|
|
||||||
cl_int err; |
|
||||||
cl_mem tmp = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err); |
|
||||||
//printf("got %d for image %zux%zu %zu\n", err, width, height, row_pitch);
|
|
||||||
assert(tmp != NULL); |
|
||||||
|
|
||||||
return tmp; |
|
||||||
} |
|
||||||
|
|
||||||
// convolution_horizontal_reduced_reads_1x1 is 66% of the model runtime
|
|
||||||
// make that faster and the model gets faster
|
|
||||||
|
|
||||||
// this cuts ~2 ms off the model runtime right now
|
|
||||||
int Thneed::optimize() { |
|
||||||
const char *kernel_path = getenv("KERNEL_PATH"); |
|
||||||
if (!kernel_path) { kernel_path = "/data/openpilot/selfdrive/modeld/thneed/kernels"; printf("no KERNEL_PATH set, defaulting to %s\n", kernel_path); } |
|
||||||
|
|
||||||
string convolution_; |
|
||||||
{ |
|
||||||
char fn[0x100]; |
|
||||||
snprintf(fn, sizeof(fn), "%s/%s.cl", kernel_path, "convolution_"); |
|
||||||
convolution_ = util::read_file(fn); |
|
||||||
} |
|
||||||
|
|
||||||
// load custom kernels
|
|
||||||
map<string, cl_program> g_programs; |
|
||||||
for (auto &k : kq) { |
|
||||||
// replace program?
|
|
||||||
if (g_programs.find(k->name) == g_programs.end()) { |
|
||||||
char fn[0x100]; |
|
||||||
snprintf(fn, sizeof(fn), "%s/%s.cl", kernel_path, k->name.c_str()); |
|
||||||
if (util::file_exists(fn)) { |
|
||||||
string kernel_src = util::read_file(fn); |
|
||||||
if (k->name.rfind("convolution_", 0) == 0) { |
|
||||||
kernel_src += convolution_; |
|
||||||
} |
|
||||||
printf("building kernel %s with len %lu\n", k->name.c_str(), kernel_src.length()); |
|
||||||
k->program = cl_program_from_source(context, device_id, kernel_src); |
|
||||||
|
|
||||||
// save in cache
|
|
||||||
g_programs[k->name] = k->program; |
|
||||||
g_program_source[k->program] = kernel_src; |
|
||||||
} else { |
|
||||||
g_programs[k->name] = NULL; |
|
||||||
} |
|
||||||
} else { |
|
||||||
// cached replacement
|
|
||||||
if (g_programs[k->name] != NULL) { |
|
||||||
k->program = g_programs[k->name]; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// hack in accumulator to convolution_horizontal_reduced_reads_1x1
|
|
||||||
if (k->name == "convolution_horizontal_reduced_reads_1x1") { |
|
||||||
k->arg_names.push_back("doAccumulate"); |
|
||||||
short doAccumulate = 0; |
|
||||||
k->args.push_back(string((char *)&doAccumulate, sizeof(doAccumulate))); |
|
||||||
k->args_size.push_back(2); |
|
||||||
k->arg_names.push_back("accumulator"); |
|
||||||
k->args.push_back(k->args[k->get_arg_num("output")]); |
|
||||||
k->args_size.push_back(8); |
|
||||||
k->num_args += 2; |
|
||||||
} |
|
||||||
|
|
||||||
// assert that parameters + batchNormBiases are not used
|
|
||||||
// since they aren't supported in custom replacement kernels
|
|
||||||
if (k->name == "convolution_horizontal_reduced_reads_1x1" || |
|
||||||
k->name == "convolution_horizontal_reduced_reads" || |
|
||||||
k->name == "convolution_horizontal_reduced_reads_5_outputs") { |
|
||||||
string p1 = k->args[k->get_arg_num("parameters")]; |
|
||||||
string p2 = k->args[k->get_arg_num("batchNormBiases")]; |
|
||||||
assert(p1.length() == 8 && *((uint64_t*)p1.data()) == 0); |
|
||||||
assert(p2.length() == 8 && *((uint64_t*)p2.data()) == 0); |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// optimizer
|
|
||||||
size_t start_size; |
|
||||||
do { |
|
||||||
start_size = kq.size(); |
|
||||||
|
|
||||||
// get optimizations
|
|
||||||
map<string, string> replacements; |
|
||||||
for (int i = 0; i < kq.size(); i++) { |
|
||||||
// fusing elementwise_sum + activate_image will save 3 enqueues
|
|
||||||
|
|
||||||
// delete useless copy layers
|
|
||||||
// saves ~0.7 ms
|
|
||||||
/*if (kq[i]->name == "concatenation" || kq[i]->name == "flatten") {
|
|
||||||
string in = kq[i]->args[kq[i]->get_arg_num("input")]; |
|
||||||
string out = kq[i]->args[kq[i]->get_arg_num("output")]; |
|
||||||
if (is_same_size_image(*(cl_mem*)in.data(), *(cl_mem*)out.data())) { |
|
||||||
cl_mem tmp = make_image_like(context, *(cl_mem *)in.data()); |
|
||||||
replacements[in] = string((char *)&tmp, sizeof(tmp)); |
|
||||||
replacements[out] = string((char *)&tmp, sizeof(tmp)); |
|
||||||
|
|
||||||
kq.erase(kq.begin()+i); --i; |
|
||||||
} |
|
||||||
}*/ |
|
||||||
|
|
||||||
// NOTE: if activations/accumulation are done in the wrong order, this will be wrong
|
|
||||||
|
|
||||||
// fuse activations into convs and fc_Wtx
|
|
||||||
// saves ~1.5 ms
|
|
||||||
// NOTE: this changes the outputs because of rounding, should be better now!
|
|
||||||
if (i != 0 && kq[i]->name == "activate_image") { |
|
||||||
if (kq[i-1]->name == "convolution_horizontal_reduced_reads_1x1" || |
|
||||||
kq[i-1]->name == "convolution_horizontal_reduced_reads_5_outputs" || |
|
||||||
kq[i-1]->name == "convolution_horizontal_reduced_reads" || |
|
||||||
kq[i-1]->name == "convolution_horizontal_reduced_reads_depthwise" || |
|
||||||
kq[i-1]->name == "convolution_horizontal_reduced_reads_depthwise_stride_1" || |
|
||||||
kq[i-1]->name == "fc_Wtx") { |
|
||||||
string lastout = kq[i-1]->args[kq[i-1]->get_arg_num("output")]; |
|
||||||
string in = kq[i]->args[kq[i]->get_arg_num("input")]; |
|
||||||
string out = kq[i]->args[kq[i]->get_arg_num("output")]; |
|
||||||
|
|
||||||
if (lastout == in) { |
|
||||||
short neuron = *(int*)kq[i]->args[kq[i]->get_arg_num("neuron")].data(); |
|
||||||
assert(neuron <= 5); |
|
||||||
|
|
||||||
// ELU isn't supported in fc_Wtx
|
|
||||||
assert(!(kq[i-1]->name == "fc_Wtx" && neuron == 5)); |
|
||||||
|
|
||||||
kq[i-1]->args[kq[i-1]->get_arg_num("neuron")] = string((char *)&neuron, sizeof(neuron)); |
|
||||||
|
|
||||||
cl_mem tmp = make_image_like(context, *(cl_mem *)lastout.data()); |
|
||||||
replacements[in] = string((char *)&tmp, sizeof(tmp)); |
|
||||||
replacements[out] = string((char *)&tmp, sizeof(tmp)); |
|
||||||
|
|
||||||
kq.erase(kq.begin()+i); --i; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// fuse accumulation into convs and fc_Wtx
|
|
||||||
if (i != 0 && kq[i]->name == "elementwise_sum") { |
|
||||||
if (kq[i-1]->name == "convolution_horizontal_reduced_reads_1x1" || |
|
||||||
kq[i-1]->name == "fc_Wtx") { |
|
||||||
string lastout = kq[i-1]->args[kq[i-1]->get_arg_num("output")]; |
|
||||||
string a = kq[i]->args[kq[i]->get_arg_num("a")]; |
|
||||||
string b = kq[i]->args[kq[i]->get_arg_num("b")]; |
|
||||||
string out = kq[i]->args[kq[i]->get_arg_num("output")]; |
|
||||||
|
|
||||||
if (lastout == a) { |
|
||||||
kq[i-1]->args[kq[i-1]->get_arg_num("accumulator")] = b; |
|
||||||
} else if (lastout == b) { |
|
||||||
kq[i-1]->args[kq[i-1]->get_arg_num("accumulator")] = a; |
|
||||||
} else { |
|
||||||
continue; |
|
||||||
} |
|
||||||
|
|
||||||
cl_mem tmp = make_image_like(context, *(cl_mem *)lastout.data()); |
|
||||||
replacements[lastout] = string((char *)&tmp, sizeof(tmp)); |
|
||||||
replacements[out] = string((char *)&tmp, sizeof(tmp)); |
|
||||||
|
|
||||||
short doAccumulate = 1; |
|
||||||
kq[i-1]->args[kq[i-1]->get_arg_num("doAccumulate")] = string((char *)&doAccumulate, sizeof(doAccumulate)); |
|
||||||
|
|
||||||
kq.erase(kq.begin()+i); --i; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// remap inputs and outputs, and clear the kernels
|
|
||||||
for (int i = 0; i < kq.size(); i++) { |
|
||||||
kq[i]->kernel = NULL; |
|
||||||
for (int j = 0; j < kq[i]->num_args; j++) { |
|
||||||
if (replacements.find(kq[i]->args[j]) != replacements.end()) { |
|
||||||
kq[i]->args[j] = replacements[kq[i]->args[j]]; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
printf("optimize %lu -> %lu\n", start_size, kq.size()); |
|
||||||
} while (kq.size() != start_size); |
|
||||||
|
|
||||||
size_t work_group_size = 0; |
|
||||||
clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(work_group_size), &work_group_size, NULL); |
|
||||||
printf("max work group size %lu\n", work_group_size); |
|
||||||
|
|
||||||
// local work group optimizer
|
|
||||||
for (auto &k : kq) { |
|
||||||
// only do it for convs, since others might share memory
|
|
||||||
if (k->name.rfind("convolution_", 0) == 0) { |
|
||||||
int best = -1; |
|
||||||
if (k->local_work_size[0] * k->local_work_size[1] * k->local_work_size[2] < work_group_size/2) { |
|
||||||
uint64_t base_time = k->benchmark(); |
|
||||||
uint64_t best_time = base_time; |
|
||||||
for (int i = 0; i < 3; i++) { |
|
||||||
k->local_work_size[i] *= 2; |
|
||||||
uint64_t this_time = k->benchmark(); |
|
||||||
if (this_time < best_time) { |
|
||||||
best = i; |
|
||||||
best_time = this_time; |
|
||||||
} |
|
||||||
k->local_work_size[i] /= 2; |
|
||||||
} |
|
||||||
if (best != -1) { |
|
||||||
k->local_work_size[best] *= 2; |
|
||||||
//printf("%s %.2f ms doubled %d to %.2f ms\n", k->name.c_str(), base_time/1e6, best, best_time/1e6);
|
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
return 0; |
|
||||||
} |
|
||||||
|
|
@ -1,146 +0,0 @@ |
|||||||
#!/usr/bin/env python3 |
|
||||||
import os |
|
||||||
import struct |
|
||||||
import zipfile |
|
||||||
import numpy as np |
|
||||||
from tqdm import tqdm |
|
||||||
|
|
||||||
from common.basedir import BASEDIR |
|
||||||
from selfdrive.modeld.thneed.lib import load_thneed, save_thneed |
|
||||||
|
|
||||||
# this is junk code, but it doesn't have deps |
|
||||||
def load_dlc_weights(fn): |
|
||||||
archive = zipfile.ZipFile(fn, 'r') |
|
||||||
dlc_params = archive.read("model.params") |
|
||||||
|
|
||||||
def extract(rdat): |
|
||||||
idx = rdat.find(b"\x00\x00\x00\x09\x04\x00\x00\x00") |
|
||||||
rdat = rdat[idx+8:] |
|
||||||
ll = struct.unpack("I", rdat[0:4])[0] |
|
||||||
buf = np.frombuffer(rdat[4:4+ll*4], dtype=np.float32) |
|
||||||
rdat = rdat[4+ll*4:] |
|
||||||
dims = struct.unpack("I", rdat[0:4])[0] |
|
||||||
buf = buf.reshape(struct.unpack("I"*dims, rdat[4:4+dims*4])) |
|
||||||
if len(buf.shape) == 4: |
|
||||||
buf = np.transpose(buf, (3,2,0,1)) |
|
||||||
return buf |
|
||||||
|
|
||||||
def parse(tdat): |
|
||||||
ll = struct.unpack("I", tdat[0:4])[0] + 4 |
|
||||||
return (None, [extract(tdat[0:]), extract(tdat[ll:])]) |
|
||||||
|
|
||||||
ptr = 0x20 |
|
||||||
def r4(): |
|
||||||
nonlocal ptr |
|
||||||
ret = struct.unpack("I", dlc_params[ptr:ptr+4])[0] |
|
||||||
ptr += 4 |
|
||||||
return ret |
|
||||||
ranges = [] |
|
||||||
cnt = r4() |
|
||||||
for _ in range(cnt): |
|
||||||
o = r4() + ptr |
|
||||||
# the header is 0xC |
|
||||||
plen, is_4, is_2 = struct.unpack("III", dlc_params[o:o+0xC]) |
|
||||||
assert is_4 == 4 and is_2 == 2 |
|
||||||
ranges.append((o+0xC, o+plen+0xC)) |
|
||||||
ranges = sorted(ranges, reverse=True) |
|
||||||
|
|
||||||
return [parse(dlc_params[s:e]) for s,e in ranges] |
|
||||||
|
|
||||||
# this won't run on device without onnx |
|
||||||
def load_onnx_weights(fn): |
|
||||||
import onnx |
|
||||||
from onnx import numpy_helper |
|
||||||
|
|
||||||
model = onnx.load(fn) |
|
||||||
graph = model.graph # pylint: disable=maybe-no-member |
|
||||||
init = {x.name:x for x in graph.initializer} |
|
||||||
|
|
||||||
onnx_layers = [] |
|
||||||
for node in graph.node: |
|
||||||
#print(node.name, node.op_type, node.input, node.output) |
|
||||||
vals = [] |
|
||||||
for inp in node.input: |
|
||||||
if inp in init: |
|
||||||
vals.append(numpy_helper.to_array(init[inp])) |
|
||||||
if len(vals) > 0: |
|
||||||
onnx_layers.append((node.name, vals)) |
|
||||||
return onnx_layers |
|
||||||
|
|
||||||
def weights_fixup(target, source_thneed, dlc): |
|
||||||
#onnx_layers = load_onnx_weights(os.path.join(BASEDIR, "models/supercombo.onnx")) |
|
||||||
onnx_layers = load_dlc_weights(dlc) |
|
||||||
jdat = load_thneed(source_thneed) |
|
||||||
|
|
||||||
bufs = {} |
|
||||||
for o in jdat['objects']: |
|
||||||
bufs[o['id']] = o |
|
||||||
|
|
||||||
thneed_layers = [] |
|
||||||
for k in jdat['kernels']: |
|
||||||
#print(k['name']) |
|
||||||
vals = [] |
|
||||||
for a in k['args']: |
|
||||||
if a in bufs: |
|
||||||
o = bufs[a] |
|
||||||
if o['needs_load'] or ('buffer_id' in o and bufs[o['buffer_id']]['needs_load']): |
|
||||||
#print(" ", o['arg_type']) |
|
||||||
vals.append(o) |
|
||||||
if len(vals) > 0: |
|
||||||
thneed_layers.append((k['name'], vals)) |
|
||||||
|
|
||||||
assert len(thneed_layers) == len(onnx_layers) |
|
||||||
|
|
||||||
# fix up weights |
|
||||||
for tl, ol in tqdm(zip(thneed_layers, onnx_layers), total=len(thneed_layers)): |
|
||||||
#print(tl[0], ol[0]) |
|
||||||
assert len(tl[1]) == len(ol[1]) |
|
||||||
for o, onnx_weight in zip(tl[1], ol[1]): |
|
||||||
if o['arg_type'] == "image2d_t": |
|
||||||
obuf = bufs[o['buffer_id']] |
|
||||||
saved_weights = np.frombuffer(obuf['data'], dtype=np.float16).reshape(o['height'], o['row_pitch']//2) |
|
||||||
|
|
||||||
if len(onnx_weight.shape) == 4: |
|
||||||
# convolution |
|
||||||
oc,ic,ch,cw = onnx_weight.shape |
|
||||||
|
|
||||||
if 'depthwise' in tl[0]: |
|
||||||
assert ic == 1 |
|
||||||
weights = np.transpose(onnx_weight.reshape(oc//4,4,ch,cw), (0,2,3,1)).reshape(o['height'], o['width']*4) |
|
||||||
else: |
|
||||||
weights = np.transpose(onnx_weight.reshape(oc//4,4,ic//4,4,ch,cw), (0,4,2,5,1,3)).reshape(o['height'], o['width']*4) |
|
||||||
else: |
|
||||||
# fc_Wtx |
|
||||||
weights = onnx_weight |
|
||||||
|
|
||||||
new_weights = np.zeros((o['height'], o['row_pitch']//2), dtype=np.float32) |
|
||||||
new_weights[:, :weights.shape[1]] = weights |
|
||||||
|
|
||||||
# weights shouldn't be too far off |
|
||||||
err = np.mean((saved_weights.astype(np.float32) - new_weights)**2) |
|
||||||
assert err < 1e-3 |
|
||||||
rerr = np.mean(np.abs((saved_weights.astype(np.float32) - new_weights)/(new_weights+1e-12))) |
|
||||||
assert rerr < 0.5 |
|
||||||
|
|
||||||
# fix should improve things |
|
||||||
fixed_err = np.mean((new_weights.astype(np.float16).astype(np.float32) - new_weights)**2) |
|
||||||
assert (err/fixed_err) >= 1 |
|
||||||
|
|
||||||
#print(" ", o['size'], onnx_weight.shape, o['row_pitch'], o['width'], o['height'], "err %.2fx better" % (err/fixed_err)) |
|
||||||
|
|
||||||
obuf['data'] = new_weights.astype(np.float16).tobytes() |
|
||||||
|
|
||||||
elif o['arg_type'] == "float*": |
|
||||||
# unconverted floats are correct |
|
||||||
new_weights = np.zeros(o['size']//4, dtype=np.float32) |
|
||||||
new_weights[:onnx_weight.shape[0]] = onnx_weight |
|
||||||
assert new_weights.tobytes() == o['data'] |
|
||||||
#print(" ", o['size'], onnx_weight.shape) |
|
||||||
|
|
||||||
save_thneed(jdat, target) |
|
||||||
|
|
||||||
if __name__ == "__main__": |
|
||||||
model_dir = os.path.join(BASEDIR, "selfdrive/modeld/models/") |
|
||||||
weights_fixup(os.path.join(model_dir, "supercombo_fixed.thneed"), |
|
||||||
os.path.join(model_dir, "supercombo.thneed"), |
|
||||||
os.path.join(model_dir, "supercombo.dlc")) |
|
Loading…
Reference in new issue