diff --git a/release/files_common b/release/files_common index 5be07b1c75..10c66a8960 100644 --- a/release/files_common +++ b/release/files_common @@ -367,10 +367,7 @@ selfdrive/modeld/thneed/thneed.h selfdrive/modeld/thneed/thneed_common.cc selfdrive/modeld/thneed/thneed_qcom2.cc selfdrive/modeld/thneed/serialize.cc -selfdrive/modeld/thneed/compile.cc -selfdrive/modeld/thneed/optimizer.cc selfdrive/modeld/thneed/include/* -selfdrive/modeld/thneed/kernels/*.cl selfdrive/modeld/runners/snpemodel.cc selfdrive/modeld/runners/snpemodel.h diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript index 2544607aa4..246f8c2941 100644 --- a/selfdrive/modeld/SConscript +++ b/selfdrive/modeld/SConscript @@ -26,7 +26,6 @@ thneed_src = [ "thneed/thneed_common.cc", "thneed/thneed_qcom2.cc", "thneed/serialize.cc", - "thneed/optimizer.cc", "runners/thneedmodel.cc", ] @@ -95,18 +94,6 @@ if use_thneed and arch == "larch64" or GetOption('pc_thneed'): "#tinygrad_repo/tinygrad/nn/__init__.py" ], cmd) - # old thneed compiler. TODO: remove this once tinygrad stuff is stable - - #compiler = lenv.Program('thneed/compile', ["thneed/compile.cc"]+common_model, LIBS=libs) - #cmd = f"cd {Dir('.').abspath} && {compiler[0].abspath} --in {fn}.dlc --out {fn}.thneed --binary --optimize" - - #lib_paths = ':'.join(Dir(p).abspath for p in lenv["LIBPATH"]) - #kernel_path = os.path.join(Dir('.').abspath, "thneed", "kernels") - #cenv = Environment(ENV={'LD_LIBRARY_PATH': f"{lib_paths}:{lenv['ENV']['LD_LIBRARY_PATH']}", 'KERNEL_PATH': kernel_path}) - - #kernels = [os.path.join(kernel_path, x) for x in os.listdir(kernel_path) if x.endswith(".cl")] - #cenv.Command(fn + ".thneed", [fn + ".dlc", kernels, compiler], cmd) - llenv = lenv.Clone() if GetOption('pc_thneed'): pc_thneed_src = [ diff --git a/selfdrive/modeld/runners/snpemodel.cc b/selfdrive/modeld/runners/snpemodel.cc index 95ee5fe822..ff4adcd8d3 100644 --- a/selfdrive/modeld/runners/snpemodel.cc +++ b/selfdrive/modeld/runners/snpemodel.cc @@ -186,75 +186,14 @@ std::unique_ptr SNPEModel::addExtra(float *state, in } void SNPEModel::execute() { -#ifdef USE_THNEED - if (Runtime == zdl::DlSystem::Runtime_t::GPU) { - if (!thneed_recorded) { - bool ret = inputBuffer->setBufferAddress(input); - assert(ret == true); - if (use_extra) { - assert(extra != NULL); - bool extra_ret = extraBuffer->setBufferAddress(extra); - assert(extra_ret == true); - } - if (!snpe->execute(inputMap, outputMap)) { - PrintErrorStringAndExit(); - } - memset(recurrent, 0, recurrent_size*sizeof(float)); - thneed->record = true; - if (!snpe->execute(inputMap, outputMap)) { - PrintErrorStringAndExit(); - } - thneed->stop(); - printf("thneed cached\n"); - - // doing self test - float *outputs_golden = (float *)malloc(output_size*sizeof(float)); - memcpy(outputs_golden, output, output_size*sizeof(float)); - memset(output, 0, output_size*sizeof(float)); - memset(recurrent, 0, recurrent_size*sizeof(float)); - uint64_t start_time = nanos_since_boot(); - if (extra != NULL) { - float *inputs[5] = {recurrent, trafficConvention, desire, extra, input}; - thneed->execute(inputs, output); - } else { - float *inputs[4] = {recurrent, trafficConvention, desire, input}; - thneed->execute(inputs, output); - } - uint64_t elapsed_time = nanos_since_boot() - start_time; - printf("ran model in %.2f ms\n", float(elapsed_time)/1e6); - - if (memcmp(output, outputs_golden, output_size*sizeof(float)) == 0) { - printf("thneed selftest passed\n"); - } else { - for (int i = 0; i < output_size; i++) { - printf("mismatch %3d: %f %f\n", i, output[i], outputs_golden[i]); - } - assert(false); - } - free(outputs_golden); - thneed_recorded = true; - } else { - if (use_extra) { - float *inputs[5] = {recurrent, trafficConvention, desire, extra, input}; - thneed->execute(inputs, output); - } else { - float *inputs[4] = {recurrent, trafficConvention, desire, input}; - thneed->execute(inputs, output); - } - } - } else { -#endif - bool ret = inputBuffer->setBufferAddress(input); - assert(ret == true); - if (use_extra) { - bool extra_ret = extraBuffer->setBufferAddress(extra); - assert(extra_ret == true); - } - if (!snpe->execute(inputMap, outputMap)) { - PrintErrorStringAndExit(); - } -#ifdef USE_THNEED + bool ret = inputBuffer->setBufferAddress(input); + assert(ret == true); + if (use_extra) { + bool extra_ret = extraBuffer->setBufferAddress(extra); + assert(extra_ret == true); + } + if (!snpe->execute(inputMap, outputMap)) { + PrintErrorStringAndExit(); } -#endif } diff --git a/selfdrive/modeld/runners/thneedmodel.cc b/selfdrive/modeld/runners/thneedmodel.cc index d55a8104ee..67db01bb95 100644 --- a/selfdrive/modeld/runners/thneedmodel.cc +++ b/selfdrive/modeld/runners/thneedmodel.cc @@ -6,7 +6,6 @@ ThneedModel::ThneedModel(const char *path, float *loutput, size_t loutput_size, thneed = new Thneed(true, context); thneed->load(path); thneed->clexec(); - thneed->find_inputs_outputs(); recorded = false; output = loutput; diff --git a/selfdrive/modeld/thneed/compile.cc b/selfdrive/modeld/thneed/compile.cc deleted file mode 100644 index f76c63b2b9..0000000000 --- a/selfdrive/modeld/thneed/compile.cc +++ /dev/null @@ -1,81 +0,0 @@ -#include -#include - -#include "selfdrive/modeld/runners/snpemodel.h" -#include "selfdrive/modeld/thneed/thneed.h" -#include "system/hardware/hw.h" - -#define TEMPORAL_SIZE 512 -#define DESIRE_LEN 8 -#define TRAFFIC_CONVENTION_LEN 2 - -// TODO: This should probably use SNPE directly. -int main(int argc, char* argv[]) { - bool run_optimizer = false, save_binaries = false; - const char *input_file = NULL, *output_file = NULL; - static struct option long_options[] = { - {"in", required_argument, 0, 'i' }, - {"out", required_argument, 0, 'o' }, - {"binary", no_argument, 0, 'b' }, - {"optimize", no_argument, 0, 'f' }, - {0, 0, 0, 0 } - }; - int long_index = 0, opt = 0; - while ((opt = getopt_long_only(argc, argv,"", long_options, &long_index)) != -1) { - switch (opt) { - case 'i': input_file = optarg; break; - case 'o': output_file = optarg; break; - case 'b': save_binaries = true; break; - case 'f': run_optimizer = true; break; - } - } - - // no input? - if (!input_file) { - printf("usage: -i -o --binary --optimize\n"); - return -1; - } - - #define OUTPUT_SIZE 0x10000 - - float *output = (float*)calloc(OUTPUT_SIZE, sizeof(float)); - SNPEModel mdl(input_file, output, 0, USE_GPU_RUNTIME, true); - mdl.thneed->run_optimizer = run_optimizer; - - float state[TEMPORAL_SIZE] = {0}; - float desire[DESIRE_LEN] = {0}; - float traffic_convention[TRAFFIC_CONVENTION_LEN] = {0}; - float *input = (float*)calloc(0x1000000, sizeof(float)); - float *extra = (float*)calloc(0x1000000, sizeof(float)); - - mdl.addRecurrent(state, TEMPORAL_SIZE); - mdl.addDesire(desire, DESIRE_LEN); - mdl.addTrafficConvention(traffic_convention, TRAFFIC_CONVENTION_LEN); - mdl.addImage(input, 0); - mdl.addExtra(extra, 0); - - // first run - printf("************** execute 1 **************\n"); - memset(output, 0, OUTPUT_SIZE * sizeof(float)); - mdl.execute(); - - // don't save? - if (!output_file) { - printf("no output file, exiting\n"); - return 0; - } - - // save model - printf("saving %s with binary %d\n", output_file, save_binaries); - mdl.thneed->save(output_file, save_binaries); - - // test model - auto thneed = new Thneed(true); - thneed->record = false; - thneed->load(output_file); - thneed->clexec(); - thneed->find_inputs_outputs(); - - return 0; -} - diff --git a/selfdrive/modeld/thneed/kernels/convolution_.cl b/selfdrive/modeld/thneed/kernels/convolution_.cl deleted file mode 100644 index 1b9d74b83f..0000000000 --- a/selfdrive/modeld/thneed/kernels/convolution_.cl +++ /dev/null @@ -1,272 +0,0 @@ - read_only image2d_t input, -#ifndef DEPTHWISE - short startPackedInputChannel, - short numPackedInputChannelsForGroup, short totalNumPackedInputChannels, - // typo required for API compatibility - short packedOuputChannelOffset, short totalNumPackedOutputChannels, -#else - short totalNumPackedChannels, -#endif - read_only image2d_t weights, __constant float *biases, - short filterSizeX, short filterSizeY, - write_only image2d_t output, - short paddingX, short paddingY, short strideX, short strideY, -#ifdef SUPPORT_DILATION - short dilationX, short dilationY, -#endif - short neuron, float a, float b, float min_clamp, float max_clamp, -#ifndef DEPTHWISE - // note: these are not supported - __constant float *parameters, __constant float *batchNormBiases, -#endif - short numOutputColumns -#ifdef SUPPORT_ACCUMULATION - , short doAccumulate, read_only image2d_t accumulator -#endif - ) { - -#ifndef NUM_OUTPUTS - #define NUM_OUTPUTS 4 -#endif - - // init - const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - short packedOutputChannel = get_global_id(0); - short startOutputColumn = mul24((short)get_global_id(1), NUM_OUTPUTS); - short outputRow = get_global_id(2); - -#ifdef DEPTHWISE - short totalNumPackedInputChannels = totalNumPackedChannels; - short totalNumPackedOutputChannels = totalNumPackedChannels; - short startPackedInputChannel = packedOutputChannel; -#endif - - short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), totalNumPackedInputChannels, startPackedInputChannel); - short strideWithChannels = mul24(strideX, totalNumPackedInputChannels); - - float4 outputValues[NUM_OUTPUTS]; - for (short i = 0; i < NUM_OUTPUTS; ++i) { - outputValues[i] = (float4)(0, 0, 0, 0); - } - - int2 inputLocation; - inputLocation.y = mad24(outputRow, strideY, -paddingY); - - int2 weightLocation; - weightLocation.x = 0; - weightLocation.y = packedOutputChannel; - -#ifdef DEPTHWISE - -#ifdef SUPPORT_DILATION - - // depthwise convolution - for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { - for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { - short dilatedStepX = mul24(totalNumPackedChannels, dilationX); - inputLocation.x = mad24(rfColumn, dilatedStepX, startX); - float4 inputValues[4]; - for (short i = 0; i < 4; ++i) { - inputValues[i] = read_imagef(input, smp, inputLocation); - inputLocation.x += strideWithChannels; - } - float4 weightValues = read_imagef(weights, smp, weightLocation); - ++weightLocation.x; - outputValues[0] += inputValues[0] * weightValues; - outputValues[1] += inputValues[1] * weightValues; - outputValues[2] += inputValues[2] * weightValues; - outputValues[3] += inputValues[3] * weightValues; - } - inputLocation.y += dilationY; - } - -#else - - // depthwise unstrided convolution - for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { - float4 inputValues[4]; - inputLocation.x = startX; - for (short i = 1; i < 4; ++i) { - inputValues[i] = read_imagef(input, smp, inputLocation); - inputLocation.x += totalNumPackedOutputChannels; - } - for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { - inputValues[0] = inputValues[1]; - inputValues[1] = inputValues[2]; - inputValues[2] = inputValues[3]; - inputValues[3] = read_imagef(input, smp, inputLocation); - inputLocation.x += totalNumPackedChannels; - float4 weightValues = read_imagef(weights, smp, weightLocation); - ++weightLocation.x; - outputValues[0] += inputValues[0] * weightValues; - outputValues[1] += inputValues[1] * weightValues; - outputValues[2] += inputValues[2] * weightValues; - outputValues[3] += inputValues[3] * weightValues; - } - ++inputLocation.y; - } - -#endif - -#elif defined(ONLY_1X1_CONV) - - // 1x1 convolution - short endPackedInputChannel = startPackedInputChannel + numPackedInputChannelsForGroup; - for (short packedInputChannel = startPackedInputChannel; packedInputChannel < endPackedInputChannel; ++packedInputChannel) { - float4 weightValues[4]; - for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { - weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); - ++weightLocation.x; - } - - inputLocation.x = startX + packedInputChannel; - float4 inputValues[NUM_OUTPUTS]; - for (short i = 0; i < NUM_OUTPUTS; ++i) { - inputValues[i] = read_imagef(input, smp, inputLocation); - inputLocation.x += strideWithChannels; - } - - for (short i = 0; i < NUM_OUTPUTS; ++i) { - float4 curOutputValues = outputValues[i]; - curOutputValues.x += inputValues[i].x * weightValues[0].x; - curOutputValues.x += inputValues[i].y * weightValues[0].y; - curOutputValues.x += inputValues[i].z * weightValues[0].z; - curOutputValues.x += inputValues[i].w * weightValues[0].w; - curOutputValues.y += inputValues[i].x * weightValues[1].x; - curOutputValues.y += inputValues[i].y * weightValues[1].y; - curOutputValues.y += inputValues[i].z * weightValues[1].z; - curOutputValues.y += inputValues[i].w * weightValues[1].w; - curOutputValues.z += inputValues[i].x * weightValues[2].x; - curOutputValues.z += inputValues[i].y * weightValues[2].y; - curOutputValues.z += inputValues[i].z * weightValues[2].z; - curOutputValues.z += inputValues[i].w * weightValues[2].w; - curOutputValues.w += inputValues[i].x * weightValues[3].x; - curOutputValues.w += inputValues[i].y * weightValues[3].y; - curOutputValues.w += inputValues[i].z * weightValues[3].z; - curOutputValues.w += inputValues[i].w * weightValues[3].w; - outputValues[i] = curOutputValues; - } - } - packedOutputChannel += packedOuputChannelOffset; - -#else - - // normal convolution - for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { - for (short packedInputChannel = 0; packedInputChannel < numPackedInputChannelsForGroup; ++packedInputChannel) { - short startXForChannel = startX + packedInputChannel; - for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { - - float4 weightValues[4]; - for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { - weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); - ++weightLocation.x; - } - -#ifdef SUPPORT_DILATION - short dilatedStepX = mul24(totalNumPackedInputChannels, dilationX); - inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel); -#else - inputLocation.x = mad24(rfColumn, totalNumPackedInputChannels, startXForChannel); -#endif - float4 inputValues[NUM_OUTPUTS]; - for (short i = 0; i < NUM_OUTPUTS; ++i) { - inputValues[i] = read_imagef(input, smp, inputLocation); - inputLocation.x += strideWithChannels; - } - for (short i = 0; i < NUM_OUTPUTS; ++i) { - float4 curOutputValues = outputValues[i]; - curOutputValues.x += inputValues[i].x * weightValues[0].x; - curOutputValues.x += inputValues[i].y * weightValues[0].y; - curOutputValues.x += inputValues[i].z * weightValues[0].z; - curOutputValues.x += inputValues[i].w * weightValues[0].w; - curOutputValues.y += inputValues[i].x * weightValues[1].x; - curOutputValues.y += inputValues[i].y * weightValues[1].y; - curOutputValues.y += inputValues[i].z * weightValues[1].z; - curOutputValues.y += inputValues[i].w * weightValues[1].w; - curOutputValues.z += inputValues[i].x * weightValues[2].x; - curOutputValues.z += inputValues[i].y * weightValues[2].y; - curOutputValues.z += inputValues[i].z * weightValues[2].z; - curOutputValues.z += inputValues[i].w * weightValues[2].w; - curOutputValues.w += inputValues[i].x * weightValues[3].x; - curOutputValues.w += inputValues[i].y * weightValues[3].y; - curOutputValues.w += inputValues[i].z * weightValues[3].z; - curOutputValues.w += inputValues[i].w * weightValues[3].w; - outputValues[i] = curOutputValues; - } - } - } -#ifdef SUPPORT_DILATION - inputLocation.y += dilationY; -#else - ++inputLocation.y; -#endif - } - packedOutputChannel += packedOuputChannelOffset; -#endif - - // bias - short outputChannel = mul24(packedOutputChannel, 4); - float4 biasValues = vload4(0, biases + outputChannel); - for (short i = 0; i < NUM_OUTPUTS; ++i) { - outputValues[i] += biasValues; - } - -#ifdef SUPPORT_ACCUMULATION - // accumulate - if (doAccumulate) { - int2 outputLocation; - short outputColumn = startOutputColumn; - outputLocation.y = outputRow; - for (short i = 0; i < NUM_OUTPUTS; ++i) { - outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); - if (outputColumn < numOutputColumns) { - outputValues[i] += read_imagef(accumulator, smp, outputLocation); - } - ++outputColumn; - } - } -#endif - - // activation - switch (neuron) { - case 1: - for (short i = 0; i < NUM_OUTPUTS; ++i) { - outputValues[i] = max(outputValues[i], 0.0f); - } - break; - case 2: - for (short i = 0; i < NUM_OUTPUTS; ++i) { - outputValues[i] = a * tanh(b * outputValues[i]); - } - break; - case 3: - for (short i = 0; i < NUM_OUTPUTS; ++i) { - outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); - } - break; - case 4: - for (short i = 0; i < NUM_OUTPUTS; ++i) { - outputValues[i] = max(outputValues[i], min_clamp); - outputValues[i] = min(outputValues[i], max_clamp); - } - break; - case 5: - for (short i = 0; i < NUM_OUTPUTS; ++i) { - outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); - } - break; - } - - // output - int2 outputLocation; - short outputColumn = startOutputColumn; - outputLocation.y = outputRow; - for (short i = 0; i < NUM_OUTPUTS; ++i) { - outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); - if (outputColumn < numOutputColumns) { - write_imagef(output, outputLocation, outputValues[i]); - } - ++outputColumn; - } -} diff --git a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads.cl b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads.cl deleted file mode 100644 index fcea88ce90..0000000000 --- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads.cl +++ /dev/null @@ -1,3 +0,0 @@ -#define SUPPORT_DILATION - -__kernel void convolution_horizontal_reduced_reads( diff --git a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_1x1.cl b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_1x1.cl deleted file mode 100644 index 0d15d80581..0000000000 --- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_1x1.cl +++ /dev/null @@ -1,4 +0,0 @@ -#define ONLY_1X1_CONV -#define SUPPORT_ACCUMULATION - -__kernel void convolution_horizontal_reduced_reads_1x1( diff --git a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_5_outputs.cl b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_5_outputs.cl deleted file mode 100644 index 69421fc2a9..0000000000 --- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_5_outputs.cl +++ /dev/null @@ -1,3 +0,0 @@ -#define NUM_OUTPUTS 5 - -__kernel void convolution_horizontal_reduced_reads_5_outputs( diff --git a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise.cl b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise.cl deleted file mode 100644 index 50e39941d4..0000000000 --- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise.cl +++ /dev/null @@ -1,4 +0,0 @@ -#define DEPTHWISE -#define SUPPORT_DILATION - -__kernel void convolution_horizontal_reduced_reads_depthwise( diff --git a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise_stride_1.cl b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise_stride_1.cl deleted file mode 100644 index b347cb6c71..0000000000 --- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise_stride_1.cl +++ /dev/null @@ -1,3 +0,0 @@ -#define DEPTHWISE - -__kernel void convolution_horizontal_reduced_reads_depthwise_stride_1( diff --git a/selfdrive/modeld/thneed/optimizer.cc b/selfdrive/modeld/thneed/optimizer.cc deleted file mode 100644 index 39737d3d76..0000000000 --- a/selfdrive/modeld/thneed/optimizer.cc +++ /dev/null @@ -1,261 +0,0 @@ -#include -#include -#include -#include -#include "thneed.h" - -#include "common/util.h" -#include "common/clutil.h" - -extern map g_program_source; - -/*static int is_same_size_image(cl_mem a, cl_mem b) { - size_t a_width, a_height, a_depth, a_array_size, a_row_pitch, a_slice_pitch; - clGetImageInfo(a, CL_IMAGE_WIDTH, sizeof(a_width), &a_width, NULL); - clGetImageInfo(a, CL_IMAGE_HEIGHT, sizeof(a_height), &a_height, NULL); - clGetImageInfo(a, CL_IMAGE_DEPTH, sizeof(a_depth), &a_depth, NULL); - clGetImageInfo(a, CL_IMAGE_ARRAY_SIZE, sizeof(a_array_size), &a_array_size, NULL); - clGetImageInfo(a, CL_IMAGE_ROW_PITCH, sizeof(a_row_pitch), &a_row_pitch, NULL); - clGetImageInfo(a, CL_IMAGE_SLICE_PITCH, sizeof(a_slice_pitch), &a_slice_pitch, NULL); - - size_t b_width, b_height, b_depth, b_array_size, b_row_pitch, b_slice_pitch; - clGetImageInfo(b, CL_IMAGE_WIDTH, sizeof(b_width), &b_width, NULL); - clGetImageInfo(b, CL_IMAGE_HEIGHT, sizeof(b_height), &b_height, NULL); - clGetImageInfo(b, CL_IMAGE_DEPTH, sizeof(b_depth), &b_depth, NULL); - clGetImageInfo(b, CL_IMAGE_ARRAY_SIZE, sizeof(b_array_size), &b_array_size, NULL); - clGetImageInfo(b, CL_IMAGE_ROW_PITCH, sizeof(b_row_pitch), &b_row_pitch, NULL); - clGetImageInfo(b, CL_IMAGE_SLICE_PITCH, sizeof(b_slice_pitch), &b_slice_pitch, NULL); - - return (a_width == b_width) && (a_height == b_height) && - (a_depth == b_depth) && (a_array_size == b_array_size) && - (a_row_pitch == b_row_pitch) && (a_slice_pitch == b_slice_pitch); -}*/ - -static cl_mem make_image_like(cl_context context, cl_mem val) { - cl_image_format format; - size_t width, height, row_pitch; - clGetImageInfo(val, CL_IMAGE_FORMAT, sizeof(format), &format, NULL); - assert(format.image_channel_order == CL_RGBA); - assert(format.image_channel_data_type == CL_HALF_FLOAT); - clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL); - clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL); - clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL); - - cl_image_desc desc = {0}; - desc.image_type = CL_MEM_OBJECT_IMAGE2D; - desc.image_width = width; - desc.image_height = height; - desc.image_row_pitch = row_pitch; - - cl_mem buf = clCreateBuffer(context, CL_MEM_READ_WRITE, row_pitch*height, NULL, NULL); - assert(buf != NULL); - desc.buffer = buf; - - cl_int err; - cl_mem tmp = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err); - //printf("got %d for image %zux%zu %zu\n", err, width, height, row_pitch); - assert(tmp != NULL); - - return tmp; -} - -// convolution_horizontal_reduced_reads_1x1 is 66% of the model runtime -// make that faster and the model gets faster - -// this cuts ~2 ms off the model runtime right now -int Thneed::optimize() { - const char *kernel_path = getenv("KERNEL_PATH"); - if (!kernel_path) { kernel_path = "/data/openpilot/selfdrive/modeld/thneed/kernels"; printf("no KERNEL_PATH set, defaulting to %s\n", kernel_path); } - - string convolution_; - { - char fn[0x100]; - snprintf(fn, sizeof(fn), "%s/%s.cl", kernel_path, "convolution_"); - convolution_ = util::read_file(fn); - } - - // load custom kernels - map g_programs; - for (auto &k : kq) { - // replace program? - if (g_programs.find(k->name) == g_programs.end()) { - char fn[0x100]; - snprintf(fn, sizeof(fn), "%s/%s.cl", kernel_path, k->name.c_str()); - if (util::file_exists(fn)) { - string kernel_src = util::read_file(fn); - if (k->name.rfind("convolution_", 0) == 0) { - kernel_src += convolution_; - } - printf("building kernel %s with len %lu\n", k->name.c_str(), kernel_src.length()); - k->program = cl_program_from_source(context, device_id, kernel_src); - - // save in cache - g_programs[k->name] = k->program; - g_program_source[k->program] = kernel_src; - } else { - g_programs[k->name] = NULL; - } - } else { - // cached replacement - if (g_programs[k->name] != NULL) { - k->program = g_programs[k->name]; - } - } - - // hack in accumulator to convolution_horizontal_reduced_reads_1x1 - if (k->name == "convolution_horizontal_reduced_reads_1x1") { - k->arg_names.push_back("doAccumulate"); - short doAccumulate = 0; - k->args.push_back(string((char *)&doAccumulate, sizeof(doAccumulate))); - k->args_size.push_back(2); - k->arg_names.push_back("accumulator"); - k->args.push_back(k->args[k->get_arg_num("output")]); - k->args_size.push_back(8); - k->num_args += 2; - } - - // assert that parameters + batchNormBiases are not used - // since they aren't supported in custom replacement kernels - if (k->name == "convolution_horizontal_reduced_reads_1x1" || - k->name == "convolution_horizontal_reduced_reads" || - k->name == "convolution_horizontal_reduced_reads_5_outputs") { - string p1 = k->args[k->get_arg_num("parameters")]; - string p2 = k->args[k->get_arg_num("batchNormBiases")]; - assert(p1.length() == 8 && *((uint64_t*)p1.data()) == 0); - assert(p2.length() == 8 && *((uint64_t*)p2.data()) == 0); - } - } - - // optimizer - size_t start_size; - do { - start_size = kq.size(); - - // get optimizations - map replacements; - for (int i = 0; i < kq.size(); i++) { - // fusing elementwise_sum + activate_image will save 3 enqueues - - // delete useless copy layers - // saves ~0.7 ms - /*if (kq[i]->name == "concatenation" || kq[i]->name == "flatten") { - string in = kq[i]->args[kq[i]->get_arg_num("input")]; - string out = kq[i]->args[kq[i]->get_arg_num("output")]; - if (is_same_size_image(*(cl_mem*)in.data(), *(cl_mem*)out.data())) { - cl_mem tmp = make_image_like(context, *(cl_mem *)in.data()); - replacements[in] = string((char *)&tmp, sizeof(tmp)); - replacements[out] = string((char *)&tmp, sizeof(tmp)); - - kq.erase(kq.begin()+i); --i; - } - }*/ - - // NOTE: if activations/accumulation are done in the wrong order, this will be wrong - - // fuse activations into convs and fc_Wtx - // saves ~1.5 ms - // NOTE: this changes the outputs because of rounding, should be better now! - if (i != 0 && kq[i]->name == "activate_image") { - if (kq[i-1]->name == "convolution_horizontal_reduced_reads_1x1" || - kq[i-1]->name == "convolution_horizontal_reduced_reads_5_outputs" || - kq[i-1]->name == "convolution_horizontal_reduced_reads" || - kq[i-1]->name == "convolution_horizontal_reduced_reads_depthwise" || - kq[i-1]->name == "convolution_horizontal_reduced_reads_depthwise_stride_1" || - kq[i-1]->name == "fc_Wtx") { - string lastout = kq[i-1]->args[kq[i-1]->get_arg_num("output")]; - string in = kq[i]->args[kq[i]->get_arg_num("input")]; - string out = kq[i]->args[kq[i]->get_arg_num("output")]; - - if (lastout == in) { - short neuron = *(int*)kq[i]->args[kq[i]->get_arg_num("neuron")].data(); - assert(neuron <= 5); - - // ELU isn't supported in fc_Wtx - assert(!(kq[i-1]->name == "fc_Wtx" && neuron == 5)); - - kq[i-1]->args[kq[i-1]->get_arg_num("neuron")] = string((char *)&neuron, sizeof(neuron)); - - cl_mem tmp = make_image_like(context, *(cl_mem *)lastout.data()); - replacements[in] = string((char *)&tmp, sizeof(tmp)); - replacements[out] = string((char *)&tmp, sizeof(tmp)); - - kq.erase(kq.begin()+i); --i; - } - } - } - - // fuse accumulation into convs and fc_Wtx - if (i != 0 && kq[i]->name == "elementwise_sum") { - if (kq[i-1]->name == "convolution_horizontal_reduced_reads_1x1" || - kq[i-1]->name == "fc_Wtx") { - string lastout = kq[i-1]->args[kq[i-1]->get_arg_num("output")]; - string a = kq[i]->args[kq[i]->get_arg_num("a")]; - string b = kq[i]->args[kq[i]->get_arg_num("b")]; - string out = kq[i]->args[kq[i]->get_arg_num("output")]; - - if (lastout == a) { - kq[i-1]->args[kq[i-1]->get_arg_num("accumulator")] = b; - } else if (lastout == b) { - kq[i-1]->args[kq[i-1]->get_arg_num("accumulator")] = a; - } else { - continue; - } - - cl_mem tmp = make_image_like(context, *(cl_mem *)lastout.data()); - replacements[lastout] = string((char *)&tmp, sizeof(tmp)); - replacements[out] = string((char *)&tmp, sizeof(tmp)); - - short doAccumulate = 1; - kq[i-1]->args[kq[i-1]->get_arg_num("doAccumulate")] = string((char *)&doAccumulate, sizeof(doAccumulate)); - - kq.erase(kq.begin()+i); --i; - } - } - } - - // remap inputs and outputs, and clear the kernels - for (int i = 0; i < kq.size(); i++) { - kq[i]->kernel = NULL; - for (int j = 0; j < kq[i]->num_args; j++) { - if (replacements.find(kq[i]->args[j]) != replacements.end()) { - kq[i]->args[j] = replacements[kq[i]->args[j]]; - } - } - } - - printf("optimize %lu -> %lu\n", start_size, kq.size()); - } while (kq.size() != start_size); - - size_t work_group_size = 0; - clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(work_group_size), &work_group_size, NULL); - printf("max work group size %lu\n", work_group_size); - - // local work group optimizer - for (auto &k : kq) { - // only do it for convs, since others might share memory - if (k->name.rfind("convolution_", 0) == 0) { - int best = -1; - if (k->local_work_size[0] * k->local_work_size[1] * k->local_work_size[2] < work_group_size/2) { - uint64_t base_time = k->benchmark(); - uint64_t best_time = base_time; - for (int i = 0; i < 3; i++) { - k->local_work_size[i] *= 2; - uint64_t this_time = k->benchmark(); - if (this_time < best_time) { - best = i; - best_time = this_time; - } - k->local_work_size[i] /= 2; - } - if (best != -1) { - k->local_work_size[best] *= 2; - //printf("%s %.2f ms doubled %d to %.2f ms\n", k->name.c_str(), base_time/1e6, best, best_time/1e6); - } - } - - } - } - - return 0; -} - diff --git a/selfdrive/modeld/thneed/serialize.cc b/selfdrive/modeld/thneed/serialize.cc index afc84ee769..f789e5bf57 100644 --- a/selfdrive/modeld/thneed/serialize.cc +++ b/selfdrive/modeld/thneed/serialize.cc @@ -152,155 +152,3 @@ void Thneed::load(const char *filename) { clFinish(command_queue); } - -void Thneed::save(const char *filename, bool save_binaries) { - printf("Thneed::save: saving to %s\n", filename); - - // get kernels - std::vector kernels; - std::set saved_objects; - std::vector objects; - std::map programs; - std::map binaries; - - for (auto &k : kq) { - kernels.push_back(k->to_json()); - - // check args for objects - int i = 0; - for (auto &a : k->args) { - if (a.size() == 8) { - if (saved_objects.find(a) == saved_objects.end()) { - saved_objects.insert(a); - cl_mem val = *(cl_mem*)(a.data()); - if (val != NULL) { - bool needs_load = k->arg_names[i] == "weights" || k->arg_names[i] == "biases"; - - auto jj = Json::object({ - {"id", a}, - {"arg_type", k->arg_types[i]}, - }); - - if (k->arg_types[i] == "image2d_t" || k->arg_types[i] == "image1d_t") { - cl_mem buf = NULL; - clGetImageInfo(val, CL_IMAGE_BUFFER, sizeof(buf), &buf, NULL); - string aa = string((char *)&buf, sizeof(buf)); - jj["buffer_id"] = aa; - - size_t width, height, row_pitch; - clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL); - clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL); - clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL); - jj["width"] = (int)width; - jj["height"] = (int)height; - jj["row_pitch"] = (int)row_pitch; - jj["size"] = (int)(height * row_pitch); - jj["needs_load"] = false; - jj["float32"] = false; - - if (saved_objects.find(aa) == saved_objects.end()) { - saved_objects.insert(aa); - size_t sz; - clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL); - // save the buffer - objects.push_back(Json::object({ - {"id", aa}, - {"arg_type", ""}, - {"needs_load", needs_load}, - {"size", (int)sz} - })); - if (needs_load) assert(sz == height * row_pitch); - } - } else { - size_t sz = 0; - clGetMemObjectInfo(val, CL_MEM_SIZE, sizeof(sz), &sz, NULL); - jj["size"] = (int)sz; - jj["needs_load"] = needs_load; - } - - objects.push_back(jj); - } - } - } - i++; - } - - if (save_binaries) { - int err; - size_t binary_size = 0; - err = clGetProgramInfo(k->program, CL_PROGRAM_BINARY_SIZES, sizeof(binary_size), &binary_size, NULL); - assert(err == 0); - assert(binary_size > 0); - string sv(binary_size, '\x00'); - - uint8_t* bufs[1] = { (uint8_t*)sv.data(), }; - err = clGetProgramInfo(k->program, CL_PROGRAM_BINARIES, sizeof(bufs), &bufs, NULL); - assert(err == 0); - - binaries[k->name] = sv; - } else { - programs[k->name] = g_program_source[k->program]; - } - } - - vector saved_buffers; - for (auto &obj : objects) { - auto mobj = obj.object_items(); - cl_mem val = *(cl_mem*)(mobj["id"].string_value().data()); - int sz = mobj["size"].int_value(); - if (mobj["needs_load"].bool_value()) { - char *buf = (char *)malloc(sz); - if (mobj["arg_type"] == "image2d_t" || mobj["arg_type"] == "image1d_t") { - assert(false); - } else { - // buffers allocated with CL_MEM_HOST_WRITE_ONLY, hence this hack - //hexdump((uint32_t*)val, 0x100); - - // the worst hack in thneed, the flags are at 0x14 - ((uint32_t*)val)[0x14] &= ~CL_MEM_HOST_WRITE_ONLY; - cl_int ret = clEnqueueReadBuffer(command_queue, val, CL_TRUE, 0, sz, buf, 0, NULL, NULL); - assert(ret == CL_SUCCESS); - } - //printf("saving buffer: %d %p %s\n", sz, buf, mobj["arg_type"].string_value().c_str()); - saved_buffers.push_back(string(buf, sz)); - free(buf); - } - } - - std::vector jbinaries; - for (auto &obj : binaries) { - jbinaries.push_back(Json::object({{"name", obj.first}, {"length", (int)obj.second.size()}})); - saved_buffers.push_back(obj.second); - } - - Json jdat = Json::object({ - {"kernels", kernels}, - {"objects", objects}, - {"programs", programs}, - {"binaries", jbinaries}, - }); - - string str = jdat.dump(); - int jsz = str.length(); - - FILE *f = fopen(filename, "wb"); - fwrite(&jsz, 1, sizeof(jsz), f); - fwrite(str.data(), 1, jsz, f); - for (auto &s : saved_buffers) { - fwrite(s.data(), 1, s.length(), f); - } - fclose(f); -} - -Json CLQueuedKernel::to_json() const { - return Json::object { - { "name", name }, - { "work_dim", (int)work_dim }, - { "global_work_size", Json::array { (int)global_work_size[0], (int)global_work_size[1], (int)global_work_size[2] } }, - { "local_work_size", Json::array { (int)local_work_size[0], (int)local_work_size[1], (int)local_work_size[2] } }, - { "num_args", (int)num_args }, - { "args", args }, - { "args_size", args_size }, - }; -} - diff --git a/selfdrive/modeld/thneed/thneed.h b/selfdrive/modeld/thneed/thneed.h index 2a5800f302..65475ccf7f 100644 --- a/selfdrive/modeld/thneed/thneed.h +++ b/selfdrive/modeld/thneed/thneed.h @@ -17,7 +17,6 @@ using namespace std; cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value); -cl_program thneed_clCreateProgramWithSource(cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret); namespace json11 { class Json; @@ -43,7 +42,6 @@ class CLQueuedKernel { const size_t *_global_work_size, const size_t *_local_work_size); cl_int exec(); - uint64_t benchmark(); void debug_print(bool verbose); int get_arg_num(const char *search_arg_name); cl_program program; @@ -96,8 +94,6 @@ class Thneed { void stop(); void execute(float **finputs, float *foutput, bool slow=false); void wait(); - int optimize(); - bool run_optimizer = false; vector input_clmem; vector inputs; @@ -121,7 +117,6 @@ class Thneed { #endif // all CL kernels - void find_inputs_outputs(); void copy_inputs(float **finputs, bool internal=false); void copy_output(float *foutput); cl_int clexec(); @@ -130,9 +125,8 @@ class Thneed { // pending CL kernels vector > ckq; - // loading and saving + // loading void load(const char *filename); - void save(const char *filename, bool save_binaries=false); private: void clinit(); }; diff --git a/selfdrive/modeld/thneed/thneed_common.cc b/selfdrive/modeld/thneed/thneed_common.cc index a3f5c908f9..21170b13a6 100644 --- a/selfdrive/modeld/thneed/thneed_common.cc +++ b/selfdrive/modeld/thneed/thneed_common.cc @@ -11,6 +11,11 @@ map, string> g_args; map, int> g_args_size; map g_program_source; +void Thneed::stop() { + printf("Thneed::stop: recorded %lu commands\n", cmds.size()); + record = false; +} + void Thneed::clinit() { device_id = cl_get_device_id(CL_DEVICE_TYPE_DEFAULT); if (context == NULL) context = CL_CHECK_ERR(clCreateContext(NULL, 1, &device_id, NULL, NULL, &err)); @@ -131,23 +136,6 @@ cl_int CLQueuedKernel::exec() { kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL); } -uint64_t CLQueuedKernel::benchmark() { - uint64_t ret = 0; - int old_record = thneed->record; - thneed->record = 0; - clFinish(thneed->command_queue); - // TODO: benchmarking at a lower level will make this more accurate - for (int i = 0; i < 10; i++) { - uint64_t sb = nanos_since_boot(); - exec(); - clFinish(thneed->command_queue); - uint64_t et = nanos_since_boot() - sb; - if (ret == 0 || et < ret) ret = et; - } - thneed->record = old_record; - return ret; -} - void CLQueuedKernel::debug_print(bool verbose) { printf("%p %56s -- ", kernel, name.c_str()); for (int i = 0; i < work_dim; i++) { @@ -226,10 +214,3 @@ cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_siz cl_int ret = clSetKernelArg(kernel, arg_index, arg_size, arg_value); return ret; } - -cl_program thneed_clCreateProgramWithSource(cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) { - assert(count == 1); - cl_program ret = clCreateProgramWithSource(context, count, strings, lengths, errcode_ret); - g_program_source[ret] = strings[0]; - return ret; -} \ No newline at end of file diff --git a/selfdrive/modeld/thneed/thneed_pc.cc b/selfdrive/modeld/thneed/thneed_pc.cc index e32dd289ec..8d0037628e 100644 --- a/selfdrive/modeld/thneed/thneed_pc.cc +++ b/selfdrive/modeld/thneed/thneed_pc.cc @@ -30,11 +30,3 @@ void Thneed::execute(float **finputs, float *foutput, bool slow) { printf("model exec in %lu us\n", (te-tb)/1000); } } - -void Thneed::stop() { -} - -void Thneed::find_inputs_outputs() { - // thneed on PC doesn't work on old style inputs/outputs -} - diff --git a/selfdrive/modeld/thneed/thneed_qcom2.cc b/selfdrive/modeld/thneed/thneed_qcom2.cc index f35317d2a7..a29a82c8c8 100644 --- a/selfdrive/modeld/thneed/thneed_qcom2.cc +++ b/selfdrive/modeld/thneed/thneed_qcom2.cc @@ -218,39 +218,6 @@ Thneed::Thneed(bool do_clinit, cl_context _context) { debug = (thneed_debug_env != NULL) ? atoi(thneed_debug_env) : 0; } -void Thneed::stop() { - find_inputs_outputs(); - printf("Thneed::stop: recorded %lu commands\n", cmds.size()); - record = false; -} - -void Thneed::find_inputs_outputs() { - cl_int err; - if (inputs.size() > 0) return; - - // save the global inputs/outputs - for (auto &k : kq) { - for (int i = 0; i < k->num_args; i++) { - if (k->name == "zero_pad_image_float" && k->arg_names[i] == "input") { - cl_mem aa = *(cl_mem*)(k->args[i].data()); - input_clmem.push_back(aa); - - size_t sz; - clGetMemObjectInfo(aa, CL_MEM_SIZE, sizeof(sz), &sz, NULL); - input_sizes.push_back(sz); - - void *ret = clEnqueueMapBuffer(command_queue, aa, CL_TRUE, CL_MAP_WRITE, 0, sz, 0, NULL, NULL, &err); - assert(err == CL_SUCCESS); - inputs.push_back(ret); - } - - if (k->name == "image2d_to_buffer_float" && k->arg_names[i] == "output") { - output = *(cl_mem*)(k->args[i].data()); - } - } - } -} - void Thneed::wait() { struct kgsl_device_waittimestamp_ctxtid wait; wait.context_id = context_id; @@ -314,74 +281,3 @@ void Thneed::execute(float **finputs, float *foutput, bool slow) { printf("model exec in %lu us\n", (te-tb)/1000); } } - -// *********** OpenCL interceptor *********** - -cl_int thneed_clEnqueueNDRangeKernel(cl_command_queue command_queue, - cl_kernel kernel, - cl_uint work_dim, - const size_t *global_work_offset, - const size_t *global_work_size, - const size_t *local_work_size, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) { - - Thneed *thneed = g_thneed; - - // SNPE doesn't use these - assert(num_events_in_wait_list == 0); - assert(global_work_offset == NULL); - assert(event_wait_list == NULL); - - cl_int ret = 0; - if (thneed != NULL && thneed->record) { - if (thneed->context == NULL) { - thneed->command_queue = command_queue; - clGetKernelInfo(kernel, CL_KERNEL_CONTEXT, sizeof(thneed->context), &thneed->context, NULL); - clGetContextInfo(thneed->context, CL_CONTEXT_DEVICES, sizeof(thneed->device_id), &thneed->device_id, NULL); - } - - // if we are recording, we don't actually enqueue the kernel - thneed->kq.push_back(unique_ptr(new CLQueuedKernel(thneed, kernel, work_dim, global_work_size, local_work_size))); - *event = NULL; - } else { - ret = clEnqueueNDRangeKernel(command_queue, kernel, work_dim, - global_work_offset, global_work_size, local_work_size, - num_events_in_wait_list, event_wait_list, event); - } - - return ret; -} - -cl_int thneed_clFinish(cl_command_queue command_queue) { - Thneed *thneed = g_thneed; - - if (thneed != NULL && thneed->record) { - if (thneed->run_optimizer) thneed->optimize(); - return thneed->clexec(); - } else { - return clFinish(command_queue); - } -} - -void *dlsym(void *handle, const char *symbol) { -#ifdef QCOM2 - void *(*my_dlsym)(void *handle, const char *symbol) = (void *(*)(void *handle, const char *symbol))((uintptr_t)dlopen + DLSYM_OFFSET); -#else - #error "Unsupported platform for thneed" -#endif - if (memcmp("REAL_", symbol, 5) == 0) { - return my_dlsym(handle, symbol+5); - } else if (strcmp("clFinish", symbol) == 0) { - return (void*)thneed_clFinish; - } else if (strcmp("clEnqueueNDRangeKernel", symbol) == 0) { - return (void*)thneed_clEnqueueNDRangeKernel; - } else if (strcmp("clSetKernelArg", symbol) == 0) { - return (void*)thneed_clSetKernelArg; - } else if (strcmp("clCreateProgramWithSource", symbol) == 0) { - return (void*)thneed_clCreateProgramWithSource; - } else { - return my_dlsym(handle, symbol); - } -} diff --git a/selfdrive/modeld/thneed/weights_fixup.py b/selfdrive/modeld/thneed/weights_fixup.py deleted file mode 100755 index 539b1b5d32..0000000000 --- a/selfdrive/modeld/thneed/weights_fixup.py +++ /dev/null @@ -1,146 +0,0 @@ -#!/usr/bin/env python3 -import os -import struct -import zipfile -import numpy as np -from tqdm import tqdm - -from common.basedir import BASEDIR -from selfdrive.modeld.thneed.lib import load_thneed, save_thneed - -# this is junk code, but it doesn't have deps -def load_dlc_weights(fn): - archive = zipfile.ZipFile(fn, 'r') - dlc_params = archive.read("model.params") - - def extract(rdat): - idx = rdat.find(b"\x00\x00\x00\x09\x04\x00\x00\x00") - rdat = rdat[idx+8:] - ll = struct.unpack("I", rdat[0:4])[0] - buf = np.frombuffer(rdat[4:4+ll*4], dtype=np.float32) - rdat = rdat[4+ll*4:] - dims = struct.unpack("I", rdat[0:4])[0] - buf = buf.reshape(struct.unpack("I"*dims, rdat[4:4+dims*4])) - if len(buf.shape) == 4: - buf = np.transpose(buf, (3,2,0,1)) - return buf - - def parse(tdat): - ll = struct.unpack("I", tdat[0:4])[0] + 4 - return (None, [extract(tdat[0:]), extract(tdat[ll:])]) - - ptr = 0x20 - def r4(): - nonlocal ptr - ret = struct.unpack("I", dlc_params[ptr:ptr+4])[0] - ptr += 4 - return ret - ranges = [] - cnt = r4() - for _ in range(cnt): - o = r4() + ptr - # the header is 0xC - plen, is_4, is_2 = struct.unpack("III", dlc_params[o:o+0xC]) - assert is_4 == 4 and is_2 == 2 - ranges.append((o+0xC, o+plen+0xC)) - ranges = sorted(ranges, reverse=True) - - return [parse(dlc_params[s:e]) for s,e in ranges] - -# this won't run on device without onnx -def load_onnx_weights(fn): - import onnx - from onnx import numpy_helper - - model = onnx.load(fn) - graph = model.graph # pylint: disable=maybe-no-member - init = {x.name:x for x in graph.initializer} - - onnx_layers = [] - for node in graph.node: - #print(node.name, node.op_type, node.input, node.output) - vals = [] - for inp in node.input: - if inp in init: - vals.append(numpy_helper.to_array(init[inp])) - if len(vals) > 0: - onnx_layers.append((node.name, vals)) - return onnx_layers - -def weights_fixup(target, source_thneed, dlc): - #onnx_layers = load_onnx_weights(os.path.join(BASEDIR, "models/supercombo.onnx")) - onnx_layers = load_dlc_weights(dlc) - jdat = load_thneed(source_thneed) - - bufs = {} - for o in jdat['objects']: - bufs[o['id']] = o - - thneed_layers = [] - for k in jdat['kernels']: - #print(k['name']) - vals = [] - for a in k['args']: - if a in bufs: - o = bufs[a] - if o['needs_load'] or ('buffer_id' in o and bufs[o['buffer_id']]['needs_load']): - #print(" ", o['arg_type']) - vals.append(o) - if len(vals) > 0: - thneed_layers.append((k['name'], vals)) - - assert len(thneed_layers) == len(onnx_layers) - - # fix up weights - for tl, ol in tqdm(zip(thneed_layers, onnx_layers), total=len(thneed_layers)): - #print(tl[0], ol[0]) - assert len(tl[1]) == len(ol[1]) - for o, onnx_weight in zip(tl[1], ol[1]): - if o['arg_type'] == "image2d_t": - obuf = bufs[o['buffer_id']] - saved_weights = np.frombuffer(obuf['data'], dtype=np.float16).reshape(o['height'], o['row_pitch']//2) - - if len(onnx_weight.shape) == 4: - # convolution - oc,ic,ch,cw = onnx_weight.shape - - if 'depthwise' in tl[0]: - assert ic == 1 - weights = np.transpose(onnx_weight.reshape(oc//4,4,ch,cw), (0,2,3,1)).reshape(o['height'], o['width']*4) - else: - weights = np.transpose(onnx_weight.reshape(oc//4,4,ic//4,4,ch,cw), (0,4,2,5,1,3)).reshape(o['height'], o['width']*4) - else: - # fc_Wtx - weights = onnx_weight - - new_weights = np.zeros((o['height'], o['row_pitch']//2), dtype=np.float32) - new_weights[:, :weights.shape[1]] = weights - - # weights shouldn't be too far off - err = np.mean((saved_weights.astype(np.float32) - new_weights)**2) - assert err < 1e-3 - rerr = np.mean(np.abs((saved_weights.astype(np.float32) - new_weights)/(new_weights+1e-12))) - assert rerr < 0.5 - - # fix should improve things - fixed_err = np.mean((new_weights.astype(np.float16).astype(np.float32) - new_weights)**2) - assert (err/fixed_err) >= 1 - - #print(" ", o['size'], onnx_weight.shape, o['row_pitch'], o['width'], o['height'], "err %.2fx better" % (err/fixed_err)) - - obuf['data'] = new_weights.astype(np.float16).tobytes() - - elif o['arg_type'] == "float*": - # unconverted floats are correct - new_weights = np.zeros(o['size']//4, dtype=np.float32) - new_weights[:onnx_weight.shape[0]] = onnx_weight - assert new_weights.tobytes() == o['data'] - #print(" ", o['size'], onnx_weight.shape) - - save_thneed(jdat, target) - -if __name__ == "__main__": - model_dir = os.path.join(BASEDIR, "selfdrive/modeld/models/") - weights_fixup(os.path.join(model_dir, "supercombo_fixed.thneed"), - os.path.join(model_dir, "supercombo.thneed"), - os.path.join(model_dir, "supercombo.dlc"))