modeld: delete unused SNPE stuff after move to tinygrad (#25635)
* delete unused stuff
* remove CL interceptor from thneed since we don't use SNPE anymore
* remove dead files from release
* that's removed
* oops, didn't save
old-commit-hash: 6c39382d71
taco
parent
42f56d1013
commit
273fab518b
18 changed files with 14 additions and 1158 deletions
@ -1,81 +0,0 @@ |
||||
#include <cstring> |
||||
#include <getopt.h> |
||||
|
||||
#include "selfdrive/modeld/runners/snpemodel.h" |
||||
#include "selfdrive/modeld/thneed/thneed.h" |
||||
#include "system/hardware/hw.h" |
||||
|
||||
#define TEMPORAL_SIZE 512 |
||||
#define DESIRE_LEN 8 |
||||
#define TRAFFIC_CONVENTION_LEN 2 |
||||
|
||||
// TODO: This should probably use SNPE directly.
|
||||
int main(int argc, char* argv[]) { |
||||
bool run_optimizer = false, save_binaries = false; |
||||
const char *input_file = NULL, *output_file = NULL; |
||||
static struct option long_options[] = { |
||||
{"in", required_argument, 0, 'i' }, |
||||
{"out", required_argument, 0, 'o' }, |
||||
{"binary", no_argument, 0, 'b' }, |
||||
{"optimize", no_argument, 0, 'f' }, |
||||
{0, 0, 0, 0 } |
||||
}; |
||||
int long_index = 0, opt = 0; |
||||
while ((opt = getopt_long_only(argc, argv,"", long_options, &long_index)) != -1) { |
||||
switch (opt) { |
||||
case 'i': input_file = optarg; break; |
||||
case 'o': output_file = optarg; break; |
||||
case 'b': save_binaries = true; break; |
||||
case 'f': run_optimizer = true; break; |
||||
} |
||||
} |
||||
|
||||
// no input?
|
||||
if (!input_file) { |
||||
printf("usage: -i <input file> -o <output file> --binary --optimize\n"); |
||||
return -1; |
||||
} |
||||
|
||||
#define OUTPUT_SIZE 0x10000 |
||||
|
||||
float *output = (float*)calloc(OUTPUT_SIZE, sizeof(float)); |
||||
SNPEModel mdl(input_file, output, 0, USE_GPU_RUNTIME, true); |
||||
mdl.thneed->run_optimizer = run_optimizer; |
||||
|
||||
float state[TEMPORAL_SIZE] = {0}; |
||||
float desire[DESIRE_LEN] = {0}; |
||||
float traffic_convention[TRAFFIC_CONVENTION_LEN] = {0}; |
||||
float *input = (float*)calloc(0x1000000, sizeof(float)); |
||||
float *extra = (float*)calloc(0x1000000, sizeof(float)); |
||||
|
||||
mdl.addRecurrent(state, TEMPORAL_SIZE); |
||||
mdl.addDesire(desire, DESIRE_LEN); |
||||
mdl.addTrafficConvention(traffic_convention, TRAFFIC_CONVENTION_LEN); |
||||
mdl.addImage(input, 0); |
||||
mdl.addExtra(extra, 0); |
||||
|
||||
// first run
|
||||
printf("************** execute 1 **************\n"); |
||||
memset(output, 0, OUTPUT_SIZE * sizeof(float)); |
||||
mdl.execute(); |
||||
|
||||
// don't save?
|
||||
if (!output_file) { |
||||
printf("no output file, exiting\n"); |
||||
return 0; |
||||
} |
||||
|
||||
// save model
|
||||
printf("saving %s with binary %d\n", output_file, save_binaries); |
||||
mdl.thneed->save(output_file, save_binaries); |
||||
|
||||
// test model
|
||||
auto thneed = new Thneed(true); |
||||
thneed->record = false; |
||||
thneed->load(output_file); |
||||
thneed->clexec(); |
||||
thneed->find_inputs_outputs(); |
||||
|
||||
return 0; |
||||
} |
||||
|
@ -1,272 +0,0 @@ |
||||
read_only image2d_t input, |
||||
#ifndef DEPTHWISE |
||||
short startPackedInputChannel, |
||||
short numPackedInputChannelsForGroup, short totalNumPackedInputChannels, |
||||
// typo required for API compatibility |
||||
short packedOuputChannelOffset, short totalNumPackedOutputChannels, |
||||
#else |
||||
short totalNumPackedChannels, |
||||
#endif |
||||
read_only image2d_t weights, __constant float *biases, |
||||
short filterSizeX, short filterSizeY, |
||||
write_only image2d_t output, |
||||
short paddingX, short paddingY, short strideX, short strideY, |
||||
#ifdef SUPPORT_DILATION |
||||
short dilationX, short dilationY, |
||||
#endif |
||||
short neuron, float a, float b, float min_clamp, float max_clamp, |
||||
#ifndef DEPTHWISE |
||||
// note: these are not supported |
||||
__constant float *parameters, __constant float *batchNormBiases, |
||||
#endif |
||||
short numOutputColumns |
||||
#ifdef SUPPORT_ACCUMULATION |
||||
, short doAccumulate, read_only image2d_t accumulator |
||||
#endif |
||||
) { |
||||
|
||||
#ifndef NUM_OUTPUTS |
||||
#define NUM_OUTPUTS 4 |
||||
#endif |
||||
|
||||
// init |
||||
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; |
||||
short packedOutputChannel = get_global_id(0); |
||||
short startOutputColumn = mul24((short)get_global_id(1), NUM_OUTPUTS); |
||||
short outputRow = get_global_id(2); |
||||
|
||||
#ifdef DEPTHWISE |
||||
short totalNumPackedInputChannels = totalNumPackedChannels; |
||||
short totalNumPackedOutputChannels = totalNumPackedChannels; |
||||
short startPackedInputChannel = packedOutputChannel; |
||||
#endif |
||||
|
||||
short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), totalNumPackedInputChannels, startPackedInputChannel); |
||||
short strideWithChannels = mul24(strideX, totalNumPackedInputChannels); |
||||
|
||||
float4 outputValues[NUM_OUTPUTS]; |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputValues[i] = (float4)(0, 0, 0, 0); |
||||
} |
||||
|
||||
int2 inputLocation; |
||||
inputLocation.y = mad24(outputRow, strideY, -paddingY); |
||||
|
||||
int2 weightLocation; |
||||
weightLocation.x = 0; |
||||
weightLocation.y = packedOutputChannel; |
||||
|
||||
#ifdef DEPTHWISE |
||||
|
||||
#ifdef SUPPORT_DILATION |
||||
|
||||
// depthwise convolution |
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { |
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { |
||||
short dilatedStepX = mul24(totalNumPackedChannels, dilationX); |
||||
inputLocation.x = mad24(rfColumn, dilatedStepX, startX); |
||||
float4 inputValues[4]; |
||||
for (short i = 0; i < 4; ++i) { |
||||
inputValues[i] = read_imagef(input, smp, inputLocation); |
||||
inputLocation.x += strideWithChannels; |
||||
} |
||||
float4 weightValues = read_imagef(weights, smp, weightLocation); |
||||
++weightLocation.x; |
||||
outputValues[0] += inputValues[0] * weightValues; |
||||
outputValues[1] += inputValues[1] * weightValues; |
||||
outputValues[2] += inputValues[2] * weightValues; |
||||
outputValues[3] += inputValues[3] * weightValues; |
||||
} |
||||
inputLocation.y += dilationY; |
||||
} |
||||
|
||||
#else |
||||
|
||||
// depthwise unstrided convolution |
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { |
||||
float4 inputValues[4]; |
||||
inputLocation.x = startX; |
||||
for (short i = 1; i < 4; ++i) { |
||||
inputValues[i] = read_imagef(input, smp, inputLocation); |
||||
inputLocation.x += totalNumPackedOutputChannels; |
||||
} |
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { |
||||
inputValues[0] = inputValues[1]; |
||||
inputValues[1] = inputValues[2]; |
||||
inputValues[2] = inputValues[3]; |
||||
inputValues[3] = read_imagef(input, smp, inputLocation); |
||||
inputLocation.x += totalNumPackedChannels; |
||||
float4 weightValues = read_imagef(weights, smp, weightLocation); |
||||
++weightLocation.x; |
||||
outputValues[0] += inputValues[0] * weightValues; |
||||
outputValues[1] += inputValues[1] * weightValues; |
||||
outputValues[2] += inputValues[2] * weightValues; |
||||
outputValues[3] += inputValues[3] * weightValues; |
||||
} |
||||
++inputLocation.y; |
||||
} |
||||
|
||||
#endif |
||||
|
||||
#elif defined(ONLY_1X1_CONV) |
||||
|
||||
// 1x1 convolution |
||||
short endPackedInputChannel = startPackedInputChannel + numPackedInputChannelsForGroup; |
||||
for (short packedInputChannel = startPackedInputChannel; packedInputChannel < endPackedInputChannel; ++packedInputChannel) { |
||||
float4 weightValues[4]; |
||||
for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { |
||||
weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); |
||||
++weightLocation.x; |
||||
} |
||||
|
||||
inputLocation.x = startX + packedInputChannel; |
||||
float4 inputValues[NUM_OUTPUTS]; |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
inputValues[i] = read_imagef(input, smp, inputLocation); |
||||
inputLocation.x += strideWithChannels; |
||||
} |
||||
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
float4 curOutputValues = outputValues[i]; |
||||
curOutputValues.x += inputValues[i].x * weightValues[0].x; |
||||
curOutputValues.x += inputValues[i].y * weightValues[0].y; |
||||
curOutputValues.x += inputValues[i].z * weightValues[0].z; |
||||
curOutputValues.x += inputValues[i].w * weightValues[0].w; |
||||
curOutputValues.y += inputValues[i].x * weightValues[1].x; |
||||
curOutputValues.y += inputValues[i].y * weightValues[1].y; |
||||
curOutputValues.y += inputValues[i].z * weightValues[1].z; |
||||
curOutputValues.y += inputValues[i].w * weightValues[1].w; |
||||
curOutputValues.z += inputValues[i].x * weightValues[2].x; |
||||
curOutputValues.z += inputValues[i].y * weightValues[2].y; |
||||
curOutputValues.z += inputValues[i].z * weightValues[2].z; |
||||
curOutputValues.z += inputValues[i].w * weightValues[2].w; |
||||
curOutputValues.w += inputValues[i].x * weightValues[3].x; |
||||
curOutputValues.w += inputValues[i].y * weightValues[3].y; |
||||
curOutputValues.w += inputValues[i].z * weightValues[3].z; |
||||
curOutputValues.w += inputValues[i].w * weightValues[3].w; |
||||
outputValues[i] = curOutputValues; |
||||
} |
||||
} |
||||
packedOutputChannel += packedOuputChannelOffset; |
||||
|
||||
#else |
||||
|
||||
// normal convolution |
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { |
||||
for (short packedInputChannel = 0; packedInputChannel < numPackedInputChannelsForGroup; ++packedInputChannel) { |
||||
short startXForChannel = startX + packedInputChannel; |
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { |
||||
|
||||
float4 weightValues[4]; |
||||
for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { |
||||
weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); |
||||
++weightLocation.x; |
||||
} |
||||
|
||||
#ifdef SUPPORT_DILATION |
||||
short dilatedStepX = mul24(totalNumPackedInputChannels, dilationX); |
||||
inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel); |
||||
#else |
||||
inputLocation.x = mad24(rfColumn, totalNumPackedInputChannels, startXForChannel); |
||||
#endif |
||||
float4 inputValues[NUM_OUTPUTS]; |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
inputValues[i] = read_imagef(input, smp, inputLocation); |
||||
inputLocation.x += strideWithChannels; |
||||
} |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
float4 curOutputValues = outputValues[i]; |
||||
curOutputValues.x += inputValues[i].x * weightValues[0].x; |
||||
curOutputValues.x += inputValues[i].y * weightValues[0].y; |
||||
curOutputValues.x += inputValues[i].z * weightValues[0].z; |
||||
curOutputValues.x += inputValues[i].w * weightValues[0].w; |
||||
curOutputValues.y += inputValues[i].x * weightValues[1].x; |
||||
curOutputValues.y += inputValues[i].y * weightValues[1].y; |
||||
curOutputValues.y += inputValues[i].z * weightValues[1].z; |
||||
curOutputValues.y += inputValues[i].w * weightValues[1].w; |
||||
curOutputValues.z += inputValues[i].x * weightValues[2].x; |
||||
curOutputValues.z += inputValues[i].y * weightValues[2].y; |
||||
curOutputValues.z += inputValues[i].z * weightValues[2].z; |
||||
curOutputValues.z += inputValues[i].w * weightValues[2].w; |
||||
curOutputValues.w += inputValues[i].x * weightValues[3].x; |
||||
curOutputValues.w += inputValues[i].y * weightValues[3].y; |
||||
curOutputValues.w += inputValues[i].z * weightValues[3].z; |
||||
curOutputValues.w += inputValues[i].w * weightValues[3].w; |
||||
outputValues[i] = curOutputValues; |
||||
} |
||||
} |
||||
} |
||||
#ifdef SUPPORT_DILATION |
||||
inputLocation.y += dilationY; |
||||
#else |
||||
++inputLocation.y; |
||||
#endif |
||||
} |
||||
packedOutputChannel += packedOuputChannelOffset; |
||||
#endif |
||||
|
||||
// bias |
||||
short outputChannel = mul24(packedOutputChannel, 4); |
||||
float4 biasValues = vload4(0, biases + outputChannel); |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputValues[i] += biasValues; |
||||
} |
||||
|
||||
#ifdef SUPPORT_ACCUMULATION |
||||
// accumulate |
||||
if (doAccumulate) { |
||||
int2 outputLocation; |
||||
short outputColumn = startOutputColumn; |
||||
outputLocation.y = outputRow; |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); |
||||
if (outputColumn < numOutputColumns) { |
||||
outputValues[i] += read_imagef(accumulator, smp, outputLocation); |
||||
} |
||||
++outputColumn; |
||||
} |
||||
} |
||||
#endif |
||||
|
||||
// activation |
||||
switch (neuron) { |
||||
case 1: |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputValues[i] = max(outputValues[i], 0.0f); |
||||
} |
||||
break; |
||||
case 2: |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputValues[i] = a * tanh(b * outputValues[i]); |
||||
} |
||||
break; |
||||
case 3: |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); |
||||
} |
||||
break; |
||||
case 4: |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputValues[i] = max(outputValues[i], min_clamp); |
||||
outputValues[i] = min(outputValues[i], max_clamp); |
||||
} |
||||
break; |
||||
case 5: |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); |
||||
} |
||||
break; |
||||
} |
||||
|
||||
// output |
||||
int2 outputLocation; |
||||
short outputColumn = startOutputColumn; |
||||
outputLocation.y = outputRow; |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); |
||||
if (outputColumn < numOutputColumns) { |
||||
write_imagef(output, outputLocation, outputValues[i]); |
||||
} |
||||
++outputColumn; |
||||
} |
||||
} |
@ -1,3 +0,0 @@ |
||||
#define SUPPORT_DILATION |
||||
|
||||
__kernel void convolution_horizontal_reduced_reads( |
@ -1,4 +0,0 @@ |
||||
#define ONLY_1X1_CONV |
||||
#define SUPPORT_ACCUMULATION |
||||
|
||||
__kernel void convolution_horizontal_reduced_reads_1x1( |
@ -1,3 +0,0 @@ |
||||
#define NUM_OUTPUTS 5 |
||||
|
||||
__kernel void convolution_horizontal_reduced_reads_5_outputs( |
@ -1,4 +0,0 @@ |
||||
#define DEPTHWISE |
||||
#define SUPPORT_DILATION |
||||
|
||||
__kernel void convolution_horizontal_reduced_reads_depthwise( |
@ -1,3 +0,0 @@ |
||||
#define DEPTHWISE |
||||
|
||||
__kernel void convolution_horizontal_reduced_reads_depthwise_stride_1( |
@ -1,261 +0,0 @@ |
||||
#include <map> |
||||
#include <string> |
||||
#include <string.h> |
||||
#include <assert.h> |
||||
#include "thneed.h" |
||||
|
||||
#include "common/util.h" |
||||
#include "common/clutil.h" |
||||
|
||||
extern map<cl_program, string> g_program_source; |
||||
|
||||
/*static int is_same_size_image(cl_mem a, cl_mem b) {
|
||||
size_t a_width, a_height, a_depth, a_array_size, a_row_pitch, a_slice_pitch; |
||||
clGetImageInfo(a, CL_IMAGE_WIDTH, sizeof(a_width), &a_width, NULL); |
||||
clGetImageInfo(a, CL_IMAGE_HEIGHT, sizeof(a_height), &a_height, NULL); |
||||
clGetImageInfo(a, CL_IMAGE_DEPTH, sizeof(a_depth), &a_depth, NULL); |
||||
clGetImageInfo(a, CL_IMAGE_ARRAY_SIZE, sizeof(a_array_size), &a_array_size, NULL); |
||||
clGetImageInfo(a, CL_IMAGE_ROW_PITCH, sizeof(a_row_pitch), &a_row_pitch, NULL); |
||||
clGetImageInfo(a, CL_IMAGE_SLICE_PITCH, sizeof(a_slice_pitch), &a_slice_pitch, NULL); |
||||
|
||||
size_t b_width, b_height, b_depth, b_array_size, b_row_pitch, b_slice_pitch; |
||||
clGetImageInfo(b, CL_IMAGE_WIDTH, sizeof(b_width), &b_width, NULL); |
||||
clGetImageInfo(b, CL_IMAGE_HEIGHT, sizeof(b_height), &b_height, NULL); |
||||
clGetImageInfo(b, CL_IMAGE_DEPTH, sizeof(b_depth), &b_depth, NULL); |
||||
clGetImageInfo(b, CL_IMAGE_ARRAY_SIZE, sizeof(b_array_size), &b_array_size, NULL); |
||||
clGetImageInfo(b, CL_IMAGE_ROW_PITCH, sizeof(b_row_pitch), &b_row_pitch, NULL); |
||||
clGetImageInfo(b, CL_IMAGE_SLICE_PITCH, sizeof(b_slice_pitch), &b_slice_pitch, NULL); |
||||
|
||||
return (a_width == b_width) && (a_height == b_height) && |
||||
(a_depth == b_depth) && (a_array_size == b_array_size) && |
||||
(a_row_pitch == b_row_pitch) && (a_slice_pitch == b_slice_pitch); |
||||
}*/ |
||||
|
||||
static cl_mem make_image_like(cl_context context, cl_mem val) { |
||||
cl_image_format format; |
||||
size_t width, height, row_pitch; |
||||
clGetImageInfo(val, CL_IMAGE_FORMAT, sizeof(format), &format, NULL); |
||||
assert(format.image_channel_order == CL_RGBA); |
||||
assert(format.image_channel_data_type == CL_HALF_FLOAT); |
||||
clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL); |
||||
clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL); |
||||
clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL); |
||||
|
||||
cl_image_desc desc = {0}; |
||||
desc.image_type = CL_MEM_OBJECT_IMAGE2D; |
||||
desc.image_width = width; |
||||
desc.image_height = height; |
||||
desc.image_row_pitch = row_pitch; |
||||
|
||||
cl_mem buf = clCreateBuffer(context, CL_MEM_READ_WRITE, row_pitch*height, NULL, NULL); |
||||
assert(buf != NULL); |
||||
desc.buffer = buf; |
||||
|
||||
cl_int err; |
||||
cl_mem tmp = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err); |
||||
//printf("got %d for image %zux%zu %zu\n", err, width, height, row_pitch);
|
||||
assert(tmp != NULL); |
||||
|
||||
return tmp; |
||||
} |
||||
|
||||
// convolution_horizontal_reduced_reads_1x1 is 66% of the model runtime
|
||||
// make that faster and the model gets faster
|
||||
|
||||
// this cuts ~2 ms off the model runtime right now
|
||||
int Thneed::optimize() { |
||||
const char *kernel_path = getenv("KERNEL_PATH"); |
||||
if (!kernel_path) { kernel_path = "/data/openpilot/selfdrive/modeld/thneed/kernels"; printf("no KERNEL_PATH set, defaulting to %s\n", kernel_path); } |
||||
|
||||
string convolution_; |
||||
{ |
||||
char fn[0x100]; |
||||
snprintf(fn, sizeof(fn), "%s/%s.cl", kernel_path, "convolution_"); |
||||
convolution_ = util::read_file(fn); |
||||
} |
||||
|
||||
// load custom kernels
|
||||
map<string, cl_program> g_programs; |
||||
for (auto &k : kq) { |
||||
// replace program?
|
||||
if (g_programs.find(k->name) == g_programs.end()) { |
||||
char fn[0x100]; |
||||
snprintf(fn, sizeof(fn), "%s/%s.cl", kernel_path, k->name.c_str()); |
||||
if (util::file_exists(fn)) { |
||||
string kernel_src = util::read_file(fn); |
||||
if (k->name.rfind("convolution_", 0) == 0) { |
||||
kernel_src += convolution_; |
||||
} |
||||
printf("building kernel %s with len %lu\n", k->name.c_str(), kernel_src.length()); |
||||
k->program = cl_program_from_source(context, device_id, kernel_src); |
||||
|
||||
// save in cache
|
||||
g_programs[k->name] = k->program; |
||||
g_program_source[k->program] = kernel_src; |
||||
} else { |
||||
g_programs[k->name] = NULL; |
||||
} |
||||
} else { |
||||
// cached replacement
|
||||
if (g_programs[k->name] != NULL) { |
||||
k->program = g_programs[k->name]; |
||||
} |
||||
} |
||||
|
||||
// hack in accumulator to convolution_horizontal_reduced_reads_1x1
|
||||
if (k->name == "convolution_horizontal_reduced_reads_1x1") { |
||||
k->arg_names.push_back("doAccumulate"); |
||||
short doAccumulate = 0; |
||||
k->args.push_back(string((char *)&doAccumulate, sizeof(doAccumulate))); |
||||
k->args_size.push_back(2); |
||||
k->arg_names.push_back("accumulator"); |
||||
k->args.push_back(k->args[k->get_arg_num("output")]); |
||||
k->args_size.push_back(8); |
||||
k->num_args += 2; |
||||
} |
||||
|
||||
// assert that parameters + batchNormBiases are not used
|
||||
// since they aren't supported in custom replacement kernels
|
||||
if (k->name == "convolution_horizontal_reduced_reads_1x1" || |
||||
k->name == "convolution_horizontal_reduced_reads" || |
||||
k->name == "convolution_horizontal_reduced_reads_5_outputs") { |
||||
string p1 = k->args[k->get_arg_num("parameters")]; |
||||
string p2 = k->args[k->get_arg_num("batchNormBiases")]; |
||||
assert(p1.length() == 8 && *((uint64_t*)p1.data()) == 0); |
||||
assert(p2.length() == 8 && *((uint64_t*)p2.data()) == 0); |
||||
} |
||||
} |
||||
|
||||
// optimizer
|
||||
size_t start_size; |
||||
do { |
||||
start_size = kq.size(); |
||||
|
||||
// get optimizations
|
||||
map<string, string> replacements; |
||||
for (int i = 0; i < kq.size(); i++) { |
||||
// fusing elementwise_sum + activate_image will save 3 enqueues
|
||||
|
||||
// delete useless copy layers
|
||||
// saves ~0.7 ms
|
||||
/*if (kq[i]->name == "concatenation" || kq[i]->name == "flatten") {
|
||||
string in = kq[i]->args[kq[i]->get_arg_num("input")]; |
||||
string out = kq[i]->args[kq[i]->get_arg_num("output")]; |
||||
if (is_same_size_image(*(cl_mem*)in.data(), *(cl_mem*)out.data())) { |
||||
cl_mem tmp = make_image_like(context, *(cl_mem *)in.data()); |
||||
replacements[in] = string((char *)&tmp, sizeof(tmp)); |
||||
replacements[out] = string((char *)&tmp, sizeof(tmp)); |
||||
|
||||
kq.erase(kq.begin()+i); --i; |
||||
} |
||||
}*/ |
||||
|
||||
// NOTE: if activations/accumulation are done in the wrong order, this will be wrong
|
||||
|
||||
// fuse activations into convs and fc_Wtx
|
||||
// saves ~1.5 ms
|
||||
// NOTE: this changes the outputs because of rounding, should be better now!
|
||||
if (i != 0 && kq[i]->name == "activate_image") { |
||||
if (kq[i-1]->name == "convolution_horizontal_reduced_reads_1x1" || |
||||
kq[i-1]->name == "convolution_horizontal_reduced_reads_5_outputs" || |
||||
kq[i-1]->name == "convolution_horizontal_reduced_reads" || |
||||
kq[i-1]->name == "convolution_horizontal_reduced_reads_depthwise" || |
||||
kq[i-1]->name == "convolution_horizontal_reduced_reads_depthwise_stride_1" || |
||||
kq[i-1]->name == "fc_Wtx") { |
||||
string lastout = kq[i-1]->args[kq[i-1]->get_arg_num("output")]; |
||||
string in = kq[i]->args[kq[i]->get_arg_num("input")]; |
||||
string out = kq[i]->args[kq[i]->get_arg_num("output")]; |
||||
|
||||
if (lastout == in) { |
||||
short neuron = *(int*)kq[i]->args[kq[i]->get_arg_num("neuron")].data(); |
||||
assert(neuron <= 5); |
||||
|
||||
// ELU isn't supported in fc_Wtx
|
||||
assert(!(kq[i-1]->name == "fc_Wtx" && neuron == 5)); |
||||
|
||||
kq[i-1]->args[kq[i-1]->get_arg_num("neuron")] = string((char *)&neuron, sizeof(neuron)); |
||||
|
||||
cl_mem tmp = make_image_like(context, *(cl_mem *)lastout.data()); |
||||
replacements[in] = string((char *)&tmp, sizeof(tmp)); |
||||
replacements[out] = string((char *)&tmp, sizeof(tmp)); |
||||
|
||||
kq.erase(kq.begin()+i); --i; |
||||
} |
||||
} |
||||
} |
||||
|
||||
// fuse accumulation into convs and fc_Wtx
|
||||
if (i != 0 && kq[i]->name == "elementwise_sum") { |
||||
if (kq[i-1]->name == "convolution_horizontal_reduced_reads_1x1" || |
||||
kq[i-1]->name == "fc_Wtx") { |
||||
string lastout = kq[i-1]->args[kq[i-1]->get_arg_num("output")]; |
||||
string a = kq[i]->args[kq[i]->get_arg_num("a")]; |
||||
string b = kq[i]->args[kq[i]->get_arg_num("b")]; |
||||
string out = kq[i]->args[kq[i]->get_arg_num("output")]; |
||||
|
||||
if (lastout == a) { |
||||
kq[i-1]->args[kq[i-1]->get_arg_num("accumulator")] = b; |
||||
} else if (lastout == b) { |
||||
kq[i-1]->args[kq[i-1]->get_arg_num("accumulator")] = a; |
||||
} else { |
||||
continue; |
||||
} |
||||
|
||||
cl_mem tmp = make_image_like(context, *(cl_mem *)lastout.data()); |
||||
replacements[lastout] = string((char *)&tmp, sizeof(tmp)); |
||||
replacements[out] = string((char *)&tmp, sizeof(tmp)); |
||||
|
||||
short doAccumulate = 1; |
||||
kq[i-1]->args[kq[i-1]->get_arg_num("doAccumulate")] = string((char *)&doAccumulate, sizeof(doAccumulate)); |
||||
|
||||
kq.erase(kq.begin()+i); --i; |
||||
} |
||||
} |
||||
} |
||||
|
||||
// remap inputs and outputs, and clear the kernels
|
||||
for (int i = 0; i < kq.size(); i++) { |
||||
kq[i]->kernel = NULL; |
||||
for (int j = 0; j < kq[i]->num_args; j++) { |
||||
if (replacements.find(kq[i]->args[j]) != replacements.end()) { |
||||
kq[i]->args[j] = replacements[kq[i]->args[j]]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
printf("optimize %lu -> %lu\n", start_size, kq.size()); |
||||
} while (kq.size() != start_size); |
||||
|
||||
size_t work_group_size = 0; |
||||
clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(work_group_size), &work_group_size, NULL); |
||||
printf("max work group size %lu\n", work_group_size); |
||||
|
||||
// local work group optimizer
|
||||
for (auto &k : kq) { |
||||
// only do it for convs, since others might share memory
|
||||
if (k->name.rfind("convolution_", 0) == 0) { |
||||
int best = -1; |
||||
if (k->local_work_size[0] * k->local_work_size[1] * k->local_work_size[2] < work_group_size/2) { |
||||
uint64_t base_time = k->benchmark(); |
||||
uint64_t best_time = base_time; |
||||
for (int i = 0; i < 3; i++) { |
||||
k->local_work_size[i] *= 2; |
||||
uint64_t this_time = k->benchmark(); |
||||
if (this_time < best_time) { |
||||
best = i; |
||||
best_time = this_time; |
||||
} |
||||
k->local_work_size[i] /= 2; |
||||
} |
||||
if (best != -1) { |
||||
k->local_work_size[best] *= 2; |
||||
//printf("%s %.2f ms doubled %d to %.2f ms\n", k->name.c_str(), base_time/1e6, best, best_time/1e6);
|
||||
} |
||||
} |
||||
|
||||
} |
||||
} |
||||
|
||||
return 0; |
||||
} |
||||
|
@ -1,146 +0,0 @@ |
||||
#!/usr/bin/env python3 |
||||
import os |
||||
import struct |
||||
import zipfile |
||||
import numpy as np |
||||
from tqdm import tqdm |
||||
|
||||
from common.basedir import BASEDIR |
||||
from selfdrive.modeld.thneed.lib import load_thneed, save_thneed |
||||
|
||||
# this is junk code, but it doesn't have deps |
||||
def load_dlc_weights(fn): |
||||
archive = zipfile.ZipFile(fn, 'r') |
||||
dlc_params = archive.read("model.params") |
||||
|
||||
def extract(rdat): |
||||
idx = rdat.find(b"\x00\x00\x00\x09\x04\x00\x00\x00") |
||||
rdat = rdat[idx+8:] |
||||
ll = struct.unpack("I", rdat[0:4])[0] |
||||
buf = np.frombuffer(rdat[4:4+ll*4], dtype=np.float32) |
||||
rdat = rdat[4+ll*4:] |
||||
dims = struct.unpack("I", rdat[0:4])[0] |
||||
buf = buf.reshape(struct.unpack("I"*dims, rdat[4:4+dims*4])) |
||||
if len(buf.shape) == 4: |
||||
buf = np.transpose(buf, (3,2,0,1)) |
||||
return buf |
||||
|
||||
def parse(tdat): |
||||
ll = struct.unpack("I", tdat[0:4])[0] + 4 |
||||
return (None, [extract(tdat[0:]), extract(tdat[ll:])]) |
||||
|
||||
ptr = 0x20 |
||||
def r4(): |
||||
nonlocal ptr |
||||
ret = struct.unpack("I", dlc_params[ptr:ptr+4])[0] |
||||
ptr += 4 |
||||
return ret |
||||
ranges = [] |
||||
cnt = r4() |
||||
for _ in range(cnt): |
||||
o = r4() + ptr |
||||
# the header is 0xC |
||||
plen, is_4, is_2 = struct.unpack("III", dlc_params[o:o+0xC]) |
||||
assert is_4 == 4 and is_2 == 2 |
||||
ranges.append((o+0xC, o+plen+0xC)) |
||||
ranges = sorted(ranges, reverse=True) |
||||
|
||||
return [parse(dlc_params[s:e]) for s,e in ranges] |
||||
|
||||
# this won't run on device without onnx |
||||
def load_onnx_weights(fn): |
||||
import onnx |
||||
from onnx import numpy_helper |
||||
|
||||
model = onnx.load(fn) |
||||
graph = model.graph # pylint: disable=maybe-no-member |
||||
init = {x.name:x for x in graph.initializer} |
||||
|
||||
onnx_layers = [] |
||||
for node in graph.node: |
||||
#print(node.name, node.op_type, node.input, node.output) |
||||
vals = [] |
||||
for inp in node.input: |
||||
if inp in init: |
||||
vals.append(numpy_helper.to_array(init[inp])) |
||||
if len(vals) > 0: |
||||
onnx_layers.append((node.name, vals)) |
||||
return onnx_layers |
||||
|
||||
def weights_fixup(target, source_thneed, dlc): |
||||
#onnx_layers = load_onnx_weights(os.path.join(BASEDIR, "models/supercombo.onnx")) |
||||
onnx_layers = load_dlc_weights(dlc) |
||||
jdat = load_thneed(source_thneed) |
||||
|
||||
bufs = {} |
||||
for o in jdat['objects']: |
||||
bufs[o['id']] = o |
||||
|
||||
thneed_layers = [] |
||||
for k in jdat['kernels']: |
||||
#print(k['name']) |
||||
vals = [] |
||||
for a in k['args']: |
||||
if a in bufs: |
||||
o = bufs[a] |
||||
if o['needs_load'] or ('buffer_id' in o and bufs[o['buffer_id']]['needs_load']): |
||||
#print(" ", o['arg_type']) |
||||
vals.append(o) |
||||
if len(vals) > 0: |
||||
thneed_layers.append((k['name'], vals)) |
||||
|
||||
assert len(thneed_layers) == len(onnx_layers) |
||||
|
||||
# fix up weights |
||||
for tl, ol in tqdm(zip(thneed_layers, onnx_layers), total=len(thneed_layers)): |
||||
#print(tl[0], ol[0]) |
||||
assert len(tl[1]) == len(ol[1]) |
||||
for o, onnx_weight in zip(tl[1], ol[1]): |
||||
if o['arg_type'] == "image2d_t": |
||||
obuf = bufs[o['buffer_id']] |
||||
saved_weights = np.frombuffer(obuf['data'], dtype=np.float16).reshape(o['height'], o['row_pitch']//2) |
||||
|
||||
if len(onnx_weight.shape) == 4: |
||||
# convolution |
||||
oc,ic,ch,cw = onnx_weight.shape |
||||
|
||||
if 'depthwise' in tl[0]: |
||||
assert ic == 1 |
||||
weights = np.transpose(onnx_weight.reshape(oc//4,4,ch,cw), (0,2,3,1)).reshape(o['height'], o['width']*4) |
||||
else: |
||||
weights = np.transpose(onnx_weight.reshape(oc//4,4,ic//4,4,ch,cw), (0,4,2,5,1,3)).reshape(o['height'], o['width']*4) |
||||
else: |
||||
# fc_Wtx |
||||
weights = onnx_weight |
||||
|
||||
new_weights = np.zeros((o['height'], o['row_pitch']//2), dtype=np.float32) |
||||
new_weights[:, :weights.shape[1]] = weights |
||||
|
||||
# weights shouldn't be too far off |
||||
err = np.mean((saved_weights.astype(np.float32) - new_weights)**2) |
||||
assert err < 1e-3 |
||||
rerr = np.mean(np.abs((saved_weights.astype(np.float32) - new_weights)/(new_weights+1e-12))) |
||||
assert rerr < 0.5 |
||||
|
||||
# fix should improve things |
||||
fixed_err = np.mean((new_weights.astype(np.float16).astype(np.float32) - new_weights)**2) |
||||
assert (err/fixed_err) >= 1 |
||||
|
||||
#print(" ", o['size'], onnx_weight.shape, o['row_pitch'], o['width'], o['height'], "err %.2fx better" % (err/fixed_err)) |
||||
|
||||
obuf['data'] = new_weights.astype(np.float16).tobytes() |
||||
|
||||
elif o['arg_type'] == "float*": |
||||
# unconverted floats are correct |
||||
new_weights = np.zeros(o['size']//4, dtype=np.float32) |
||||
new_weights[:onnx_weight.shape[0]] = onnx_weight |
||||
assert new_weights.tobytes() == o['data'] |
||||
#print(" ", o['size'], onnx_weight.shape) |
||||
|
||||
save_thneed(jdat, target) |
||||
|
||||
if __name__ == "__main__": |
||||
model_dir = os.path.join(BASEDIR, "selfdrive/modeld/models/") |
||||
weights_fixup(os.path.join(model_dir, "supercombo_fixed.thneed"), |
||||
os.path.join(model_dir, "supercombo.thneed"), |
||||
os.path.join(model_dir, "supercombo.dlc")) |
Loading…
Reference in new issue