thneed: add flag to enable optimizer (#24568)

* improve the thneed compiler

* only init thneed if we are using the GPU

Co-authored-by: Comma Device <device@comma.ai>
old-commit-hash: 0fc4b4df98
taco
George Hotz 3 years ago committed by GitHub
parent 6034c5414a
commit 1b18cef243
  1. 2
      selfdrive/modeld/SConscript
  2. 11
      selfdrive/modeld/runners/snpemodel.cc
  3. 3
      selfdrive/modeld/runners/snpemodel.h
  4. 1
      selfdrive/modeld/runners/thneedmodel.cc
  5. 41
      selfdrive/modeld/thneed/compile.cc
  6. 14
      selfdrive/modeld/thneed/thneed.cc
  7. 3
      selfdrive/modeld/thneed/thneed.h

@ -65,7 +65,7 @@ common_model = lenv.Object(common_src)
if use_thneed and arch == "larch64": if use_thneed and arch == "larch64":
fn = File("models/supercombo").abspath fn = File("models/supercombo").abspath
compiler = lenv.Program('thneed/compile', ["thneed/compile.cc"]+common_model, LIBS=libs) compiler = lenv.Program('thneed/compile', ["thneed/compile.cc"]+common_model, LIBS=libs)
cmd = f"cd {Dir('.').abspath} && {compiler[0].abspath} {fn}.dlc {fn}_badweights.thneed --binary" cmd = f"cd {Dir('.').abspath} && {compiler[0].abspath} --in {fn}.dlc --out {fn}_badweights.thneed --binary --optimize"
lib_paths = ':'.join(Dir(p).abspath for p in lenv["LIBPATH"]) lib_paths = ':'.join(Dir(p).abspath for p in lenv["LIBPATH"])
kernel_path = os.path.join(Dir('.').abspath, "thneed", "kernels") kernel_path = os.path.join(Dir('.').abspath, "thneed", "kernels")

@ -123,6 +123,12 @@ SNPEModel::SNPEModel(const char *path, float *loutput, size_t loutput_size, int
outputBuffer = ubFactory.createUserBuffer(output, output_size * sizeof(float), outputStrides, &userBufferEncodingFloat); outputBuffer = ubFactory.createUserBuffer(output, output_size * sizeof(float), outputStrides, &userBufferEncodingFloat);
outputMap.add(output_tensor_name, outputBuffer.get()); outputMap.add(output_tensor_name, outputBuffer.get());
} }
#ifdef USE_THNEED
if (Runtime == zdl::DlSystem::Runtime_t::GPU) {
thneed.reset(new Thneed());
}
#endif
} }
void SNPEModel::addRecurrent(float *state, int state_size) { void SNPEModel::addRecurrent(float *state, int state_size) {
@ -176,7 +182,7 @@ std::unique_ptr<zdl::DlSystem::IUserBuffer> SNPEModel::addExtra(float *state, in
void SNPEModel::execute() { void SNPEModel::execute() {
#ifdef USE_THNEED #ifdef USE_THNEED
if (Runtime == zdl::DlSystem::Runtime_t::GPU) { if (Runtime == zdl::DlSystem::Runtime_t::GPU) {
if (thneed == NULL) { if (!thneed_recorded) {
bool ret = inputBuffer->setBufferAddress(input); bool ret = inputBuffer->setBufferAddress(input);
assert(ret == true); assert(ret == true);
if (use_extra) { if (use_extra) {
@ -188,7 +194,7 @@ void SNPEModel::execute() {
PrintErrorStringAndExit(); PrintErrorStringAndExit();
} }
memset(recurrent, 0, recurrent_size*sizeof(float)); memset(recurrent, 0, recurrent_size*sizeof(float));
thneed = new Thneed(); thneed->record = true;
if (!snpe->execute(inputMap, outputMap)) { if (!snpe->execute(inputMap, outputMap)) {
PrintErrorStringAndExit(); PrintErrorStringAndExit();
} }
@ -220,6 +226,7 @@ void SNPEModel::execute() {
assert(false); assert(false);
} }
free(outputs_golden); free(outputs_golden);
thneed_recorded = true;
} else { } else {
if (use_extra) { if (use_extra) {
float *inputs[5] = {recurrent, trafficConvention, desire, extra, input}; float *inputs[5] = {recurrent, trafficConvention, desire, extra, input};

@ -32,7 +32,8 @@ public:
void execute(); void execute();
#ifdef USE_THNEED #ifdef USE_THNEED
Thneed *thneed = NULL; std::unique_ptr<Thneed> thneed;
bool thneed_recorded = false;
#endif #endif
private: private:

@ -4,7 +4,6 @@
ThneedModel::ThneedModel(const char *path, float *loutput, size_t loutput_size, int runtime, bool luse_extra) { ThneedModel::ThneedModel(const char *path, float *loutput, size_t loutput_size, int runtime, bool luse_extra) {
thneed = new Thneed(true); thneed = new Thneed(true);
thneed->record = 0;
thneed->load(path); thneed->load(path);
thneed->clexec(); thneed->clexec();
thneed->find_inputs_outputs(); thneed->find_inputs_outputs();

@ -1,4 +1,5 @@
#include <cstring> #include <cstring>
#include <getopt.h>
#include "selfdrive/modeld/runners/snpemodel.h" #include "selfdrive/modeld/runners/snpemodel.h"
#include "selfdrive/modeld/thneed/thneed.h" #include "selfdrive/modeld/thneed/thneed.h"
@ -10,10 +11,36 @@
// TODO: This should probably use SNPE directly. // TODO: This should probably use SNPE directly.
int main(int argc, char* argv[]) { int main(int argc, char* argv[]) {
bool run_optimizer = false, save_binaries = false;
const char *input_file = NULL, *output_file = NULL;
static struct option long_options[] = {
{"in", required_argument, 0, 'i' },
{"out", required_argument, 0, 'o' },
{"binary", no_argument, 0, 'b' },
{"optimize", no_argument, 0, 'f' },
{0, 0, 0, 0 }
};
int long_index = 0, opt = 0;
while ((opt = getopt_long_only(argc, argv,"", long_options, &long_index)) != -1) {
switch (opt) {
case 'i': input_file = optarg; break;
case 'o': output_file = optarg; break;
case 'b': save_binaries = true; break;
case 'f': run_optimizer = true; break;
}
}
// no input?
if (!input_file) {
printf("usage: -i <input file> -o <output file> --binary --optimize\n");
return -1;
}
#define OUTPUT_SIZE 0x10000 #define OUTPUT_SIZE 0x10000
float *output = (float*)calloc(OUTPUT_SIZE, sizeof(float)); float *output = (float*)calloc(OUTPUT_SIZE, sizeof(float));
SNPEModel mdl(argv[1], output, 0, USE_GPU_RUNTIME, true); SNPEModel mdl(input_file, output, 0, USE_GPU_RUNTIME, true);
mdl.thneed->run_optimizer = run_optimizer;
float state[TEMPORAL_SIZE] = {0}; float state[TEMPORAL_SIZE] = {0};
float desire[DESIRE_LEN] = {0}; float desire[DESIRE_LEN] = {0};
@ -32,14 +59,20 @@ int main(int argc, char* argv[]) {
memset(output, 0, OUTPUT_SIZE * sizeof(float)); memset(output, 0, OUTPUT_SIZE * sizeof(float));
mdl.execute(); mdl.execute();
// don't save?
if (!output_file) {
printf("no output file, exiting\n");
return 0;
}
// save model // save model
bool save_binaries = (argc > 3) && (strcmp(argv[3], "--binary") == 0); printf("saving %s with binary %d\n", output_file, save_binaries);
mdl.thneed->save(argv[2], save_binaries); mdl.thneed->save(output_file, save_binaries);
// test model // test model
auto thneed = new Thneed(true); auto thneed = new Thneed(true);
thneed->record = false; thneed->record = false;
thneed->load(argv[2]); thneed->load(output_file);
thneed->clexec(); thneed->clexec();
thneed->find_inputs_outputs(); thneed->find_inputs_outputs();

@ -11,8 +11,6 @@
#include "selfdrive/common/clutil.h" #include "selfdrive/common/clutil.h"
#include "selfdrive/common/timing.h" #include "selfdrive/common/timing.h"
//#define RUN_DISASSEMBLER
#define RUN_OPTIMIZER
Thneed *g_thneed = NULL; Thneed *g_thneed = NULL;
int g_fd = -1; int g_fd = -1;
@ -203,11 +201,6 @@ void CachedCommand::exec() {
for (auto &it : kq) { for (auto &it : kq) {
it->debug_print(false); it->debug_print(false);
} }
#ifdef RUN_DISASSEMBLER
// assuming 2 commands
disassemble(0);
disassemble(1);
#endif
} }
assert(ret == 0); assert(ret == 0);
@ -220,7 +213,6 @@ Thneed::Thneed(bool do_clinit) {
assert(g_fd != -1); assert(g_fd != -1);
fd = g_fd; fd = g_fd;
ram = make_unique<GPUMalloc>(0x80000, fd); ram = make_unique<GPUMalloc>(0x80000, fd);
record = true;
timestamp = -1; timestamp = -1;
g_thneed = this; g_thneed = this;
char *thneed_debug_env = getenv("THNEED_DEBUG"); char *thneed_debug_env = getenv("THNEED_DEBUG");
@ -230,7 +222,7 @@ Thneed::Thneed(bool do_clinit) {
void Thneed::stop() { void Thneed::stop() {
find_inputs_outputs(); find_inputs_outputs();
printf("Thneed::stop: recorded %lu commands\n", cmds.size()); printf("Thneed::stop: recorded %lu commands\n", cmds.size());
record = 0; record = false;
} }
void Thneed::find_inputs_outputs() { void Thneed::find_inputs_outputs() {
@ -416,9 +408,7 @@ cl_int thneed_clFinish(cl_command_queue command_queue) {
Thneed *thneed = g_thneed; Thneed *thneed = g_thneed;
if (thneed != NULL && thneed->record) { if (thneed != NULL && thneed->record) {
#ifdef RUN_OPTIMIZER if (thneed->run_optimizer) thneed->optimize();
thneed->optimize();
#endif
return thneed->clexec(); return thneed->clexec();
} else { } else {
return clFinish(command_queue); return clFinish(command_queue);

@ -94,6 +94,7 @@ class Thneed {
void execute(float **finputs, float *foutput, bool slow=false); void execute(float **finputs, float *foutput, bool slow=false);
void wait(); void wait();
int optimize(); int optimize();
bool run_optimizer = false;
vector<cl_mem> input_clmem; vector<cl_mem> input_clmem;
vector<void *> inputs; vector<void *> inputs;
@ -106,7 +107,7 @@ class Thneed {
int context_id; int context_id;
// protected? // protected?
bool record; bool record = false;
int debug; int debug;
int timestamp; int timestamp;
unique_ptr<GPUMalloc> ram; unique_ptr<GPUMalloc> ram;

Loading…
Cancel
Save