diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript index 9b34398c98..4fcded106c 100644 --- a/selfdrive/modeld/SConscript +++ b/selfdrive/modeld/SConscript @@ -3,8 +3,6 @@ lenv = env.Clone() libs = [cereal, messaging, common, 'OpenCL', 'SNPE', 'symphony-cpu', 'capnp', 'zmq', 'kj', 'yuv', gpucommon, visionipc] -TEST_THNEED = False - common_src = [ "models/commonmodel.cc", "runners/snpemodel.cc", @@ -14,16 +12,14 @@ common_src = [ if arch == "aarch64": libs += ['gsl', 'CB', 'gnustl_shared'] - if not TEST_THNEED: - common_src += ["thneed/thneed.cc"] - lenv['CFLAGS'].append("-DUSE_THNEED") - lenv['CXXFLAGS'].append("-DUSE_THNEED") + common_src += ["thneed/thneed.cc"] + lenv['CFLAGS'].append("-DUSE_THNEED") + lenv['CXXFLAGS'].append("-DUSE_THNEED") elif arch == "larch64": libs += ['gsl', 'CB', 'pthread', 'dl'] - if not TEST_THNEED: - common_src += ["thneed/thneed.cc"] - lenv['CFLAGS'].append("-DUSE_THNEED") - lenv['CXXFLAGS'].append("-DUSE_THNEED") + common_src += ["thneed/thneed.cc"] + lenv['CFLAGS'].append("-DUSE_THNEED") + lenv['CXXFLAGS'].append("-DUSE_THNEED") else: libs += ['pthread'] @@ -56,8 +52,3 @@ lenv.Program('_modeld', [ "models/driving.cc", ]+common, LIBS=libs) -if TEST_THNEED: - lenv.Program('thneed/debug/_thneed', [ - "thneed/thneed.cc", "thneed/debug/test.cc" - ]+common, LIBS=libs) - diff --git a/selfdrive/modeld/thneed/debug/.gitignore b/selfdrive/modeld/thneed/debug/.gitignore deleted file mode 100644 index f0ea768194..0000000000 --- a/selfdrive/modeld/thneed/debug/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8ac84c959869ac7c7df139f0d307734f162fec51735ec16c8c6f8c908e69a2ce -size 8 diff --git a/selfdrive/modeld/thneed/debug/decompiler/disasm-a3xx.c b/selfdrive/modeld/thneed/debug/decompiler/disasm-a3xx.c deleted file mode 100644 index 2bed514dab..0000000000 --- a/selfdrive/modeld/thneed/debug/decompiler/disasm-a3xx.c +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8a3d2041c848e4be23b7d535d10af5f454c1b08ae1f1a8ed9031bee24365c965 -size 39479 diff --git a/selfdrive/modeld/thneed/debug/decompiler/instr-a3xx.h b/selfdrive/modeld/thneed/debug/decompiler/instr-a3xx.h deleted file mode 100644 index d84e6b0435..0000000000 --- a/selfdrive/modeld/thneed/debug/decompiler/instr-a3xx.h +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2224e54e0113771123ff3ffec3585c2b675523d62cd759bc28c09406c8efe991 -size 29300 diff --git a/selfdrive/modeld/thneed/debug/decompiler/ir3.h b/selfdrive/modeld/thneed/debug/decompiler/ir3.h deleted file mode 100644 index 97991b7b42..0000000000 --- a/selfdrive/modeld/thneed/debug/decompiler/ir3.h +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a9068160ee23d5dac031b3e29aba8783cf1ec126dde8a6742c23e788d6de5f57 -size 47789 diff --git a/selfdrive/modeld/thneed/debug/decompiler/shader_enums.h b/selfdrive/modeld/thneed/debug/decompiler/shader_enums.h deleted file mode 100644 index 33e077d217..0000000000 --- a/selfdrive/modeld/thneed/debug/decompiler/shader_enums.h +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6bb1240a2a430f82e660b2e78abf01248ee0956632c17594f0ca5fb5924540f1 -size 30316 diff --git a/selfdrive/modeld/thneed/debug/decompiler/util/bitset.h b/selfdrive/modeld/thneed/debug/decompiler/util/bitset.h deleted file mode 100644 index a88cd4cd54..0000000000 --- a/selfdrive/modeld/thneed/debug/decompiler/util/bitset.h +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:99545448afae6443072235b945f525676d89dbf047aa88e5fb4b1f5ec0b389f3 -size 12093 diff --git a/selfdrive/modeld/thneed/debug/decompiler/util/list.h b/selfdrive/modeld/thneed/debug/decompiler/util/list.h deleted file mode 100644 index fe650684ad..0000000000 --- a/selfdrive/modeld/thneed/debug/decompiler/util/list.h +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:086776d92973088f980ff78029bee3e9191be9ac1c21f98a0f4688109eeaf5ab -size 9381 diff --git a/selfdrive/modeld/thneed/debug/decompiler/util/macros.h b/selfdrive/modeld/thneed/debug/decompiler/util/macros.h deleted file mode 100644 index c4c3072455..0000000000 --- a/selfdrive/modeld/thneed/debug/decompiler/util/macros.h +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eb9cf345a46c36f86e246086b09b45c3bab1e29d9d91fbeec9b56efcf8dd4e8b -size 10376 diff --git a/selfdrive/modeld/thneed/debug/disassembler.cc b/selfdrive/modeld/thneed/debug/disassembler.cc deleted file mode 100644 index c9f7f17bf2..0000000000 --- a/selfdrive/modeld/thneed/debug/disassembler.cc +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:afd70142141fef1a4f722dccc06b321f3f83595ef68fad84a728c75d9fe13f87 -size 4236 diff --git a/selfdrive/modeld/thneed/debug/include/a5xx.xml.h b/selfdrive/modeld/thneed/debug/include/a5xx.xml.h deleted file mode 100644 index 10eb528890..0000000000 --- a/selfdrive/modeld/thneed/debug/include/a5xx.xml.h +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c26352f9921d4bf51b182bd6ae1cd56f4c93954cafad446e983cadeb7a41546e -size 184973 diff --git a/selfdrive/modeld/thneed/debug/include/adreno_pm4.xml.h b/selfdrive/modeld/thneed/debug/include/adreno_pm4.xml.h deleted file mode 100644 index b1915eb5c1..0000000000 --- a/selfdrive/modeld/thneed/debug/include/adreno_pm4.xml.h +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:784eb1d9af94889e2ec29f4dba60c25185454a94837e59f6f96ceb62d9b33465 -size 50159 diff --git a/selfdrive/modeld/thneed/debug/include/adreno_pm4types.h b/selfdrive/modeld/thneed/debug/include/adreno_pm4types.h deleted file mode 100644 index cd5fc1378c..0000000000 --- a/selfdrive/modeld/thneed/debug/include/adreno_pm4types.h +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61c74fb0b2ead28ae4ce9c7e849c66f3b200517310f9b26b0f5dcb294079167d -size 13124 diff --git a/selfdrive/modeld/thneed/debug/main.cc b/selfdrive/modeld/thneed/debug/main.cc deleted file mode 100644 index a59fd91f3c..0000000000 --- a/selfdrive/modeld/thneed/debug/main.cc +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b6fcf547efd5bd6d6a740d9b7415e7dacd6624910eb3447e1a2063b0bfb6fba8 -size 24327 diff --git a/selfdrive/modeld/thneed/debug/microbenchmark/gemm.cl b/selfdrive/modeld/thneed/debug/microbenchmark/gemm.cl deleted file mode 100644 index 9269d9c492..0000000000 --- a/selfdrive/modeld/thneed/debug/microbenchmark/gemm.cl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:82956213d71cce8685bbfaf45aa44fa0032e435d3f235890a72837a8a981893a -size 1127 diff --git a/selfdrive/modeld/thneed/debug/microbenchmark/gemm_image.cl b/selfdrive/modeld/thneed/debug/microbenchmark/gemm_image.cl deleted file mode 100644 index 67f50be623..0000000000 --- a/selfdrive/modeld/thneed/debug/microbenchmark/gemm_image.cl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1ad38feef90e767d5edf723f38dc65666149ebd6692923c8c8aa80a9328b2845 -size 1816 diff --git a/selfdrive/modeld/thneed/debug/microbenchmark/go.c b/selfdrive/modeld/thneed/debug/microbenchmark/go.c deleted file mode 100644 index 5572b5c011..0000000000 --- a/selfdrive/modeld/thneed/debug/microbenchmark/go.c +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6bf800f2584a7320bd99d2fee3a1868e194913d47ac51d6ecdd6e0399c0c945f -size 9604 diff --git a/selfdrive/modeld/thneed/debug/microbenchmark/run.sh b/selfdrive/modeld/thneed/debug/microbenchmark/run.sh deleted file mode 100755 index d2cd2ea549..0000000000 --- a/selfdrive/modeld/thneed/debug/microbenchmark/run.sh +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:98ccebe6e903204a6fee90ac3b689e427448dc48efed095c435c0f40dab62ae7 -size 115 diff --git a/selfdrive/modeld/thneed/debug/test.cc b/selfdrive/modeld/thneed/debug/test.cc deleted file mode 100644 index 13c3114d06..0000000000 --- a/selfdrive/modeld/thneed/debug/test.cc +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dc1020ef4bbdd7ae0fc3ea2bdf76478f82a1708c2838795f768b2c5a5fa86423 -size 3019 diff --git a/selfdrive/modeld/thneed/debug/thneed b/selfdrive/modeld/thneed/debug/thneed deleted file mode 100755 index f0e49e824d..0000000000 --- a/selfdrive/modeld/thneed/debug/thneed +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:96f39f024c77aa83a127eedb2751d09946ed03560ce5ec80dd57bb9756b00325 -size 200 diff --git a/selfdrive/modeld/thneed/thneed.cc b/selfdrive/modeld/thneed/thneed.cc index e08370a70d..4432856532 100644 --- a/selfdrive/modeld/thneed/thneed.cc +++ b/selfdrive/modeld/thneed/thneed.cc @@ -7,9 +7,16 @@ #include #include "thneed.h" +//#define SAVE_KERNELS + +//#define RUN_DISASSEMBLER +//#define RUN_OPTIMIZER + Thneed *g_thneed = NULL; int g_fd = -1; map, string> g_args; +map, int> g_args_size; +map g_program_source; static inline uint64_t nanos_since_boot() { struct timespec t; @@ -27,6 +34,8 @@ void hexdump(uint32_t *d, int len) { printf("\n"); } +// *********** ioctl interceptor *********** + extern "C" { int (*my_ioctl)(int filedes, unsigned long request, void *argp) = NULL; @@ -39,32 +48,33 @@ int ioctl(int filedes, unsigned long request, void *argp) { // save the fd if (request == IOCTL_KGSL_GPUOBJ_ALLOC) g_fd = filedes; + // note that this runs always, even without a thneed object if (request == IOCTL_KGSL_DRAWCTXT_CREATE) { struct kgsl_drawctxt_create *create = (struct kgsl_drawctxt_create *)argp; create->flags &= ~KGSL_CONTEXT_PRIORITY_MASK; create->flags |= 1 << KGSL_CONTEXT_PRIORITY_SHIFT; // priority from 1-15, 1 is max priority - printf("creating context with flags 0x%x\n", create->flags); + printf("IOCTL_KGSL_DRAWCTXT_CREATE: creating context with flags 0x%x\n", create->flags); } if (thneed != NULL) { if (request == IOCTL_KGSL_GPU_COMMAND) { struct kgsl_gpu_command *cmd = (struct kgsl_gpu_command *)argp; - if (thneed->record & 1) { + if (thneed->record & THNEED_RECORD) { thneed->timestamp = cmd->timestamp; thneed->context_id = cmd->context_id; thneed->cmds.push_back(unique_ptr(new CachedCommand(thneed, cmd))); } - if (thneed->record & 2) { - printf("IOCTL_KGSL_GPU_COMMAND(%2zu): flags: 0x%lx context_id: %u timestamp: %u\n", + if (thneed->record & THNEED_DEBUG) { + printf("IOCTL_KGSL_GPU_COMMAND(%2zu): flags: 0x%lx context_id: %u timestamp: %u numcmds: %d numobjs: %d\n", thneed->cmds.size(), cmd->flags, - cmd->context_id, cmd->timestamp); + cmd->context_id, cmd->timestamp, cmd->numcmds, cmd->numobjs); } } else if (request == IOCTL_KGSL_GPUOBJ_SYNC) { struct kgsl_gpuobj_sync *cmd = (struct kgsl_gpuobj_sync *)argp; struct kgsl_gpuobj_sync_obj *objs = (struct kgsl_gpuobj_sync_obj *)(cmd->objs); - if (thneed->record & 2) { + if (thneed->record & THNEED_DEBUG) { printf("IOCTL_KGSL_GPUOBJ_SYNC count:%d ", cmd->count); for (int i = 0; i < cmd->count; i++) { printf(" -- offset:0x%lx len:0x%lx id:%d op:%d ", objs[i].offset, objs[i].length, objs[i].id, objs[i].op); @@ -72,20 +82,20 @@ int ioctl(int filedes, unsigned long request, void *argp) { printf("\n"); } - if (thneed->record & 1) { + if (thneed->record & THNEED_RECORD) { thneed->syncobjs.push_back(string((char *)objs, sizeof(struct kgsl_gpuobj_sync_obj)*cmd->count)); } } else if (request == IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID) { struct kgsl_device_waittimestamp_ctxtid *cmd = (struct kgsl_device_waittimestamp_ctxtid *)argp; - if (thneed->record & 2) { + if (thneed->record & THNEED_DEBUG) { printf("IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID: context_id: %d timestamp: %d timeout: %d\n", cmd->context_id, cmd->timestamp, cmd->timeout); } } else if (request == IOCTL_KGSL_SETPROPERTY) { - if (thneed->record & 2) { + if (thneed->record & THNEED_DEBUG) { struct kgsl_device_getproperty *prop = (struct kgsl_device_getproperty *)argp; printf("IOCTL_KGSL_SETPROPERTY: 0x%x sizebytes:%zu\n", prop->type, prop->sizebytes); - if (thneed->record & 4) { + if (thneed->record & THNEED_VERBOSE_DEBUG) { hexdump((uint32_t *)prop->value, prop->sizebytes); if (prop->type == KGSL_PROP_PWR_CONSTRAINT) { struct kgsl_device_constraint *constraint = (struct kgsl_device_constraint *)prop->value; @@ -103,6 +113,8 @@ int ioctl(int filedes, unsigned long request, void *argp) { } +// *********** GPUMalloc *********** + GPUMalloc::GPUMalloc(int size, int fd) { struct kgsl_gpuobj_alloc alloc; memset(&alloc, 0, sizeof(alloc)); @@ -128,30 +140,38 @@ void *GPUMalloc::alloc(int size) { return ret; } +// *********** CachedCommand, at the ioctl layer *********** + CachedCommand::CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd) { thneed = lthneed; - assert(cmd->numcmds == 2); - assert(cmd->numobjs == 1); assert(cmd->numsyncs == 0); - memcpy(cmds, (void *)cmd->cmdlist, sizeof(struct kgsl_command_object)*2); - memcpy(objs, (void *)cmd->objlist, sizeof(struct kgsl_command_object)*1); - memcpy(&cache, cmd, sizeof(cache)); - cache.cmdlist = (uint64_t)cmds; - cache.objlist = (uint64_t)objs; - for (int i = 0; i < cmd->numcmds; i++) { - void *nn = thneed->ram->alloc(cmds[i].size); - memcpy(nn, (void*)cmds[i].gpuaddr, cmds[i].size); - cmds[i].gpuaddr = (uint64_t)nn; + if (cmd->numcmds > 0) { + cmds = make_unique(cmd->numcmds); + memcpy(cmds.get(), (void *)cmd->cmdlist, sizeof(struct kgsl_command_object)*cmd->numcmds); + cache.cmdlist = (uint64_t)cmds.get(); + for (int i = 0; i < cmd->numcmds; i++) { + void *nn = thneed->ram->alloc(cmds[i].size); + memcpy(nn, (void*)cmds[i].gpuaddr, cmds[i].size); + cmds[i].gpuaddr = (uint64_t)nn; + } } - for (int i = 0; i < cmd->numobjs; i++) { - void *nn = thneed->ram->alloc(objs[i].size); - memset(nn, 0, objs[i].size); - objs[i].gpuaddr = (uint64_t)nn; + if (cmd->numobjs > 0) { + objs = make_unique(cmd->numobjs); + memcpy(objs.get(), (void *)cmd->objlist, sizeof(struct kgsl_command_object)*cmd->numobjs); + cache.objlist = (uint64_t)objs.get(); + for (int i = 0; i < cmd->numobjs; i++) { + void *nn = thneed->ram->alloc(objs[i].size); + memset(nn, 0, objs[i].size); + objs[i].gpuaddr = (uint64_t)nn; + } } + + kq = thneed->ckq; + thneed->ckq.clear(); } void CachedCommand::exec(bool wait) { @@ -168,19 +188,33 @@ void CachedCommand::exec(bool wait) { int wret = ioctl(thneed->fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait); uint64_t te = nanos_since_boot(); - if (thneed->record & 2) printf("exec %d wait %d after %lu us\n", ret, wret, (te-tb)/1000); + if (thneed->record & THNEED_DEBUG) printf("exec %d wait %d after %lu us\n", ret, wret, (te-tb)/1000); } else { - if (thneed->record & 2) printf("CachedCommand::exec got %d\n", ret); + if (thneed->record & THNEED_DEBUG) printf("CachedCommand::exec got %d\n", ret); + } + + if (thneed->record & THNEED_VERBOSE_DEBUG) { + for (auto &it : kq) { + it->debug_print(false); + } + #ifdef RUN_DISASSEMBLER + // assuming 2 commands + disassemble(0); + disassemble(1); + #endif } assert(ret == 0); } -Thneed::Thneed() { +// *********** Thneed *********** + +Thneed::Thneed(bool do_clinit) { + if (do_clinit) clinit(); assert(g_fd != -1); fd = g_fd; ram = make_unique(0x40000, fd); - record = 1; + record = THNEED_RECORD; timestamp = -1; g_thneed = this; } @@ -189,29 +223,18 @@ void Thneed::stop() { record = 0; } -//#define SAVE_LOG - void Thneed::execute(float **finputs, float *foutput, bool slow) { + int ret; uint64_t tb, te; - if (record & 2) tb = nanos_since_boot(); - - #ifdef SAVE_LOG - char fn[0x100]; - snprintf(fn, sizeof(fn), "/tmp/thneed_log_%d", timestamp); - FILE *f = fopen(fn, "wb"); - #endif + if (record & THNEED_DEBUG) tb = nanos_since_boot(); // ****** copy inputs for (int idx = 0; idx < inputs.size(); ++idx) { size_t sz; clGetMemObjectInfo(inputs[idx], CL_MEM_SIZE, sizeof(sz), &sz, NULL); - #ifdef SAVE_LOG - fwrite(&sz, 1, sizeof(sz), f); - fwrite(finputs[idx], 1, sz, f); - #endif - - if (record & 2) printf("copying %lu -- %p -> %p\n", sz, finputs[idx], inputs[idx]); + if (record & THNEED_DEBUG) printf("copying %lu -- %p -> %p\n", sz, finputs[idx], inputs[idx]); + // TODO: This shouldn't have to block clEnqueueWriteBuffer(command_queue, inputs[idx], CL_TRUE, 0, sz, finputs[idx], 0, NULL, NULL); } @@ -229,40 +252,38 @@ void Thneed::execute(float **finputs, float *foutput, bool slow) { prop.type = KGSL_PROP_PWR_CONSTRAINT; prop.value = (void*)&constraint; prop.sizebytes = sizeof(constraint); - int ret = ioctl(fd, IOCTL_KGSL_SETPROPERTY, &prop); + ret = ioctl(fd, IOCTL_KGSL_SETPROPERTY, &prop); assert(ret == 0); // ****** run commands int i = 0; - for (auto it = cmds.begin(); it != cmds.end(); ++it) { + for (auto &it : cmds) { ++i; - if (record & 2) printf("run %2d: ", i); - (*it)->exec((i == cmds.size()) || slow); + if (record & THNEED_DEBUG) printf("run %2d @ %7lu us: ", i, (nanos_since_boot()-tb)/1000); + it->exec((i == cmds.size()) || slow); } // ****** sync objects - for (auto it = syncobjs.begin(); it != syncobjs.end(); ++it) { + for (auto &it : syncobjs) { struct kgsl_gpuobj_sync cmd; - cmd.objs = (uint64_t)it->data(); - cmd.obj_len = it->length(); - cmd.count = it->length() / sizeof(struct kgsl_gpuobj_sync_obj); + cmd.objs = (uint64_t)it.data(); + cmd.obj_len = it.length(); + cmd.count = it.length() / sizeof(struct kgsl_gpuobj_sync_obj); ret = ioctl(fd, IOCTL_KGSL_GPUOBJ_SYNC, &cmd); assert(ret == 0); } // ****** copy outputs - size_t sz; - clGetMemObjectInfo(output, CL_MEM_SIZE, sizeof(sz), &sz, NULL); - if (record & 2) printf("copying %lu for output %p -> %p\n", sz, output, foutput); - clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sz, foutput, 0, NULL, NULL); - - #ifdef SAVE_LOG - fwrite(&sz, 1, sizeof(sz), f); - fwrite(foutput, 1, sz, f); - fclose(f); - #endif + if (output != NULL) { + size_t sz; + clGetMemObjectInfo(output, CL_MEM_SIZE, sizeof(sz), &sz, NULL); + if (record & THNEED_DEBUG) printf("copying %lu for output %p -> %p\n", sz, output, foutput); + clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sz, foutput, 0, NULL, NULL); + } else { + printf("CAUTION: model output is NULL, does it have no outputs?\n"); + } // ****** unset power constraint constraint.type = KGSL_CONSTRAINT_NONE; @@ -272,25 +293,59 @@ void Thneed::execute(float **finputs, float *foutput, bool slow) { ret = ioctl(fd, IOCTL_KGSL_SETPROPERTY, &prop); assert(ret == 0); - if (record & 2) { + if (record & THNEED_DEBUG) { te = nanos_since_boot(); printf("model exec in %lu us\n", (te-tb)/1000); } } -// TODO: with a different way of getting the input and output buffers, we don't have to intercept CL at all +void Thneed::clinit() { + cl_int err; + + cl_platform_id platform_id[2]; + cl_uint num_devices; + cl_uint num_platforms; + + err = clGetPlatformIDs(sizeof(platform_id)/sizeof(cl_platform_id), platform_id, &num_platforms); + assert(err == 0); + + err = clGetDeviceIDs(platform_id[0], CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &num_devices); + assert(err == 0); + + context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &err); + assert(err == 0); + + //cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; + cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0}; + command_queue = clCreateCommandQueueWithProperties(context, device_id, props, &err); + assert(err == 0); + + printf("Thneed::clinit done\n"); +} + +cl_int Thneed::clexec() { + printf("Thneed::clexec: running %lu queued kernels\n", kq.size()); + for (auto &k : kq) { + if (record & THNEED_RECORD) ckq.push_back(k); + cl_int ret = k->exec(); + assert(ret == CL_SUCCESS); + } + return clFinish(command_queue); +} + +// *********** OpenCL interceptor *********** -cl_int (*my_clSetKernelArg)(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) = NULL; cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) { - if (my_clSetKernelArg == NULL) my_clSetKernelArg = reinterpret_cast(dlsym(RTLD_NEXT, "REAL_clSetKernelArg")); + g_args_size[make_pair(kernel, arg_index)] = arg_size; if (arg_value != NULL) { g_args[make_pair(kernel, arg_index)] = string((char*)arg_value, arg_size); + } else { + g_args[make_pair(kernel, arg_index)] = string(""); } - cl_int ret = my_clSetKernelArg(kernel, arg_index, arg_size, arg_value); + cl_int ret = clSetKernelArg(kernel, arg_index, arg_size, arg_value); return ret; } -cl_int (*my_clEnqueueNDRangeKernel)(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *) = NULL; cl_int thneed_clEnqueueNDRangeKernel(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, @@ -301,62 +356,183 @@ cl_int thneed_clEnqueueNDRangeKernel(cl_command_queue command_queue, const cl_event *event_wait_list, cl_event *event) { - if (my_clEnqueueNDRangeKernel == NULL) my_clEnqueueNDRangeKernel = reinterpret_cast(dlsym(RTLD_NEXT, "REAL_clEnqueueNDRangeKernel")); Thneed *thneed = g_thneed; // SNPE doesn't use these assert(num_events_in_wait_list == 0); assert(global_work_offset == NULL); + assert(event_wait_list == NULL); + + cl_int ret = 0; + if (thneed != NULL && thneed->record & THNEED_RECORD) { + if (thneed->context == NULL) { + thneed->command_queue = command_queue; + clGetKernelInfo(kernel, CL_KERNEL_CONTEXT, sizeof(thneed->context), &thneed->context, NULL); + clGetContextInfo(thneed->context, CL_CONTEXT_DEVICES, sizeof(thneed->device_id), &thneed->device_id, NULL); + } + + // if we are recording, we don't actually enqueue the kernel + thneed->kq.push_back(unique_ptr(new CLQueuedKernel(thneed, kernel, work_dim, global_work_size, local_work_size))); + *event = NULL; + } else { + ret = clEnqueueNDRangeKernel(command_queue, kernel, work_dim, + global_work_offset, global_work_size, local_work_size, + num_events_in_wait_list, event_wait_list, event); + } + + return ret; +} + +cl_int thneed_clFinish(cl_command_queue command_queue) { + Thneed *thneed = g_thneed; + + if (thneed != NULL && thneed->record & THNEED_RECORD) { + #ifdef RUN_OPTIMIZER + thneed->optimize(); + #endif + return thneed->clexec(); + } else { + return clFinish(command_queue); + } +} + +cl_program thneed_clCreateProgramWithSource(cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) { + assert(count == 1); + cl_program ret = clCreateProgramWithSource(context, count, strings, lengths, errcode_ret); + g_program_source[ret] = strings[0]; + return ret; +} + +void *dlsym(void *handle, const char *symbol) { + // TODO: Find dlsym in a better way. Currently this is hand looked up in libdl.so +#if defined QCOM + void *(*my_dlsym)(void *handle, const char *symbol) = (void *(*)(void *handle, const char *symbol))((uintptr_t)dlopen-0x2d4); +#elif defined QCOM2 + void *(*my_dlsym)(void *handle, const char *symbol) = (void *(*)(void *handle, const char *symbol))((uintptr_t)dlopen+0x138); +#else + #error "Unsupported platform for thneed" +#endif + if (memcmp("REAL_", symbol, 5) == 0) { + return my_dlsym(handle, symbol+5); + } else if (strcmp("clFinish", symbol) == 0) { + return (void*)thneed_clFinish; + } else if (strcmp("clEnqueueNDRangeKernel", symbol) == 0) { + return (void*)thneed_clEnqueueNDRangeKernel; + } else if (strcmp("clSetKernelArg", symbol) == 0) { + return (void*)thneed_clSetKernelArg; + } else if (strcmp("clCreateProgramWithSource", symbol) == 0) { + return (void*)thneed_clCreateProgramWithSource; + } else { + return my_dlsym(handle, symbol); + } +} + +// *********** CLQueuedKernel *********** - char name[0x100]; - clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, sizeof(name), name, NULL); +CLQueuedKernel::CLQueuedKernel(Thneed *lthneed, + cl_kernel _kernel, + cl_uint _work_dim, + const size_t *_global_work_size, + const size_t *_local_work_size) { + thneed = lthneed; + kernel = _kernel; + work_dim = _work_dim; + assert(work_dim <= 3); + for (int i = 0; i < work_dim; i++) { + global_work_size[i] = _global_work_size[i]; + local_work_size[i] = _local_work_size[i]; + } - cl_uint num_args; + char _name[0x100]; + clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, sizeof(_name), _name, NULL); + name = string(_name); clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(num_args), &num_args, NULL); - if (thneed != NULL && thneed->record & 1) { - thneed->command_queue = command_queue; - for (int i = 0; i < num_args; i++) { + // get args + for (int i = 0; i < num_args; i++) { + char arg_name[0x100]; + clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL); + arg_names.push_back(string(arg_name)); + args.push_back(g_args[make_pair(kernel, i)]); + args_size.push_back(g_args_size[make_pair(kernel, i)]); + } + + // get program + clGetKernelInfo(kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, NULL); +} + +int CLQueuedKernel::get_arg_num(const char *search_arg_name) { + for (int i = 0; i < num_args; i++) { + if (arg_names[i] == search_arg_name) return i; + } + printf("failed to find %s in %s\n", search_arg_name, name.c_str()); + assert(false); +} + +cl_int CLQueuedKernel::exec() { + if (kernel == NULL) { + kernel = clCreateKernel(program, name.c_str(), NULL); + arg_names.clear(); + + for (int j = 0; j < num_args; j++) { char arg_name[0x100]; - clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL); - string arg = g_args[make_pair(kernel, i)]; + clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL); + arg_names.push_back(string(arg_name)); + + cl_int ret; + if (args[j].size() != 0) { + assert(args[j].size() == args_size[j]); + ret = thneed_clSetKernelArg(kernel, j, args[j].size(), args[j].data()); + } else { + ret = thneed_clSetKernelArg(kernel, j, args_size[j], NULL); + } + assert(ret == CL_SUCCESS); + } + } - if (strcmp(arg_name, "input") == 0 && strcmp(name, "zero_pad_image_float") == 0) { - cl_mem mem; - memcpy(&mem, (void*)arg.data(), sizeof(mem)); - thneed->inputs.push_back(mem); + // save the global inputs/outputs + if (thneed->record & THNEED_RECORD) { + for (int i = 0; i < num_args; i++) { + if (name == "zero_pad_image_float" && arg_names[i] == "input") { + thneed->inputs.push_back(*(cl_mem*)(args[i].data())); } - if (strcmp(arg_name, "output") == 0 && strcmp(name, "image2d_to_buffer_float") == 0) { - cl_mem mem; - memcpy(&mem, (void*)arg.data(), sizeof(mem)); - thneed->output = mem; + if (name == "image2d_to_buffer_float" && arg_names[i] == "output") { + thneed->output = *(cl_mem*)(args[i].data()); } } } - if (thneed != NULL && thneed->record & 2) { - printf("%p %56s -- ", kernel, name); - for (int i = 0; i < work_dim; i++) { - printf("%4zu ", global_work_size[i]); - } - printf(" -- "); - for (int i = 0; i < work_dim; i++) { - printf("%4zu ", local_work_size[i]); - } - printf("\n"); + + if (thneed->record & THNEED_DEBUG) { + debug_print(thneed->record & THNEED_VERBOSE_DEBUG); + } + + return clEnqueueNDRangeKernel(thneed->command_queue, + kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL); +} + +void CLQueuedKernel::debug_print(bool verbose) { + printf("%p %56s -- ", kernel, name.c_str()); + for (int i = 0; i < work_dim; i++) { + printf("%4zu ", global_work_size[i]); + } + printf(" -- "); + for (int i = 0; i < work_dim; i++) { + printf("%4zu ", local_work_size[i]); } - if (thneed != NULL && thneed->record & 4) { - // extreme debug + printf("\n"); + + if (verbose) { for (int i = 0; i < num_args; i++) { char arg_type[0x100]; - char arg_name[0x100]; clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_type), arg_type, NULL); - clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL); - string arg = g_args[make_pair(kernel, i)]; - printf(" %s %s", arg_type, arg_name); + string arg = args[i]; + printf(" %s %s", arg_type, arg_names[i].c_str()); void *arg_value = (void*)arg.data(); int arg_size = arg.size(); - if (arg_size == 1) { + if (arg_size == 0) { + printf(" (size) %d", args_size[i]); + } else if (arg_size == 1) { printf(" = %d", *((char*)arg_value)); } else if (arg_size == 2) { printf(" = %d", *((short*)arg_value)); @@ -373,19 +549,24 @@ cl_int thneed_clEnqueueNDRangeKernel(cl_command_queue command_queue, if (strcmp("image2d_t", arg_type) == 0 || strcmp("image1d_t", arg_type) == 0) { cl_image_format format; size_t width, height, depth, array_size, row_pitch, slice_pitch; + cl_mem buf; clGetImageInfo(val, CL_IMAGE_FORMAT, sizeof(format), &format, NULL); + assert(format.image_channel_order == CL_RGBA); assert(format.image_channel_data_type == CL_HALF_FLOAT); clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL); clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL); + clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL); clGetImageInfo(val, CL_IMAGE_DEPTH, sizeof(depth), &depth, NULL); clGetImageInfo(val, CL_IMAGE_ARRAY_SIZE, sizeof(array_size), &array_size, NULL); - clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL); clGetImageInfo(val, CL_IMAGE_SLICE_PITCH, sizeof(slice_pitch), &slice_pitch, NULL); assert(depth == 0); assert(array_size == 0); assert(slice_pitch == 0); - printf(" image %zu x %zu rp %zu", width, height, row_pitch); + clGetImageInfo(val, CL_IMAGE_BUFFER, sizeof(buf), &buf, NULL); + size_t sz; + clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL); + printf(" image %zu x %zu rp %zu @ %p buffer %zu", width, height, row_pitch, buf, sz); } else { size_t sz; clGetMemObjectInfo(val, CL_MEM_SIZE, sizeof(sz), &sz, NULL); @@ -396,79 +577,5 @@ cl_int thneed_clEnqueueNDRangeKernel(cl_command_queue command_queue, printf("\n"); } } - - cl_int ret = my_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, - global_work_offset, global_work_size, local_work_size, - num_events_in_wait_list, event_wait_list, event); - - /*uint64_t tb = nanos_since_boot(); - clWaitForEvents(1, event); - uint64_t te = nanos_since_boot(); - if (thneed != NULL && thneed->record & 2) { - printf(" wait %lu us\n", (te-tb)/1000); - }*/ - - return ret; -} - -//#define SAVE_KERNELS - -#ifdef SAVE_KERNELS -map program_source; - -cl_program (*my_clCreateProgramWithSource)(cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) = NULL; -cl_program thneed_clCreateProgramWithSource(cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) { - if (my_clCreateProgramWithSource == NULL) my_clCreateProgramWithSource = reinterpret_cast(dlsym(RTLD_NEXT, "REAL_clCreateProgramWithSource")); - assert(count == 1); - size_t my_lengths[1]; - my_lengths[0] = lengths[0]; - - char fn[0x100]; - snprintf(fn, sizeof(fn), "/tmp/program_%zu.cl", strlen(strings[0])); - FILE *f = fopen(fn, "wb"); - fprintf(f, "%s", strings[0]); - fclose(f); - - char tmp[0x10000]; - memset(tmp, 0, sizeof(tmp)); - snprintf(fn, sizeof(fn), "/tmp/patched_%zu.cl", strlen(strings[0])); - FILE *g = fopen(fn, "rb"); - if (g != NULL) { - printf("LOADING PATCHED PROGRAM %s\n", fn); - fread(tmp, 1, sizeof(tmp), g); - fclose(g); - strings[0] = tmp; - my_lengths[0] = strlen(tmp); - } - - program_source[ret] = strings[0]; - - cl_program ret = my_clCreateProgramWithSource(context, count, strings, my_lengths, errcode_ret); - return ret; -} -#endif - -void *dlsym(void *handle, const char *symbol) { - // TODO: Find dlsym in a better way. Currently this is hand looked up in libdl.so -#if defined QCOM - void *(*my_dlsym)(void *handle, const char *symbol) = (void *(*)(void *handle, const char *symbol))((uintptr_t)dlopen-0x2d4); -#elif defined QCOM2 - void *(*my_dlsym)(void *handle, const char *symbol) = (void *(*)(void *handle, const char *symbol))((uintptr_t)dlopen+0x138); -#else - #error "Unsupported platform for thneed" -#endif - if (memcmp("REAL_", symbol, 5) == 0) { - return my_dlsym(handle, symbol+5); - } else if (strcmp("clEnqueueNDRangeKernel", symbol) == 0) { - return (void*)thneed_clEnqueueNDRangeKernel; - } else if (strcmp("clSetKernelArg", symbol) == 0) { - return (void*)thneed_clSetKernelArg; -#ifdef SAVE_KERNELS - } else if (strcmp("clCreateProgramWithSource", symbol) == 0) { - return (void*)thneed_clCreateProgramWithSource; -#endif - } else { - return my_dlsym(handle, symbol); - } } diff --git a/selfdrive/modeld/thneed/thneed.h b/selfdrive/modeld/thneed/thneed.h index a145a28476..e1039efdff 100644 --- a/selfdrive/modeld/thneed/thneed.h +++ b/selfdrive/modeld/thneed/thneed.h @@ -9,10 +9,18 @@ #include "include/msm_kgsl.h" #include #include +#include #include +#define THNEED_RECORD 1 +#define THNEED_DEBUG 2 +#define THNEED_VERBOSE_DEBUG 4 + using namespace std; +namespace json11 { + class Json; +} class Thneed; class GPUMalloc { @@ -25,28 +33,59 @@ class GPUMalloc { int remaining; }; +class CLQueuedKernel { + public: + CLQueuedKernel(Thneed *lthneed) { thneed = lthneed; } + CLQueuedKernel(Thneed *lthneed, + cl_kernel _kernel, + cl_uint _work_dim, + const size_t *_global_work_size, + const size_t *_local_work_size); + cl_int exec(); + void debug_print(bool verbose); + int get_arg_num(const char *search_arg_name); + cl_program program; + string name; + cl_uint num_args; + vector arg_names; + vector args; + vector args_size; + cl_kernel kernel = NULL; + json11::Json to_json() const; + + cl_uint work_dim; + size_t global_work_size[3] = {0}; + size_t local_work_size[3] = {0}; + private: + Thneed *thneed; +}; + class CachedCommand { public: CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd); void exec(bool wait); - void disassemble(); + void disassemble(int cmd_index); private: struct kgsl_gpu_command cache; - struct kgsl_command_object cmds[2]; - struct kgsl_command_object objs[1]; + unique_ptr cmds; + unique_ptr objs; Thneed *thneed; + vector > kq; }; class Thneed { public: - Thneed(); + Thneed(bool do_clinit=false); void stop(); void execute(float **finputs, float *foutput, bool slow=false); + int optimize(); vector inputs; - cl_mem output; + cl_mem output = NULL; + cl_context context = NULL; cl_command_queue command_queue; + cl_device_id device_id; int context_id; // protected? @@ -56,5 +95,19 @@ class Thneed { vector > cmds; vector syncobjs; int fd; + + // all CL kernels + cl_int clexec(); + vector > kq; + + // pending CL kernels + vector > ckq; + + // loading and saving + void load(const char *filename); + void save(const char *filename); + private: + void clinit(); + json11::Json to_json(); };