diff --git a/release/files_common b/release/files_common index ca05fbf756..eb849cce95 100644 --- a/release/files_common +++ b/release/files_common @@ -387,6 +387,9 @@ selfdrive/modeld/transforms/loadyuv.cl selfdrive/modeld/transforms/transform.[c,h] selfdrive/modeld/transforms/transform.cl +selfdrive/modeld/thneed/thneed.* +selfdrive/modeld/thneed/include/* + selfdrive/modeld/runners/snpemodel.cc selfdrive/modeld/runners/snpemodel.h selfdrive/modeld/runners/runmodel.h diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript index 0872b653a9..f70057d649 100644 --- a/selfdrive/modeld/SConscript +++ b/selfdrive/modeld/SConscript @@ -3,6 +3,8 @@ lenv = env.Clone() libs = [messaging, common, 'OpenCL', 'SNPE', 'capnp', 'zmq', 'kj', 'yuv', gpucommon, visionipc] +TEST_THNEED = False + common_src = [ "models/commonmodel.c", "runners/snpemodel.cc", @@ -11,6 +13,10 @@ common_src = [ if arch == "aarch64": libs += ['gsl', 'CB', 'gnustl_shared'] + if not TEST_THNEED: + common_src += ["thneed/thneed.cc"] + lenv['CFLAGS'].append("-DUSE_THNEED") + lenv['CXXFLAGS'].append("-DUSE_THNEED") elif arch == "larch64": libs += ['gsl', 'CB', 'symphony-cpu', 'pthread'] else: @@ -34,3 +40,8 @@ lenv.Program('_modeld', [ "models/driving.cc", ]+common, LIBS=libs) +if TEST_THNEED: + lenv.Program('thneed/debug/_thneed', [ + "thneed/thneed.cc", "thneed/debug/test.cc" + ]+common, LIBS=libs) + diff --git a/selfdrive/modeld/runners/snpemodel.cc b/selfdrive/modeld/runners/snpemodel.cc index 4bb442d5e1..bab7432207 100644 --- a/selfdrive/modeld/runners/snpemodel.cc +++ b/selfdrive/modeld/runners/snpemodel.cc @@ -9,9 +9,9 @@ void PrintErrorStringAndExit() { std::exit(EXIT_FAILURE); } -SNPEModel::SNPEModel(const char *path, float *output, size_t output_size, int runtime) { +SNPEModel::SNPEModel(const char *path, float *loutput, size_t output_size, int runtime) { + output = loutput; #ifdef QCOM - zdl::DlSystem::Runtime_t Runtime; if (runtime==USE_GPU_RUNTIME) { Runtime = zdl::DlSystem::Runtime_t::GPU; } else if (runtime==USE_DSP_RUNTIME) { @@ -87,6 +87,13 @@ SNPEModel::SNPEModel(const char *path, float *output, size_t output_size, int ru // create output buffer { + const zdl::DlSystem::TensorShape& bufferShape = snpe->getInputOutputBufferAttributes(output_tensor_name)->getDims(); + if (output_size != 0) { + assert(output_size == bufferShape[1]); + } else { + output_size = bufferShape[1]; + } + std::vector outputStrides = {output_size * sizeof(float), sizeof(float)}; outputBuffer = ubFactory.createUserBuffer(output, output_size * sizeof(float), outputStrides, &userBufferEncodingFloat); outputMap.add(output_tensor_name, outputBuffer.get()); @@ -94,14 +101,17 @@ SNPEModel::SNPEModel(const char *path, float *output, size_t output_size, int ru } void SNPEModel::addRecurrent(float *state, int state_size) { + recurrent = state; recurrentBuffer = this->addExtra(state, state_size, 3); } void SNPEModel::addTrafficConvention(float *state, int state_size) { + trafficConvention = state; trafficConventionBuffer = this->addExtra(state, state_size, 2); } void SNPEModel::addDesire(float *state, int state_size) { + desire = state; desireBuffer = this->addExtra(state, state_size, 1); } @@ -122,9 +132,33 @@ std::unique_ptr SNPEModel::addExtra(float *state, in } void SNPEModel::execute(float *net_input_buf, int buf_size) { - assert(inputBuffer->setBufferAddress(net_input_buf)); - if (!snpe->execute(inputMap, outputMap)) { - PrintErrorStringAndExit(); +#ifdef USE_THNEED + if (Runtime == zdl::DlSystem::Runtime_t::GPU) { + if (thneed == NULL) { + assert(inputBuffer->setBufferAddress(net_input_buf)); + if (!snpe->execute(inputMap, outputMap)) { + PrintErrorStringAndExit(); + } + thneed = new Thneed(); + //thneed->record = 3; + if (!snpe->execute(inputMap, outputMap)) { + PrintErrorStringAndExit(); + } + thneed->stop(); + //thneed->record = 2; + printf("thneed cached\n"); + } else { + float *inputs[4] = {recurrent, trafficConvention, desire, net_input_buf}; + thneed->execute(inputs, output); + } + } else { +#endif + assert(inputBuffer->setBufferAddress(net_input_buf)); + if (!snpe->execute(inputMap, outputMap)) { + PrintErrorStringAndExit(); + } +#ifdef USE_THNEED } +#endif } diff --git a/selfdrive/modeld/runners/snpemodel.h b/selfdrive/modeld/runners/snpemodel.h index 9289444b09..496ad51db2 100644 --- a/selfdrive/modeld/runners/snpemodel.h +++ b/selfdrive/modeld/runners/snpemodel.h @@ -17,9 +17,13 @@ #define USE_GPU_RUNTIME 1 #define USE_DSP_RUNTIME 2 +#ifdef USE_THNEED +#include "thneed/thneed.h" +#endif + class SNPEModel : public RunModel { public: - SNPEModel(const char *path, float *output, size_t output_size, int runtime); + SNPEModel(const char *path, float *loutput, size_t output_size, int runtime); ~SNPEModel() { if (model_data) free(model_data); } @@ -30,6 +34,12 @@ public: private: uint8_t *model_data = NULL; +#ifdef USE_THNEED + Thneed *thneed = NULL; +#endif + + zdl::DlSystem::Runtime_t Runtime; + // snpe model stuff std::unique_ptr snpe; @@ -44,8 +54,11 @@ private: // recurrent and desire std::unique_ptr addExtra(float *state, int state_size, int idx); + float *recurrent; std::unique_ptr recurrentBuffer; + float *trafficConvention; std::unique_ptr trafficConventionBuffer; + float *desire; std::unique_ptr desireBuffer; }; diff --git a/selfdrive/modeld/thneed/README b/selfdrive/modeld/thneed/README new file mode 100644 index 0000000000..f3bc66d8fc --- /dev/null +++ b/selfdrive/modeld/thneed/README @@ -0,0 +1,8 @@ +thneed is an SNPE accelerator. I know SNPE is already an accelerator, but sometimes things need to go even faster.. + +It runs on the local device, and caches a single model run. Then it replays it, but fast. + +thneed slices through abstraction layers like a fish. + +You need a thneed. + diff --git a/selfdrive/modeld/thneed/debug/.gitignore b/selfdrive/modeld/thneed/debug/.gitignore new file mode 100644 index 0000000000..f0ea768194 --- /dev/null +++ b/selfdrive/modeld/thneed/debug/.gitignore @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ac84c959869ac7c7df139f0d307734f162fec51735ec16c8c6f8c908e69a2ce +size 8 diff --git a/selfdrive/modeld/thneed/debug/include/a5xx.xml.h b/selfdrive/modeld/thneed/debug/include/a5xx.xml.h new file mode 100644 index 0000000000..10eb528890 --- /dev/null +++ b/selfdrive/modeld/thneed/debug/include/a5xx.xml.h @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c26352f9921d4bf51b182bd6ae1cd56f4c93954cafad446e983cadeb7a41546e +size 184973 diff --git a/selfdrive/modeld/thneed/debug/include/adreno_pm4.xml.h b/selfdrive/modeld/thneed/debug/include/adreno_pm4.xml.h new file mode 100644 index 0000000000..b1915eb5c1 --- /dev/null +++ b/selfdrive/modeld/thneed/debug/include/adreno_pm4.xml.h @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:784eb1d9af94889e2ec29f4dba60c25185454a94837e59f6f96ceb62d9b33465 +size 50159 diff --git a/selfdrive/modeld/thneed/debug/include/adreno_pm4types.h b/selfdrive/modeld/thneed/debug/include/adreno_pm4types.h new file mode 100644 index 0000000000..cd5fc1378c --- /dev/null +++ b/selfdrive/modeld/thneed/debug/include/adreno_pm4types.h @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61c74fb0b2ead28ae4ce9c7e849c66f3b200517310f9b26b0f5dcb294079167d +size 13124 diff --git a/selfdrive/modeld/thneed/debug/main.cc b/selfdrive/modeld/thneed/debug/main.cc new file mode 100644 index 0000000000..660fe6c6df --- /dev/null +++ b/selfdrive/modeld/thneed/debug/main.cc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf5043514cf5b79912e54da6550f8a1bf3f378644827154c47ea7fd31de4093a +size 24549 diff --git a/selfdrive/modeld/thneed/debug/test.cc b/selfdrive/modeld/thneed/debug/test.cc new file mode 100644 index 0000000000..61f771402b --- /dev/null +++ b/selfdrive/modeld/thneed/debug/test.cc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24e057ba05689d07f82bb6f5cdca78e366d9dde9f29f18765c91954af6e6ff16 +size 2832 diff --git a/selfdrive/modeld/thneed/debug/thneed b/selfdrive/modeld/thneed/debug/thneed new file mode 100755 index 0000000000..f0e49e824d --- /dev/null +++ b/selfdrive/modeld/thneed/debug/thneed @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96f39f024c77aa83a127eedb2751d09946ed03560ce5ec80dd57bb9756b00325 +size 200 diff --git a/selfdrive/modeld/thneed/include/msm_kgsl.h b/selfdrive/modeld/thneed/include/msm_kgsl.h new file mode 100644 index 0000000000..6653b07a59 --- /dev/null +++ b/selfdrive/modeld/thneed/include/msm_kgsl.h @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:089b21b377325c0b0f04b96b6ed4a8e5975b1c050191598cd64dc0a3a3565a71 +size 45343 diff --git a/selfdrive/modeld/thneed/thneed.cc b/selfdrive/modeld/thneed/thneed.cc new file mode 100644 index 0000000000..826015999f --- /dev/null +++ b/selfdrive/modeld/thneed/thneed.cc @@ -0,0 +1,363 @@ +#include "thneed.h" +#include +#include +#include +#include +#include +#include + +Thneed *g_thneed = NULL; +int g_fd = -1; +std::map, std::string> g_args; + +static inline uint64_t nanos_since_boot() { + struct timespec t; + clock_gettime(CLOCK_BOOTTIME, &t); + return t.tv_sec * 1000000000ULL + t.tv_nsec; } + +void hexdump(uint32_t *d, int len) { + assert((len%4) == 0); + printf(" dumping %p len 0x%x\n", d, len); + for (int i = 0; i < len/4; i++) { + if (i != 0 && (i%0x10) == 0) printf("\n"); + printf("%8x ", d[i]); + } + printf("\n"); +} + +extern "C" { + +int (*my_ioctl)(int filedes, unsigned long request, void *argp) = NULL; +#undef ioctl +int ioctl(int filedes, unsigned long request, void *argp) { + if (my_ioctl == NULL) my_ioctl = reinterpret_cast(dlsym(RTLD_NEXT, "ioctl")); + Thneed *thneed = g_thneed; + + // save the fd + if (request == IOCTL_KGSL_GPUOBJ_ALLOC) g_fd = filedes; + + if (thneed != NULL) { + if (request == IOCTL_KGSL_GPU_COMMAND) { + struct kgsl_gpu_command *cmd = (struct kgsl_gpu_command *)argp; + if (thneed->record & 1) { + thneed->timestamp = cmd->timestamp; + thneed->context_id = cmd->context_id; + CachedCommand *ccmd = new CachedCommand(thneed, cmd); + thneed->cmds.push_back(ccmd); + } + if (thneed->record & 2) { + printf("IOCTL_KGSL_GPU_COMMAND: flags: 0x%lx context_id: %u timestamp: %u\n", + cmd->flags, + cmd->context_id, cmd->timestamp); + } + } else if (request == IOCTL_KGSL_GPUOBJ_SYNC) { + struct kgsl_gpuobj_sync *cmd = (struct kgsl_gpuobj_sync *)argp; + struct kgsl_gpuobj_sync_obj *objs = (struct kgsl_gpuobj_sync_obj *)(cmd->objs); + + if (thneed->record & 2) { + printf("IOCTL_KGSL_GPUOBJ_SYNC count:%d ", cmd->count); + for (int i = 0; i < cmd->count; i++) { + printf(" -- offset:0x%lx len:0x%lx id:%d op:%d ", objs[i].offset, objs[i].length, objs[i].id, objs[i].op); + } + printf("\n"); + } + + if (thneed->record & 1) { + struct kgsl_gpuobj_sync_obj *new_objs = (struct kgsl_gpuobj_sync_obj *)malloc(sizeof(struct kgsl_gpuobj_sync_obj)*cmd->count); + memcpy(new_objs, objs, sizeof(struct kgsl_gpuobj_sync_obj)*cmd->count); + thneed->syncobjs.push_back(std::make_pair(cmd->count, new_objs)); + } + } else if (request == IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID) { + struct kgsl_device_waittimestamp_ctxtid *cmd = (struct kgsl_device_waittimestamp_ctxtid *)argp; + if (thneed->record & 2) { + printf("IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID: context_id: %d timestamp: %d timeout: %d\n", + cmd->context_id, cmd->timestamp, cmd->timeout); + } + } else if (request == IOCTL_KGSL_SETPROPERTY) { + if (thneed->record & 2) { + struct kgsl_device_getproperty *prop = (struct kgsl_device_getproperty *)argp; + printf("IOCTL_KGSL_SETPROPERTY: 0x%x sizebytes:%zu\n", prop->type, prop->sizebytes); + if (thneed->record & 4) { + hexdump((uint32_t *)prop->value, prop->sizebytes); + if (prop->type == KGSL_PROP_PWR_CONSTRAINT) { + struct kgsl_device_constraint *constraint = (struct kgsl_device_constraint *)prop->value; + hexdump((uint32_t *)constraint->data, constraint->size); + } + } + } + } + } + + int ret = my_ioctl(filedes, request, argp); + if (ret != 0) printf("ioctl returned %d with errno %d\n", ret, errno); + return ret; +} + +} + +GPUMalloc::GPUMalloc(int size, int fd) { + struct kgsl_gpuobj_alloc alloc; + memset(&alloc, 0, sizeof(alloc)); + alloc.size = size; + alloc.flags = 0x10000a00; + int ret = ioctl(fd, IOCTL_KGSL_GPUOBJ_ALLOC, &alloc); + void *addr = mmap64(NULL, alloc.mmapsize, 0x3, 0x1, fd, alloc.id*0x1000); + assert(addr != MAP_FAILED); + + base = (uint64_t)addr; + remaining = size; +} + +void *GPUMalloc::alloc(int size) { + if (size > remaining) return NULL; + remaining -= size; + void *ret = (void*)base; + base += (size+0xff) & (~0xFF); + return ret; +} + +CachedCommand::CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd) { + thneed = lthneed; + assert(cmd->numcmds == 2); + assert(cmd->numobjs == 1); + assert(cmd->numsyncs == 0); + + memcpy(cmds, (void *)cmd->cmdlist, sizeof(struct kgsl_command_object)*2); + memcpy(objs, (void *)cmd->objlist, sizeof(struct kgsl_command_object)*1); + + memcpy(&cache, cmd, sizeof(cache)); + cache.cmdlist = (uint64_t)cmds; + cache.objlist = (uint64_t)objs; + + for (int i = 0; i < cmd->numcmds; i++) { + void *nn = thneed->ram->alloc(cmds[i].size); + memcpy(nn, (void*)cmds[i].gpuaddr, cmds[i].size); + cmds[i].gpuaddr = (uint64_t)nn; + } + + for (int i = 0; i < cmd->numobjs; i++) { + void *nn = thneed->ram->alloc(objs[i].size); + memset(nn, 0, objs[i].size); + objs[i].gpuaddr = (uint64_t)nn; + } +} + +void CachedCommand::exec(bool wait) { + cache.timestamp = ++thneed->timestamp; + int ret = ioctl(thneed->fd, IOCTL_KGSL_GPU_COMMAND, &cache); + + if (wait) { + struct kgsl_device_waittimestamp_ctxtid wait; + wait.context_id = cache.context_id; + wait.timestamp = cache.timestamp; + wait.timeout = -1; + + uint64_t tb = nanos_since_boot(); + int wret = ioctl(thneed->fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait); + uint64_t te = nanos_since_boot(); + + if (thneed->record & 2) printf("exec %d wait %d after %lu us\n", ret, wret, (te-tb)/1000); + } else { + if (thneed->record & 2) printf("CachedCommand::exec got %d\n", ret); + } + + assert(ret == 0); +} + +Thneed::Thneed() { + assert(g_fd != -1); + fd = g_fd; + ram = new GPUMalloc(0x40000, fd); + record = 1; + timestamp = -1; + g_thneed = this; +} + +void Thneed::stop() { + record = 0; +} + +//#define SAVE_LOG + +void Thneed::execute(float **finputs, float *foutput) { + #ifdef SAVE_LOG + char fn[0x100]; + snprintf(fn, sizeof(fn), "/tmp/thneed_log_%d", timestamp); + FILE *f = fopen(fn, "wb"); + #endif + + // ****** copy inputs + for (int idx = 0; idx < inputs.size(); ++idx) { + size_t sz; + clGetMemObjectInfo(inputs[idx], CL_MEM_SIZE, sizeof(sz), &sz, NULL); + + #ifdef SAVE_LOG + fwrite(&sz, 1, sizeof(sz), f); + fwrite(finputs[idx], 1, sz, f); + #endif + + if (record & 2) printf("copying %lu -- %p -> %p\n", sz, finputs[idx], inputs[idx]); + clEnqueueWriteBuffer(command_queue, inputs[idx], CL_TRUE, 0, sz, finputs[idx], 0, NULL, NULL); + } + + // ****** set power constraint + struct kgsl_device_constraint_pwrlevel pwrlevel; + pwrlevel.level = KGSL_CONSTRAINT_PWR_MAX; + + struct kgsl_device_constraint constraint; + constraint.type = KGSL_CONSTRAINT_PWRLEVEL; + constraint.context_id = context_id; + constraint.data = (void*)&pwrlevel; + constraint.size = sizeof(pwrlevel); + + struct kgsl_device_getproperty prop; + prop.type = KGSL_PROP_PWR_CONSTRAINT; + prop.value = (void*)&constraint; + prop.sizebytes = sizeof(constraint); + int ret = ioctl(fd, IOCTL_KGSL_SETPROPERTY, &prop); + assert(ret == 0); + + // ****** run commands + int i = 0; + for (auto it = cmds.begin(); it != cmds.end(); ++it) { + if (record & 2) printf("run %2d: ", i); + (*it)->exec((++i) == cmds.size()); + } + + // ****** sync objects + for (auto it = syncobjs.begin(); it != syncobjs.end(); ++it) { + struct kgsl_gpuobj_sync cmd; + + cmd.objs = (uint64_t)it->second; + cmd.obj_len = it->first * sizeof(struct kgsl_gpuobj_sync_obj); + cmd.count = it->first; + + ret = ioctl(fd, IOCTL_KGSL_GPUOBJ_SYNC, &cmd); + assert(ret == 0); + } + + // ****** copy outputs + size_t sz; + clGetMemObjectInfo(output, CL_MEM_SIZE, sizeof(sz), &sz, NULL); + if (record & 2) printf("copying %lu for output %p -> %p\n", sz, output, foutput); + clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sz, foutput, 0, NULL, NULL); + + #ifdef SAVE_LOG + fwrite(&sz, 1, sizeof(sz), f); + fwrite(foutput, 1, sz, f); + fclose(f); + #endif + + // ****** unset power constraint + constraint.type = KGSL_CONSTRAINT_NONE; + constraint.data = NULL; + constraint.size = 0; + + ret = ioctl(fd, IOCTL_KGSL_SETPROPERTY, &prop); + assert(ret == 0); +} + +cl_int (*my_clSetKernelArg)(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) = NULL; +cl_int clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) { + if (my_clSetKernelArg == NULL) my_clSetKernelArg = reinterpret_cast(dlsym(RTLD_NEXT, "REAL_clSetKernelArg")); + if (arg_value != NULL) { + g_args[std::make_pair(kernel, arg_index)] = std::string((char*)arg_value, arg_size); + } + cl_int ret = my_clSetKernelArg(kernel, arg_index, arg_size, arg_value); + return ret; +} + +cl_int (*my_clEnqueueNDRangeKernel)(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *) = NULL; +cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue, + cl_kernel kernel, + cl_uint work_dim, + const size_t *global_work_offset, + const size_t *global_work_size, + const size_t *local_work_size, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { + + if (my_clEnqueueNDRangeKernel == NULL) my_clEnqueueNDRangeKernel = reinterpret_cast(dlsym(RTLD_NEXT, "REAL_clEnqueueNDRangeKernel")); + Thneed *thneed = g_thneed; + + // SNPE doesn't use these + assert(num_events_in_wait_list == 0); + assert(global_work_offset == NULL); + + char name[0x100]; + clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, sizeof(name), name, NULL); + + cl_uint num_args; + clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(num_args), &num_args, NULL); + + if (thneed != NULL && thneed->record & 1) { + thneed->command_queue = command_queue; + for (int i = 0; i < num_args; i++) { + char arg_name[0x100]; + clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL); + std::string arg = g_args[std::make_pair(kernel, i)]; + + if (strcmp(arg_name, "input") == 0 && strcmp(name, "zero_pad_image_float") == 0) { + cl_mem mem; + memcpy(&mem, (void*)arg.data(), sizeof(mem)); + thneed->inputs.push_back(mem); + } + + if (strcmp(arg_name, "output") == 0 && strcmp(name, "image2d_to_buffer_float") == 0) { + cl_mem mem; + memcpy(&mem, (void*)arg.data(), sizeof(mem)); + thneed->output = mem; + } + } + } + + if (thneed != NULL && thneed->record & 4) { + // extreme debug + printf("%s -- %p\n", name, kernel); + for (int i = 0; i < num_args; i++) { + char arg_type[0x100]; + char arg_name[0x100]; + clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_type), arg_type, NULL); + clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL); + std::string arg = g_args[std::make_pair(kernel, i)]; + printf(" %s %s", arg_type, arg_name); + void *arg_value = (void*)arg.data(); + int arg_size = arg.size(); + if (arg_size == 1) { + printf(" = %d", *((char*)arg_value)); + } else if (arg_size == 2) { + printf(" = %d", *((short*)arg_value)); + } else if (arg_size == 4) { + if (strcmp(arg_type, "float") == 0) { + printf(" = %f", *((float*)arg_value)); + } else { + printf(" = %d", *((int*)arg_value)); + } + } else if (arg_size == 8) { + cl_mem val = (cl_mem)(*((uintptr_t*)arg_value)); + printf(" = %p", val); + } + printf("\n"); + } + } + + cl_int ret = my_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, + global_work_offset, global_work_size, local_work_size, + num_events_in_wait_list, event_wait_list, event); + return ret; +} + +void *dlsym(void *handle, const char *symbol) { + void *(*my_dlsym)(void *handle, const char *symbol) = (void *(*)(void *handle, const char *symbol))((uintptr_t)dlopen-0x2d4); + if (memcmp("REAL_", symbol, 5) == 0) { + return my_dlsym(handle, symbol+5); + } else if (strcmp("clEnqueueNDRangeKernel", symbol) == 0) { + return (void*)clEnqueueNDRangeKernel; + } else if (strcmp("clSetKernelArg", symbol) == 0) { + return (void*)clSetKernelArg; + } else { + return my_dlsym(handle, symbol); + } +} + diff --git a/selfdrive/modeld/thneed/thneed.h b/selfdrive/modeld/thneed/thneed.h new file mode 100644 index 0000000000..9f35f5dcfb --- /dev/null +++ b/selfdrive/modeld/thneed/thneed.h @@ -0,0 +1,50 @@ +#pragma once + +#include +#include "include/msm_kgsl.h" +#include +#include + +class Thneed; + +class GPUMalloc { + public: + GPUMalloc(int size, int fd); + void *alloc(int size); + private: + uint64_t base; + int remaining; +}; + +class CachedCommand { + public: + CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd); + void exec(bool wait); + private: + struct kgsl_gpu_command cache; + struct kgsl_command_object cmds[2]; + struct kgsl_command_object objs[1]; + Thneed *thneed; +}; + +class Thneed { + public: + Thneed(); + void stop(); + void execute(float **finputs, float *foutput); + + std::vector inputs; + cl_mem output; + + cl_command_queue command_queue; + int context_id; + + // protected? + int record; + int timestamp; + GPUMalloc *ram; + std::vector cmds; + std::vector > syncobjs; + int fd; +}; +