Thneed refactors for future functions (#2673)

* delete debug

* thneed updates, but it seems slower

* thneed refactor

* refactor touchups

* add back asserts

* fix uaf

* track the size for local args

* final thneed refactor

* switch kgsl_command_object to avoid memory leak

* comments

* unused includes

Co-authored-by: Comma Device <device@comma.ai>
old-commit-hash: 5fdda8dbd8
commatwo_master
George Hotz 4 years ago committed by GitHub
parent 890df06246
commit 02a2f9ca15
  1. 9
      selfdrive/modeld/SConscript
  2. 3
      selfdrive/modeld/thneed/debug/.gitignore
  3. 3
      selfdrive/modeld/thneed/debug/decompiler/disasm-a3xx.c
  4. 3
      selfdrive/modeld/thneed/debug/decompiler/instr-a3xx.h
  5. 3
      selfdrive/modeld/thneed/debug/decompiler/ir3.h
  6. 3
      selfdrive/modeld/thneed/debug/decompiler/shader_enums.h
  7. 3
      selfdrive/modeld/thneed/debug/decompiler/util/bitset.h
  8. 3
      selfdrive/modeld/thneed/debug/decompiler/util/list.h
  9. 3
      selfdrive/modeld/thneed/debug/decompiler/util/macros.h
  10. 3
      selfdrive/modeld/thneed/debug/disassembler.cc
  11. 3
      selfdrive/modeld/thneed/debug/include/a5xx.xml.h
  12. 3
      selfdrive/modeld/thneed/debug/include/adreno_pm4.xml.h
  13. 3
      selfdrive/modeld/thneed/debug/include/adreno_pm4types.h
  14. 3
      selfdrive/modeld/thneed/debug/main.cc
  15. 3
      selfdrive/modeld/thneed/debug/microbenchmark/gemm.cl
  16. 3
      selfdrive/modeld/thneed/debug/microbenchmark/gemm_image.cl
  17. 3
      selfdrive/modeld/thneed/debug/microbenchmark/go.c
  18. 3
      selfdrive/modeld/thneed/debug/microbenchmark/run.sh
  19. 3
      selfdrive/modeld/thneed/debug/test.cc
  20. 3
      selfdrive/modeld/thneed/debug/thneed
  21. 423
      selfdrive/modeld/thneed/thneed.cc
  22. 63
      selfdrive/modeld/thneed/thneed.h

@ -3,8 +3,6 @@ lenv = env.Clone()
libs = [cereal, messaging, common, 'OpenCL', 'SNPE', 'symphony-cpu', 'capnp', 'zmq', 'kj', 'yuv', gpucommon, visionipc] libs = [cereal, messaging, common, 'OpenCL', 'SNPE', 'symphony-cpu', 'capnp', 'zmq', 'kj', 'yuv', gpucommon, visionipc]
TEST_THNEED = False
common_src = [ common_src = [
"models/commonmodel.cc", "models/commonmodel.cc",
"runners/snpemodel.cc", "runners/snpemodel.cc",
@ -14,13 +12,11 @@ common_src = [
if arch == "aarch64": if arch == "aarch64":
libs += ['gsl', 'CB', 'gnustl_shared'] libs += ['gsl', 'CB', 'gnustl_shared']
if not TEST_THNEED:
common_src += ["thneed/thneed.cc"] common_src += ["thneed/thneed.cc"]
lenv['CFLAGS'].append("-DUSE_THNEED") lenv['CFLAGS'].append("-DUSE_THNEED")
lenv['CXXFLAGS'].append("-DUSE_THNEED") lenv['CXXFLAGS'].append("-DUSE_THNEED")
elif arch == "larch64": elif arch == "larch64":
libs += ['gsl', 'CB', 'pthread', 'dl'] libs += ['gsl', 'CB', 'pthread', 'dl']
if not TEST_THNEED:
common_src += ["thneed/thneed.cc"] common_src += ["thneed/thneed.cc"]
lenv['CFLAGS'].append("-DUSE_THNEED") lenv['CFLAGS'].append("-DUSE_THNEED")
lenv['CXXFLAGS'].append("-DUSE_THNEED") lenv['CXXFLAGS'].append("-DUSE_THNEED")
@ -56,8 +52,3 @@ lenv.Program('_modeld', [
"models/driving.cc", "models/driving.cc",
]+common, LIBS=libs) ]+common, LIBS=libs)
if TEST_THNEED:
lenv.Program('thneed/debug/_thneed', [
"thneed/thneed.cc", "thneed/debug/test.cc"
]+common, LIBS=libs)

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8ac84c959869ac7c7df139f0d307734f162fec51735ec16c8c6f8c908e69a2ce
size 8

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8a3d2041c848e4be23b7d535d10af5f454c1b08ae1f1a8ed9031bee24365c965
size 39479

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2224e54e0113771123ff3ffec3585c2b675523d62cd759bc28c09406c8efe991
size 29300

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a9068160ee23d5dac031b3e29aba8783cf1ec126dde8a6742c23e788d6de5f57
size 47789

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:6bb1240a2a430f82e660b2e78abf01248ee0956632c17594f0ca5fb5924540f1
size 30316

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:99545448afae6443072235b945f525676d89dbf047aa88e5fb4b1f5ec0b389f3
size 12093

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:086776d92973088f980ff78029bee3e9191be9ac1c21f98a0f4688109eeaf5ab
size 9381

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:eb9cf345a46c36f86e246086b09b45c3bab1e29d9d91fbeec9b56efcf8dd4e8b
size 10376

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:afd70142141fef1a4f722dccc06b321f3f83595ef68fad84a728c75d9fe13f87
size 4236

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:c26352f9921d4bf51b182bd6ae1cd56f4c93954cafad446e983cadeb7a41546e
size 184973

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:784eb1d9af94889e2ec29f4dba60c25185454a94837e59f6f96ceb62d9b33465
size 50159

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:61c74fb0b2ead28ae4ce9c7e849c66f3b200517310f9b26b0f5dcb294079167d
size 13124

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b6fcf547efd5bd6d6a740d9b7415e7dacd6624910eb3447e1a2063b0bfb6fba8
size 24327

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:82956213d71cce8685bbfaf45aa44fa0032e435d3f235890a72837a8a981893a
size 1127

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1ad38feef90e767d5edf723f38dc65666149ebd6692923c8c8aa80a9328b2845
size 1816

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:6bf800f2584a7320bd99d2fee3a1868e194913d47ac51d6ecdd6e0399c0c945f
size 9604

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:98ccebe6e903204a6fee90ac3b689e427448dc48efed095c435c0f40dab62ae7
size 115

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:dc1020ef4bbdd7ae0fc3ea2bdf76478f82a1708c2838795f768b2c5a5fa86423
size 3019

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:96f39f024c77aa83a127eedb2751d09946ed03560ce5ec80dd57bb9756b00325
size 200

@ -7,9 +7,16 @@
#include <errno.h> #include <errno.h>
#include "thneed.h" #include "thneed.h"
//#define SAVE_KERNELS
//#define RUN_DISASSEMBLER
//#define RUN_OPTIMIZER
Thneed *g_thneed = NULL; Thneed *g_thneed = NULL;
int g_fd = -1; int g_fd = -1;
map<pair<cl_kernel, int>, string> g_args; map<pair<cl_kernel, int>, string> g_args;
map<pair<cl_kernel, int>, int> g_args_size;
map<cl_program, string> g_program_source;
static inline uint64_t nanos_since_boot() { static inline uint64_t nanos_since_boot() {
struct timespec t; struct timespec t;
@ -27,6 +34,8 @@ void hexdump(uint32_t *d, int len) {
printf("\n"); printf("\n");
} }
// *********** ioctl interceptor ***********
extern "C" { extern "C" {
int (*my_ioctl)(int filedes, unsigned long request, void *argp) = NULL; int (*my_ioctl)(int filedes, unsigned long request, void *argp) = NULL;
@ -39,32 +48,33 @@ int ioctl(int filedes, unsigned long request, void *argp) {
// save the fd // save the fd
if (request == IOCTL_KGSL_GPUOBJ_ALLOC) g_fd = filedes; if (request == IOCTL_KGSL_GPUOBJ_ALLOC) g_fd = filedes;
// note that this runs always, even without a thneed object
if (request == IOCTL_KGSL_DRAWCTXT_CREATE) { if (request == IOCTL_KGSL_DRAWCTXT_CREATE) {
struct kgsl_drawctxt_create *create = (struct kgsl_drawctxt_create *)argp; struct kgsl_drawctxt_create *create = (struct kgsl_drawctxt_create *)argp;
create->flags &= ~KGSL_CONTEXT_PRIORITY_MASK; create->flags &= ~KGSL_CONTEXT_PRIORITY_MASK;
create->flags |= 1 << KGSL_CONTEXT_PRIORITY_SHIFT; // priority from 1-15, 1 is max priority create->flags |= 1 << KGSL_CONTEXT_PRIORITY_SHIFT; // priority from 1-15, 1 is max priority
printf("creating context with flags 0x%x\n", create->flags); printf("IOCTL_KGSL_DRAWCTXT_CREATE: creating context with flags 0x%x\n", create->flags);
} }
if (thneed != NULL) { if (thneed != NULL) {
if (request == IOCTL_KGSL_GPU_COMMAND) { if (request == IOCTL_KGSL_GPU_COMMAND) {
struct kgsl_gpu_command *cmd = (struct kgsl_gpu_command *)argp; struct kgsl_gpu_command *cmd = (struct kgsl_gpu_command *)argp;
if (thneed->record & 1) { if (thneed->record & THNEED_RECORD) {
thneed->timestamp = cmd->timestamp; thneed->timestamp = cmd->timestamp;
thneed->context_id = cmd->context_id; thneed->context_id = cmd->context_id;
thneed->cmds.push_back(unique_ptr<CachedCommand>(new CachedCommand(thneed, cmd))); thneed->cmds.push_back(unique_ptr<CachedCommand>(new CachedCommand(thneed, cmd)));
} }
if (thneed->record & 2) { if (thneed->record & THNEED_DEBUG) {
printf("IOCTL_KGSL_GPU_COMMAND(%2zu): flags: 0x%lx context_id: %u timestamp: %u\n", printf("IOCTL_KGSL_GPU_COMMAND(%2zu): flags: 0x%lx context_id: %u timestamp: %u numcmds: %d numobjs: %d\n",
thneed->cmds.size(), thneed->cmds.size(),
cmd->flags, cmd->flags,
cmd->context_id, cmd->timestamp); cmd->context_id, cmd->timestamp, cmd->numcmds, cmd->numobjs);
} }
} else if (request == IOCTL_KGSL_GPUOBJ_SYNC) { } else if (request == IOCTL_KGSL_GPUOBJ_SYNC) {
struct kgsl_gpuobj_sync *cmd = (struct kgsl_gpuobj_sync *)argp; struct kgsl_gpuobj_sync *cmd = (struct kgsl_gpuobj_sync *)argp;
struct kgsl_gpuobj_sync_obj *objs = (struct kgsl_gpuobj_sync_obj *)(cmd->objs); struct kgsl_gpuobj_sync_obj *objs = (struct kgsl_gpuobj_sync_obj *)(cmd->objs);
if (thneed->record & 2) { if (thneed->record & THNEED_DEBUG) {
printf("IOCTL_KGSL_GPUOBJ_SYNC count:%d ", cmd->count); printf("IOCTL_KGSL_GPUOBJ_SYNC count:%d ", cmd->count);
for (int i = 0; i < cmd->count; i++) { for (int i = 0; i < cmd->count; i++) {
printf(" -- offset:0x%lx len:0x%lx id:%d op:%d ", objs[i].offset, objs[i].length, objs[i].id, objs[i].op); printf(" -- offset:0x%lx len:0x%lx id:%d op:%d ", objs[i].offset, objs[i].length, objs[i].id, objs[i].op);
@ -72,20 +82,20 @@ int ioctl(int filedes, unsigned long request, void *argp) {
printf("\n"); printf("\n");
} }
if (thneed->record & 1) { if (thneed->record & THNEED_RECORD) {
thneed->syncobjs.push_back(string((char *)objs, sizeof(struct kgsl_gpuobj_sync_obj)*cmd->count)); thneed->syncobjs.push_back(string((char *)objs, sizeof(struct kgsl_gpuobj_sync_obj)*cmd->count));
} }
} else if (request == IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID) { } else if (request == IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID) {
struct kgsl_device_waittimestamp_ctxtid *cmd = (struct kgsl_device_waittimestamp_ctxtid *)argp; struct kgsl_device_waittimestamp_ctxtid *cmd = (struct kgsl_device_waittimestamp_ctxtid *)argp;
if (thneed->record & 2) { if (thneed->record & THNEED_DEBUG) {
printf("IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID: context_id: %d timestamp: %d timeout: %d\n", printf("IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID: context_id: %d timestamp: %d timeout: %d\n",
cmd->context_id, cmd->timestamp, cmd->timeout); cmd->context_id, cmd->timestamp, cmd->timeout);
} }
} else if (request == IOCTL_KGSL_SETPROPERTY) { } else if (request == IOCTL_KGSL_SETPROPERTY) {
if (thneed->record & 2) { if (thneed->record & THNEED_DEBUG) {
struct kgsl_device_getproperty *prop = (struct kgsl_device_getproperty *)argp; struct kgsl_device_getproperty *prop = (struct kgsl_device_getproperty *)argp;
printf("IOCTL_KGSL_SETPROPERTY: 0x%x sizebytes:%zu\n", prop->type, prop->sizebytes); printf("IOCTL_KGSL_SETPROPERTY: 0x%x sizebytes:%zu\n", prop->type, prop->sizebytes);
if (thneed->record & 4) { if (thneed->record & THNEED_VERBOSE_DEBUG) {
hexdump((uint32_t *)prop->value, prop->sizebytes); hexdump((uint32_t *)prop->value, prop->sizebytes);
if (prop->type == KGSL_PROP_PWR_CONSTRAINT) { if (prop->type == KGSL_PROP_PWR_CONSTRAINT) {
struct kgsl_device_constraint *constraint = (struct kgsl_device_constraint *)prop->value; struct kgsl_device_constraint *constraint = (struct kgsl_device_constraint *)prop->value;
@ -103,6 +113,8 @@ int ioctl(int filedes, unsigned long request, void *argp) {
} }
// *********** GPUMalloc ***********
GPUMalloc::GPUMalloc(int size, int fd) { GPUMalloc::GPUMalloc(int size, int fd) {
struct kgsl_gpuobj_alloc alloc; struct kgsl_gpuobj_alloc alloc;
memset(&alloc, 0, sizeof(alloc)); memset(&alloc, 0, sizeof(alloc));
@ -128,25 +140,29 @@ void *GPUMalloc::alloc(int size) {
return ret; return ret;
} }
// *********** CachedCommand, at the ioctl layer ***********
CachedCommand::CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd) { CachedCommand::CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd) {
thneed = lthneed; thneed = lthneed;
assert(cmd->numcmds == 2);
assert(cmd->numobjs == 1);
assert(cmd->numsyncs == 0); assert(cmd->numsyncs == 0);
memcpy(cmds, (void *)cmd->cmdlist, sizeof(struct kgsl_command_object)*2);
memcpy(objs, (void *)cmd->objlist, sizeof(struct kgsl_command_object)*1);
memcpy(&cache, cmd, sizeof(cache)); memcpy(&cache, cmd, sizeof(cache));
cache.cmdlist = (uint64_t)cmds;
cache.objlist = (uint64_t)objs;
if (cmd->numcmds > 0) {
cmds = make_unique<struct kgsl_command_object[]>(cmd->numcmds);
memcpy(cmds.get(), (void *)cmd->cmdlist, sizeof(struct kgsl_command_object)*cmd->numcmds);
cache.cmdlist = (uint64_t)cmds.get();
for (int i = 0; i < cmd->numcmds; i++) { for (int i = 0; i < cmd->numcmds; i++) {
void *nn = thneed->ram->alloc(cmds[i].size); void *nn = thneed->ram->alloc(cmds[i].size);
memcpy(nn, (void*)cmds[i].gpuaddr, cmds[i].size); memcpy(nn, (void*)cmds[i].gpuaddr, cmds[i].size);
cmds[i].gpuaddr = (uint64_t)nn; cmds[i].gpuaddr = (uint64_t)nn;
} }
}
if (cmd->numobjs > 0) {
objs = make_unique<struct kgsl_command_object[]>(cmd->numobjs);
memcpy(objs.get(), (void *)cmd->objlist, sizeof(struct kgsl_command_object)*cmd->numobjs);
cache.objlist = (uint64_t)objs.get();
for (int i = 0; i < cmd->numobjs; i++) { for (int i = 0; i < cmd->numobjs; i++) {
void *nn = thneed->ram->alloc(objs[i].size); void *nn = thneed->ram->alloc(objs[i].size);
memset(nn, 0, objs[i].size); memset(nn, 0, objs[i].size);
@ -154,6 +170,10 @@ CachedCommand::CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd) {
} }
} }
kq = thneed->ckq;
thneed->ckq.clear();
}
void CachedCommand::exec(bool wait) { void CachedCommand::exec(bool wait) {
cache.timestamp = ++thneed->timestamp; cache.timestamp = ++thneed->timestamp;
int ret = ioctl(thneed->fd, IOCTL_KGSL_GPU_COMMAND, &cache); int ret = ioctl(thneed->fd, IOCTL_KGSL_GPU_COMMAND, &cache);
@ -168,19 +188,33 @@ void CachedCommand::exec(bool wait) {
int wret = ioctl(thneed->fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait); int wret = ioctl(thneed->fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait);
uint64_t te = nanos_since_boot(); uint64_t te = nanos_since_boot();
if (thneed->record & 2) printf("exec %d wait %d after %lu us\n", ret, wret, (te-tb)/1000); if (thneed->record & THNEED_DEBUG) printf("exec %d wait %d after %lu us\n", ret, wret, (te-tb)/1000);
} else { } else {
if (thneed->record & 2) printf("CachedCommand::exec got %d\n", ret); if (thneed->record & THNEED_DEBUG) printf("CachedCommand::exec got %d\n", ret);
}
if (thneed->record & THNEED_VERBOSE_DEBUG) {
for (auto &it : kq) {
it->debug_print(false);
}
#ifdef RUN_DISASSEMBLER
// assuming 2 commands
disassemble(0);
disassemble(1);
#endif
} }
assert(ret == 0); assert(ret == 0);
} }
Thneed::Thneed() { // *********** Thneed ***********
Thneed::Thneed(bool do_clinit) {
if (do_clinit) clinit();
assert(g_fd != -1); assert(g_fd != -1);
fd = g_fd; fd = g_fd;
ram = make_unique<GPUMalloc>(0x40000, fd); ram = make_unique<GPUMalloc>(0x40000, fd);
record = 1; record = THNEED_RECORD;
timestamp = -1; timestamp = -1;
g_thneed = this; g_thneed = this;
} }
@ -189,29 +223,18 @@ void Thneed::stop() {
record = 0; record = 0;
} }
//#define SAVE_LOG
void Thneed::execute(float **finputs, float *foutput, bool slow) { void Thneed::execute(float **finputs, float *foutput, bool slow) {
int ret;
uint64_t tb, te; uint64_t tb, te;
if (record & 2) tb = nanos_since_boot(); if (record & THNEED_DEBUG) tb = nanos_since_boot();
#ifdef SAVE_LOG
char fn[0x100];
snprintf(fn, sizeof(fn), "/tmp/thneed_log_%d", timestamp);
FILE *f = fopen(fn, "wb");
#endif
// ****** copy inputs // ****** copy inputs
for (int idx = 0; idx < inputs.size(); ++idx) { for (int idx = 0; idx < inputs.size(); ++idx) {
size_t sz; size_t sz;
clGetMemObjectInfo(inputs[idx], CL_MEM_SIZE, sizeof(sz), &sz, NULL); clGetMemObjectInfo(inputs[idx], CL_MEM_SIZE, sizeof(sz), &sz, NULL);
#ifdef SAVE_LOG if (record & THNEED_DEBUG) printf("copying %lu -- %p -> %p\n", sz, finputs[idx], inputs[idx]);
fwrite(&sz, 1, sizeof(sz), f); // TODO: This shouldn't have to block
fwrite(finputs[idx], 1, sz, f);
#endif
if (record & 2) printf("copying %lu -- %p -> %p\n", sz, finputs[idx], inputs[idx]);
clEnqueueWriteBuffer(command_queue, inputs[idx], CL_TRUE, 0, sz, finputs[idx], 0, NULL, NULL); clEnqueueWriteBuffer(command_queue, inputs[idx], CL_TRUE, 0, sz, finputs[idx], 0, NULL, NULL);
} }
@ -229,40 +252,38 @@ void Thneed::execute(float **finputs, float *foutput, bool slow) {
prop.type = KGSL_PROP_PWR_CONSTRAINT; prop.type = KGSL_PROP_PWR_CONSTRAINT;
prop.value = (void*)&constraint; prop.value = (void*)&constraint;
prop.sizebytes = sizeof(constraint); prop.sizebytes = sizeof(constraint);
int ret = ioctl(fd, IOCTL_KGSL_SETPROPERTY, &prop); ret = ioctl(fd, IOCTL_KGSL_SETPROPERTY, &prop);
assert(ret == 0); assert(ret == 0);
// ****** run commands // ****** run commands
int i = 0; int i = 0;
for (auto it = cmds.begin(); it != cmds.end(); ++it) { for (auto &it : cmds) {
++i; ++i;
if (record & 2) printf("run %2d: ", i); if (record & THNEED_DEBUG) printf("run %2d @ %7lu us: ", i, (nanos_since_boot()-tb)/1000);
(*it)->exec((i == cmds.size()) || slow); it->exec((i == cmds.size()) || slow);
} }
// ****** sync objects // ****** sync objects
for (auto it = syncobjs.begin(); it != syncobjs.end(); ++it) { for (auto &it : syncobjs) {
struct kgsl_gpuobj_sync cmd; struct kgsl_gpuobj_sync cmd;
cmd.objs = (uint64_t)it->data(); cmd.objs = (uint64_t)it.data();
cmd.obj_len = it->length(); cmd.obj_len = it.length();
cmd.count = it->length() / sizeof(struct kgsl_gpuobj_sync_obj); cmd.count = it.length() / sizeof(struct kgsl_gpuobj_sync_obj);
ret = ioctl(fd, IOCTL_KGSL_GPUOBJ_SYNC, &cmd); ret = ioctl(fd, IOCTL_KGSL_GPUOBJ_SYNC, &cmd);
assert(ret == 0); assert(ret == 0);
} }
// ****** copy outputs // ****** copy outputs
if (output != NULL) {
size_t sz; size_t sz;
clGetMemObjectInfo(output, CL_MEM_SIZE, sizeof(sz), &sz, NULL); clGetMemObjectInfo(output, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
if (record & 2) printf("copying %lu for output %p -> %p\n", sz, output, foutput); if (record & THNEED_DEBUG) printf("copying %lu for output %p -> %p\n", sz, output, foutput);
clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sz, foutput, 0, NULL, NULL); clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sz, foutput, 0, NULL, NULL);
} else {
#ifdef SAVE_LOG printf("CAUTION: model output is NULL, does it have no outputs?\n");
fwrite(&sz, 1, sizeof(sz), f); }
fwrite(foutput, 1, sz, f);
fclose(f);
#endif
// ****** unset power constraint // ****** unset power constraint
constraint.type = KGSL_CONSTRAINT_NONE; constraint.type = KGSL_CONSTRAINT_NONE;
@ -272,25 +293,59 @@ void Thneed::execute(float **finputs, float *foutput, bool slow) {
ret = ioctl(fd, IOCTL_KGSL_SETPROPERTY, &prop); ret = ioctl(fd, IOCTL_KGSL_SETPROPERTY, &prop);
assert(ret == 0); assert(ret == 0);
if (record & 2) { if (record & THNEED_DEBUG) {
te = nanos_since_boot(); te = nanos_since_boot();
printf("model exec in %lu us\n", (te-tb)/1000); printf("model exec in %lu us\n", (te-tb)/1000);
} }
} }
// TODO: with a different way of getting the input and output buffers, we don't have to intercept CL at all void Thneed::clinit() {
cl_int err;
cl_platform_id platform_id[2];
cl_uint num_devices;
cl_uint num_platforms;
err = clGetPlatformIDs(sizeof(platform_id)/sizeof(cl_platform_id), platform_id, &num_platforms);
assert(err == 0);
err = clGetDeviceIDs(platform_id[0], CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &num_devices);
assert(err == 0);
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &err);
assert(err == 0);
//cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0};
command_queue = clCreateCommandQueueWithProperties(context, device_id, props, &err);
assert(err == 0);
printf("Thneed::clinit done\n");
}
cl_int Thneed::clexec() {
printf("Thneed::clexec: running %lu queued kernels\n", kq.size());
for (auto &k : kq) {
if (record & THNEED_RECORD) ckq.push_back(k);
cl_int ret = k->exec();
assert(ret == CL_SUCCESS);
}
return clFinish(command_queue);
}
// *********** OpenCL interceptor ***********
cl_int (*my_clSetKernelArg)(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) = NULL;
cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) { cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) {
if (my_clSetKernelArg == NULL) my_clSetKernelArg = reinterpret_cast<decltype(my_clSetKernelArg)>(dlsym(RTLD_NEXT, "REAL_clSetKernelArg")); g_args_size[make_pair(kernel, arg_index)] = arg_size;
if (arg_value != NULL) { if (arg_value != NULL) {
g_args[make_pair(kernel, arg_index)] = string((char*)arg_value, arg_size); g_args[make_pair(kernel, arg_index)] = string((char*)arg_value, arg_size);
} else {
g_args[make_pair(kernel, arg_index)] = string("");
} }
cl_int ret = my_clSetKernelArg(kernel, arg_index, arg_size, arg_value); cl_int ret = clSetKernelArg(kernel, arg_index, arg_size, arg_value);
return ret; return ret;
} }
cl_int (*my_clEnqueueNDRangeKernel)(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *) = NULL;
cl_int thneed_clEnqueueNDRangeKernel(cl_command_queue command_queue, cl_int thneed_clEnqueueNDRangeKernel(cl_command_queue command_queue,
cl_kernel kernel, cl_kernel kernel,
cl_uint work_dim, cl_uint work_dim,
@ -301,41 +356,163 @@ cl_int thneed_clEnqueueNDRangeKernel(cl_command_queue command_queue,
const cl_event *event_wait_list, const cl_event *event_wait_list,
cl_event *event) { cl_event *event) {
if (my_clEnqueueNDRangeKernel == NULL) my_clEnqueueNDRangeKernel = reinterpret_cast<decltype(my_clEnqueueNDRangeKernel)>(dlsym(RTLD_NEXT, "REAL_clEnqueueNDRangeKernel"));
Thneed *thneed = g_thneed; Thneed *thneed = g_thneed;
// SNPE doesn't use these // SNPE doesn't use these
assert(num_events_in_wait_list == 0); assert(num_events_in_wait_list == 0);
assert(global_work_offset == NULL); assert(global_work_offset == NULL);
assert(event_wait_list == NULL);
cl_int ret = 0;
if (thneed != NULL && thneed->record & THNEED_RECORD) {
if (thneed->context == NULL) {
thneed->command_queue = command_queue;
clGetKernelInfo(kernel, CL_KERNEL_CONTEXT, sizeof(thneed->context), &thneed->context, NULL);
clGetContextInfo(thneed->context, CL_CONTEXT_DEVICES, sizeof(thneed->device_id), &thneed->device_id, NULL);
}
// if we are recording, we don't actually enqueue the kernel
thneed->kq.push_back(unique_ptr<CLQueuedKernel>(new CLQueuedKernel(thneed, kernel, work_dim, global_work_size, local_work_size)));
*event = NULL;
} else {
ret = clEnqueueNDRangeKernel(command_queue, kernel, work_dim,
global_work_offset, global_work_size, local_work_size,
num_events_in_wait_list, event_wait_list, event);
}
return ret;
}
cl_int thneed_clFinish(cl_command_queue command_queue) {
Thneed *thneed = g_thneed;
if (thneed != NULL && thneed->record & THNEED_RECORD) {
#ifdef RUN_OPTIMIZER
thneed->optimize();
#endif
return thneed->clexec();
} else {
return clFinish(command_queue);
}
}
cl_program thneed_clCreateProgramWithSource(cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) {
assert(count == 1);
cl_program ret = clCreateProgramWithSource(context, count, strings, lengths, errcode_ret);
g_program_source[ret] = strings[0];
return ret;
}
void *dlsym(void *handle, const char *symbol) {
// TODO: Find dlsym in a better way. Currently this is hand looked up in libdl.so
#if defined QCOM
void *(*my_dlsym)(void *handle, const char *symbol) = (void *(*)(void *handle, const char *symbol))((uintptr_t)dlopen-0x2d4);
#elif defined QCOM2
void *(*my_dlsym)(void *handle, const char *symbol) = (void *(*)(void *handle, const char *symbol))((uintptr_t)dlopen+0x138);
#else
#error "Unsupported platform for thneed"
#endif
if (memcmp("REAL_", symbol, 5) == 0) {
return my_dlsym(handle, symbol+5);
} else if (strcmp("clFinish", symbol) == 0) {
return (void*)thneed_clFinish;
} else if (strcmp("clEnqueueNDRangeKernel", symbol) == 0) {
return (void*)thneed_clEnqueueNDRangeKernel;
} else if (strcmp("clSetKernelArg", symbol) == 0) {
return (void*)thneed_clSetKernelArg;
} else if (strcmp("clCreateProgramWithSource", symbol) == 0) {
return (void*)thneed_clCreateProgramWithSource;
} else {
return my_dlsym(handle, symbol);
}
}
char name[0x100]; // *********** CLQueuedKernel ***********
clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, sizeof(name), name, NULL);
cl_uint num_args; CLQueuedKernel::CLQueuedKernel(Thneed *lthneed,
cl_kernel _kernel,
cl_uint _work_dim,
const size_t *_global_work_size,
const size_t *_local_work_size) {
thneed = lthneed;
kernel = _kernel;
work_dim = _work_dim;
assert(work_dim <= 3);
for (int i = 0; i < work_dim; i++) {
global_work_size[i] = _global_work_size[i];
local_work_size[i] = _local_work_size[i];
}
char _name[0x100];
clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, sizeof(_name), _name, NULL);
name = string(_name);
clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(num_args), &num_args, NULL); clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(num_args), &num_args, NULL);
if (thneed != NULL && thneed->record & 1) { // get args
thneed->command_queue = command_queue;
for (int i = 0; i < num_args; i++) { for (int i = 0; i < num_args; i++) {
char arg_name[0x100]; char arg_name[0x100];
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL); clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
string arg = g_args[make_pair(kernel, i)]; arg_names.push_back(string(arg_name));
args.push_back(g_args[make_pair(kernel, i)]);
args_size.push_back(g_args_size[make_pair(kernel, i)]);
}
// get program
clGetKernelInfo(kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, NULL);
}
if (strcmp(arg_name, "input") == 0 && strcmp(name, "zero_pad_image_float") == 0) { int CLQueuedKernel::get_arg_num(const char *search_arg_name) {
cl_mem mem; for (int i = 0; i < num_args; i++) {
memcpy(&mem, (void*)arg.data(), sizeof(mem)); if (arg_names[i] == search_arg_name) return i;
thneed->inputs.push_back(mem); }
printf("failed to find %s in %s\n", search_arg_name, name.c_str());
assert(false);
} }
if (strcmp(arg_name, "output") == 0 && strcmp(name, "image2d_to_buffer_float") == 0) { cl_int CLQueuedKernel::exec() {
cl_mem mem; if (kernel == NULL) {
memcpy(&mem, (void*)arg.data(), sizeof(mem)); kernel = clCreateKernel(program, name.c_str(), NULL);
thneed->output = mem; arg_names.clear();
for (int j = 0; j < num_args; j++) {
char arg_name[0x100];
clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
arg_names.push_back(string(arg_name));
cl_int ret;
if (args[j].size() != 0) {
assert(args[j].size() == args_size[j]);
ret = thneed_clSetKernelArg(kernel, j, args[j].size(), args[j].data());
} else {
ret = thneed_clSetKernelArg(kernel, j, args_size[j], NULL);
} }
assert(ret == CL_SUCCESS);
} }
} }
if (thneed != NULL && thneed->record & 2) {
printf("%p %56s -- ", kernel, name); // save the global inputs/outputs
if (thneed->record & THNEED_RECORD) {
for (int i = 0; i < num_args; i++) {
if (name == "zero_pad_image_float" && arg_names[i] == "input") {
thneed->inputs.push_back(*(cl_mem*)(args[i].data()));
}
if (name == "image2d_to_buffer_float" && arg_names[i] == "output") {
thneed->output = *(cl_mem*)(args[i].data());
}
}
}
if (thneed->record & THNEED_DEBUG) {
debug_print(thneed->record & THNEED_VERBOSE_DEBUG);
}
return clEnqueueNDRangeKernel(thneed->command_queue,
kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL);
}
void CLQueuedKernel::debug_print(bool verbose) {
printf("%p %56s -- ", kernel, name.c_str());
for (int i = 0; i < work_dim; i++) { for (int i = 0; i < work_dim; i++) {
printf("%4zu ", global_work_size[i]); printf("%4zu ", global_work_size[i]);
} }
@ -344,19 +521,18 @@ cl_int thneed_clEnqueueNDRangeKernel(cl_command_queue command_queue,
printf("%4zu ", local_work_size[i]); printf("%4zu ", local_work_size[i]);
} }
printf("\n"); printf("\n");
}
if (thneed != NULL && thneed->record & 4) { if (verbose) {
// extreme debug
for (int i = 0; i < num_args; i++) { for (int i = 0; i < num_args; i++) {
char arg_type[0x100]; char arg_type[0x100];
char arg_name[0x100];
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_type), arg_type, NULL); clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_type), arg_type, NULL);
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL); string arg = args[i];
string arg = g_args[make_pair(kernel, i)]; printf(" %s %s", arg_type, arg_names[i].c_str());
printf(" %s %s", arg_type, arg_name);
void *arg_value = (void*)arg.data(); void *arg_value = (void*)arg.data();
int arg_size = arg.size(); int arg_size = arg.size();
if (arg_size == 1) { if (arg_size == 0) {
printf(" (size) %d", args_size[i]);
} else if (arg_size == 1) {
printf(" = %d", *((char*)arg_value)); printf(" = %d", *((char*)arg_value));
} else if (arg_size == 2) { } else if (arg_size == 2) {
printf(" = %d", *((short*)arg_value)); printf(" = %d", *((short*)arg_value));
@ -373,19 +549,24 @@ cl_int thneed_clEnqueueNDRangeKernel(cl_command_queue command_queue,
if (strcmp("image2d_t", arg_type) == 0 || strcmp("image1d_t", arg_type) == 0) { if (strcmp("image2d_t", arg_type) == 0 || strcmp("image1d_t", arg_type) == 0) {
cl_image_format format; cl_image_format format;
size_t width, height, depth, array_size, row_pitch, slice_pitch; size_t width, height, depth, array_size, row_pitch, slice_pitch;
cl_mem buf;
clGetImageInfo(val, CL_IMAGE_FORMAT, sizeof(format), &format, NULL); clGetImageInfo(val, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
assert(format.image_channel_order == CL_RGBA);
assert(format.image_channel_data_type == CL_HALF_FLOAT); assert(format.image_channel_data_type == CL_HALF_FLOAT);
clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL); clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL); clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);
clGetImageInfo(val, CL_IMAGE_DEPTH, sizeof(depth), &depth, NULL); clGetImageInfo(val, CL_IMAGE_DEPTH, sizeof(depth), &depth, NULL);
clGetImageInfo(val, CL_IMAGE_ARRAY_SIZE, sizeof(array_size), &array_size, NULL); clGetImageInfo(val, CL_IMAGE_ARRAY_SIZE, sizeof(array_size), &array_size, NULL);
clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);
clGetImageInfo(val, CL_IMAGE_SLICE_PITCH, sizeof(slice_pitch), &slice_pitch, NULL); clGetImageInfo(val, CL_IMAGE_SLICE_PITCH, sizeof(slice_pitch), &slice_pitch, NULL);
assert(depth == 0); assert(depth == 0);
assert(array_size == 0); assert(array_size == 0);
assert(slice_pitch == 0); assert(slice_pitch == 0);
printf(" image %zu x %zu rp %zu", width, height, row_pitch); clGetImageInfo(val, CL_IMAGE_BUFFER, sizeof(buf), &buf, NULL);
size_t sz;
clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
printf(" image %zu x %zu rp %zu @ %p buffer %zu", width, height, row_pitch, buf, sz);
} else { } else {
size_t sz; size_t sz;
clGetMemObjectInfo(val, CL_MEM_SIZE, sizeof(sz), &sz, NULL); clGetMemObjectInfo(val, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
@ -396,79 +577,5 @@ cl_int thneed_clEnqueueNDRangeKernel(cl_command_queue command_queue,
printf("\n"); printf("\n");
} }
} }
cl_int ret = my_clEnqueueNDRangeKernel(command_queue, kernel, work_dim,
global_work_offset, global_work_size, local_work_size,
num_events_in_wait_list, event_wait_list, event);
/*uint64_t tb = nanos_since_boot();
clWaitForEvents(1, event);
uint64_t te = nanos_since_boot();
if (thneed != NULL && thneed->record & 2) {
printf(" wait %lu us\n", (te-tb)/1000);
}*/
return ret;
}
//#define SAVE_KERNELS
#ifdef SAVE_KERNELS
map<cl_program, string> program_source;
cl_program (*my_clCreateProgramWithSource)(cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) = NULL;
cl_program thneed_clCreateProgramWithSource(cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) {
if (my_clCreateProgramWithSource == NULL) my_clCreateProgramWithSource = reinterpret_cast<decltype(my_clCreateProgramWithSource)>(dlsym(RTLD_NEXT, "REAL_clCreateProgramWithSource"));
assert(count == 1);
size_t my_lengths[1];
my_lengths[0] = lengths[0];
char fn[0x100];
snprintf(fn, sizeof(fn), "/tmp/program_%zu.cl", strlen(strings[0]));
FILE *f = fopen(fn, "wb");
fprintf(f, "%s", strings[0]);
fclose(f);
char tmp[0x10000];
memset(tmp, 0, sizeof(tmp));
snprintf(fn, sizeof(fn), "/tmp/patched_%zu.cl", strlen(strings[0]));
FILE *g = fopen(fn, "rb");
if (g != NULL) {
printf("LOADING PATCHED PROGRAM %s\n", fn);
fread(tmp, 1, sizeof(tmp), g);
fclose(g);
strings[0] = tmp;
my_lengths[0] = strlen(tmp);
}
program_source[ret] = strings[0];
cl_program ret = my_clCreateProgramWithSource(context, count, strings, my_lengths, errcode_ret);
return ret;
}
#endif
void *dlsym(void *handle, const char *symbol) {
// TODO: Find dlsym in a better way. Currently this is hand looked up in libdl.so
#if defined QCOM
void *(*my_dlsym)(void *handle, const char *symbol) = (void *(*)(void *handle, const char *symbol))((uintptr_t)dlopen-0x2d4);
#elif defined QCOM2
void *(*my_dlsym)(void *handle, const char *symbol) = (void *(*)(void *handle, const char *symbol))((uintptr_t)dlopen+0x138);
#else
#error "Unsupported platform for thneed"
#endif
if (memcmp("REAL_", symbol, 5) == 0) {
return my_dlsym(handle, symbol+5);
} else if (strcmp("clEnqueueNDRangeKernel", symbol) == 0) {
return (void*)thneed_clEnqueueNDRangeKernel;
} else if (strcmp("clSetKernelArg", symbol) == 0) {
return (void*)thneed_clSetKernelArg;
#ifdef SAVE_KERNELS
} else if (strcmp("clCreateProgramWithSource", symbol) == 0) {
return (void*)thneed_clCreateProgramWithSource;
#endif
} else {
return my_dlsym(handle, symbol);
}
} }

@ -9,10 +9,18 @@
#include "include/msm_kgsl.h" #include "include/msm_kgsl.h"
#include <vector> #include <vector>
#include <memory> #include <memory>
#include <string>
#include <CL/cl.h> #include <CL/cl.h>
#define THNEED_RECORD 1
#define THNEED_DEBUG 2
#define THNEED_VERBOSE_DEBUG 4
using namespace std; using namespace std;
namespace json11 {
class Json;
}
class Thneed; class Thneed;
class GPUMalloc { class GPUMalloc {
@ -25,28 +33,59 @@ class GPUMalloc {
int remaining; int remaining;
}; };
class CLQueuedKernel {
public:
CLQueuedKernel(Thneed *lthneed) { thneed = lthneed; }
CLQueuedKernel(Thneed *lthneed,
cl_kernel _kernel,
cl_uint _work_dim,
const size_t *_global_work_size,
const size_t *_local_work_size);
cl_int exec();
void debug_print(bool verbose);
int get_arg_num(const char *search_arg_name);
cl_program program;
string name;
cl_uint num_args;
vector<string> arg_names;
vector<string> args;
vector<int> args_size;
cl_kernel kernel = NULL;
json11::Json to_json() const;
cl_uint work_dim;
size_t global_work_size[3] = {0};
size_t local_work_size[3] = {0};
private:
Thneed *thneed;
};
class CachedCommand { class CachedCommand {
public: public:
CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd); CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd);
void exec(bool wait); void exec(bool wait);
void disassemble(); void disassemble(int cmd_index);
private: private:
struct kgsl_gpu_command cache; struct kgsl_gpu_command cache;
struct kgsl_command_object cmds[2]; unique_ptr<kgsl_command_object[]> cmds;
struct kgsl_command_object objs[1]; unique_ptr<kgsl_command_object[]> objs;
Thneed *thneed; Thneed *thneed;
vector<shared_ptr<CLQueuedKernel> > kq;
}; };
class Thneed { class Thneed {
public: public:
Thneed(); Thneed(bool do_clinit=false);
void stop(); void stop();
void execute(float **finputs, float *foutput, bool slow=false); void execute(float **finputs, float *foutput, bool slow=false);
int optimize();
vector<cl_mem> inputs; vector<cl_mem> inputs;
cl_mem output; cl_mem output = NULL;
cl_context context = NULL;
cl_command_queue command_queue; cl_command_queue command_queue;
cl_device_id device_id;
int context_id; int context_id;
// protected? // protected?
@ -56,5 +95,19 @@ class Thneed {
vector<unique_ptr<CachedCommand> > cmds; vector<unique_ptr<CachedCommand> > cmds;
vector<string> syncobjs; vector<string> syncobjs;
int fd; int fd;
// all CL kernels
cl_int clexec();
vector<shared_ptr<CLQueuedKernel> > kq;
// pending CL kernels
vector<shared_ptr<CLQueuedKernel> > ckq;
// loading and saving
void load(const char *filename);
void save(const char *filename);
private:
void clinit();
json11::Json to_json();
}; };

Loading…
Cancel
Save