openpilot is an open source driver assistance system. openpilot performs the functions of Automated Lane Centering and Adaptive Cruise Control for over 200 supported car makes and models.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

467 lines
16 KiB

#include <cassert>
#include <sys/mman.h>
#include <dlfcn.h>
#include <map>
#include <string>
#include <string.h>
#include <errno.h>
#include "thneed.h"
Thneed *g_thneed = NULL;
int g_fd = -1;
map<pair<cl_kernel, int>, string> g_args;
static inline uint64_t nanos_since_boot() {
struct timespec t;
clock_gettime(CLOCK_BOOTTIME, &t);
return t.tv_sec * 1000000000ULL + t.tv_nsec;
}
void hexdump(uint32_t *d, int len) {
assert((len%4) == 0);
printf(" dumping %p len 0x%x\n", d, len);
for (int i = 0; i < len/4; i++) {
if (i != 0 && (i%0x10) == 0) printf("\n");
printf("%8x ", d[i]);
}
printf("\n");
}
extern "C" {
int (*my_ioctl)(int filedes, unsigned long request, void *argp) = NULL;
#undef ioctl
int ioctl(int filedes, unsigned long request, void *argp) {
request &= 0xFFFFFFFF; // needed on QCOM2
if (my_ioctl == NULL) my_ioctl = reinterpret_cast<decltype(my_ioctl)>(dlsym(RTLD_NEXT, "ioctl"));
Thneed *thneed = g_thneed;
// save the fd
if (request == IOCTL_KGSL_GPUOBJ_ALLOC) g_fd = filedes;
if (thneed != NULL) {
if (request == IOCTL_KGSL_GPU_COMMAND) {
struct kgsl_gpu_command *cmd = (struct kgsl_gpu_command *)argp;
if (thneed->record & 1) {
thneed->timestamp = cmd->timestamp;
thneed->context_id = cmd->context_id;
thneed->cmds.push_back(unique_ptr<CachedCommand>(new CachedCommand(thneed, cmd)));
}
if (thneed->record & 2) {
printf("IOCTL_KGSL_GPU_COMMAND(%2zu): flags: 0x%lx context_id: %u timestamp: %u\n",
thneed->cmds.size(),
cmd->flags,
cmd->context_id, cmd->timestamp);
}
} else if (request == IOCTL_KGSL_GPUOBJ_SYNC) {
struct kgsl_gpuobj_sync *cmd = (struct kgsl_gpuobj_sync *)argp;
struct kgsl_gpuobj_sync_obj *objs = (struct kgsl_gpuobj_sync_obj *)(cmd->objs);
if (thneed->record & 2) {
printf("IOCTL_KGSL_GPUOBJ_SYNC count:%d ", cmd->count);
for (int i = 0; i < cmd->count; i++) {
printf(" -- offset:0x%lx len:0x%lx id:%d op:%d ", objs[i].offset, objs[i].length, objs[i].id, objs[i].op);
}
printf("\n");
}
if (thneed->record & 1) {
thneed->syncobjs.push_back(string((char *)objs, sizeof(struct kgsl_gpuobj_sync_obj)*cmd->count));
}
} else if (request == IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID) {
struct kgsl_device_waittimestamp_ctxtid *cmd = (struct kgsl_device_waittimestamp_ctxtid *)argp;
if (thneed->record & 2) {
printf("IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID: context_id: %d timestamp: %d timeout: %d\n",
cmd->context_id, cmd->timestamp, cmd->timeout);
}
} else if (request == IOCTL_KGSL_SETPROPERTY) {
if (thneed->record & 2) {
struct kgsl_device_getproperty *prop = (struct kgsl_device_getproperty *)argp;
printf("IOCTL_KGSL_SETPROPERTY: 0x%x sizebytes:%zu\n", prop->type, prop->sizebytes);
if (thneed->record & 4) {
hexdump((uint32_t *)prop->value, prop->sizebytes);
if (prop->type == KGSL_PROP_PWR_CONSTRAINT) {
struct kgsl_device_constraint *constraint = (struct kgsl_device_constraint *)prop->value;
hexdump((uint32_t *)constraint->data, constraint->size);
}
}
}
}
}
int ret = my_ioctl(filedes, request, argp);
if (ret != 0) printf("ioctl returned %d with errno %d\n", ret, errno);
return ret;
}
}
GPUMalloc::GPUMalloc(int size, int fd) {
struct kgsl_gpuobj_alloc alloc;
memset(&alloc, 0, sizeof(alloc));
alloc.size = size;
alloc.flags = 0x10000a00;
ioctl(fd, IOCTL_KGSL_GPUOBJ_ALLOC, &alloc);
void *addr = mmap64(NULL, alloc.mmapsize, 0x3, 0x1, fd, alloc.id*0x1000);
assert(addr != MAP_FAILED);
base = (uint64_t)addr;
remaining = size;
}
GPUMalloc::~GPUMalloc() {
// TODO: free the GPU malloced area
}
void *GPUMalloc::alloc(int size) {
if (size > remaining) return NULL;
remaining -= size;
void *ret = (void*)base;
base += (size+0xff) & (~0xFF);
return ret;
}
CachedCommand::CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd) {
thneed = lthneed;
assert(cmd->numcmds == 2);
assert(cmd->numobjs == 1);
assert(cmd->numsyncs == 0);
memcpy(cmds, (void *)cmd->cmdlist, sizeof(struct kgsl_command_object)*2);
memcpy(objs, (void *)cmd->objlist, sizeof(struct kgsl_command_object)*1);
memcpy(&cache, cmd, sizeof(cache));
cache.cmdlist = (uint64_t)cmds;
cache.objlist = (uint64_t)objs;
for (int i = 0; i < cmd->numcmds; i++) {
void *nn = thneed->ram->alloc(cmds[i].size);
memcpy(nn, (void*)cmds[i].gpuaddr, cmds[i].size);
cmds[i].gpuaddr = (uint64_t)nn;
}
for (int i = 0; i < cmd->numobjs; i++) {
void *nn = thneed->ram->alloc(objs[i].size);
memset(nn, 0, objs[i].size);
objs[i].gpuaddr = (uint64_t)nn;
}
}
void CachedCommand::exec(bool wait) {
cache.timestamp = ++thneed->timestamp;
int ret = ioctl(thneed->fd, IOCTL_KGSL_GPU_COMMAND, &cache);
if (wait) {
struct kgsl_device_waittimestamp_ctxtid wait;
wait.context_id = cache.context_id;
wait.timestamp = cache.timestamp;
wait.timeout = -1;
uint64_t tb = nanos_since_boot();
int wret = ioctl(thneed->fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait);
uint64_t te = nanos_since_boot();
if (thneed->record & 2) printf("exec %d wait %d after %lu us\n", ret, wret, (te-tb)/1000);
} else {
if (thneed->record & 2) printf("CachedCommand::exec got %d\n", ret);
}
assert(ret == 0);
}
Thneed::Thneed() {
assert(g_fd != -1);
fd = g_fd;
ram = make_unique<GPUMalloc>(0x40000, fd);
record = 1;
timestamp = -1;
g_thneed = this;
}
void Thneed::stop() {
record = 0;
}
//#define SAVE_LOG
void Thneed::execute(float **finputs, float *foutput, bool slow) {
uint64_t tb, te;
if (record & 2) tb = nanos_since_boot();
#ifdef SAVE_LOG
char fn[0x100];
snprintf(fn, sizeof(fn), "/tmp/thneed_log_%d", timestamp);
FILE *f = fopen(fn, "wb");
#endif
// ****** copy inputs
for (int idx = 0; idx < inputs.size(); ++idx) {
size_t sz;
clGetMemObjectInfo(inputs[idx], CL_MEM_SIZE, sizeof(sz), &sz, NULL);
#ifdef SAVE_LOG
fwrite(&sz, 1, sizeof(sz), f);
fwrite(finputs[idx], 1, sz, f);
#endif
if (record & 2) printf("copying %lu -- %p -> %p\n", sz, finputs[idx], inputs[idx]);
clEnqueueWriteBuffer(command_queue, inputs[idx], CL_TRUE, 0, sz, finputs[idx], 0, NULL, NULL);
}
// ****** set power constraint
struct kgsl_device_constraint_pwrlevel pwrlevel;
pwrlevel.level = KGSL_CONSTRAINT_PWR_MAX;
struct kgsl_device_constraint constraint;
constraint.type = KGSL_CONSTRAINT_PWRLEVEL;
constraint.context_id = context_id;
constraint.data = (void*)&pwrlevel;
constraint.size = sizeof(pwrlevel);
struct kgsl_device_getproperty prop;
prop.type = KGSL_PROP_PWR_CONSTRAINT;
prop.value = (void*)&constraint;
prop.sizebytes = sizeof(constraint);
int ret = ioctl(fd, IOCTL_KGSL_SETPROPERTY, &prop);
assert(ret == 0);
// ****** run commands
int i = 0;
for (auto it = cmds.begin(); it != cmds.end(); ++it) {
++i;
if (record & 2) printf("run %2d: ", i);
(*it)->exec((i == cmds.size()) || slow);
}
// ****** sync objects
for (auto it = syncobjs.begin(); it != syncobjs.end(); ++it) {
struct kgsl_gpuobj_sync cmd;
cmd.objs = (uint64_t)it->data();
cmd.obj_len = it->length();
cmd.count = it->length() / sizeof(struct kgsl_gpuobj_sync_obj);
ret = ioctl(fd, IOCTL_KGSL_GPUOBJ_SYNC, &cmd);
assert(ret == 0);
}
// ****** copy outputs
size_t sz;
clGetMemObjectInfo(output, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
if (record & 2) printf("copying %lu for output %p -> %p\n", sz, output, foutput);
clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sz, foutput, 0, NULL, NULL);
#ifdef SAVE_LOG
fwrite(&sz, 1, sizeof(sz), f);
fwrite(foutput, 1, sz, f);
fclose(f);
#endif
// ****** unset power constraint
constraint.type = KGSL_CONSTRAINT_NONE;
constraint.data = NULL;
constraint.size = 0;
ret = ioctl(fd, IOCTL_KGSL_SETPROPERTY, &prop);
assert(ret == 0);
if (record & 2) {
te = nanos_since_boot();
printf("model exec in %lu us\n", (te-tb)/1000);
}
}
// TODO: with a different way of getting the input and output buffers, we don't have to intercept CL at all
cl_int (*my_clSetKernelArg)(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) = NULL;
cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) {
if (my_clSetKernelArg == NULL) my_clSetKernelArg = reinterpret_cast<decltype(my_clSetKernelArg)>(dlsym(RTLD_NEXT, "REAL_clSetKernelArg"));
if (arg_value != NULL) {
g_args[make_pair(kernel, arg_index)] = string((char*)arg_value, arg_size);
}
cl_int ret = my_clSetKernelArg(kernel, arg_index, arg_size, arg_value);
return ret;
}
cl_int (*my_clEnqueueNDRangeKernel)(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *) = NULL;
cl_int thneed_clEnqueueNDRangeKernel(cl_command_queue command_queue,
cl_kernel kernel,
cl_uint work_dim,
const size_t *global_work_offset,
const size_t *global_work_size,
const size_t *local_work_size,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) {
if (my_clEnqueueNDRangeKernel == NULL) my_clEnqueueNDRangeKernel = reinterpret_cast<decltype(my_clEnqueueNDRangeKernel)>(dlsym(RTLD_NEXT, "REAL_clEnqueueNDRangeKernel"));
Thneed *thneed = g_thneed;
// SNPE doesn't use these
assert(num_events_in_wait_list == 0);
assert(global_work_offset == NULL);
char name[0x100];
clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, sizeof(name), name, NULL);
cl_uint num_args;
clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(num_args), &num_args, NULL);
if (thneed != NULL && thneed->record & 1) {
thneed->command_queue = command_queue;
for (int i = 0; i < num_args; i++) {
char arg_name[0x100];
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
string arg = g_args[make_pair(kernel, i)];
if (strcmp(arg_name, "input") == 0 && strcmp(name, "zero_pad_image_float") == 0) {
cl_mem mem;
memcpy(&mem, (void*)arg.data(), sizeof(mem));
thneed->inputs.push_back(mem);
}
if (strcmp(arg_name, "output") == 0 && strcmp(name, "image2d_to_buffer_float") == 0) {
cl_mem mem;
memcpy(&mem, (void*)arg.data(), sizeof(mem));
thneed->output = mem;
}
}
}
if (thneed != NULL && thneed->record & 2) {
printf("%p %56s -- ", kernel, name);
for (int i = 0; i < work_dim; i++) {
printf("%4zu ", global_work_size[i]);
}
printf(" -- ");
for (int i = 0; i < work_dim; i++) {
printf("%4zu ", local_work_size[i]);
}
printf("\n");
}
if (thneed != NULL && thneed->record & 4) {
// extreme debug
for (int i = 0; i < num_args; i++) {
char arg_type[0x100];
char arg_name[0x100];
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_type), arg_type, NULL);
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
string arg = g_args[make_pair(kernel, i)];
printf(" %s %s", arg_type, arg_name);
void *arg_value = (void*)arg.data();
int arg_size = arg.size();
if (arg_size == 1) {
printf(" = %d", *((char*)arg_value));
} else if (arg_size == 2) {
printf(" = %d", *((short*)arg_value));
} else if (arg_size == 4) {
if (strcmp(arg_type, "float") == 0) {
printf(" = %f", *((float*)arg_value));
} else {
printf(" = %d", *((int*)arg_value));
}
} else if (arg_size == 8) {
cl_mem val = (cl_mem)(*((uintptr_t*)arg_value));
printf(" = %p", val);
if (val != NULL) {
if (strcmp("image2d_t", arg_type) == 0 || strcmp("image1d_t", arg_type) == 0) {
cl_image_format format;
size_t width, height, depth, array_size, row_pitch, slice_pitch;
clGetImageInfo(val, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
assert(format.image_channel_data_type == CL_HALF_FLOAT);
clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
clGetImageInfo(val, CL_IMAGE_DEPTH, sizeof(depth), &depth, NULL);
clGetImageInfo(val, CL_IMAGE_ARRAY_SIZE, sizeof(array_size), &array_size, NULL);
clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);
clGetImageInfo(val, CL_IMAGE_SLICE_PITCH, sizeof(slice_pitch), &slice_pitch, NULL);
assert(depth == 0);
assert(array_size == 0);
assert(slice_pitch == 0);
printf(" image %zu x %zu rp %zu", width, height, row_pitch);
} else {
size_t sz;
clGetMemObjectInfo(val, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
printf(" buffer %zu", sz);
}
}
}
printf("\n");
}
}
cl_int ret = my_clEnqueueNDRangeKernel(command_queue, kernel, work_dim,
global_work_offset, global_work_size, local_work_size,
num_events_in_wait_list, event_wait_list, event);
/*uint64_t tb = nanos_since_boot();
clWaitForEvents(1, event);
uint64_t te = nanos_since_boot();
if (thneed != NULL && thneed->record & 2) {
printf(" wait %lu us\n", (te-tb)/1000);
}*/
return ret;
}
//#define SAVE_KERNELS
#ifdef SAVE_KERNELS
map<cl_program, string> program_source;
cl_program (*my_clCreateProgramWithSource)(cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) = NULL;
cl_program thneed_clCreateProgramWithSource(cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) {
if (my_clCreateProgramWithSource == NULL) my_clCreateProgramWithSource = reinterpret_cast<decltype(my_clCreateProgramWithSource)>(dlsym(RTLD_NEXT, "REAL_clCreateProgramWithSource"));
assert(count == 1);
size_t my_lengths[1];
my_lengths[0] = lengths[0];
char fn[0x100];
snprintf(fn, sizeof(fn), "/tmp/program_%zu.cl", strlen(strings[0]));
FILE *f = fopen(fn, "wb");
fprintf(f, "%s", strings[0]);
fclose(f);
char tmp[0x10000];
memset(tmp, 0, sizeof(tmp));
snprintf(fn, sizeof(fn), "/tmp/patched_%zu.cl", strlen(strings[0]));
FILE *g = fopen(fn, "rb");
if (g != NULL) {
printf("LOADING PATCHED PROGRAM %s\n", fn);
fread(tmp, 1, sizeof(tmp), g);
fclose(g);
strings[0] = tmp;
my_lengths[0] = strlen(tmp);
}
program_source[ret] = strings[0];
cl_program ret = my_clCreateProgramWithSource(context, count, strings, my_lengths, errcode_ret);
return ret;
}
#endif
void *dlsym(void *handle, const char *symbol) {
// TODO: Find dlsym in a better way. Currently this is hand looked up in libdl.so
#if defined QCOM
void *(*my_dlsym)(void *handle, const char *symbol) = (void *(*)(void *handle, const char *symbol))((uintptr_t)dlopen-0x2d4);
#elif defined QCOM2
void *(*my_dlsym)(void *handle, const char *symbol) = (void *(*)(void *handle, const char *symbol))((uintptr_t)dlopen+0x138);
#else
#error "Unsupported platform for thneed"
#endif
if (memcmp("REAL_", symbol, 5) == 0) {
return my_dlsym(handle, symbol+5);
} else if (strcmp("clEnqueueNDRangeKernel", symbol) == 0) {
return (void*)thneed_clEnqueueNDRangeKernel;
} else if (strcmp("clSetKernelArg", symbol) == 0) {
return (void*)thneed_clSetKernelArg;
#ifdef SAVE_KERNELS
} else if (strcmp("clCreateProgramWithSource", symbol) == 0) {
return (void*)thneed_clCreateProgramWithSource;
#endif
} else {
return my_dlsym(handle, symbol);
}
}