diff --git a/selfdrive/modeld/models/commonmodel.cc b/selfdrive/modeld/models/commonmodel.cc index 9def7e49e3..4575199943 100644 --- a/selfdrive/modeld/models/commonmodel.cc +++ b/selfdrive/modeld/models/commonmodel.cc @@ -22,16 +22,24 @@ ModelFrame::ModelFrame(cl_device_id device_id, cl_context context) { loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT); } -float* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, const mat3 &transform) { +float* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, const mat3 &transform, cl_mem *output) { transform_queue(&this->transform, q, yuv_cl, frame_width, frame_height, y_cl, u_cl, v_cl, MODEL_WIDTH, MODEL_HEIGHT, transform); - loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, net_input_cl); - std::memmove(&input_frames[0], &input_frames[MODEL_FRAME_SIZE], sizeof(float) * MODEL_FRAME_SIZE); - clEnqueueReadBuffer(q, net_input_cl, CL_TRUE, 0, MODEL_FRAME_SIZE * sizeof(float), &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr); - clFinish(q); - return &input_frames[0]; + if (output == NULL) { + loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, net_input_cl); + + std::memmove(&input_frames[0], &input_frames[MODEL_FRAME_SIZE], sizeof(float) * MODEL_FRAME_SIZE); + CL_CHECK(clEnqueueReadBuffer(q, net_input_cl, CL_TRUE, 0, MODEL_FRAME_SIZE * sizeof(float), &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr)); + clFinish(q); + return &input_frames[0]; + } else { + loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, *output, true); + // NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready. + clFinish(q); + return NULL; + } } ModelFrame::~ModelFrame() { diff --git a/selfdrive/modeld/models/commonmodel.h b/selfdrive/modeld/models/commonmodel.h index e4e6563d7c..d7904489b1 100644 --- a/selfdrive/modeld/models/commonmodel.h +++ b/selfdrive/modeld/models/commonmodel.h @@ -30,7 +30,7 @@ class ModelFrame { public: ModelFrame(cl_device_id device_id, cl_context context); ~ModelFrame(); - float* prepare(cl_mem yuv_cl, int width, int height, const mat3& transform); + float* prepare(cl_mem yuv_cl, int width, int height, const mat3& transform, cl_mem *output); const int buf_size = MODEL_FRAME_SIZE * 2; diff --git a/selfdrive/modeld/models/driving.cc b/selfdrive/modeld/models/driving.cc index 0ebb9a374b..c1ce11bea2 100644 --- a/selfdrive/modeld/models/driving.cc +++ b/selfdrive/modeld/models/driving.cc @@ -105,7 +105,8 @@ ModelDataRaw model_eval_frame(ModelState* s, cl_mem yuv_cl, int width, int heigh //for (int i = 0; i < OUTPUT_SIZE + TEMPORAL_SIZE; i++) { printf("%f ", s->output[i]); } printf("\n"); - auto net_input_buf = s->frame->prepare(yuv_cl, width, height, transform); + // if getInputBuf is not NULL, net_input_buf will be + auto net_input_buf = s->frame->prepare(yuv_cl, width, height, transform, static_cast(s->m->getInputBuf())); s->m->execute(net_input_buf, s->frame->buf_size); // net outputs diff --git a/selfdrive/modeld/runners/runmodel.h b/selfdrive/modeld/runners/runmodel.h index 9eb7e1ea6c..0893a4acc6 100644 --- a/selfdrive/modeld/runners/runmodel.h +++ b/selfdrive/modeld/runners/runmodel.h @@ -5,5 +5,6 @@ public: virtual void addDesire(float *state, int state_size) {} virtual void addTrafficConvention(float *state, int state_size) {} virtual void execute(float *net_input_buf, int buf_size) {} + virtual void* getInputBuf() { return nullptr; } }; diff --git a/selfdrive/modeld/runners/thneedmodel.cc b/selfdrive/modeld/runners/thneedmodel.cc index be782b9741..0beb6a2794 100644 --- a/selfdrive/modeld/runners/thneedmodel.cc +++ b/selfdrive/modeld/runners/thneedmodel.cc @@ -25,6 +25,11 @@ void ThneedModel::addDesire(float *state, int state_size) { desire = state; } +void* ThneedModel::getInputBuf() { + if (thneed->input_clmem.size() > 3) return &(thneed->input_clmem[3]); + else return nullptr; +} + void ThneedModel::execute(float *net_input_buf, int buf_size) { float *inputs[4] = {recurrent, trafficConvention, desire, net_input_buf}; if (!recorded) { diff --git a/selfdrive/modeld/runners/thneedmodel.h b/selfdrive/modeld/runners/thneedmodel.h index 933c751d96..1720d5ff20 100644 --- a/selfdrive/modeld/runners/thneedmodel.h +++ b/selfdrive/modeld/runners/thneedmodel.h @@ -10,6 +10,7 @@ public: void addTrafficConvention(float *state, int state_size); void addDesire(float *state, int state_size); void execute(float *net_input_buf, int buf_size); + void* getInputBuf(); private: Thneed *thneed = NULL; bool recorded; diff --git a/selfdrive/modeld/thneed/thneed.cc b/selfdrive/modeld/thneed/thneed.cc index 89fdfe3910..242905d9f4 100644 --- a/selfdrive/modeld/thneed/thneed.cc +++ b/selfdrive/modeld/thneed/thneed.cc @@ -241,6 +241,7 @@ void Thneed::find_inputs_outputs() { for (int i = 0; i < k->num_args; i++) { if (k->name == "zero_pad_image_float" && k->arg_names[i] == "input") { cl_mem aa = *(cl_mem*)(k->args[i].data()); + input_clmem.push_back(aa); size_t sz; clGetMemObjectInfo(aa, CL_MEM_SIZE, sizeof(sz), &sz, NULL); @@ -262,7 +263,7 @@ void Thneed::copy_inputs(float **finputs) { //cl_int ret; for (int idx = 0; idx < inputs.size(); ++idx) { if (record & THNEED_DEBUG) printf("copying %lu -- %p -> %p\n", input_sizes[idx], finputs[idx], inputs[idx]); - memcpy(inputs[idx], finputs[idx], input_sizes[idx]); + if (finputs[idx] != NULL) memcpy(inputs[idx], finputs[idx], input_sizes[idx]); } } diff --git a/selfdrive/modeld/thneed/thneed.h b/selfdrive/modeld/thneed/thneed.h index 1ae7247477..77d27e0435 100644 --- a/selfdrive/modeld/thneed/thneed.h +++ b/selfdrive/modeld/thneed/thneed.h @@ -98,6 +98,7 @@ class Thneed { void wait(); int optimize(); + vector input_clmem; vector inputs; vector input_sizes; cl_mem output = NULL; diff --git a/selfdrive/modeld/transforms/loadyuv.cc b/selfdrive/modeld/transforms/loadyuv.cc index 9f3dd45aad..39f404a897 100644 --- a/selfdrive/modeld/transforms/loadyuv.cc +++ b/selfdrive/modeld/transforms/loadyuv.cc @@ -19,6 +19,7 @@ void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int w s->loadys_krnl = CL_CHECK_ERR(clCreateKernel(prg, "loadys", &err)); s->loaduv_krnl = CL_CHECK_ERR(clCreateKernel(prg, "loaduv", &err)); + s->copy_krnl = CL_CHECK_ERR(clCreateKernel(prg, "copy", &err)); // done with this CL_CHECK(clReleaseProgram(prg)); @@ -27,33 +28,46 @@ void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int w void loadyuv_destroy(LoadYUVState* s) { CL_CHECK(clReleaseKernel(s->loadys_krnl)); CL_CHECK(clReleaseKernel(s->loaduv_krnl)); + CL_CHECK(clReleaseKernel(s->copy_krnl)); } void loadyuv_queue(LoadYUVState* s, cl_command_queue q, cl_mem y_cl, cl_mem u_cl, cl_mem v_cl, - cl_mem out_cl) { + cl_mem out_cl, bool do_shift) { + cl_int global_out_off = 0; + if (do_shift) { + // shift the image in slot 1 to slot 0, then place the new image in slot 1 + global_out_off += (s->width*s->height) + (s->width/2)*(s->height/2)*2; + CL_CHECK(clSetKernelArg(s->copy_krnl, 0, sizeof(cl_mem), &out_cl)); + CL_CHECK(clSetKernelArg(s->copy_krnl, 1, sizeof(cl_int), &global_out_off)); + const size_t copy_work_size = global_out_off/8; + CL_CHECK(clEnqueueNDRangeKernel(q, s->copy_krnl, 1, NULL, + ©_work_size, NULL, 0, 0, NULL)); + } + CL_CHECK(clSetKernelArg(s->loadys_krnl, 0, sizeof(cl_mem), &y_cl)); CL_CHECK(clSetKernelArg(s->loadys_krnl, 1, sizeof(cl_mem), &out_cl)); + CL_CHECK(clSetKernelArg(s->loadys_krnl, 2, sizeof(cl_int), &global_out_off)); const size_t loadys_work_size = (s->width*s->height)/8; CL_CHECK(clEnqueueNDRangeKernel(q, s->loadys_krnl, 1, NULL, &loadys_work_size, NULL, 0, 0, NULL)); const size_t loaduv_work_size = ((s->width/2)*(s->height/2))/8; - cl_int loaduv_out_off = (s->width*s->height); + global_out_off += (s->width*s->height); CL_CHECK(clSetKernelArg(s->loaduv_krnl, 0, sizeof(cl_mem), &u_cl)); CL_CHECK(clSetKernelArg(s->loaduv_krnl, 1, sizeof(cl_mem), &out_cl)); - CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &loaduv_out_off)); + CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &global_out_off)); CL_CHECK(clEnqueueNDRangeKernel(q, s->loaduv_krnl, 1, NULL, &loaduv_work_size, NULL, 0, 0, NULL)); - loaduv_out_off += (s->width/2)*(s->height/2); + global_out_off += (s->width/2)*(s->height/2); CL_CHECK(clSetKernelArg(s->loaduv_krnl, 0, sizeof(cl_mem), &v_cl)); CL_CHECK(clSetKernelArg(s->loaduv_krnl, 1, sizeof(cl_mem), &out_cl)); - CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &loaduv_out_off)); + CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &global_out_off)); CL_CHECK(clEnqueueNDRangeKernel(q, s->loaduv_krnl, 1, NULL, &loaduv_work_size, NULL, 0, 0, NULL)); diff --git a/selfdrive/modeld/transforms/loadyuv.cl b/selfdrive/modeld/transforms/loadyuv.cl index fc7655b568..7dd3d973a3 100644 --- a/selfdrive/modeld/transforms/loadyuv.cl +++ b/selfdrive/modeld/transforms/loadyuv.cl @@ -1,7 +1,8 @@ #define UV_SIZE ((TRANSFORMED_WIDTH/2)*(TRANSFORMED_HEIGHT/2)) __kernel void loadys(__global uchar8 const * const Y, - __global float * out) + __global float * out, + int out_offset) { const int gid = get_global_id(0); const int ois = gid * 8; @@ -17,11 +18,11 @@ __kernel void loadys(__global uchar8 const * const Y, __global float* outy0; __global float* outy1; if ((oy & 1) == 0) { - outy0 = out; //y0 - outy1 = out + UV_SIZE*2; //y2 + outy0 = out + out_offset; //y0 + outy1 = out + out_offset + UV_SIZE*2; //y2 } else { - outy0 = out + UV_SIZE; //y1 - outy1 = out + UV_SIZE*3; //y3 + outy0 = out + out_offset + UV_SIZE; //y1 + outy1 = out + out_offset + UV_SIZE*3; //y3 } vstore4(ysf.s0246, 0, outy0 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2); @@ -37,3 +38,10 @@ __kernel void loaduv(__global uchar8 const * const in, const float8 outv = convert_float8(inv); out[gid + out_offset / 8] = outv; } + +__kernel void copy(__global float8 * inout, + int in_offset) +{ + const int gid = get_global_id(0); + inout[gid] = inout[gid + in_offset / 8]; +} diff --git a/selfdrive/modeld/transforms/loadyuv.h b/selfdrive/modeld/transforms/loadyuv.h index a77258debd..a3161b2935 100644 --- a/selfdrive/modeld/transforms/loadyuv.h +++ b/selfdrive/modeld/transforms/loadyuv.h @@ -4,7 +4,7 @@ typedef struct { int width, height; - cl_kernel loadys_krnl, loaduv_krnl; + cl_kernel loadys_krnl, loaduv_krnl, copy_krnl; } LoadYUVState; void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int width, int height); @@ -13,4 +13,4 @@ void loadyuv_destroy(LoadYUVState* s); void loadyuv_queue(LoadYUVState* s, cl_command_queue q, cl_mem y_cl, cl_mem u_cl, cl_mem v_cl, - cl_mem out_cl); + cl_mem out_cl, bool do_shift = false);