diff --git a/selfdrive/modeld/models/commonmodel.cc b/selfdrive/modeld/models/commonmodel.cc index d2d37bbc5a..a188a4ebd3 100644 --- a/selfdrive/modeld/models/commonmodel.cc +++ b/selfdrive/modeld/models/commonmodel.cc @@ -13,7 +13,7 @@ ModelFrame::ModelFrame(cl_device_id device_id, cl_context context) { y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, MODEL_WIDTH * MODEL_HEIGHT, NULL, &err)); u_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (MODEL_WIDTH / 2) * (MODEL_HEIGHT / 2), NULL, &err)); v_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (MODEL_WIDTH / 2) * (MODEL_HEIGHT / 2), NULL, &err)); - net_input_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, MODEL_FRAME_SIZE * sizeof(uint8_t), NULL, &err)); + net_input_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, frame_size_bytes, NULL, &err)); transform_init(&transform, context, device_id); loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT); @@ -24,15 +24,16 @@ uint8_t* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, i yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset, y_cl, u_cl, v_cl, MODEL_WIDTH, MODEL_HEIGHT, projection); + loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, net_input_cl); if (output == NULL) { - loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, net_input_cl); - - std::memmove(&input_frames[0], &input_frames[MODEL_FRAME_SIZE], sizeof(uint8_t) * MODEL_FRAME_SIZE); - CL_CHECK(clEnqueueReadBuffer(q, net_input_cl, CL_TRUE, 0, MODEL_FRAME_SIZE * sizeof(uint8_t), &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr)); + std::memmove(&input_frames[0], &input_frames[MODEL_FRAME_SIZE], frame_size_bytes); + CL_CHECK(clEnqueueReadBuffer(q, net_input_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr)); clFinish(q); return &input_frames[0]; } else { - loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, *output, true); + copy_queue(&loadyuv, q, *output, *output, frame_size_bytes, 0, frame_size_bytes); + copy_queue(&loadyuv, q, net_input_cl, *output, 0, frame_size_bytes, frame_size_bytes); + // NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready. clFinish(q); return NULL; diff --git a/selfdrive/modeld/models/commonmodel.h b/selfdrive/modeld/models/commonmodel.h index ea39466670..fb527fc7a1 100644 --- a/selfdrive/modeld/models/commonmodel.h +++ b/selfdrive/modeld/models/commonmodel.h @@ -26,6 +26,7 @@ public: const int MODEL_HEIGHT = 256; const int MODEL_FRAME_SIZE = MODEL_WIDTH * MODEL_HEIGHT * 3 / 2; const int buf_size = MODEL_FRAME_SIZE * 2; + const size_t frame_size_bytes = MODEL_FRAME_SIZE * sizeof(uint8_t); private: Transform transform; diff --git a/selfdrive/modeld/transforms/loadyuv.cc b/selfdrive/modeld/transforms/loadyuv.cc index c7ce7b0830..c93f5cd038 100644 --- a/selfdrive/modeld/transforms/loadyuv.cc +++ b/selfdrive/modeld/transforms/loadyuv.cc @@ -33,17 +33,8 @@ void loadyuv_destroy(LoadYUVState* s) { void loadyuv_queue(LoadYUVState* s, cl_command_queue q, cl_mem y_cl, cl_mem u_cl, cl_mem v_cl, - cl_mem out_cl, bool do_shift) { + cl_mem out_cl) { cl_int global_out_off = 0; - if (do_shift) { - // shift the image in slot 1 to slot 0, then place the new image in slot 1 - global_out_off += (s->width*s->height) + (s->width/2)*(s->height/2)*2; - CL_CHECK(clSetKernelArg(s->copy_krnl, 0, sizeof(cl_mem), &out_cl)); - CL_CHECK(clSetKernelArg(s->copy_krnl, 1, sizeof(cl_int), &global_out_off)); - const size_t copy_work_size = global_out_off/8; - CL_CHECK(clEnqueueNDRangeKernel(q, s->copy_krnl, 1, NULL, - ©_work_size, NULL, 0, 0, NULL)); - } CL_CHECK(clSetKernelArg(s->loadys_krnl, 0, sizeof(cl_mem), &y_cl)); CL_CHECK(clSetKernelArg(s->loadys_krnl, 1, sizeof(cl_mem), &out_cl)); @@ -72,3 +63,14 @@ void loadyuv_queue(LoadYUVState* s, cl_command_queue q, CL_CHECK(clEnqueueNDRangeKernel(q, s->loaduv_krnl, 1, NULL, &loaduv_work_size, NULL, 0, 0, NULL)); } + +void copy_queue(LoadYUVState* s, cl_command_queue q, cl_mem src, cl_mem dst, + size_t src_offset, size_t dst_offset, size_t size) { + CL_CHECK(clSetKernelArg(s->copy_krnl, 0, sizeof(cl_mem), &src)); + CL_CHECK(clSetKernelArg(s->copy_krnl, 1, sizeof(cl_mem), &dst)); + CL_CHECK(clSetKernelArg(s->copy_krnl, 2, sizeof(cl_int), &src_offset)); + CL_CHECK(clSetKernelArg(s->copy_krnl, 3, sizeof(cl_int), &dst_offset)); + const size_t copy_work_size = size/8; + CL_CHECK(clEnqueueNDRangeKernel(q, s->copy_krnl, 1, NULL, + ©_work_size, NULL, 0, 0, NULL)); +} \ No newline at end of file diff --git a/selfdrive/modeld/transforms/loadyuv.cl b/selfdrive/modeld/transforms/loadyuv.cl index 3a6b3ebc4f..970187a6d7 100644 --- a/selfdrive/modeld/transforms/loadyuv.cl +++ b/selfdrive/modeld/transforms/loadyuv.cl @@ -37,9 +37,11 @@ __kernel void loaduv(__global uchar8 const * const in, out[gid + out_offset / 8] = inv; } -__kernel void copy(__global uchar8 * inout, - int in_offset) +__kernel void copy(__global uchar8 * in, + __global uchar8 * out, + int in_offset, + int out_offset) { const int gid = get_global_id(0); - inout[gid] = inout[gid + in_offset / 8]; + out[gid + out_offset / 8] = in[gid + in_offset / 8]; } diff --git a/selfdrive/modeld/transforms/loadyuv.h b/selfdrive/modeld/transforms/loadyuv.h index 7d27ef5d46..659059cd25 100644 --- a/selfdrive/modeld/transforms/loadyuv.h +++ b/selfdrive/modeld/transforms/loadyuv.h @@ -13,4 +13,8 @@ void loadyuv_destroy(LoadYUVState* s); void loadyuv_queue(LoadYUVState* s, cl_command_queue q, cl_mem y_cl, cl_mem u_cl, cl_mem v_cl, - cl_mem out_cl, bool do_shift = false); + cl_mem out_cl); + + +void copy_queue(LoadYUVState* s, cl_command_queue q, cl_mem src, cl_mem dst, + size_t src_offset, size_t dst_offset, size_t size); \ No newline at end of file