From 7352e612a26e17bee9cbb3caeda28aa017edd44c Mon Sep 17 00:00:00 2001 From: ZwX1616 Date: Mon, 16 Dec 2024 16:29:06 -0800 Subject: [PATCH] dmonitoringmodeld: use cl transform without tinygrad (#34266) * merge * why * self.buf_size * 0.05 more than with tg due to copy --------- Co-authored-by: Comma Device --- common/transformations/model.py | 9 ++- selfdrive/modeld/dmonitoringmodeld.py | 25 +++--- selfdrive/modeld/modeld.py | 10 +-- selfdrive/modeld/models/commonmodel.cc | 47 +++++++----- selfdrive/modeld/models/commonmodel.h | 76 +++++++++++++++++-- selfdrive/modeld/models/commonmodel.pxd | 10 ++- selfdrive/modeld/models/commonmodel_pyx.pyx | 41 ++++++++-- system/hardware/tici/tests/test_power_draw.py | 2 +- 8 files changed, 169 insertions(+), 51 deletions(-) diff --git a/common/transformations/model.py b/common/transformations/model.py index aaa12d776a..ea1dff30e8 100644 --- a/common/transformations/model.py +++ b/common/transformations/model.py @@ -1,7 +1,7 @@ import numpy as np from openpilot.common.transformations.orientation import rot_from_euler -from openpilot.common.transformations.camera import get_view_frame_from_calib_frame, view_frame_from_device_frame +from openpilot.common.transformations.camera import get_view_frame_from_calib_frame, view_frame_from_device_frame, _ar_ox_fisheye # segnet SEGNET_SIZE = (512, 384) @@ -39,6 +39,13 @@ sbigmodel_intrinsics = np.array([ [0.0, sbigmodel_fl, 0.5 * (256 + MEDMODEL_CY)], [0.0, 0.0, 1.0]]) +DM_INPUT_SIZE = (1440, 960) +dmonitoringmodel_fl = _ar_ox_fisheye.focal_length +dmonitoringmodel_intrinsics = np.array([ + [dmonitoringmodel_fl, 0.0, DM_INPUT_SIZE[0]/2], + [0.0, dmonitoringmodel_fl, DM_INPUT_SIZE[1]/2 - (_ar_ox_fisheye.height - DM_INPUT_SIZE[1])/2], + [0.0, 0.0, 1.0]]) + bigmodel_frame_from_calib_frame = np.dot(bigmodel_intrinsics, get_view_frame_from_calib_frame(0, 0, 0, 0)) diff --git a/selfdrive/modeld/dmonitoringmodeld.py b/selfdrive/modeld/dmonitoringmodeld.py index 7f04939c65..a29dc081e1 100755 --- a/selfdrive/modeld/dmonitoringmodeld.py +++ b/selfdrive/modeld/dmonitoringmodeld.py @@ -13,13 +13,13 @@ from cereal.messaging import PubMaster, SubMaster from msgq.visionipc import VisionIpcClient, VisionStreamType, VisionBuf from openpilot.common.swaglog import cloudlog from openpilot.common.realtime import set_realtime_priority +from openpilot.common.transformations.model import dmonitoringmodel_intrinsics +from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye +from openpilot.selfdrive.modeld.models.commonmodel_pyx import CLContext, MonitoringModelFrame from openpilot.selfdrive.modeld.runners import ModelRunner, Runtime -from openpilot.selfdrive.modeld.models.commonmodel_pyx import CLContext from openpilot.selfdrive.modeld.parse_model_outputs import sigmoid CALIB_LEN = 3 -MODEL_WIDTH = 1440 -MODEL_HEIGHT = 960 FEATURE_LEN = 512 OUTPUT_SIZE = 84 + FEATURE_LEN @@ -62,25 +62,21 @@ class ModelState: def __init__(self, cl_ctx): assert ctypes.sizeof(DMonitoringModelResult) == OUTPUT_SIZE * ctypes.sizeof(ctypes.c_float) + + self.frame = MonitoringModelFrame(cl_ctx) self.output = np.zeros(OUTPUT_SIZE, dtype=np.float32) self.inputs = { - 'input_img': np.zeros(MODEL_HEIGHT * MODEL_WIDTH, dtype=np.uint8), 'calib': np.zeros(CALIB_LEN, dtype=np.float32)} self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.GPU, False, cl_ctx) self.model.addInput("input_img", None) self.model.addInput("calib", self.inputs['calib']) - def run(self, buf:VisionBuf, calib:np.ndarray) -> tuple[np.ndarray, float]: + def run(self, buf:VisionBuf, calib:np.ndarray, transform:np.ndarray) -> tuple[np.ndarray, float]: self.inputs['calib'][:] = calib - v_offset = buf.height - MODEL_HEIGHT - h_offset = (buf.width - MODEL_WIDTH) // 2 - buf_data = buf.data.reshape(-1, buf.stride) - input_data = self.inputs['input_img'].reshape(MODEL_HEIGHT, MODEL_WIDTH) - input_data[:] = buf_data[v_offset:v_offset+MODEL_HEIGHT, h_offset:h_offset+MODEL_WIDTH] + self.model.setInputBuffer("input_img", self.frame.prepare(buf, transform.flatten(), None).view(np.float32)) - self.model.setInputBuffer("input_img", self.inputs['input_img'].view(np.float32)) t1 = time.perf_counter() self.model.execute() t2 = time.perf_counter() @@ -137,18 +133,23 @@ def main(): pm = PubMaster(["driverStateV2"]) calib = np.zeros(CALIB_LEN, dtype=np.float32) + model_transform = None while True: buf = vipc_client.recv() if buf is None: continue + if model_transform is None: + cam = _os_fisheye if buf.width == _os_fisheye.width else _ar_ox_fisheye + model_transform = np.linalg.inv(np.dot(dmonitoringmodel_intrinsics, np.linalg.inv(cam.intrinsics))).astype(np.float32) + sm.update(0) if sm.updated["liveCalibration"]: calib[:] = np.array(sm["liveCalibration"].rpyCalib) t1 = time.perf_counter() - model_output, gpu_execution_time = model.run(buf, calib) + model_output, gpu_execution_time = model.run(buf, calib, model_transform) t2 = time.perf_counter() pm.send("driverStateV2", get_driverstate_packet(model_output, vipc_client.frame_id, vipc_client.timestamp_sof, t2 - t1, gpu_execution_time)) diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py index c160f9888c..e0f78c5aa0 100755 --- a/selfdrive/modeld/modeld.py +++ b/selfdrive/modeld/modeld.py @@ -22,7 +22,7 @@ from openpilot.selfdrive.modeld.runners import ModelRunner, Runtime from openpilot.selfdrive.modeld.parse_model_outputs import Parser from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState from openpilot.selfdrive.modeld.constants import ModelConstants -from openpilot.selfdrive.modeld.models.commonmodel_pyx import ModelFrame, CLContext +from openpilot.selfdrive.modeld.models.commonmodel_pyx import DrivingModelFrame, CLContext PROCESS_NAME = "selfdrive.modeld.modeld" SEND_RAW_PRED = os.getenv('SEND_RAW_PRED') @@ -44,16 +44,16 @@ class FrameMeta: self.frame_id, self.timestamp_sof, self.timestamp_eof = vipc.frame_id, vipc.timestamp_sof, vipc.timestamp_eof class ModelState: - frame: ModelFrame - wide_frame: ModelFrame + frame: DrivingModelFrame + wide_frame: DrivingModelFrame inputs: dict[str, np.ndarray] output: np.ndarray prev_desire: np.ndarray # for tracking the rising edge of the pulse model: ModelRunner def __init__(self, context: CLContext): - self.frame = ModelFrame(context) - self.wide_frame = ModelFrame(context) + self.frame = DrivingModelFrame(context) + self.wide_frame = DrivingModelFrame(context) self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32) self.full_features_20Hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN, ModelConstants.FEATURE_LEN), dtype=np.float32) self.desire_20Hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN + 1, ModelConstants.DESIRE_LEN), dtype=np.float32) diff --git a/selfdrive/modeld/models/commonmodel.cc b/selfdrive/modeld/models/commonmodel.cc index e8a5a7ed52..cd77903680 100644 --- a/selfdrive/modeld/models/commonmodel.cc +++ b/selfdrive/modeld/models/commonmodel.cc @@ -1,36 +1,30 @@ #include "selfdrive/modeld/models/commonmodel.h" -#include #include #include #include "common/clutil.h" -ModelFrame::ModelFrame(cl_device_id device_id, cl_context context) { +DrivingModelFrame::DrivingModelFrame(cl_device_id device_id, cl_context context) : ModelFrame(device_id, context) { input_frames = std::make_unique(buf_size); - - q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err)); - y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, MODEL_WIDTH * MODEL_HEIGHT, NULL, &err)); - u_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (MODEL_WIDTH / 2) * (MODEL_HEIGHT / 2), NULL, &err)); - v_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (MODEL_WIDTH / 2) * (MODEL_HEIGHT / 2), NULL, &err)); + //input_frames_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err)); img_buffer_20hz_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, 5*frame_size_bytes, NULL, &err)); region.origin = 4 * frame_size_bytes; region.size = frame_size_bytes; last_img_cl = CL_CHECK_ERR(clCreateSubBuffer(img_buffer_20hz_cl, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err)); - transform_init(&transform, context, device_id); loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT); + init_transform(device_id, context, MODEL_WIDTH, MODEL_HEIGHT); } -uint8_t* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3 &projection, cl_mem *output) { - transform_queue(&this->transform, q, - yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset, - y_cl, u_cl, v_cl, MODEL_WIDTH, MODEL_HEIGHT, projection); +uint8_t* DrivingModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output) { + run_transform(yuv_cl, MODEL_WIDTH, MODEL_HEIGHT, frame_width, frame_height, frame_stride, frame_uv_offset, projection); for (int i = 0; i < 4; i++) { CL_CHECK(clEnqueueCopyBuffer(q, img_buffer_20hz_cl, img_buffer_20hz_cl, (i+1)*frame_size_bytes, i*frame_size_bytes, frame_size_bytes, 0, nullptr, nullptr)); } loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, last_img_cl); + if (output == NULL) { CL_CHECK(clEnqueueReadBuffer(q, img_buffer_20hz_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[0], 0, nullptr, nullptr)); CL_CHECK(clEnqueueReadBuffer(q, last_img_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr)); @@ -46,13 +40,30 @@ uint8_t* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, i } } -ModelFrame::~ModelFrame() { - transform_destroy(&transform); +DrivingModelFrame::~DrivingModelFrame() { + deinit_transform(); loadyuv_destroy(&loadyuv); CL_CHECK(clReleaseMemObject(img_buffer_20hz_cl)); CL_CHECK(clReleaseMemObject(last_img_cl)); - CL_CHECK(clReleaseMemObject(v_cl)); - CL_CHECK(clReleaseMemObject(u_cl)); - CL_CHECK(clReleaseMemObject(y_cl)); CL_CHECK(clReleaseCommandQueue(q)); -} \ No newline at end of file +} + + +MonitoringModelFrame::MonitoringModelFrame(cl_device_id device_id, cl_context context) : ModelFrame(device_id, context) { + input_frames = std::make_unique(buf_size); + //input_frame_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err)); + + init_transform(device_id, context, MODEL_WIDTH, MODEL_HEIGHT); +} +uint8_t* MonitoringModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output) { + run_transform(yuv_cl, MODEL_WIDTH, MODEL_HEIGHT, frame_width, frame_height, frame_stride, frame_uv_offset, projection); + CL_CHECK(clEnqueueReadBuffer(q, y_cl, CL_TRUE, 0, MODEL_FRAME_SIZE * sizeof(uint8_t), input_frames.get(), 0, nullptr, nullptr)); + clFinish(q); + //return &y_cl; + return input_frames.get(); +} + +MonitoringModelFrame::~MonitoringModelFrame() { + deinit_transform(); + CL_CHECK(clReleaseCommandQueue(q)); +} diff --git a/selfdrive/modeld/models/commonmodel.h b/selfdrive/modeld/models/commonmodel.h index 1c7360f159..e55f78e248 100644 --- a/selfdrive/modeld/models/commonmodel.h +++ b/selfdrive/modeld/models/commonmodel.h @@ -2,6 +2,7 @@ #include #include +#include #include @@ -18,9 +19,56 @@ class ModelFrame { public: - ModelFrame(cl_device_id device_id, cl_context context); - ~ModelFrame(); - uint8_t* prepare(cl_mem yuv_cl, int width, int height, int frame_stride, int frame_uv_offset, const mat3& transform, cl_mem *output); + ModelFrame(cl_device_id device_id, cl_context context) { + q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err)); + } + virtual ~ModelFrame() {} + virtual uint8_t* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output) { return NULL; } + /* + uint8_t* buffer_from_cl(cl_mem *in_frames, int buffer_size) { + CL_CHECK(clEnqueueReadBuffer(q, *in_frames, CL_TRUE, 0, buffer_size, input_frames.get(), 0, nullptr, nullptr)); + clFinish(q); + return &input_frames[0]; + } + */ + + int MODEL_WIDTH; + int MODEL_HEIGHT; + int MODEL_FRAME_SIZE; + int buf_size; + +protected: + cl_mem y_cl, u_cl, v_cl; + Transform transform; + cl_command_queue q; + std::unique_ptr input_frames; + + void init_transform(cl_device_id device_id, cl_context context, int model_width, int model_height) { + y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, model_width * model_height, NULL, &err)); + u_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (model_width / 2) * (model_height / 2), NULL, &err)); + v_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (model_width / 2) * (model_height / 2), NULL, &err)); + transform_init(&transform, context, device_id); + } + + void deinit_transform() { + transform_destroy(&transform); + CL_CHECK(clReleaseMemObject(v_cl)); + CL_CHECK(clReleaseMemObject(u_cl)); + CL_CHECK(clReleaseMemObject(y_cl)); + } + + void run_transform(cl_mem yuv_cl, int model_width, int model_height, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) { + transform_queue(&transform, q, + yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset, + y_cl, u_cl, v_cl, model_width, model_height, projection); + } +}; + +class DrivingModelFrame : public ModelFrame { +public: + DrivingModelFrame(cl_device_id device_id, cl_context context); + ~DrivingModelFrame(); + uint8_t* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output); const int MODEL_WIDTH = 512; const int MODEL_HEIGHT = 256; @@ -29,10 +77,22 @@ public: const size_t frame_size_bytes = MODEL_FRAME_SIZE * sizeof(uint8_t); private: - Transform transform; LoadYUVState loadyuv; - cl_command_queue q; - cl_mem y_cl, u_cl, v_cl, img_buffer_20hz_cl, last_img_cl; + cl_mem img_buffer_20hz_cl, last_img_cl;//, input_frames_cl; cl_buffer_region region; - std::unique_ptr input_frames; -}; \ No newline at end of file +}; + +class MonitoringModelFrame : public ModelFrame { +public: + MonitoringModelFrame(cl_device_id device_id, cl_context context); + ~MonitoringModelFrame(); + uint8_t* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output); + + const int MODEL_WIDTH = 1440; + const int MODEL_HEIGHT = 960; + const int MODEL_FRAME_SIZE = MODEL_WIDTH * MODEL_HEIGHT; + const int buf_size = MODEL_FRAME_SIZE; + +private: + // cl_mem input_frame_cl; +}; diff --git a/selfdrive/modeld/models/commonmodel.pxd b/selfdrive/modeld/models/commonmodel.pxd index 3348af3f17..61acb2bb2f 100644 --- a/selfdrive/modeld/models/commonmodel.pxd +++ b/selfdrive/modeld/models/commonmodel.pxd @@ -14,5 +14,13 @@ cdef extern from "common/clutil.h": cdef extern from "selfdrive/modeld/models/commonmodel.h": cppclass ModelFrame: int buf_size - ModelFrame(cl_device_id, cl_context) + # unsigned char * buffer_from_cl(cl_mem*, int); unsigned char * prepare(cl_mem, int, int, int, int, mat3, cl_mem*) + + cppclass DrivingModelFrame: + int buf_size + DrivingModelFrame(cl_device_id, cl_context) + + cppclass MonitoringModelFrame: + int buf_size + MonitoringModelFrame(cl_device_id, cl_context) diff --git a/selfdrive/modeld/models/commonmodel_pyx.pyx b/selfdrive/modeld/models/commonmodel_pyx.pyx index 99f9c5dc17..c7ea5ecac2 100644 --- a/selfdrive/modeld/models/commonmodel_pyx.pyx +++ b/selfdrive/modeld/models/commonmodel_pyx.pyx @@ -4,11 +4,12 @@ import numpy as np cimport numpy as cnp from libc.string cimport memcpy +from libc.stdint cimport uintptr_t from msgq.visionipc.visionipc cimport cl_mem from msgq.visionipc.visionipc_pyx cimport VisionBuf, CLContext as BaseCLContext from .commonmodel cimport CL_DEVICE_TYPE_DEFAULT, cl_get_device_id, cl_create_context -from .commonmodel cimport mat3, ModelFrame as cppModelFrame +from .commonmodel cimport mat3, ModelFrame as cppModelFrame, DrivingModelFrame as cppDrivingModelFrame, MonitoringModelFrame as cppMonitoringModelFrame cdef class CLContext(BaseCLContext): @@ -23,11 +24,17 @@ cdef class CLMem: mem.mem = cmem return mem + @property + def mem_address(self): + return (self.mem) + +def cl_from_visionbuf(VisionBuf buf): + return CLMem.create(&buf.buf.buf_cl) + + cdef class ModelFrame: cdef cppModelFrame * frame - - def __cinit__(self, CLContext context): - self.frame = new cppModelFrame(context.device_id, context.context) + cdef int buf_size def __dealloc__(self): del self.frame @@ -42,4 +49,28 @@ cdef class ModelFrame: data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection, output.mem) if not data: return None - return np.asarray( data) + + return np.asarray( data) + # return CLMem.create(data) + + # def buffer_from_cl(self, CLMem in_frames): + # cdef unsigned char * data2 + # data2 = self.frame.buffer_from_cl(in_frames.mem, self.buf_size) + # return np.asarray( data2) + + +cdef class DrivingModelFrame(ModelFrame): + cdef cppDrivingModelFrame * _frame + + def __cinit__(self, CLContext context): + self._frame = new cppDrivingModelFrame(context.device_id, context.context) + self.frame = (self._frame) + self.buf_size = self._frame.buf_size + +cdef class MonitoringModelFrame(ModelFrame): + cdef cppMonitoringModelFrame * _frame + + def __cinit__(self, CLContext context): + self._frame = new cppMonitoringModelFrame(context.device_id, context.context) + self.frame = (self._frame) + self.buf_size = self._frame.buf_size diff --git a/system/hardware/tici/tests/test_power_draw.py b/system/hardware/tici/tests/test_power_draw.py index 8598b2faa2..22fd70e1a5 100644 --- a/system/hardware/tici/tests/test_power_draw.py +++ b/system/hardware/tici/tests/test_power_draw.py @@ -33,7 +33,7 @@ class Proc: PROCS = [ Proc(['camerad'], 1.75, msgs=['roadCameraState', 'wideRoadCameraState', 'driverCameraState']), Proc(['modeld'], 1.12, atol=0.2, msgs=['modelV2']), - Proc(['dmonitoringmodeld'], 0.5, msgs=['driverStateV2']), + Proc(['dmonitoringmodeld'], 0.65, msgs=['driverStateV2']), Proc(['encoderd'], 0.23, msgs=[]), ]