Revert Tinygrad (#34243)

* Revert "dmonitoringmodeld: use cl transform (#34235)"

This reverts commit 684b0b9d4d.

* Revert "load model before calling convert_fp16_to_fp32"

This reverts commit 31606a7d15.

* Revert "bump tinygrad"

This reverts commit 44f58ff758.

* Revert "Tinygrad runner (#34171)"

This reverts commit 7b5a4fbb03.

* Allow init buffer

* typo
pull/34245/head
Harald Schäfer 4 months ago committed by GitHub
parent 4c27878f67
commit f6885dcbec
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 9
      common/transformations/model.py
  2. 3
      pyproject.toml
  3. 2
      release/release_files.py
  4. 43
      selfdrive/modeld/SConscript
  5. 6
      selfdrive/modeld/dmonitoringmodeld
  6. 75
      selfdrive/modeld/dmonitoringmodeld.py
  7. 86
      selfdrive/modeld/modeld.py
  8. 65
      selfdrive/modeld/models/commonmodel.cc
  9. 74
      selfdrive/modeld/models/commonmodel.h
  10. 12
      selfdrive/modeld/models/commonmodel.pxd
  11. 51
      selfdrive/modeld/models/commonmodel_pyx.pyx
  12. 27
      selfdrive/modeld/runners/__init__.py
  13. 98
      selfdrive/modeld/runners/onnxmodel.py
  14. 37
      selfdrive/modeld/runners/ort_helpers.py
  15. 4
      selfdrive/modeld/runners/run.h
  16. 49
      selfdrive/modeld/runners/runmodel.h
  17. 14
      selfdrive/modeld/runners/runmodel.pxd
  18. 6
      selfdrive/modeld/runners/runmodel_pyx.pxd
  19. 37
      selfdrive/modeld/runners/runmodel_pyx.pyx
  20. 116
      selfdrive/modeld/runners/snpemodel.cc
  21. 52
      selfdrive/modeld/runners/snpemodel.h
  22. 9
      selfdrive/modeld/runners/snpemodel.pxd
  23. 17
      selfdrive/modeld/runners/snpemodel_pyx.pyx
  24. 58
      selfdrive/modeld/runners/thneedmodel.cc
  25. 17
      selfdrive/modeld/runners/thneedmodel.h
  26. 9
      selfdrive/modeld/runners/thneedmodel.pxd
  27. 14
      selfdrive/modeld/runners/thneedmodel_pyx.pyx
  28. 8
      selfdrive/modeld/runners/tinygrad_helpers.py
  29. 8
      selfdrive/modeld/thneed/README
  30. 0
      selfdrive/modeld/thneed/__init__.py
  31. 154
      selfdrive/modeld/thneed/serialize.cc
  32. 133
      selfdrive/modeld/thneed/thneed.h
  33. 216
      selfdrive/modeld/thneed/thneed_common.cc
  34. 32
      selfdrive/modeld/thneed/thneed_pc.cc
  35. 258
      selfdrive/modeld/thneed/thneed_qcom2.cc
  36. 13
      selfdrive/test/test_onroad.py
  37. 2
      system/hardware/tici/tests/test_power_draw.py
  38. 2
      tinygrad_repo
  39. 99
      uv.lock

@ -1,7 +1,7 @@
import numpy as np import numpy as np
from openpilot.common.transformations.orientation import rot_from_euler from openpilot.common.transformations.orientation import rot_from_euler
from openpilot.common.transformations.camera import get_view_frame_from_calib_frame, view_frame_from_device_frame, _ar_ox_fisheye from openpilot.common.transformations.camera import get_view_frame_from_calib_frame, view_frame_from_device_frame
# segnet # segnet
SEGNET_SIZE = (512, 384) SEGNET_SIZE = (512, 384)
@ -39,13 +39,6 @@ sbigmodel_intrinsics = np.array([
[0.0, sbigmodel_fl, 0.5 * (256 + MEDMODEL_CY)], [0.0, sbigmodel_fl, 0.5 * (256 + MEDMODEL_CY)],
[0.0, 0.0, 1.0]]) [0.0, 0.0, 1.0]])
DM_INPUT_SIZE = (1440, 960)
dmonitoringmodel_fl = _ar_ox_fisheye.focal_length
dmonitoringmodel_intrinsics = np.array([
[dmonitoringmodel_fl, 0.0, DM_INPUT_SIZE[0]/2],
[0.0, dmonitoringmodel_fl, DM_INPUT_SIZE[1]/2 - (_ar_ox_fisheye.height - DM_INPUT_SIZE[1])/2],
[0.0, 0.0, 1.0]])
bigmodel_frame_from_calib_frame = np.dot(bigmodel_intrinsics, bigmodel_frame_from_calib_frame = np.dot(bigmodel_intrinsics,
get_view_frame_from_calib_frame(0, 0, 0, 0)) get_view_frame_from_calib_frame(0, 0, 0, 0))

@ -42,7 +42,8 @@ dependencies = [
# modeld # modeld
"onnx >= 1.14.0", "onnx >= 1.14.0",
"onnxruntime >=1.16.3", "onnxruntime >=1.16.3; platform_system == 'Linux' and platform_machine == 'aarch64'",
"onnxruntime-gpu >=1.16.3; platform_system == 'Linux' and platform_machine == 'x86_64'",
# logging # logging
"pyzmq", "pyzmq",

@ -54,7 +54,7 @@ whitelist = [
"tools/joystick/", "tools/joystick/",
"tools/longitudinal_maneuvers/", "tools/longitudinal_maneuvers/",
"tinygrad_repo/examples/openpilot/compile3.py", "tinygrad_repo/openpilot/compile2.py",
"tinygrad_repo/extra/onnx.py", "tinygrad_repo/extra/onnx.py",
"tinygrad_repo/extra/onnx_ops.py", "tinygrad_repo/extra/onnx_ops.py",
"tinygrad_repo/extra/thneed.py", "tinygrad_repo/extra/thneed.py",

@ -13,6 +13,20 @@ common_src = [
"transforms/transform.cc", "transforms/transform.cc",
] ]
thneed_src_common = [
"thneed/thneed_common.cc",
"thneed/serialize.cc",
]
thneed_src_qcom = thneed_src_common + ["thneed/thneed_qcom2.cc"]
thneed_src_pc = thneed_src_common + ["thneed/thneed_pc.cc"]
thneed_src = thneed_src_qcom if arch == "larch64" else thneed_src_pc
# SNPE except on Mac and ARM Linux
snpe_lib = []
if arch != "Darwin" and arch != "aarch64":
common_src += ['runners/snpemodel.cc']
snpe_lib += ['SNPE']
# OpenCL is a framework on Mac # OpenCL is a framework on Mac
if arch == "Darwin": if arch == "Darwin":
@ -31,7 +45,11 @@ snpe_rpath_pc = f"{Dir('#').abspath}/third_party/snpe/x86_64-linux-clang"
snpe_rpath = lenvCython['RPATH'] + [snpe_rpath_qcom if arch == "larch64" else snpe_rpath_pc] snpe_rpath = lenvCython['RPATH'] + [snpe_rpath_qcom if arch == "larch64" else snpe_rpath_pc]
cython_libs = envCython["LIBS"] + libs cython_libs = envCython["LIBS"] + libs
snpemodel_lib = lenv.Library('snpemodel', ['runners/snpemodel.cc'])
commonmodel_lib = lenv.Library('commonmodel', common_src) commonmodel_lib = lenv.Library('commonmodel', common_src)
lenvCython.Program('runners/runmodel_pyx.so', 'runners/runmodel_pyx.pyx', LIBS=cython_libs, FRAMEWORKS=frameworks)
lenvCython.Program('runners/snpemodel_pyx.so', 'runners/snpemodel_pyx.pyx', LIBS=[snpemodel_lib, snpe_lib, *cython_libs], FRAMEWORKS=frameworks, RPATH=snpe_rpath)
lenvCython.Program('models/commonmodel_pyx.so', 'models/commonmodel_pyx.pyx', LIBS=[commonmodel_lib, *cython_libs], FRAMEWORKS=frameworks) lenvCython.Program('models/commonmodel_pyx.so', 'models/commonmodel_pyx.pyx', LIBS=[commonmodel_lib, *cython_libs], FRAMEWORKS=frameworks)
tinygrad_files = ["#"+x for x in glob.glob(env.Dir("#tinygrad_repo").relpath + "/**", recursive=True, root_dir=env.Dir("#").abspath)] tinygrad_files = ["#"+x for x in glob.glob(env.Dir("#tinygrad_repo").relpath + "/**", recursive=True, root_dir=env.Dir("#").abspath)]
@ -41,17 +59,20 @@ fn = File("models/supercombo").abspath
cmd = f'python3 {Dir("#selfdrive/modeld").abspath}/get_model_metadata.py {fn}.onnx' cmd = f'python3 {Dir("#selfdrive/modeld").abspath}/get_model_metadata.py {fn}.onnx'
lenv.Command(fn + "_metadata.pkl", [fn + ".onnx"] + tinygrad_files, cmd) lenv.Command(fn + "_metadata.pkl", [fn + ".onnx"] + tinygrad_files, cmd)
# Compile tinygrad model # Build thneed model
# TODO this is all super hacky if arch == "larch64" or GetOption('pc_thneed'):
tinygrad_opts = []
if not GetOption('pc_thneed'):
# use FLOAT16 on device for speed + don't cache the CL kernels for space
tinygrad_opts += ["FLOAT16=1", "PYOPENCL_NO_CACHE=1"]
cmd = f"cd {Dir('#').abspath}/tinygrad_repo && " + ' '.join(tinygrad_opts) + f" python3 openpilot/compile2.py {fn}.onnx {fn}.thneed"
pythonpath_string = 'PYTHONPATH="${PYTHONPATH}:' + env.Dir("#tinygrad_repo").abspath + '"' lenv.Command(fn + ".thneed", [fn + ".onnx"] + tinygrad_files, cmd)
if arch == 'larch64':
device_string = 'QCOM=1'
else:
device_string = 'CLANG=1 IMAGE=0'
for model_name in ['supercombo', 'dmonitoring_model']: fn_dm = File("models/dmonitoring_model").abspath
fn = File(f"models/{model_name}").abspath cmd = f"cd {Dir('#').abspath}/tinygrad_repo && " + ' '.join(tinygrad_opts) + f" python3 openpilot/compile2.py {fn_dm}.onnx {fn_dm}.thneed"
cmd = f'{pythonpath_string} {device_string} python3 {Dir("#tinygrad_repo").abspath}/examples/openpilot/compile3.py {fn}.onnx {fn}_tinygrad.pkl' lenv.Command(fn_dm + ".thneed", [fn_dm + ".onnx"] + tinygrad_files, cmd)
lenv.Command(fn + "_tinygrad.pkl", [fn + ".onnx"] + tinygrad_files, cmd)
thneed_lib = env.SharedLibrary('thneed', thneed_src, LIBS=[gpucommon, common, 'OpenCL', 'dl'])
thneedmodel_lib = env.Library('thneedmodel', ['runners/thneedmodel.cc'])
lenvCython.Program('runners/thneedmodel_pyx.so', 'runners/thneedmodel_pyx.pyx', LIBS=envCython["LIBS"]+[thneedmodel_lib, thneed_lib, gpucommon, common, 'dl', 'OpenCL'])

@ -1,4 +1,10 @@
#!/usr/bin/env bash #!/usr/bin/env bash
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null && pwd)" DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null && pwd)"
cd "$DIR/../../"
if [ -f "$DIR/libthneed.so" ]; then
export LD_PRELOAD="$DIR/libthneed.so"
fi
exec "$DIR/dmonitoringmodeld.py" "$@" exec "$DIR/dmonitoringmodeld.py" "$@"

@ -1,18 +1,8 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os import os
from openpilot.system.hardware import TICI
## TODO this is hack
if TICI:
from tinygrad.tensor import Tensor
from tinygrad.dtype import dtypes
from openpilot.selfdrive.modeld.runners.tinygrad_helpers import qcom_tensor_from_opencl_address
os.environ['QCOM'] = '1'
else:
from openpilot.selfdrive.modeld.runners.ort_helpers import make_onnx_cpu_runner
import gc import gc
import math import math
import time import time
import pickle
import ctypes import ctypes
import numpy as np import numpy as np
from pathlib import Path from pathlib import Path
@ -23,20 +13,21 @@ from cereal.messaging import PubMaster, SubMaster
from msgq.visionipc import VisionIpcClient, VisionStreamType, VisionBuf from msgq.visionipc import VisionIpcClient, VisionStreamType, VisionBuf
from openpilot.common.swaglog import cloudlog from openpilot.common.swaglog import cloudlog
from openpilot.common.realtime import set_realtime_priority from openpilot.common.realtime import set_realtime_priority
from openpilot.common.transformations.model import dmonitoringmodel_intrinsics, DM_INPUT_SIZE from openpilot.selfdrive.modeld.runners import ModelRunner, Runtime
from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye from openpilot.selfdrive.modeld.models.commonmodel_pyx import CLContext
from openpilot.selfdrive.modeld.models.commonmodel_pyx import CLContext, MonitoringModelFrame
from openpilot.selfdrive.modeld.parse_model_outputs import sigmoid from openpilot.selfdrive.modeld.parse_model_outputs import sigmoid
MODEL_WIDTH, MODEL_HEIGHT = DM_INPUT_SIZE
CALIB_LEN = 3 CALIB_LEN = 3
MODEL_WIDTH = 1440
MODEL_HEIGHT = 960
FEATURE_LEN = 512 FEATURE_LEN = 512
OUTPUT_SIZE = 84 + FEATURE_LEN OUTPUT_SIZE = 84 + FEATURE_LEN
PROCESS_NAME = "selfdrive.modeld.dmonitoringmodeld" PROCESS_NAME = "selfdrive.modeld.dmonitoringmodeld"
SEND_RAW_PRED = os.getenv('SEND_RAW_PRED') SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
MODEL_PATH = Path(__file__).parent / 'models/dmonitoring_model.onnx' MODEL_PATHS = {
MODEL_PKL_PATH = Path(__file__).parent / 'models/dmonitoring_model_tinygrad.pkl' ModelRunner.THNEED: Path(__file__).parent / 'models/dmonitoring_model.thneed',
ModelRunner.ONNX: Path(__file__).parent / 'models/dmonitoring_model.onnx'}
class DriverStateResult(ctypes.Structure): class DriverStateResult(ctypes.Structure):
_fields_ = [ _fields_ = [
@ -67,42 +58,33 @@ class DMonitoringModelResult(ctypes.Structure):
class ModelState: class ModelState:
inputs: dict[str, np.ndarray] inputs: dict[str, np.ndarray]
output: np.ndarray output: np.ndarray
model: ModelRunner
def __init__(self, cl_ctx): def __init__(self, cl_ctx):
assert ctypes.sizeof(DMonitoringModelResult) == OUTPUT_SIZE * ctypes.sizeof(ctypes.c_float) assert ctypes.sizeof(DMonitoringModelResult) == OUTPUT_SIZE * ctypes.sizeof(ctypes.c_float)
self.output = np.zeros(OUTPUT_SIZE, dtype=np.float32)
self.inputs = {
'input_img': np.zeros(MODEL_HEIGHT * MODEL_WIDTH, dtype=np.uint8),
'calib': np.zeros(CALIB_LEN, dtype=np.float32)}
self.frame = MonitoringModelFrame(cl_ctx) self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.GPU, False, cl_ctx)
self.numpy_inputs = { self.model.addInput("input_img", None)
'calib': np.zeros((1, CALIB_LEN), dtype=np.float32), self.model.addInput("calib", self.inputs['calib'])
}
if TICI: def run(self, buf:VisionBuf, calib:np.ndarray) -> tuple[np.ndarray, float]:
self.tensor_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()} self.inputs['calib'][:] = calib
with open(MODEL_PKL_PATH, "rb") as f:
self.model_run = pickle.load(f)
else:
self.onnx_cpu_runner = make_onnx_cpu_runner(MODEL_PATH)
def run(self, buf:VisionBuf, calib:np.ndarray, transform:np.ndarray) -> tuple[np.ndarray, float]: v_offset = buf.height - MODEL_HEIGHT
self.numpy_inputs['calib'][0,:] = calib h_offset = (buf.width - MODEL_WIDTH) // 2
buf_data = buf.data.reshape(-1, buf.stride)
input_data = self.inputs['input_img'].reshape(MODEL_HEIGHT, MODEL_WIDTH)
input_data[:] = buf_data[v_offset:v_offset+MODEL_HEIGHT, h_offset:h_offset+MODEL_WIDTH]
self.model.setInputBuffer("input_img", self.inputs['input_img'].view(np.float32))
t1 = time.perf_counter() t1 = time.perf_counter()
self.model.execute()
input_img_cl = self.frame.prepare(buf, transform.flatten())
if TICI:
# The imgs tensors are backed by opencl memory, only need init once
if 'input_img' not in self.tensor_inputs:
self.tensor_inputs['input_img'] = qcom_tensor_from_opencl_address(input_img_cl.mem_address, (1, MODEL_WIDTH*MODEL_HEIGHT), dtype=dtypes.uint8)
else:
self.numpy_inputs['input_img'] = self.frame.buffer_from_cl(input_img_cl).reshape((1, MODEL_WIDTH*MODEL_HEIGHT))
if TICI:
output = self.model_run(**self.tensor_inputs).numpy().flatten()
else:
output = self.onnx_cpu_runner.run(None, self.numpy_inputs)[0].flatten()
t2 = time.perf_counter() t2 = time.perf_counter()
return output, t2 - t1 return self.output, t2 - t1
def fill_driver_state(msg, ds_result: DriverStateResult): def fill_driver_state(msg, ds_result: DriverStateResult):
@ -155,23 +137,18 @@ def main():
pm = PubMaster(["driverStateV2"]) pm = PubMaster(["driverStateV2"])
calib = np.zeros(CALIB_LEN, dtype=np.float32) calib = np.zeros(CALIB_LEN, dtype=np.float32)
model_transform = None
while True: while True:
buf = vipc_client.recv() buf = vipc_client.recv()
if buf is None: if buf is None:
continue continue
if model_transform is None:
cam = _os_fisheye if buf.width == _os_fisheye.width else _ar_ox_fisheye
model_transform = np.linalg.inv(np.dot(dmonitoringmodel_intrinsics, np.linalg.inv(cam.intrinsics))).astype(np.float32)
sm.update(0) sm.update(0)
if sm.updated["liveCalibration"]: if sm.updated["liveCalibration"]:
calib[:] = np.array(sm["liveCalibration"].rpyCalib) calib[:] = np.array(sm["liveCalibration"].rpyCalib)
t1 = time.perf_counter() t1 = time.perf_counter()
model_output, gpu_execution_time = model.run(buf, calib, model_transform) model_output, gpu_execution_time = model.run(buf, calib)
t2 = time.perf_counter() t2 = time.perf_counter()
pm.send("driverStateV2", get_driverstate_packet(model_output, vipc_client.frame_id, vipc_client.timestamp_sof, t2 - t1, gpu_execution_time)) pm.send("driverStateV2", get_driverstate_packet(model_output, vipc_client.frame_id, vipc_client.timestamp_sof, t2 - t1, gpu_execution_time))

@ -1,14 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os import os
from openpilot.system.hardware import TICI
## TODO this is hack
if TICI:
from tinygrad.tensor import Tensor
from tinygrad.dtype import dtypes
from openpilot.selfdrive.modeld.runners.tinygrad_helpers import qcom_tensor_from_opencl_address
os.environ['QCOM'] = '1'
else:
from openpilot.selfdrive.modeld.runners.ort_helpers import make_onnx_cpu_runner
import time import time
import pickle import pickle
import numpy as np import numpy as np
@ -27,21 +18,21 @@ from openpilot.common.transformations.camera import DEVICE_CAMERAS
from openpilot.common.transformations.model import get_warp_matrix from openpilot.common.transformations.model import get_warp_matrix
from openpilot.system import sentry from openpilot.system import sentry
from openpilot.selfdrive.controls.lib.desire_helper import DesireHelper from openpilot.selfdrive.controls.lib.desire_helper import DesireHelper
from openpilot.selfdrive.modeld.runners import ModelRunner, Runtime
from openpilot.selfdrive.modeld.parse_model_outputs import Parser from openpilot.selfdrive.modeld.parse_model_outputs import Parser
from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState
from openpilot.selfdrive.modeld.constants import ModelConstants from openpilot.selfdrive.modeld.constants import ModelConstants
from openpilot.selfdrive.modeld.models.commonmodel_pyx import DrivingModelFrame, CLContext from openpilot.selfdrive.modeld.models.commonmodel_pyx import ModelFrame, CLContext
PROCESS_NAME = "selfdrive.modeld.modeld" PROCESS_NAME = "selfdrive.modeld.modeld"
SEND_RAW_PRED = os.getenv('SEND_RAW_PRED') SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
MODEL_PATH = Path(__file__).parent / 'models/supercombo.onnx' MODEL_PATHS = {
MODEL_PKL_PATH = Path(__file__).parent / 'models/supercombo_tinygrad.pkl' ModelRunner.THNEED: Path(__file__).parent / 'models/supercombo.thneed',
ModelRunner.ONNX: Path(__file__).parent / 'models/supercombo.onnx'}
METADATA_PATH = Path(__file__).parent / 'models/supercombo_metadata.pkl' METADATA_PATH = Path(__file__).parent / 'models/supercombo_metadata.pkl'
# TODO: should not hardcoded
IMG_INPUT_SHAPE = (1, 12, 128, 256)
class FrameMeta: class FrameMeta:
frame_id: int = 0 frame_id: int = 0
@ -53,27 +44,28 @@ class FrameMeta:
self.frame_id, self.timestamp_sof, self.timestamp_eof = vipc.frame_id, vipc.timestamp_sof, vipc.timestamp_eof self.frame_id, self.timestamp_sof, self.timestamp_eof = vipc.frame_id, vipc.timestamp_sof, vipc.timestamp_eof
class ModelState: class ModelState:
frame: DrivingModelFrame frame: ModelFrame
wide_frame: DrivingModelFrame wide_frame: ModelFrame
inputs: dict[str, np.ndarray] inputs: dict[str, np.ndarray]
output: np.ndarray output: np.ndarray
prev_desire: np.ndarray # for tracking the rising edge of the pulse prev_desire: np.ndarray # for tracking the rising edge of the pulse
model: ModelRunner
def __init__(self, context: CLContext): def __init__(self, context: CLContext):
self.frame = DrivingModelFrame(context) self.frame = ModelFrame(context)
self.wide_frame = DrivingModelFrame(context) self.wide_frame = ModelFrame(context)
self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32) self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
self.full_features_20Hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN, ModelConstants.FEATURE_LEN), dtype=np.float32) self.full_features_20Hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN, ModelConstants.FEATURE_LEN), dtype=np.float32)
self.desire_20Hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN + 1, ModelConstants.DESIRE_LEN), dtype=np.float32) self.desire_20Hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN + 1, ModelConstants.DESIRE_LEN), dtype=np.float32)
self.prev_desired_curv_20hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN + 1, ModelConstants.PREV_DESIRED_CURV_LEN), dtype=np.float32) self.prev_desired_curv_20hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN + 1, ModelConstants.PREV_DESIRED_CURV_LEN), dtype=np.float32)
# img buffers are managed in openCL transform code # img buffers are managed in openCL transform code
self.numpy_inputs = { self.inputs = {
'desire': np.zeros((1, (ModelConstants.HISTORY_BUFFER_LEN+1), ModelConstants.DESIRE_LEN), dtype=np.float32), 'desire': np.zeros(ModelConstants.DESIRE_LEN * (ModelConstants.HISTORY_BUFFER_LEN+1), dtype=np.float32),
'traffic_convention': np.zeros((1, ModelConstants.TRAFFIC_CONVENTION_LEN), dtype=np.float32), 'traffic_convention': np.zeros(ModelConstants.TRAFFIC_CONVENTION_LEN, dtype=np.float32),
'lateral_control_params': np.zeros((1, ModelConstants.LATERAL_CONTROL_PARAMS_LEN), dtype=np.float32), 'lateral_control_params': np.zeros(ModelConstants.LATERAL_CONTROL_PARAMS_LEN, dtype=np.float32),
'prev_desired_curv': np.zeros((1,(ModelConstants.HISTORY_BUFFER_LEN+1), ModelConstants.PREV_DESIRED_CURV_LEN), dtype=np.float32), 'prev_desired_curv': np.zeros(ModelConstants.PREV_DESIRED_CURV_LEN * (ModelConstants.HISTORY_BUFFER_LEN+1), dtype=np.float32),
'features_buffer': np.zeros((1, ModelConstants.HISTORY_BUFFER_LEN, ModelConstants.FEATURE_LEN), dtype=np.float32), 'features_buffer': np.zeros(ModelConstants.HISTORY_BUFFER_LEN * ModelConstants.FEATURE_LEN, dtype=np.float32),
} }
with open(METADATA_PATH, 'rb') as f: with open(METADATA_PATH, 'rb') as f:
@ -84,12 +76,11 @@ class ModelState:
self.output = np.zeros(net_output_size, dtype=np.float32) self.output = np.zeros(net_output_size, dtype=np.float32)
self.parser = Parser() self.parser = Parser()
if TICI: self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.GPU, False, context)
self.tensor_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()} self.model.addInput("input_imgs", None)
with open(MODEL_PKL_PATH, "rb") as f: self.model.addInput("big_input_imgs", None)
self.model_run = pickle.load(f) for k,v in self.inputs.items():
else: self.model.addInput(k, v)
self.onnx_cpu_runner = make_onnx_cpu_runner(MODEL_PATH)
def slice_outputs(self, model_outputs: np.ndarray) -> dict[str, np.ndarray]: def slice_outputs(self, model_outputs: np.ndarray) -> dict[str, np.ndarray]:
parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in self.output_slices.items()} parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in self.output_slices.items()}
@ -106,30 +97,18 @@ class ModelState:
self.desire_20Hz[:-1] = self.desire_20Hz[1:] self.desire_20Hz[:-1] = self.desire_20Hz[1:]
self.desire_20Hz[-1] = new_desire self.desire_20Hz[-1] = new_desire
self.numpy_inputs['desire'][:] = self.desire_20Hz.reshape((1,25,4,-1)).max(axis=2) self.inputs['desire'][:] = self.desire_20Hz.reshape((25,4,-1)).max(axis=1).flatten()
self.numpy_inputs['traffic_convention'][:] = inputs['traffic_convention'] self.inputs['traffic_convention'][:] = inputs['traffic_convention']
self.numpy_inputs['lateral_control_params'][:] = inputs['lateral_control_params'] self.inputs['lateral_control_params'][:] = inputs['lateral_control_params']
input_imgs_cl = self.frame.prepare(buf, transform.flatten())
big_input_imgs_cl = self.wide_frame.prepare(wbuf, transform_wide.flatten()) self.model.setInputBuffer("input_imgs", self.frame.prepare(buf, transform.flatten(), self.model.getCLBuffer("input_imgs")))
self.model.setInputBuffer("big_input_imgs", self.wide_frame.prepare(wbuf, transform_wide.flatten(), self.model.getCLBuffer("big_input_imgs")))
if TICI:
# The imgs tensors are backed by opencl memory, only need init once
if 'input_imgs' not in self.tensor_inputs:
self.tensor_inputs['input_imgs'] = qcom_tensor_from_opencl_address(input_imgs_cl.mem_address, IMG_INPUT_SHAPE, dtype=dtypes.uint8)
self.tensor_inputs['big_input_imgs'] = qcom_tensor_from_opencl_address(big_input_imgs_cl.mem_address, IMG_INPUT_SHAPE, dtype=dtypes.uint8)
else:
self.numpy_inputs['input_imgs'] = self.frame.buffer_from_cl(input_imgs_cl).reshape(IMG_INPUT_SHAPE)
self.numpy_inputs['big_input_imgs'] = self.wide_frame.buffer_from_cl(big_input_imgs_cl).reshape(IMG_INPUT_SHAPE)
if prepare_only: if prepare_only:
return None return None
if TICI: self.model.execute()
self.output = self.model_run(**self.tensor_inputs).numpy().flatten()
else:
self.output = self.onnx_cpu_runner.run(None, self.numpy_inputs)[0].flatten()
outputs = self.parser.parse_outputs(self.slice_outputs(self.output)) outputs = self.parser.parse_outputs(self.slice_outputs(self.output))
self.full_features_20Hz[:-1] = self.full_features_20Hz[1:] self.full_features_20Hz[:-1] = self.full_features_20Hz[1:]
@ -139,9 +118,9 @@ class ModelState:
self.prev_desired_curv_20hz[-1] = outputs['desired_curvature'][0, :] self.prev_desired_curv_20hz[-1] = outputs['desired_curvature'][0, :]
idxs = np.arange(-4,-100,-4)[::-1] idxs = np.arange(-4,-100,-4)[::-1]
self.numpy_inputs['features_buffer'][:] = self.full_features_20Hz[idxs] self.inputs['features_buffer'][:] = self.full_features_20Hz[idxs].flatten()
# TODO model only uses last value now, once that changes we need to input strided action history buffer # TODO model only uses last value now, once that changes we need to input strided action history buffer
self.numpy_inputs['prev_desired_curv'][-ModelConstants.PREV_DESIRED_CURV_LEN:] = 0. * self.prev_desired_curv_20hz[-4, :] self.inputs['prev_desired_curv'][-ModelConstants.PREV_DESIRED_CURV_LEN:] = 0. * self.prev_desired_curv_20hz[-4, :]
return outputs return outputs
@ -313,6 +292,7 @@ def main(demo=False):
pm.send('modelV2', modelv2_send) pm.send('modelV2', modelv2_send)
pm.send('drivingModelData', drivingdata_send) pm.send('drivingModelData', drivingdata_send)
pm.send('cameraOdometry', posenet_send) pm.send('cameraOdometry', posenet_send)
last_vipc_frame_id = meta_main.frame_id last_vipc_frame_id = meta_main.frame_id

@ -1,61 +1,58 @@
#include "selfdrive/modeld/models/commonmodel.h" #include "selfdrive/modeld/models/commonmodel.h"
#include <cassert>
#include <cmath> #include <cmath>
#include <cstring> #include <cstring>
#include "common/clutil.h" #include "common/clutil.h"
DrivingModelFrame::DrivingModelFrame(cl_device_id device_id, cl_context context) : ModelFrame(device_id, context) { ModelFrame::ModelFrame(cl_device_id device_id, cl_context context) {
input_frames = std::make_unique<uint8_t[]>(buf_size); input_frames = std::make_unique<uint8_t[]>(buf_size);
input_frames_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));
q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err));
y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, MODEL_WIDTH * MODEL_HEIGHT, NULL, &err));
u_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (MODEL_WIDTH / 2) * (MODEL_HEIGHT / 2), NULL, &err));
v_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (MODEL_WIDTH / 2) * (MODEL_HEIGHT / 2), NULL, &err));
img_buffer_20hz_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, 5*frame_size_bytes, NULL, &err)); img_buffer_20hz_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, 5*frame_size_bytes, NULL, &err));
region.origin = 4 * frame_size_bytes; region.origin = 4 * frame_size_bytes;
region.size = frame_size_bytes; region.size = frame_size_bytes;
last_img_cl = CL_CHECK_ERR(clCreateSubBuffer(img_buffer_20hz_cl, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err)); last_img_cl = CL_CHECK_ERR(clCreateSubBuffer(img_buffer_20hz_cl, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err));
transform_init(&transform, context, device_id);
loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT); loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT);
init_transform(device_id, context, MODEL_WIDTH, MODEL_HEIGHT);
} }
cl_mem* DrivingModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) { uint8_t* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3 &projection, cl_mem *output) {
run_transform(yuv_cl, MODEL_WIDTH, MODEL_HEIGHT, frame_width, frame_height, frame_stride, frame_uv_offset, projection); transform_queue(&this->transform, q,
yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset,
y_cl, u_cl, v_cl, MODEL_WIDTH, MODEL_HEIGHT, projection);
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
CL_CHECK(clEnqueueCopyBuffer(q, img_buffer_20hz_cl, img_buffer_20hz_cl, (i+1)*frame_size_bytes, i*frame_size_bytes, frame_size_bytes, 0, nullptr, nullptr)); CL_CHECK(clEnqueueCopyBuffer(q, img_buffer_20hz_cl, img_buffer_20hz_cl, (i+1)*frame_size_bytes, i*frame_size_bytes, frame_size_bytes, 0, nullptr, nullptr));
} }
loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, last_img_cl); loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, last_img_cl);
if (output == NULL) {
copy_queue(&loadyuv, q, img_buffer_20hz_cl, input_frames_cl, 0, 0, frame_size_bytes); CL_CHECK(clEnqueueReadBuffer(q, img_buffer_20hz_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[0], 0, nullptr, nullptr));
copy_queue(&loadyuv, q, last_img_cl, input_frames_cl, 0, frame_size_bytes, frame_size_bytes); CL_CHECK(clEnqueueReadBuffer(q, last_img_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr));
clFinish(q);
// NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready. return &input_frames[0];
clFinish(q); } else {
return &input_frames_cl; copy_queue(&loadyuv, q, img_buffer_20hz_cl, *output, 0, 0, frame_size_bytes);
copy_queue(&loadyuv, q, last_img_cl, *output, 0, frame_size_bytes, frame_size_bytes);
// NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready.
clFinish(q);
return NULL;
}
} }
DrivingModelFrame::~DrivingModelFrame() { ModelFrame::~ModelFrame() {
deinit_transform(); transform_destroy(&transform);
loadyuv_destroy(&loadyuv); loadyuv_destroy(&loadyuv);
CL_CHECK(clReleaseMemObject(img_buffer_20hz_cl)); CL_CHECK(clReleaseMemObject(img_buffer_20hz_cl));
CL_CHECK(clReleaseMemObject(last_img_cl)); CL_CHECK(clReleaseMemObject(last_img_cl));
CL_CHECK(clReleaseMemObject(v_cl));
CL_CHECK(clReleaseMemObject(u_cl));
CL_CHECK(clReleaseMemObject(y_cl));
CL_CHECK(clReleaseCommandQueue(q)); CL_CHECK(clReleaseCommandQueue(q));
} }
MonitoringModelFrame::MonitoringModelFrame(cl_device_id device_id, cl_context context) : ModelFrame(device_id, context) {
input_frames = std::make_unique<uint8_t[]>(buf_size);
input_frame_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));
init_transform(device_id, context, MODEL_WIDTH, MODEL_HEIGHT);
}
cl_mem* MonitoringModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) {
run_transform(yuv_cl, MODEL_WIDTH, MODEL_HEIGHT, frame_width, frame_height, frame_stride, frame_uv_offset, projection);
clFinish(q);
return &y_cl;
}
MonitoringModelFrame::~MonitoringModelFrame() {
deinit_transform();
CL_CHECK(clReleaseCommandQueue(q));
}

@ -2,7 +2,6 @@
#include <cfloat> #include <cfloat>
#include <cstdlib> #include <cstdlib>
#include <cassert>
#include <memory> #include <memory>
@ -19,54 +18,9 @@
class ModelFrame { class ModelFrame {
public: public:
ModelFrame(cl_device_id device_id, cl_context context) { ModelFrame(cl_device_id device_id, cl_context context);
q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err)); ~ModelFrame();
} uint8_t* prepare(cl_mem yuv_cl, int width, int height, int frame_stride, int frame_uv_offset, const mat3& transform, cl_mem *output);
virtual ~ModelFrame() {}
virtual cl_mem* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) { return NULL; }
uint8_t* buffer_from_cl(cl_mem *in_frames, int buffer_size) {
CL_CHECK(clEnqueueReadBuffer(q, *in_frames, CL_TRUE, 0, buffer_size, input_frames.get(), 0, nullptr, nullptr));
clFinish(q);
return &input_frames[0];
}
int MODEL_WIDTH;
int MODEL_HEIGHT;
int MODEL_FRAME_SIZE;
int buf_size;
protected:
cl_mem y_cl, u_cl, v_cl;
Transform transform;
cl_command_queue q;
std::unique_ptr<uint8_t[]> input_frames;
void init_transform(cl_device_id device_id, cl_context context, int model_width, int model_height) {
y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, model_width * model_height, NULL, &err));
u_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (model_width / 2) * (model_height / 2), NULL, &err));
v_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (model_width / 2) * (model_height / 2), NULL, &err));
transform_init(&transform, context, device_id);
}
void deinit_transform() {
transform_destroy(&transform);
CL_CHECK(clReleaseMemObject(v_cl));
CL_CHECK(clReleaseMemObject(u_cl));
CL_CHECK(clReleaseMemObject(y_cl));
}
void run_transform(cl_mem yuv_cl, int model_width, int model_height, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) {
transform_queue(&transform, q,
yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset,
y_cl, u_cl, v_cl, model_width, model_height, projection);
}
};
class DrivingModelFrame : public ModelFrame {
public:
DrivingModelFrame(cl_device_id device_id, cl_context context);
~DrivingModelFrame();
cl_mem* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection);
const int MODEL_WIDTH = 512; const int MODEL_WIDTH = 512;
const int MODEL_HEIGHT = 256; const int MODEL_HEIGHT = 256;
@ -75,22 +29,10 @@ public:
const size_t frame_size_bytes = MODEL_FRAME_SIZE * sizeof(uint8_t); const size_t frame_size_bytes = MODEL_FRAME_SIZE * sizeof(uint8_t);
private: private:
Transform transform;
LoadYUVState loadyuv; LoadYUVState loadyuv;
cl_mem img_buffer_20hz_cl, last_img_cl, input_frames_cl; cl_command_queue q;
cl_mem y_cl, u_cl, v_cl, img_buffer_20hz_cl, last_img_cl;
cl_buffer_region region; cl_buffer_region region;
}; std::unique_ptr<uint8_t[]> input_frames;
};
class MonitoringModelFrame : public ModelFrame {
public:
MonitoringModelFrame(cl_device_id device_id, cl_context context);
~MonitoringModelFrame();
cl_mem* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection);
const int MODEL_WIDTH = 1440;
const int MODEL_HEIGHT = 960;
const int MODEL_FRAME_SIZE = MODEL_WIDTH * MODEL_HEIGHT;
const int buf_size = MODEL_FRAME_SIZE;
private:
cl_mem input_frame_cl;
};

@ -14,13 +14,5 @@ cdef extern from "common/clutil.h":
cdef extern from "selfdrive/modeld/models/commonmodel.h": cdef extern from "selfdrive/modeld/models/commonmodel.h":
cppclass ModelFrame: cppclass ModelFrame:
int buf_size int buf_size
unsigned char * buffer_from_cl(cl_mem*, int); ModelFrame(cl_device_id, cl_context)
cl_mem * prepare(cl_mem, int, int, int, int, mat3) unsigned char * prepare(cl_mem, int, int, int, int, mat3, cl_mem*)
cppclass DrivingModelFrame:
int buf_size
DrivingModelFrame(cl_device_id, cl_context)
cppclass MonitoringModelFrame:
int buf_size
MonitoringModelFrame(cl_device_id, cl_context)

@ -4,12 +4,11 @@
import numpy as np import numpy as np
cimport numpy as cnp cimport numpy as cnp
from libc.string cimport memcpy from libc.string cimport memcpy
from libc.stdint cimport uintptr_t
from msgq.visionipc.visionipc cimport cl_mem from msgq.visionipc.visionipc cimport cl_mem
from msgq.visionipc.visionipc_pyx cimport VisionBuf, CLContext as BaseCLContext from msgq.visionipc.visionipc_pyx cimport VisionBuf, CLContext as BaseCLContext
from .commonmodel cimport CL_DEVICE_TYPE_DEFAULT, cl_get_device_id, cl_create_context from .commonmodel cimport CL_DEVICE_TYPE_DEFAULT, cl_get_device_id, cl_create_context
from .commonmodel cimport mat3, ModelFrame as cppModelFrame, DrivingModelFrame as cppDrivingModelFrame, MonitoringModelFrame as cppMonitoringModelFrame from .commonmodel cimport mat3, ModelFrame as cppModelFrame
cdef class CLContext(BaseCLContext): cdef class CLContext(BaseCLContext):
@ -24,47 +23,23 @@ cdef class CLMem:
mem.mem = <cl_mem*> cmem mem.mem = <cl_mem*> cmem
return mem return mem
@property
def mem_address(self):
return <uintptr_t>(self.mem)
def cl_from_visionbuf(VisionBuf buf):
return CLMem.create(<void*>&buf.buf.buf_cl)
cdef class ModelFrame: cdef class ModelFrame:
cdef cppModelFrame * frame cdef cppModelFrame * frame
cdef int buf_size
def __cinit__(self, CLContext context):
self.frame = new cppModelFrame(context.device_id, context.context)
def __dealloc__(self): def __dealloc__(self):
del self.frame del self.frame
def prepare(self, VisionBuf buf, float[:] projection): def prepare(self, VisionBuf buf, float[:] projection, CLMem output):
cdef mat3 cprojection cdef mat3 cprojection
memcpy(cprojection.v, &projection[0], 9*sizeof(float)) memcpy(cprojection.v, &projection[0], 9*sizeof(float))
cdef cl_mem * data cdef unsigned char * data
data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection) if output is None:
return CLMem.create(data) data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection, NULL)
else:
def buffer_from_cl(self, CLMem in_frames): data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection, output.mem)
cdef unsigned char * data2 if not data:
data2 = self.frame.buffer_from_cl(in_frames.mem, self.buf_size) return None
return np.asarray(<cnp.uint8_t[:self.buf_size]> data2) return np.asarray(<cnp.uint8_t[:self.frame.buf_size]> data)
cdef class DrivingModelFrame(ModelFrame):
cdef cppDrivingModelFrame * _frame
def __cinit__(self, CLContext context):
self._frame = new cppDrivingModelFrame(context.device_id, context.context)
self.frame = <cppModelFrame*>(self._frame)
self.buf_size = self._frame.buf_size
cdef class MonitoringModelFrame(ModelFrame):
cdef cppMonitoringModelFrame * _frame
def __cinit__(self, CLContext context):
self._frame = new cppMonitoringModelFrame(context.device_id, context.context)
self.frame = <cppModelFrame*>(self._frame)
self.buf_size = self._frame.buf_size

@ -0,0 +1,27 @@
import os
from openpilot.system.hardware import TICI
from openpilot.selfdrive.modeld.runners.runmodel_pyx import RunModel, Runtime
assert Runtime
USE_THNEED = int(os.getenv('USE_THNEED', str(int(TICI))))
USE_SNPE = int(os.getenv('USE_SNPE', str(int(TICI))))
class ModelRunner(RunModel):
THNEED = 'THNEED'
SNPE = 'SNPE'
ONNX = 'ONNX'
def __new__(cls, paths, *args, **kwargs):
if ModelRunner.THNEED in paths and USE_THNEED:
from openpilot.selfdrive.modeld.runners.thneedmodel_pyx import ThneedModel as Runner
runner_type = ModelRunner.THNEED
elif ModelRunner.SNPE in paths and USE_SNPE:
from openpilot.selfdrive.modeld.runners.snpemodel_pyx import SNPEModel as Runner
runner_type = ModelRunner.SNPE
elif ModelRunner.ONNX in paths:
from openpilot.selfdrive.modeld.runners.onnxmodel import ONNXModel as Runner
runner_type = ModelRunner.ONNX
else:
raise Exception("Couldn't select a model runner, make sure to pass at least one valid model path")
return Runner(str(paths[runner_type]), *args, **kwargs)

@ -0,0 +1,98 @@
import onnx
import itertools
import os
import sys
import numpy as np
from typing import Any
from openpilot.selfdrive.modeld.runners.runmodel_pyx import RunModel
ORT_TYPES_TO_NP_TYPES = {'tensor(float16)': np.float16, 'tensor(float)': np.float32, 'tensor(uint8)': np.uint8}
def attributeproto_fp16_to_fp32(attr):
float32_list = np.frombuffer(attr.raw_data, dtype=np.float16)
attr.data_type = 1
attr.raw_data = float32_list.astype(np.float32).tobytes()
def convert_fp16_to_fp32(onnx_path_or_bytes):
if isinstance(onnx_path_or_bytes, bytes):
model = onnx.load_from_string(onnx_path_or_bytes)
elif isinstance(onnx_path_or_bytes, str):
model = onnx.load(onnx_path_or_bytes)
for i in model.graph.initializer:
if i.data_type == 10:
attributeproto_fp16_to_fp32(i)
for i in itertools.chain(model.graph.input, model.graph.output):
if i.type.tensor_type.elem_type == 10:
i.type.tensor_type.elem_type = 1
for i in model.graph.node:
if i.op_type == 'Cast' and i.attribute[0].i == 10:
i.attribute[0].i = 1
for a in i.attribute:
if hasattr(a, 't'):
if a.t.data_type == 10:
attributeproto_fp16_to_fp32(a.t)
return model.SerializeToString()
def create_ort_session(path, fp16_to_fp32):
os.environ["OMP_NUM_THREADS"] = "4"
os.environ["OMP_WAIT_POLICY"] = "PASSIVE"
import onnxruntime as ort
print("Onnx available providers: ", ort.get_available_providers(), file=sys.stderr)
options = ort.SessionOptions()
options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
provider: str | tuple[str, dict[Any, Any]]
if 'OpenVINOExecutionProvider' in ort.get_available_providers() and 'ONNXCPU' not in os.environ:
provider = 'OpenVINOExecutionProvider'
elif 'CUDAExecutionProvider' in ort.get_available_providers() and 'ONNXCPU' not in os.environ:
options.intra_op_num_threads = 2
provider = ('CUDAExecutionProvider', {'cudnn_conv_algo_search': 'EXHAUSTIVE'})
else:
options.intra_op_num_threads = 2
options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
provider = 'CPUExecutionProvider'
model_data = convert_fp16_to_fp32(path) if fp16_to_fp32 else path
print("Onnx selected provider: ", [provider], file=sys.stderr)
ort_session = ort.InferenceSession(model_data, options, providers=[provider])
print("Onnx using ", ort_session.get_providers(), file=sys.stderr)
return ort_session
class ONNXModel(RunModel):
def __init__(self, path, output, runtime, use_tf8, cl_context):
self.inputs = {}
self.output = output
self.session = create_ort_session(path, fp16_to_fp32=True)
self.input_names = [x.name for x in self.session.get_inputs()]
self.input_shapes = {x.name: [1, *x.shape[1:]] for x in self.session.get_inputs()}
self.input_dtypes = {x.name: ORT_TYPES_TO_NP_TYPES[x.type] for x in self.session.get_inputs()}
# run once to initialize CUDA provider
if "CUDAExecutionProvider" in self.session.get_providers():
self.session.run(None, {k: np.zeros(self.input_shapes[k], dtype=self.input_dtypes[k]) for k in self.input_names})
print("ready to run onnx model", self.input_shapes, file=sys.stderr)
def addInput(self, name, buffer):
assert name in self.input_names
self.inputs[name] = buffer
def setInputBuffer(self, name, buffer):
assert name in self.inputs
self.inputs[name] = buffer
def getCLBuffer(self, name):
return None
def execute(self):
inputs = {k: v.view(self.input_dtypes[k]) for k,v in self.inputs.items()}
inputs = {k: v.reshape(self.input_shapes[k]).astype(self.input_dtypes[k]) for k,v in inputs.items()}
outputs = self.session.run(None, inputs)
assert len(outputs) == 1, "Only single model outputs are supported"
self.output[:] = outputs[0]
return self.output

@ -1,37 +0,0 @@
import onnx
import onnxruntime as ort
import numpy as np
import itertools
ORT_TYPES_TO_NP_TYPES = {'tensor(float16)': np.float16, 'tensor(float)': np.float32, 'tensor(uint8)': np.uint8}
def attributeproto_fp16_to_fp32(attr):
float32_list = np.frombuffer(attr.raw_data, dtype=np.float16)
attr.data_type = 1
attr.raw_data = float32_list.astype(np.float32).tobytes()
def convert_fp16_to_fp32(model):
for i in model.graph.initializer:
if i.data_type == 10:
attributeproto_fp16_to_fp32(i)
for i in itertools.chain(model.graph.input, model.graph.output):
if i.type.tensor_type.elem_type == 10:
i.type.tensor_type.elem_type = 1
for i in model.graph.node:
if i.op_type == 'Cast' and i.attribute[0].i == 10:
i.attribute[0].i = 1
for a in i.attribute:
if hasattr(a, 't'):
if a.t.data_type == 10:
attributeproto_fp16_to_fp32(a.t)
return model.SerializeToString()
def make_onnx_cpu_runner(model_path):
options = ort.SessionOptions()
options.intra_op_num_threads = 4
options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
model_data = convert_fp16_to_fp32(onnx.load(model_path))
return ort.InferenceSession(model_data, options, providers=['CPUExecutionProvider'])

@ -0,0 +1,4 @@
#pragma once
#include "selfdrive/modeld/runners/runmodel.h"
#include "selfdrive/modeld/runners/snpemodel.h"

@ -0,0 +1,49 @@
#pragma once
#include <string>
#include <vector>
#include <memory>
#include <cassert>
#include "common/clutil.h"
#include "common/swaglog.h"
#define USE_CPU_RUNTIME 0
#define USE_GPU_RUNTIME 1
#define USE_DSP_RUNTIME 2
struct ModelInput {
const std::string name;
float *buffer;
int size;
ModelInput(const std::string _name, float *_buffer, int _size) : name(_name), buffer(_buffer), size(_size) {}
virtual void setBuffer(float *_buffer, int _size) {
assert(size == _size || size == 0);
buffer = _buffer;
size = _size;
}
};
class RunModel {
public:
std::vector<std::unique_ptr<ModelInput>> inputs;
virtual ~RunModel() {}
virtual void execute() {}
virtual void* getCLBuffer(const std::string name) { return nullptr; }
virtual void addInput(const std::string name, float *buffer, int size) {
inputs.push_back(std::unique_ptr<ModelInput>(new ModelInput(name, buffer, size)));
}
virtual void setInputBuffer(const std::string name, float *buffer, int size) {
for (auto &input : inputs) {
if (name == input->name) {
input->setBuffer(buffer, size);
return;
}
}
LOGE("Tried to update input `%s` but no input with this name exists", name.c_str());
assert(false);
}
};

@ -0,0 +1,14 @@
# distutils: language = c++
from libcpp.string cimport string
cdef extern from "selfdrive/modeld/runners/runmodel.h":
cdef int USE_CPU_RUNTIME
cdef int USE_GPU_RUNTIME
cdef int USE_DSP_RUNTIME
cdef cppclass RunModel:
void addInput(string, float*, int)
void setInputBuffer(string, float*, int)
void * getCLBuffer(string)
void execute()

@ -0,0 +1,6 @@
# distutils: language = c++
from .runmodel cimport RunModel as cppRunModel
cdef class RunModel:
cdef cppRunModel * model

@ -0,0 +1,37 @@
# distutils: language = c++
# cython: c_string_encoding=ascii, language_level=3
from libcpp.string cimport string
from .runmodel cimport USE_CPU_RUNTIME, USE_GPU_RUNTIME, USE_DSP_RUNTIME
from selfdrive.modeld.models.commonmodel_pyx cimport CLMem
class Runtime:
CPU = USE_CPU_RUNTIME
GPU = USE_GPU_RUNTIME
DSP = USE_DSP_RUNTIME
cdef class RunModel:
def __dealloc__(self):
del self.model
def addInput(self, string name, float[:] buffer):
if buffer is not None:
self.model.addInput(name, &buffer[0], len(buffer))
else:
self.model.addInput(name, NULL, 0)
def setInputBuffer(self, string name, float[:] buffer):
if buffer is not None:
self.model.setInputBuffer(name, &buffer[0], len(buffer))
else:
self.model.setInputBuffer(name, NULL, 0)
def getCLBuffer(self, string name):
cdef void * cl_buf = self.model.getCLBuffer(name)
if not cl_buf:
return None
return CLMem.create(cl_buf)
def execute(self):
self.model.execute()

@ -0,0 +1,116 @@
#pragma clang diagnostic ignored "-Wexceptions"
#include "selfdrive/modeld/runners/snpemodel.h"
#include <cstring>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "common/util.h"
#include "common/timing.h"
void PrintErrorStringAndExit() {
std::cerr << zdl::DlSystem::getLastErrorString() << std::endl;
std::exit(EXIT_FAILURE);
}
SNPEModel::SNPEModel(const std::string path, float *_output, size_t _output_size, int runtime, bool _use_tf8, cl_context context) {
output = _output;
output_size = _output_size;
use_tf8 = _use_tf8;
#ifdef QCOM2
if (runtime == USE_GPU_RUNTIME) {
snpe_runtime = zdl::DlSystem::Runtime_t::GPU;
} else if (runtime == USE_DSP_RUNTIME) {
snpe_runtime = zdl::DlSystem::Runtime_t::DSP;
} else {
snpe_runtime = zdl::DlSystem::Runtime_t::CPU;
}
assert(zdl::SNPE::SNPEFactory::isRuntimeAvailable(snpe_runtime));
#endif
model_data = util::read_file(path);
assert(model_data.size() > 0);
// load model
std::unique_ptr<zdl::DlContainer::IDlContainer> container = zdl::DlContainer::IDlContainer::open((uint8_t*)model_data.data(), model_data.size());
if (!container) { PrintErrorStringAndExit(); }
LOGW("loaded model with size: %lu", model_data.size());
// create model runner
zdl::SNPE::SNPEBuilder snpe_builder(container.get());
while (!snpe) {
#ifdef QCOM2
snpe = snpe_builder.setOutputLayers({})
.setRuntimeProcessor(snpe_runtime)
.setUseUserSuppliedBuffers(true)
.setPerformanceProfile(zdl::DlSystem::PerformanceProfile_t::HIGH_PERFORMANCE)
.build();
#else
snpe = snpe_builder.setOutputLayers({})
.setUseUserSuppliedBuffers(true)
.setPerformanceProfile(zdl::DlSystem::PerformanceProfile_t::HIGH_PERFORMANCE)
.build();
#endif
if (!snpe) std::cerr << zdl::DlSystem::getLastErrorString() << std::endl;
}
// create output buffer
zdl::DlSystem::UserBufferEncodingFloat ub_encoding_float;
zdl::DlSystem::IUserBufferFactory &ub_factory = zdl::SNPE::SNPEFactory::getUserBufferFactory();
const auto &output_tensor_names_opt = snpe->getOutputTensorNames();
if (!output_tensor_names_opt) throw std::runtime_error("Error obtaining output tensor names");
const auto &output_tensor_names = *output_tensor_names_opt;
assert(output_tensor_names.size() == 1);
const char *output_tensor_name = output_tensor_names.at(0);
const zdl::DlSystem::TensorShape &buffer_shape = snpe->getInputOutputBufferAttributes(output_tensor_name)->getDims();
if (output_size != 0) {
assert(output_size == buffer_shape[1]);
} else {
output_size = buffer_shape[1];
}
std::vector<size_t> output_strides = {output_size * sizeof(float), sizeof(float)};
output_buffer = ub_factory.createUserBuffer(output, output_size * sizeof(float), output_strides, &ub_encoding_float);
output_map.add(output_tensor_name, output_buffer.get());
}
void SNPEModel::addInput(const std::string name, float *buffer, int size) {
const int idx = inputs.size();
const auto &input_tensor_names_opt = snpe->getInputTensorNames();
if (!input_tensor_names_opt) throw std::runtime_error("Error obtaining input tensor names");
const auto &input_tensor_names = *input_tensor_names_opt;
const char *input_tensor_name = input_tensor_names.at(idx);
const bool input_tf8 = use_tf8 && strcmp(input_tensor_name, "input_img") == 0; // TODO: This is a terrible hack, get rid of this name check both here and in onnx_runner.py
LOGW("adding index %d: %s", idx, input_tensor_name);
zdl::DlSystem::UserBufferEncodingFloat ub_encoding_float;
zdl::DlSystem::UserBufferEncodingTf8 ub_encoding_tf8(0, 1./255); // network takes 0-1
zdl::DlSystem::IUserBufferFactory &ub_factory = zdl::SNPE::SNPEFactory::getUserBufferFactory();
zdl::DlSystem::UserBufferEncoding *input_encoding = input_tf8 ? (zdl::DlSystem::UserBufferEncoding*)&ub_encoding_tf8 : (zdl::DlSystem::UserBufferEncoding*)&ub_encoding_float;
const auto &buffer_shape_opt = snpe->getInputDimensions(input_tensor_name);
const zdl::DlSystem::TensorShape &buffer_shape = *buffer_shape_opt;
size_t size_of_input = input_tf8 ? sizeof(uint8_t) : sizeof(float);
std::vector<size_t> strides(buffer_shape.rank());
strides[strides.size() - 1] = size_of_input;
size_t product = 1;
for (size_t i = 0; i < buffer_shape.rank(); i++) product *= buffer_shape[i];
size_t stride = strides[strides.size() - 1];
for (size_t i = buffer_shape.rank() - 1; i > 0; i--) {
stride *= buffer_shape[i];
strides[i-1] = stride;
}
auto input_buffer = ub_factory.createUserBuffer(buffer, product*size_of_input, strides, input_encoding);
input_map.add(input_tensor_name, input_buffer.get());
inputs.push_back(std::unique_ptr<SNPEModelInput>(new SNPEModelInput(name, buffer, size, std::move(input_buffer))));
}
void SNPEModel::execute() {
if (!snpe->execute(input_map, output_map)) {
PrintErrorStringAndExit();
}
}

@ -0,0 +1,52 @@
#pragma once
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
#include <memory>
#include <string>
#include <utility>
#include <DlContainer/IDlContainer.hpp>
#include <DlSystem/DlError.hpp>
#include <DlSystem/ITensor.hpp>
#include <DlSystem/ITensorFactory.hpp>
#include <DlSystem/IUserBuffer.hpp>
#include <DlSystem/IUserBufferFactory.hpp>
#include <SNPE/SNPE.hpp>
#include <SNPE/SNPEBuilder.hpp>
#include <SNPE/SNPEFactory.hpp>
#include "selfdrive/modeld/runners/runmodel.h"
struct SNPEModelInput : public ModelInput {
std::unique_ptr<zdl::DlSystem::IUserBuffer> snpe_buffer;
SNPEModelInput(const std::string _name, float *_buffer, int _size, std::unique_ptr<zdl::DlSystem::IUserBuffer> _snpe_buffer) : ModelInput(_name, _buffer, _size), snpe_buffer(std::move(_snpe_buffer)) {}
void setBuffer(float *_buffer, int _size) {
ModelInput::setBuffer(_buffer, _size);
assert(snpe_buffer->setBufferAddress(_buffer) == true);
}
};
class SNPEModel : public RunModel {
public:
SNPEModel(const std::string path, float *_output, size_t _output_size, int runtime, bool use_tf8 = false, cl_context context = NULL);
void addInput(const std::string name, float *buffer, int size);
void execute();
private:
std::string model_data;
#ifdef QCOM2
zdl::DlSystem::Runtime_t snpe_runtime;
#endif
// snpe model stuff
std::unique_ptr<zdl::SNPE::SNPE> snpe;
zdl::DlSystem::UserBufferMap input_map;
zdl::DlSystem::UserBufferMap output_map;
std::unique_ptr<zdl::DlSystem::IUserBuffer> output_buffer;
bool use_tf8;
float *output;
size_t output_size;
};

@ -0,0 +1,9 @@
# distutils: language = c++
from libcpp.string cimport string
from msgq.visionipc.visionipc cimport cl_context
cdef extern from "selfdrive/modeld/runners/snpemodel.h":
cdef cppclass SNPEModel:
SNPEModel(string, float*, size_t, int, bool, cl_context)

@ -0,0 +1,17 @@
# distutils: language = c++
# cython: c_string_encoding=ascii, language_level=3
import os
from libcpp cimport bool
from libcpp.string cimport string
from .snpemodel cimport SNPEModel as cppSNPEModel
from selfdrive.modeld.models.commonmodel_pyx cimport CLContext
from selfdrive.modeld.runners.runmodel_pyx cimport RunModel
from selfdrive.modeld.runners.runmodel cimport RunModel as cppRunModel
os.environ['ADSP_LIBRARY_PATH'] = "/data/pythonpath/third_party/snpe/dsp/"
cdef class SNPEModel(RunModel):
def __cinit__(self, string path, float[:] output, int runtime, bool use_tf8, CLContext context):
self.model = <cppRunModel *> new cppSNPEModel(path, &output[0], len(output), runtime, use_tf8, context.context)

@ -0,0 +1,58 @@
#include "selfdrive/modeld/runners/thneedmodel.h"
#include <string>
#include "common/swaglog.h"
ThneedModel::ThneedModel(const std::string path, float *_output, size_t _output_size, int runtime, bool luse_tf8, cl_context context) {
thneed = new Thneed(true, context);
thneed->load(path.c_str());
thneed->clexec();
recorded = false;
output = _output;
}
void* ThneedModel::getCLBuffer(const std::string name) {
int index = -1;
for (int i = 0; i < inputs.size(); i++) {
if (name == inputs[i]->name) {
index = i;
break;
}
}
if (index == -1) {
LOGE("Tried to get CL buffer for input `%s` but no input with this name exists", name.c_str());
assert(false);
}
if (thneed->input_clmem.size() >= inputs.size()) {
return &thneed->input_clmem[inputs.size() - index - 1];
} else {
return nullptr;
}
}
void ThneedModel::execute() {
if (!recorded) {
thneed->record = true;
float *input_buffers[inputs.size()];
for (int i = 0; i < inputs.size(); i++) {
input_buffers[inputs.size() - i - 1] = inputs[i]->buffer;
}
thneed->copy_inputs(input_buffers);
thneed->clexec();
thneed->copy_output(output);
thneed->stop();
recorded = true;
} else {
float *input_buffers[inputs.size()];
for (int i = 0; i < inputs.size(); i++) {
input_buffers[inputs.size() - i - 1] = inputs[i]->buffer;
}
thneed->execute(input_buffers, output);
}
}

@ -0,0 +1,17 @@
#pragma once
#include <string>
#include "selfdrive/modeld/runners/runmodel.h"
#include "selfdrive/modeld/thneed/thneed.h"
class ThneedModel : public RunModel {
public:
ThneedModel(const std::string path, float *_output, size_t _output_size, int runtime, bool use_tf8 = false, cl_context context = NULL);
void *getCLBuffer(const std::string name);
void execute();
private:
Thneed *thneed = NULL;
bool recorded;
float *output;
};

@ -0,0 +1,9 @@
# distutils: language = c++
from libcpp.string cimport string
from msgq.visionipc.visionipc cimport cl_context
cdef extern from "selfdrive/modeld/runners/thneedmodel.h":
cdef cppclass ThneedModel:
ThneedModel(string, float*, size_t, int, bool, cl_context)

@ -0,0 +1,14 @@
# distutils: language = c++
# cython: c_string_encoding=ascii, language_level=3
from libcpp cimport bool
from libcpp.string cimport string
from .thneedmodel cimport ThneedModel as cppThneedModel
from selfdrive.modeld.models.commonmodel_pyx cimport CLContext
from selfdrive.modeld.runners.runmodel_pyx cimport RunModel
from selfdrive.modeld.runners.runmodel cimport RunModel as cppRunModel
cdef class ThneedModel(RunModel):
def __cinit__(self, string path, float[:] output, int runtime, bool use_tf8, CLContext context):
self.model = <cppRunModel *> new cppThneedModel(path, &output[0], len(output), runtime, use_tf8, context.context)

@ -1,8 +0,0 @@
from tinygrad.tensor import Tensor
from tinygrad.helpers import to_mv
def qcom_tensor_from_opencl_address(opencl_address, shape, dtype):
cl_buf_desc_ptr = to_mv(opencl_address, 8).cast('Q')[0]
rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20] # offset 0xA0 is a raw gpu pointer.
return Tensor.from_blob(rawbuf_ptr, shape, dtype=dtype, device='QCOM')

@ -0,0 +1,8 @@
thneed is an SNPE accelerator. I know SNPE is already an accelerator, but sometimes things need to go even faster..
It runs on the local device, and caches a single model run. Then it replays it, but fast.
thneed slices through abstraction layers like a fish.
You need a thneed.

@ -0,0 +1,154 @@
#include <cassert>
#include <set>
#include "third_party/json11/json11.hpp"
#include "common/util.h"
#include "common/clutil.h"
#include "common/swaglog.h"
#include "selfdrive/modeld/thneed/thneed.h"
using namespace json11;
extern map<cl_program, string> g_program_source;
void Thneed::load(const char *filename) {
LOGD("Thneed::load: loading from %s\n", filename);
string buf = util::read_file(filename);
int jsz = *(int *)buf.data();
string jsonerr;
string jj(buf.data() + sizeof(int), jsz);
Json jdat = Json::parse(jj, jsonerr);
map<cl_mem, cl_mem> real_mem;
real_mem[NULL] = NULL;
int ptr = sizeof(int)+jsz;
for (auto &obj : jdat["objects"].array_items()) {
auto mobj = obj.object_items();
int sz = mobj["size"].int_value();
cl_mem clbuf = NULL;
if (mobj["buffer_id"].string_value().size() > 0) {
// image buffer must already be allocated
clbuf = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
assert(mobj["needs_load"].bool_value() == false);
} else {
if (mobj["needs_load"].bool_value()) {
clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, &buf[ptr], NULL);
if (debug >= 1) printf("loading %p %d @ 0x%X\n", clbuf, sz, ptr);
ptr += sz;
} else {
// TODO: is there a faster way to init zeroed out buffers?
void *host_zeros = calloc(sz, 1);
clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, host_zeros, NULL);
free(host_zeros);
}
}
assert(clbuf != NULL);
if (mobj["arg_type"] == "image2d_t" || mobj["arg_type"] == "image1d_t") {
cl_image_desc desc = {0};
desc.image_type = (mobj["arg_type"] == "image2d_t") ? CL_MEM_OBJECT_IMAGE2D : CL_MEM_OBJECT_IMAGE1D_BUFFER;
desc.image_width = mobj["width"].int_value();
desc.image_height = mobj["height"].int_value();
desc.image_row_pitch = mobj["row_pitch"].int_value();
assert(sz == desc.image_height*desc.image_row_pitch);
#ifdef QCOM2
desc.buffer = clbuf;
#else
// TODO: we are creating unused buffers on PC
clReleaseMemObject(clbuf);
#endif
cl_image_format format = {0};
format.image_channel_order = CL_RGBA;
format.image_channel_data_type = mobj["float32"].bool_value() ? CL_FLOAT : CL_HALF_FLOAT;
cl_int errcode;
#ifndef QCOM2
if (mobj["needs_load"].bool_value()) {
clbuf = clCreateImage(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, &format, &desc, &buf[ptr-sz], &errcode);
} else {
clbuf = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &errcode);
}
#else
clbuf = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &errcode);
#endif
if (clbuf == NULL) {
LOGE("clError: %s create image %zux%zu rp %zu with buffer %p\n", cl_get_error_string(errcode),
desc.image_width, desc.image_height, desc.image_row_pitch, desc.buffer);
}
assert(clbuf != NULL);
}
real_mem[*(cl_mem*)(mobj["id"].string_value().data())] = clbuf;
}
map<string, cl_program> g_programs;
for (const auto &[name, source] : jdat["programs"].object_items()) {
if (debug >= 1) printf("building %s with size %zu\n", name.c_str(), source.string_value().size());
g_programs[name] = cl_program_from_source(context, device_id, source.string_value());
}
for (auto &obj : jdat["inputs"].array_items()) {
auto mobj = obj.object_items();
int sz = mobj["size"].int_value();
cl_mem aa = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
input_clmem.push_back(aa);
input_sizes.push_back(sz);
LOGD("Thneed::load: adding input %s with size %d\n", mobj["name"].string_value().data(), sz);
cl_int cl_err;
void *ret = clEnqueueMapBuffer(command_queue, aa, CL_TRUE, CL_MAP_WRITE, 0, sz, 0, NULL, NULL, &cl_err);
if (cl_err != CL_SUCCESS) LOGE("clError: %s map %p %d\n", cl_get_error_string(cl_err), aa, sz);
assert(cl_err == CL_SUCCESS);
inputs.push_back(ret);
}
for (auto &obj : jdat["outputs"].array_items()) {
auto mobj = obj.object_items();
int sz = mobj["size"].int_value();
LOGD("Thneed::save: adding output with size %d\n", sz);
// TODO: support multiple outputs
output = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
assert(output != NULL);
}
for (auto &obj : jdat["binaries"].array_items()) {
string name = obj["name"].string_value();
size_t length = obj["length"].int_value();
if (debug >= 1) printf("binary %s with size %zu\n", name.c_str(), length);
g_programs[name] = cl_program_from_binary(context, device_id, (const uint8_t*)&buf[ptr], length);
ptr += length;
}
for (auto &obj : jdat["kernels"].array_items()) {
auto gws = obj["global_work_size"];
auto lws = obj["local_work_size"];
auto kk = shared_ptr<CLQueuedKernel>(new CLQueuedKernel(this));
kk->name = obj["name"].string_value();
kk->program = g_programs[kk->name];
kk->work_dim = obj["work_dim"].int_value();
for (int i = 0; i < kk->work_dim; i++) {
kk->global_work_size[i] = gws[i].int_value();
kk->local_work_size[i] = lws[i].int_value();
}
kk->num_args = obj["num_args"].int_value();
for (int i = 0; i < kk->num_args; i++) {
string arg = obj["args"].array_items()[i].string_value();
int arg_size = obj["args_size"].array_items()[i].int_value();
kk->args_size.push_back(arg_size);
if (arg_size == 8) {
cl_mem val = *(cl_mem*)(arg.data());
val = real_mem[val];
kk->args.push_back(string((char*)&val, sizeof(val)));
} else {
kk->args.push_back(arg);
}
}
kq.push_back(kk);
}
clFinish(command_queue);
}

@ -0,0 +1,133 @@
#pragma once
#ifndef __user
#define __user __attribute__(())
#endif
#include <cstdint>
#include <cstdlib>
#include <memory>
#include <string>
#include <vector>
#include <CL/cl.h>
#include "third_party/linux/include/msm_kgsl.h"
using namespace std;
cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value);
namespace json11 {
class Json;
}
class Thneed;
class GPUMalloc {
public:
GPUMalloc(int size, int fd);
~GPUMalloc();
void *alloc(int size);
private:
uint64_t base;
int remaining;
};
class CLQueuedKernel {
public:
CLQueuedKernel(Thneed *lthneed) { thneed = lthneed; }
CLQueuedKernel(Thneed *lthneed,
cl_kernel _kernel,
cl_uint _work_dim,
const size_t *_global_work_size,
const size_t *_local_work_size);
cl_int exec();
void debug_print(bool verbose);
int get_arg_num(const char *search_arg_name);
cl_program program;
string name;
cl_uint num_args;
vector<string> arg_names;
vector<string> arg_types;
vector<string> args;
vector<int> args_size;
cl_kernel kernel = NULL;
json11::Json to_json() const;
cl_uint work_dim;
size_t global_work_size[3] = {0};
size_t local_work_size[3] = {0};
private:
Thneed *thneed;
};
class CachedIoctl {
public:
virtual void exec() {}
};
class CachedSync: public CachedIoctl {
public:
CachedSync(Thneed *lthneed, string ldata) { thneed = lthneed; data = ldata; }
void exec();
private:
Thneed *thneed;
string data;
};
class CachedCommand: public CachedIoctl {
public:
CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd);
void exec();
private:
void disassemble(int cmd_index);
struct kgsl_gpu_command cache;
unique_ptr<kgsl_command_object[]> cmds;
unique_ptr<kgsl_command_object[]> objs;
Thneed *thneed;
vector<shared_ptr<CLQueuedKernel> > kq;
};
class Thneed {
public:
Thneed(bool do_clinit=false, cl_context _context = NULL);
void stop();
void execute(float **finputs, float *foutput, bool slow=false);
void wait();
vector<cl_mem> input_clmem;
vector<void *> inputs;
vector<size_t> input_sizes;
cl_mem output = NULL;
cl_context context = NULL;
cl_command_queue command_queue;
cl_device_id device_id;
int context_id;
// protected?
bool record = false;
int debug;
int timestamp;
#ifdef QCOM2
unique_ptr<GPUMalloc> ram;
vector<unique_ptr<CachedIoctl> > cmds;
int fd;
#endif
// all CL kernels
void copy_inputs(float **finputs, bool internal=false);
void copy_output(float *foutput);
cl_int clexec();
vector<shared_ptr<CLQueuedKernel> > kq;
// pending CL kernels
vector<shared_ptr<CLQueuedKernel> > ckq;
// loading
void load(const char *filename);
private:
void clinit();
};

@ -0,0 +1,216 @@
#include "selfdrive/modeld/thneed/thneed.h"
#include <cassert>
#include <cstring>
#include <map>
#include "common/clutil.h"
#include "common/timing.h"
map<pair<cl_kernel, int>, string> g_args;
map<pair<cl_kernel, int>, int> g_args_size;
map<cl_program, string> g_program_source;
void Thneed::stop() {
//printf("Thneed::stop: recorded %lu commands\n", cmds.size());
record = false;
}
void Thneed::clinit() {
device_id = cl_get_device_id(CL_DEVICE_TYPE_DEFAULT);
if (context == NULL) context = CL_CHECK_ERR(clCreateContext(NULL, 1, &device_id, NULL, NULL, &err));
//cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0};
command_queue = CL_CHECK_ERR(clCreateCommandQueueWithProperties(context, device_id, props, &err));
printf("Thneed::clinit done\n");
}
cl_int Thneed::clexec() {
if (debug >= 1) printf("Thneed::clexec: running %lu queued kernels\n", kq.size());
for (auto &k : kq) {
if (record) ckq.push_back(k);
cl_int ret = k->exec();
assert(ret == CL_SUCCESS);
}
return clFinish(command_queue);
}
void Thneed::copy_inputs(float **finputs, bool internal) {
for (int idx = 0; idx < inputs.size(); ++idx) {
if (debug >= 1) printf("copying %lu -- %p -> %p (cl %p)\n", input_sizes[idx], finputs[idx], inputs[idx], input_clmem[idx]);
if (internal) {
// if it's internal, using memcpy is fine since the buffer sync is cached in the ioctl layer
if (finputs[idx] != NULL) memcpy(inputs[idx], finputs[idx], input_sizes[idx]);
} else {
if (finputs[idx] != NULL) CL_CHECK(clEnqueueWriteBuffer(command_queue, input_clmem[idx], CL_TRUE, 0, input_sizes[idx], finputs[idx], 0, NULL, NULL));
}
}
}
void Thneed::copy_output(float *foutput) {
if (output != NULL) {
size_t sz;
clGetMemObjectInfo(output, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
if (debug >= 1) printf("copying %lu for output %p -> %p\n", sz, output, foutput);
CL_CHECK(clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sz, foutput, 0, NULL, NULL));
} else {
printf("CAUTION: model output is NULL, does it have no outputs?\n");
}
}
// *********** CLQueuedKernel ***********
CLQueuedKernel::CLQueuedKernel(Thneed *lthneed,
cl_kernel _kernel,
cl_uint _work_dim,
const size_t *_global_work_size,
const size_t *_local_work_size) {
thneed = lthneed;
kernel = _kernel;
work_dim = _work_dim;
assert(work_dim <= 3);
for (int i = 0; i < work_dim; i++) {
global_work_size[i] = _global_work_size[i];
local_work_size[i] = _local_work_size[i];
}
char _name[0x100];
clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, sizeof(_name), _name, NULL);
name = string(_name);
clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(num_args), &num_args, NULL);
// get args
for (int i = 0; i < num_args; i++) {
char arg_name[0x100] = {0};
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
arg_names.push_back(string(arg_name));
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_name), arg_name, NULL);
arg_types.push_back(string(arg_name));
args.push_back(g_args[make_pair(kernel, i)]);
args_size.push_back(g_args_size[make_pair(kernel, i)]);
}
// get program
clGetKernelInfo(kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, NULL);
}
int CLQueuedKernel::get_arg_num(const char *search_arg_name) {
for (int i = 0; i < num_args; i++) {
if (arg_names[i] == search_arg_name) return i;
}
printf("failed to find %s in %s\n", search_arg_name, name.c_str());
assert(false);
}
cl_int CLQueuedKernel::exec() {
if (kernel == NULL) {
kernel = clCreateKernel(program, name.c_str(), NULL);
arg_names.clear();
arg_types.clear();
for (int j = 0; j < num_args; j++) {
char arg_name[0x100] = {0};
clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
arg_names.push_back(string(arg_name));
clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_name), arg_name, NULL);
arg_types.push_back(string(arg_name));
cl_int ret;
if (args[j].size() != 0) {
assert(args[j].size() == args_size[j]);
ret = thneed_clSetKernelArg(kernel, j, args[j].size(), args[j].data());
} else {
ret = thneed_clSetKernelArg(kernel, j, args_size[j], NULL);
}
assert(ret == CL_SUCCESS);
}
}
if (thneed->debug >= 1) {
debug_print(thneed->debug >= 2);
}
return clEnqueueNDRangeKernel(thneed->command_queue,
kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL);
}
void CLQueuedKernel::debug_print(bool verbose) {
printf("%p %56s -- ", kernel, name.c_str());
for (int i = 0; i < work_dim; i++) {
printf("%4zu ", global_work_size[i]);
}
printf(" -- ");
for (int i = 0; i < work_dim; i++) {
printf("%4zu ", local_work_size[i]);
}
printf("\n");
if (verbose) {
for (int i = 0; i < num_args; i++) {
string arg = args[i];
printf(" %s %s", arg_types[i].c_str(), arg_names[i].c_str());
void *arg_value = (void*)arg.data();
int arg_size = arg.size();
if (arg_size == 0) {
printf(" (size) %d", args_size[i]);
} else if (arg_size == 1) {
printf(" = %d", *((char*)arg_value));
} else if (arg_size == 2) {
printf(" = %d", *((short*)arg_value));
} else if (arg_size == 4) {
if (arg_types[i] == "float") {
printf(" = %f", *((float*)arg_value));
} else {
printf(" = %d", *((int*)arg_value));
}
} else if (arg_size == 8) {
cl_mem val = (cl_mem)(*((uintptr_t*)arg_value));
printf(" = %p", val);
if (val != NULL) {
cl_mem_object_type obj_type;
clGetMemObjectInfo(val, CL_MEM_TYPE, sizeof(obj_type), &obj_type, NULL);
if (arg_types[i] == "image2d_t" || arg_types[i] == "image1d_t" || obj_type == CL_MEM_OBJECT_IMAGE2D) {
cl_image_format format;
size_t width, height, depth, array_size, row_pitch, slice_pitch;
cl_mem buf;
clGetImageInfo(val, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
assert(format.image_channel_order == CL_RGBA);
assert(format.image_channel_data_type == CL_HALF_FLOAT || format.image_channel_data_type == CL_FLOAT);
clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);
clGetImageInfo(val, CL_IMAGE_DEPTH, sizeof(depth), &depth, NULL);
clGetImageInfo(val, CL_IMAGE_ARRAY_SIZE, sizeof(array_size), &array_size, NULL);
clGetImageInfo(val, CL_IMAGE_SLICE_PITCH, sizeof(slice_pitch), &slice_pitch, NULL);
assert(depth == 0);
assert(array_size == 0);
assert(slice_pitch == 0);
clGetImageInfo(val, CL_IMAGE_BUFFER, sizeof(buf), &buf, NULL);
size_t sz = 0;
if (buf != NULL) clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
printf(" image %zu x %zu rp %zu @ %p buffer %zu", width, height, row_pitch, buf, sz);
} else {
size_t sz;
clGetMemObjectInfo(val, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
printf(" buffer %zu", sz);
}
}
}
printf("\n");
}
}
}
cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) {
g_args_size[make_pair(kernel, arg_index)] = arg_size;
if (arg_value != NULL) {
g_args[make_pair(kernel, arg_index)] = string((char*)arg_value, arg_size);
} else {
g_args[make_pair(kernel, arg_index)] = string("");
}
cl_int ret = clSetKernelArg(kernel, arg_index, arg_size, arg_value);
return ret;
}

@ -0,0 +1,32 @@
#include "selfdrive/modeld/thneed/thneed.h"
#include <cassert>
#include "common/clutil.h"
#include "common/timing.h"
Thneed::Thneed(bool do_clinit, cl_context _context) {
context = _context;
if (do_clinit) clinit();
char *thneed_debug_env = getenv("THNEED_DEBUG");
debug = (thneed_debug_env != NULL) ? atoi(thneed_debug_env) : 0;
}
void Thneed::execute(float **finputs, float *foutput, bool slow) {
uint64_t tb, te;
if (debug >= 1) tb = nanos_since_boot();
// ****** copy inputs
copy_inputs(finputs);
// ****** run commands
clexec();
// ****** copy outputs
copy_output(foutput);
if (debug >= 1) {
te = nanos_since_boot();
printf("model exec in %lu us\n", (te-tb)/1000);
}
}

@ -0,0 +1,258 @@
#include "selfdrive/modeld/thneed/thneed.h"
#include <dlfcn.h>
#include <sys/mman.h>
#include <cassert>
#include <cerrno>
#include <cstring>
#include <map>
#include <string>
#include "common/clutil.h"
#include "common/timing.h"
Thneed *g_thneed = NULL;
int g_fd = -1;
void hexdump(uint8_t *d, int len) {
assert((len%4) == 0);
printf(" dumping %p len 0x%x\n", d, len);
for (int i = 0; i < len/4; i++) {
if (i != 0 && (i%0x10) == 0) printf("\n");
printf("%8x ", d[i]);
}
printf("\n");
}
// *********** ioctl interceptor ***********
extern "C" {
int (*my_ioctl)(int filedes, unsigned long request, void *argp) = NULL;
#undef ioctl
int ioctl(int filedes, unsigned long request, void *argp) {
request &= 0xFFFFFFFF; // needed on QCOM2
if (my_ioctl == NULL) my_ioctl = reinterpret_cast<decltype(my_ioctl)>(dlsym(RTLD_NEXT, "ioctl"));
Thneed *thneed = g_thneed;
// save the fd
if (request == IOCTL_KGSL_GPUOBJ_ALLOC) g_fd = filedes;
// note that this runs always, even without a thneed object
if (request == IOCTL_KGSL_DRAWCTXT_CREATE) {
struct kgsl_drawctxt_create *create = (struct kgsl_drawctxt_create *)argp;
create->flags &= ~KGSL_CONTEXT_PRIORITY_MASK;
create->flags |= 6 << KGSL_CONTEXT_PRIORITY_SHIFT; // priority from 1-15, 1 is max priority
printf("IOCTL_KGSL_DRAWCTXT_CREATE: creating context with flags 0x%x\n", create->flags);
}
if (thneed != NULL) {
if (request == IOCTL_KGSL_GPU_COMMAND) {
struct kgsl_gpu_command *cmd = (struct kgsl_gpu_command *)argp;
if (thneed->record) {
thneed->timestamp = cmd->timestamp;
thneed->context_id = cmd->context_id;
thneed->cmds.push_back(unique_ptr<CachedCommand>(new CachedCommand(thneed, cmd)));
}
if (thneed->debug >= 1) {
printf("IOCTL_KGSL_GPU_COMMAND(%2zu): flags: 0x%lx context_id: %u timestamp: %u numcmds: %d numobjs: %d\n",
thneed->cmds.size(),
cmd->flags,
cmd->context_id, cmd->timestamp, cmd->numcmds, cmd->numobjs);
}
} else if (request == IOCTL_KGSL_GPUOBJ_SYNC) {
struct kgsl_gpuobj_sync *cmd = (struct kgsl_gpuobj_sync *)argp;
struct kgsl_gpuobj_sync_obj *objs = (struct kgsl_gpuobj_sync_obj *)(cmd->objs);
if (thneed->debug >= 2) {
printf("IOCTL_KGSL_GPUOBJ_SYNC count:%d ", cmd->count);
for (int i = 0; i < cmd->count; i++) {
printf(" -- offset:0x%lx len:0x%lx id:%d op:%d ", objs[i].offset, objs[i].length, objs[i].id, objs[i].op);
}
printf("\n");
}
if (thneed->record) {
thneed->cmds.push_back(unique_ptr<CachedSync>(new
CachedSync(thneed, string((char *)objs, sizeof(struct kgsl_gpuobj_sync_obj)*cmd->count))));
}
} else if (request == IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID) {
struct kgsl_device_waittimestamp_ctxtid *cmd = (struct kgsl_device_waittimestamp_ctxtid *)argp;
if (thneed->debug >= 1) {
printf("IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID: context_id: %d timestamp: %d timeout: %d\n",
cmd->context_id, cmd->timestamp, cmd->timeout);
}
} else if (request == IOCTL_KGSL_SETPROPERTY) {
if (thneed->debug >= 1) {
struct kgsl_device_getproperty *prop = (struct kgsl_device_getproperty *)argp;
printf("IOCTL_KGSL_SETPROPERTY: 0x%x sizebytes:%zu\n", prop->type, prop->sizebytes);
if (thneed->debug >= 2) {
hexdump((uint8_t *)prop->value, prop->sizebytes);
if (prop->type == KGSL_PROP_PWR_CONSTRAINT) {
struct kgsl_device_constraint *constraint = (struct kgsl_device_constraint *)prop->value;
hexdump((uint8_t *)constraint->data, constraint->size);
}
}
}
} else if (request == IOCTL_KGSL_DRAWCTXT_CREATE || request == IOCTL_KGSL_DRAWCTXT_DESTROY) {
// this happens
} else if (request == IOCTL_KGSL_GPUOBJ_ALLOC || request == IOCTL_KGSL_GPUOBJ_FREE) {
// this happens
} else {
if (thneed->debug >= 1) {
printf("other ioctl %lx\n", request);
}
}
}
int ret = my_ioctl(filedes, request, argp);
// NOTE: This error message goes into stdout and messes up pyenv
// if (ret != 0) printf("ioctl returned %d with errno %d\n", ret, errno);
return ret;
}
}
// *********** GPUMalloc ***********
GPUMalloc::GPUMalloc(int size, int fd) {
struct kgsl_gpuobj_alloc alloc;
memset(&alloc, 0, sizeof(alloc));
alloc.size = size;
alloc.flags = 0x10000a00;
ioctl(fd, IOCTL_KGSL_GPUOBJ_ALLOC, &alloc);
void *addr = mmap64(NULL, alloc.mmapsize, 0x3, 0x1, fd, alloc.id*0x1000);
assert(addr != MAP_FAILED);
base = (uint64_t)addr;
remaining = size;
}
GPUMalloc::~GPUMalloc() {
// TODO: free the GPU malloced area
}
void *GPUMalloc::alloc(int size) {
void *ret = (void*)base;
size = (size+0xff) & (~0xFF);
assert(size <= remaining);
remaining -= size;
base += size;
return ret;
}
// *********** CachedSync, at the ioctl layer ***********
void CachedSync::exec() {
struct kgsl_gpuobj_sync cmd;
cmd.objs = (uint64_t)data.data();
cmd.obj_len = data.length();
cmd.count = data.length() / sizeof(struct kgsl_gpuobj_sync_obj);
int ret = ioctl(thneed->fd, IOCTL_KGSL_GPUOBJ_SYNC, &cmd);
assert(ret == 0);
}
// *********** CachedCommand, at the ioctl layer ***********
CachedCommand::CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd) {
thneed = lthneed;
assert(cmd->numsyncs == 0);
memcpy(&cache, cmd, sizeof(cache));
if (cmd->numcmds > 0) {
cmds = make_unique<struct kgsl_command_object[]>(cmd->numcmds);
memcpy(cmds.get(), (void *)cmd->cmdlist, sizeof(struct kgsl_command_object)*cmd->numcmds);
cache.cmdlist = (uint64_t)cmds.get();
for (int i = 0; i < cmd->numcmds; i++) {
void *nn = thneed->ram->alloc(cmds[i].size);
memcpy(nn, (void*)cmds[i].gpuaddr, cmds[i].size);
cmds[i].gpuaddr = (uint64_t)nn;
}
}
if (cmd->numobjs > 0) {
objs = make_unique<struct kgsl_command_object[]>(cmd->numobjs);
memcpy(objs.get(), (void *)cmd->objlist, sizeof(struct kgsl_command_object)*cmd->numobjs);
cache.objlist = (uint64_t)objs.get();
for (int i = 0; i < cmd->numobjs; i++) {
void *nn = thneed->ram->alloc(objs[i].size);
memset(nn, 0, objs[i].size);
objs[i].gpuaddr = (uint64_t)nn;
}
}
kq = thneed->ckq;
thneed->ckq.clear();
}
void CachedCommand::exec() {
cache.timestamp = ++thneed->timestamp;
int ret = ioctl(thneed->fd, IOCTL_KGSL_GPU_COMMAND, &cache);
if (thneed->debug >= 1) printf("CachedCommand::exec got %d\n", ret);
if (thneed->debug >= 2) {
for (auto &it : kq) {
it->debug_print(false);
}
}
assert(ret == 0);
}
// *********** Thneed ***********
Thneed::Thneed(bool do_clinit, cl_context _context) {
// TODO: QCOM2 actually requires a different context
//context = _context;
if (do_clinit) clinit();
assert(g_fd != -1);
fd = g_fd;
ram = make_unique<GPUMalloc>(0x80000, fd);
timestamp = -1;
g_thneed = this;
char *thneed_debug_env = getenv("THNEED_DEBUG");
debug = (thneed_debug_env != NULL) ? atoi(thneed_debug_env) : 0;
}
void Thneed::wait() {
struct kgsl_device_waittimestamp_ctxtid wait;
wait.context_id = context_id;
wait.timestamp = timestamp;
wait.timeout = -1;
uint64_t tb = nanos_since_boot();
int wret = ioctl(fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait);
uint64_t te = nanos_since_boot();
if (debug >= 1) printf("wait %d after %lu us\n", wret, (te-tb)/1000);
}
void Thneed::execute(float **finputs, float *foutput, bool slow) {
uint64_t tb, te;
if (debug >= 1) tb = nanos_since_boot();
// ****** copy inputs
copy_inputs(finputs, true);
// ****** run commands
int i = 0;
for (auto &it : cmds) {
++i;
if (debug >= 1) printf("run %2d @ %7lu us: ", i, (nanos_since_boot()-tb)/1000);
it->exec();
if ((i == cmds.size()) || slow) wait();
}
// ****** copy outputs
copy_output(foutput);
if (debug >= 1) {
te = nanos_since_boot();
printf("model exec in %lu us\n", (te-tb)/1000);
}
}

@ -36,7 +36,7 @@ CPU usage budget
TEST_DURATION = 25 TEST_DURATION = 25
LOG_OFFSET = 8 LOG_OFFSET = 8
MAX_TOTAL_CPU = 275. # total for all 8 cores MAX_TOTAL_CPU = 265. # total for all 8 cores
PROCS = { PROCS = {
# Baseline CPU usage by process # Baseline CPU usage by process
"selfdrive.controls.controlsd": 16.0, "selfdrive.controls.controlsd": 16.0,
@ -50,8 +50,8 @@ PROCS = {
"selfdrive.locationd.paramsd": 9.0, "selfdrive.locationd.paramsd": 9.0,
"./sensord": 7.0, "./sensord": 7.0,
"selfdrive.controls.radard": 2.0, "selfdrive.controls.radard": 2.0,
"selfdrive.modeld.modeld": 22.0, "selfdrive.modeld.modeld": 17.0,
"selfdrive.modeld.dmonitoringmodeld": 21.0, "selfdrive.modeld.dmonitoringmodeld": 11.0,
"system.hardware.hardwared": 4.0, "system.hardware.hardwared": 4.0,
"selfdrive.locationd.calibrationd": 2.0, "selfdrive.locationd.calibrationd": 2.0,
"selfdrive.locationd.torqued": 5.0, "selfdrive.locationd.torqued": 5.0,
@ -371,13 +371,14 @@ class TestOnroad:
result += "------------------------------------------------\n" result += "------------------------------------------------\n"
result += "----------------- Model Timing -----------------\n" result += "----------------- Model Timing -----------------\n"
result += "------------------------------------------------\n" result += "------------------------------------------------\n"
# TODO: this went up when plannerd cpu usage increased, why?
cfgs = [ cfgs = [
("modelV2", 0.045, 0.035), ("modelV2", 0.050, 0.036),
("driverStateV2", 0.045, 0.035), ("driverStateV2", 0.050, 0.026),
] ]
for (s, instant_max, avg_max) in cfgs: for (s, instant_max, avg_max) in cfgs:
ts = [getattr(m, s).modelExecutionTime for m in self.msgs[s]] ts = [getattr(m, s).modelExecutionTime for m in self.msgs[s]]
# TODO some tinygrad init happens in first iteration # TODO some init can happen in first iteration
ts = ts[1:] ts = ts[1:]
assert max(ts) < instant_max, f"high '{s}' execution time: {max(ts)}" assert max(ts) < instant_max, f"high '{s}' execution time: {max(ts)}"
assert np.mean(ts) < avg_max, f"high avg '{s}' execution time: {np.mean(ts)}" assert np.mean(ts) < avg_max, f"high avg '{s}' execution time: {np.mean(ts)}"

@ -33,7 +33,7 @@ class Proc:
PROCS = [ PROCS = [
Proc(['camerad'], 1.75, msgs=['roadCameraState', 'wideRoadCameraState', 'driverCameraState']), Proc(['camerad'], 1.75, msgs=['roadCameraState', 'wideRoadCameraState', 'driverCameraState']),
Proc(['modeld'], 1.12, atol=0.2, msgs=['modelV2']), Proc(['modeld'], 1.12, atol=0.2, msgs=['modelV2']),
Proc(['dmonitoringmodeld'], 0.6, msgs=['driverStateV2']), Proc(['dmonitoringmodeld'], 0.5, msgs=['driverStateV2']),
Proc(['encoderd'], 0.23, msgs=[]), Proc(['encoderd'], 0.23, msgs=[]),
] ]

@ -1 +1 @@
Subproject commit 270bbd36a925d9c612f1eeb7ea0ea4ad83fec41e Subproject commit 9dda6d260db0255750bacff61e3cee1e580567e1

@ -652,10 +652,10 @@ name = "gymnasium"
version = "1.0.0" version = "1.0.0"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "cloudpickle", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "cloudpickle" },
{ name = "farama-notifications", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "farama-notifications" },
{ name = "numpy", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "numpy" },
{ name = "typing-extensions", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "typing-extensions" },
] ]
sdist = { url = "https://files.pythonhosted.org/packages/4e/12/1047b8fdbfcdce74022048d916e844ad7e6e1114d81d26a7aed657e3a76d/gymnasium-1.0.0.tar.gz", hash = "sha256:9d2b66f30c1b34fe3c2ce7fae65ecf365d0e9982d2b3d860235e773328a3b403", size = 821389 } sdist = { url = "https://files.pythonhosted.org/packages/4e/12/1047b8fdbfcdce74022048d916e844ad7e6e1114d81d26a7aed657e3a76d/gymnasium-1.0.0.tar.gz", hash = "sha256:9d2b66f30c1b34fe3c2ce7fae65ecf365d0e9982d2b3d860235e773328a3b403", size = 821389 }
wheels = [ wheels = [
@ -962,22 +962,22 @@ name = "metadrive-simulator"
version = "0.4.2.3" version = "0.4.2.3"
source = { url = "https://github.com/commaai/metadrive/releases/download/MetaDrive-minimal/metadrive_simulator-0.4.2.3-py3-none-any.whl" } source = { url = "https://github.com/commaai/metadrive/releases/download/MetaDrive-minimal/metadrive_simulator-0.4.2.3-py3-none-any.whl" }
dependencies = [ dependencies = [
{ name = "filelock", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "filelock" },
{ name = "gymnasium", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "gymnasium" },
{ name = "lxml", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "lxml" },
{ name = "matplotlib", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "matplotlib" },
{ name = "numpy", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "numpy" },
{ name = "opencv-python-headless", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "opencv-python-headless" },
{ name = "panda3d", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "panda3d" },
{ name = "panda3d-gltf", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "panda3d-gltf" },
{ name = "pillow", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "pillow" },
{ name = "progressbar", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "progressbar" },
{ name = "psutil", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "psutil" },
{ name = "pygments", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "pygments" },
{ name = "requests", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "requests" },
{ name = "shapely", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "shapely" },
{ name = "tqdm", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "tqdm" },
{ name = "yapf", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "yapf" },
] ]
wheels = [ wheels = [
{ url = "https://github.com/commaai/metadrive/releases/download/MetaDrive-minimal/metadrive_simulator-0.4.2.3-py3-none-any.whl", hash = "sha256:6242d4e37e6c592d5eb1cadf497637540d3b754b89813a88c50a93c7fc88b02d" }, { url = "https://github.com/commaai/metadrive/releases/download/MetaDrive-minimal/metadrive_simulator-0.4.2.3-py3-none-any.whl", hash = "sha256:6242d4e37e6c592d5eb1cadf497637540d3b754b89813a88c50a93c7fc88b02d" },
@ -1234,16 +1234,27 @@ dependencies = [
{ name = "sympy" }, { name = "sympy" },
] ]
wheels = [ wheels = [
{ url = "https://files.pythonhosted.org/packages/95/8d/2634e2959b34aa8a0037989f4229e9abcfa484e9c228f99633b3241768a6/onnxruntime-1.20.1-cp311-cp311-macosx_13_0_universal2.whl", hash = "sha256:06bfbf02ca9ab5f28946e0f912a562a5f005301d0c419283dc57b3ed7969bb7b", size = 30998725 },
{ url = "https://files.pythonhosted.org/packages/a5/da/c44bf9bd66cd6d9018a921f053f28d819445c4d84b4dd4777271b0fe52a2/onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f6243e34d74423bdd1edf0ae9596dd61023b260f546ee17d701723915f06a9f7", size = 11955227 }, { url = "https://files.pythonhosted.org/packages/a5/da/c44bf9bd66cd6d9018a921f053f28d819445c4d84b4dd4777271b0fe52a2/onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f6243e34d74423bdd1edf0ae9596dd61023b260f546ee17d701723915f06a9f7", size = 11955227 },
{ url = "https://files.pythonhosted.org/packages/11/ac/4120dfb74c8e45cce1c664fc7f7ce010edd587ba67ac41489f7432eb9381/onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5eec64c0269dcdb8d9a9a53dc4d64f87b9e0c19801d9321246a53b7eb5a7d1bc", size = 13331703 }, { url = "https://files.pythonhosted.org/packages/11/ac/4120dfb74c8e45cce1c664fc7f7ce010edd587ba67ac41489f7432eb9381/onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5eec64c0269dcdb8d9a9a53dc4d64f87b9e0c19801d9321246a53b7eb5a7d1bc", size = 13331703 },
{ url = "https://files.pythonhosted.org/packages/12/f1/cefacac137f7bb7bfba57c50c478150fcd3c54aca72762ac2c05ce0532c1/onnxruntime-1.20.1-cp311-cp311-win32.whl", hash = "sha256:a19bc6e8c70e2485a1725b3d517a2319603acc14c1f1a017dda0afe6d4665b41", size = 9813977 },
{ url = "https://files.pythonhosted.org/packages/2c/2d/2d4d202c0bcfb3a4cc2b171abb9328672d7f91d7af9ea52572722c6d8d96/onnxruntime-1.20.1-cp311-cp311-win_amd64.whl", hash = "sha256:8508887eb1c5f9537a4071768723ec7c30c28eb2518a00d0adcd32c89dea3221", size = 11329895 },
{ url = "https://files.pythonhosted.org/packages/e5/39/9335e0874f68f7d27103cbffc0e235e32e26759202df6085716375c078bb/onnxruntime-1.20.1-cp312-cp312-macosx_13_0_universal2.whl", hash = "sha256:22b0655e2bf4f2161d52706e31f517a0e54939dc393e92577df51808a7edc8c9", size = 31007580 },
{ url = "https://files.pythonhosted.org/packages/c5/9d/a42a84e10f1744dd27c6f2f9280cc3fb98f869dd19b7cd042e391ee2ab61/onnxruntime-1.20.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f1f56e898815963d6dc4ee1c35fc6c36506466eff6d16f3cb9848cea4e8c8172", size = 11952833 }, { url = "https://files.pythonhosted.org/packages/c5/9d/a42a84e10f1744dd27c6f2f9280cc3fb98f869dd19b7cd042e391ee2ab61/onnxruntime-1.20.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f1f56e898815963d6dc4ee1c35fc6c36506466eff6d16f3cb9848cea4e8c8172", size = 11952833 },
{ url = "https://files.pythonhosted.org/packages/47/42/2f71f5680834688a9c81becbe5c5bb996fd33eaed5c66ae0606c3b1d6a02/onnxruntime-1.20.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bb71a814f66517a65628c9e4a2bb530a6edd2cd5d87ffa0af0f6f773a027d99e", size = 13333903 }, { url = "https://files.pythonhosted.org/packages/47/42/2f71f5680834688a9c81becbe5c5bb996fd33eaed5c66ae0606c3b1d6a02/onnxruntime-1.20.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bb71a814f66517a65628c9e4a2bb530a6edd2cd5d87ffa0af0f6f773a027d99e", size = 13333903 },
{ url = "https://files.pythonhosted.org/packages/c8/f1/aabfdf91d013320aa2fc46cf43c88ca0182860ff15df872b4552254a9680/onnxruntime-1.20.1-cp312-cp312-win32.whl", hash = "sha256:bd386cc9ee5f686ee8a75ba74037750aca55183085bf1941da8efcfe12d5b120", size = 9814562 }, ]
{ url = "https://files.pythonhosted.org/packages/dd/80/76979e0b744307d488c79e41051117634b956612cc731f1028eb17ee7294/onnxruntime-1.20.1-cp312-cp312-win_amd64.whl", hash = "sha256:19c2d843eb074f385e8bbb753a40df780511061a63f9def1b216bf53860223fb", size = 11331482 },
[[package]]
name = "onnxruntime-gpu"
version = "1.20.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "coloredlogs" },
{ name = "flatbuffers" },
{ name = "numpy" },
{ name = "packaging" },
{ name = "protobuf" },
{ name = "sympy" },
]
wheels = [
{ url = "https://files.pythonhosted.org/packages/e0/a5/5c2287d61f359c7342e9d59d1e3dd728a982dea85f846c7af305a801c3ca/onnxruntime_gpu-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1795e8bc6f9a1488a4d51d242edc4232a5ae60ec44ab4d4b0a7c65b3d17fcbff", size = 291519550 },
{ url = "https://files.pythonhosted.org/packages/91/a8/6984a2fb070be372a866108e3e85c9eb6e8f0378a8567a66967d80befb75/onnxruntime_gpu-1.20.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1951f96cd534c6151721e552606d0d792ea6a4c3e57e2f10eed17cca8105e953", size = 291510989 },
] ]
[[package]] [[package]]
@ -1251,7 +1262,7 @@ name = "opencv-python-headless"
version = "4.10.0.84" version = "4.10.0.84"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "numpy", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "numpy" },
] ]
sdist = { url = "https://files.pythonhosted.org/packages/2f/7e/d20f68a5f1487adf19d74378d349932a386b1ece3be9be9915e5986db468/opencv-python-headless-4.10.0.84.tar.gz", hash = "sha256:f2017c6101d7c2ef8d7bc3b414c37ff7f54d64413a1847d89970b6b7069b4e1a", size = 95117755 } sdist = { url = "https://files.pythonhosted.org/packages/2f/7e/d20f68a5f1487adf19d74378d349932a386b1ece3be9be9915e5986db468/opencv-python-headless-4.10.0.84.tar.gz", hash = "sha256:f2017c6101d7c2ef8d7bc3b414c37ff7f54d64413a1847d89970b6b7069b4e1a", size = 95117755 }
wheels = [ wheels = [
@ -1280,7 +1291,8 @@ dependencies = [
{ name = "libusb1" }, { name = "libusb1" },
{ name = "numpy" }, { name = "numpy" },
{ name = "onnx" }, { name = "onnx" },
{ name = "onnxruntime" }, { name = "onnxruntime", marker = "platform_machine == 'aarch64' and platform_system == 'Linux'" },
{ name = "onnxruntime-gpu", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
{ name = "psutil" }, { name = "psutil" },
{ name = "pyaudio" }, { name = "pyaudio" },
{ name = "pycapnp" }, { name = "pycapnp" },
@ -1378,7 +1390,8 @@ requires-dist = [
{ name = "natsort", marker = "extra == 'docs'" }, { name = "natsort", marker = "extra == 'docs'" },
{ name = "numpy", specifier = "<2.0.0" }, { name = "numpy", specifier = "<2.0.0" },
{ name = "onnx", specifier = ">=1.14.0" }, { name = "onnx", specifier = ">=1.14.0" },
{ name = "onnxruntime", specifier = ">=1.16.3" }, { name = "onnxruntime", marker = "platform_machine == 'aarch64' and platform_system == 'Linux'", specifier = ">=1.16.3" },
{ name = "onnxruntime-gpu", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'", specifier = ">=1.16.3" },
{ name = "parameterized", marker = "extra == 'dev'", specifier = ">=0.8,<0.9" }, { name = "parameterized", marker = "extra == 'dev'", specifier = ">=0.8,<0.9" },
{ name = "pre-commit-hooks", marker = "extra == 'testing'" }, { name = "pre-commit-hooks", marker = "extra == 'testing'" },
{ name = "psutil" }, { name = "psutil" },
@ -1456,8 +1469,8 @@ name = "panda3d-gltf"
version = "0.13" version = "0.13"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "panda3d", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "panda3d" },
{ name = "panda3d-simplepbr", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "panda3d-simplepbr" },
] ]
sdist = { url = "https://files.pythonhosted.org/packages/07/7f/9f18fc3fa843a080acb891af6bcc12262e7bdf1d194a530f7042bebfc81f/panda3d-gltf-0.13.tar.gz", hash = "sha256:d06d373bdd91cf530909b669f43080e599463bbf6d3ef00c3558bad6c6b19675", size = 25573 } sdist = { url = "https://files.pythonhosted.org/packages/07/7f/9f18fc3fa843a080acb891af6bcc12262e7bdf1d194a530f7042bebfc81f/panda3d-gltf-0.13.tar.gz", hash = "sha256:d06d373bdd91cf530909b669f43080e599463bbf6d3ef00c3558bad6c6b19675", size = 25573 }
wheels = [ wheels = [
@ -1469,8 +1482,8 @@ name = "panda3d-simplepbr"
version = "0.12.0" version = "0.12.0"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "panda3d", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "panda3d" },
{ name = "typing-extensions", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "typing-extensions" },
] ]
sdist = { url = "https://files.pythonhosted.org/packages/b1/af/505608eef09d7f9b822e69dc7631cd14102650b8fe1b6f60d9562d2788d9/panda3d-simplepbr-0.12.0.tar.gz", hash = "sha256:c71d490afeeb3a90455dcfde1d30c41f321a38742a97d18834e5c31016331ed5", size = 1929980 } sdist = { url = "https://files.pythonhosted.org/packages/b1/af/505608eef09d7f9b822e69dc7631cd14102650b8fe1b6f60d9562d2788d9/panda3d-simplepbr-0.12.0.tar.gz", hash = "sha256:c71d490afeeb3a90455dcfde1d30c41f321a38742a97d18834e5c31016331ed5", size = 1929980 }
wheels = [ wheels = [
@ -4341,9 +4354,9 @@ name = "pyopencl"
version = "2024.3" version = "2024.3"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "numpy", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "numpy" },
{ name = "platformdirs", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "platformdirs" },
{ name = "pytools", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "pytools" },
] ]
sdist = { url = "https://files.pythonhosted.org/packages/ec/28/4679ea08b84532a67fd2d270c8f87aec64dab9ab99e618927b6a26ea063e/pyopencl-2024.3.tar.gz", hash = "sha256:d5d08de9b0a6d85695caba1769aceae4e7661f06951c507bd1ce8fb7a89e2413", size = 422604 } sdist = { url = "https://files.pythonhosted.org/packages/ec/28/4679ea08b84532a67fd2d270c8f87aec64dab9ab99e618927b6a26ea063e/pyopencl-2024.3.tar.gz", hash = "sha256:d5d08de9b0a6d85695caba1769aceae4e7661f06951c507bd1ce8fb7a89e2413", size = 422604 }
wheels = [ wheels = [
@ -4397,7 +4410,7 @@ name = "pyqt5"
version = "5.15.2" version = "5.15.2"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "pyqt5-sip", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "pyqt5-sip" },
] ]
sdist = { url = "https://files.pythonhosted.org/packages/28/6c/640e3f5c734c296a7193079a86842a789edb7988dca39eab44579088a1d1/PyQt5-5.15.2.tar.gz", hash = "sha256:372b08dc9321d1201e4690182697c5e7ffb2e0770e6b4a45519025134b12e4fc", size = 3265445 } sdist = { url = "https://files.pythonhosted.org/packages/28/6c/640e3f5c734c296a7193079a86842a789edb7988dca39eab44579088a1d1/PyQt5-5.15.2.tar.gz", hash = "sha256:372b08dc9321d1201e4690182697c5e7ffb2e0770e6b4a45519025134b12e4fc", size = 3265445 }
wheels = [ wheels = [
@ -4615,9 +4628,9 @@ name = "pytools"
version = "2024.1.10" version = "2024.1.10"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "platformdirs", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "platformdirs" },
{ name = "siphash24", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "siphash24" },
{ name = "typing-extensions", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "typing-extensions" },
] ]
sdist = { url = "https://files.pythonhosted.org/packages/ee/0f/56e109c0307f831b5d598ad73976aaaa84b4d0e98da29a642e797eaa940c/pytools-2024.1.10.tar.gz", hash = "sha256:9af6f4b045212c49be32bb31fe19606c478ee4b09631886d05a32459f4ce0a12", size = 81741 } sdist = { url = "https://files.pythonhosted.org/packages/ee/0f/56e109c0307f831b5d598ad73976aaaa84b4d0e98da29a642e797eaa940c/pytools-2024.1.10.tar.gz", hash = "sha256:9af6f4b045212c49be32bb31fe19606c478ee4b09631886d05a32459f4ce0a12", size = 81741 }
wheels = [ wheels = [
@ -4806,7 +4819,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/86/29/88c2567bc893c84d88b4c48027367c3562ae69121d568e8a3f3a8d363f4d/ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:811ea1594b8a0fb466172c384267a4e5e367298af6b228931f273b111f17ef52", size = 703012 }, { url = "https://files.pythonhosted.org/packages/86/29/88c2567bc893c84d88b4c48027367c3562ae69121d568e8a3f3a8d363f4d/ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:811ea1594b8a0fb466172c384267a4e5e367298af6b228931f273b111f17ef52", size = 703012 },
{ url = "https://files.pythonhosted.org/packages/11/46/879763c619b5470820f0cd6ca97d134771e502776bc2b844d2adb6e37753/ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:cf12567a7b565cbf65d438dec6cfbe2917d3c1bdddfce84a9930b7d35ea59642", size = 704352 }, { url = "https://files.pythonhosted.org/packages/11/46/879763c619b5470820f0cd6ca97d134771e502776bc2b844d2adb6e37753/ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:cf12567a7b565cbf65d438dec6cfbe2917d3c1bdddfce84a9930b7d35ea59642", size = 704352 },
{ url = "https://files.pythonhosted.org/packages/02/80/ece7e6034256a4186bbe50dee28cd032d816974941a6abf6a9d65e4228a7/ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7dd5adc8b930b12c8fc5b99e2d535a09889941aa0d0bd06f4749e9a9397c71d2", size = 737344 }, { url = "https://files.pythonhosted.org/packages/02/80/ece7e6034256a4186bbe50dee28cd032d816974941a6abf6a9d65e4228a7/ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7dd5adc8b930b12c8fc5b99e2d535a09889941aa0d0bd06f4749e9a9397c71d2", size = 737344 },
{ url = "https://files.pythonhosted.org/packages/f0/ca/e4106ac7e80efbabdf4bf91d3d32fc424e41418458251712f5672eada9ce/ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1492a6051dab8d912fc2adeef0e8c72216b24d57bd896ea607cb90bb0c4981d3", size = 714498 },
{ url = "https://files.pythonhosted.org/packages/67/58/b1f60a1d591b771298ffa0428237afb092c7f29ae23bad93420b1eb10703/ruamel.yaml.clib-0.2.12-cp311-cp311-win32.whl", hash = "sha256:bd0a08f0bab19093c54e18a14a10b4322e1eacc5217056f3c063bd2f59853ce4", size = 100205 }, { url = "https://files.pythonhosted.org/packages/67/58/b1f60a1d591b771298ffa0428237afb092c7f29ae23bad93420b1eb10703/ruamel.yaml.clib-0.2.12-cp311-cp311-win32.whl", hash = "sha256:bd0a08f0bab19093c54e18a14a10b4322e1eacc5217056f3c063bd2f59853ce4", size = 100205 },
{ url = "https://files.pythonhosted.org/packages/b4/4f/b52f634c9548a9291a70dfce26ca7ebce388235c93588a1068028ea23fcc/ruamel.yaml.clib-0.2.12-cp311-cp311-win_amd64.whl", hash = "sha256:a274fb2cb086c7a3dea4322ec27f4cb5cc4b6298adb583ab0e211a4682f241eb", size = 118185 }, { url = "https://files.pythonhosted.org/packages/b4/4f/b52f634c9548a9291a70dfce26ca7ebce388235c93588a1068028ea23fcc/ruamel.yaml.clib-0.2.12-cp311-cp311-win_amd64.whl", hash = "sha256:a274fb2cb086c7a3dea4322ec27f4cb5cc4b6298adb583ab0e211a4682f241eb", size = 118185 },
{ url = "https://files.pythonhosted.org/packages/48/41/e7a405afbdc26af961678474a55373e1b323605a4f5e2ddd4a80ea80f628/ruamel.yaml.clib-0.2.12-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:20b0f8dc160ba83b6dcc0e256846e1a02d044e13f7ea74a3d1d56ede4e48c632", size = 133433 }, { url = "https://files.pythonhosted.org/packages/48/41/e7a405afbdc26af961678474a55373e1b323605a4f5e2ddd4a80ea80f628/ruamel.yaml.clib-0.2.12-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:20b0f8dc160ba83b6dcc0e256846e1a02d044e13f7ea74a3d1d56ede4e48c632", size = 133433 },
@ -4815,7 +4827,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/52/a9/d39f3c5ada0a3bb2870d7db41901125dbe2434fa4f12ca8c5b83a42d7c53/ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:749c16fcc4a2b09f28843cda5a193e0283e47454b63ec4b81eaa2242f50e4ccd", size = 706497 }, { url = "https://files.pythonhosted.org/packages/52/a9/d39f3c5ada0a3bb2870d7db41901125dbe2434fa4f12ca8c5b83a42d7c53/ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:749c16fcc4a2b09f28843cda5a193e0283e47454b63ec4b81eaa2242f50e4ccd", size = 706497 },
{ url = "https://files.pythonhosted.org/packages/b0/fa/097e38135dadd9ac25aecf2a54be17ddf6e4c23e43d538492a90ab3d71c6/ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bf165fef1f223beae7333275156ab2022cffe255dcc51c27f066b4370da81e31", size = 698042 }, { url = "https://files.pythonhosted.org/packages/b0/fa/097e38135dadd9ac25aecf2a54be17ddf6e4c23e43d538492a90ab3d71c6/ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bf165fef1f223beae7333275156ab2022cffe255dcc51c27f066b4370da81e31", size = 698042 },
{ url = "https://files.pythonhosted.org/packages/ec/d5/a659ca6f503b9379b930f13bc6b130c9f176469b73b9834296822a83a132/ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:32621c177bbf782ca5a18ba4d7af0f1082a3f6e517ac2a18b3974d4edf349680", size = 745831 }, { url = "https://files.pythonhosted.org/packages/ec/d5/a659ca6f503b9379b930f13bc6b130c9f176469b73b9834296822a83a132/ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:32621c177bbf782ca5a18ba4d7af0f1082a3f6e517ac2a18b3974d4edf349680", size = 745831 },
{ url = "https://files.pythonhosted.org/packages/db/5d/36619b61ffa2429eeaefaab4f3374666adf36ad8ac6330d855848d7d36fd/ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b82a7c94a498853aa0b272fd5bc67f29008da798d4f93a2f9f289feb8426a58d", size = 715692 },
{ url = "https://files.pythonhosted.org/packages/b1/82/85cb92f15a4231c89b95dfe08b09eb6adca929ef7df7e17ab59902b6f589/ruamel.yaml.clib-0.2.12-cp312-cp312-win32.whl", hash = "sha256:e8c4ebfcfd57177b572e2040777b8abc537cdef58a2120e830124946aa9b42c5", size = 98777 }, { url = "https://files.pythonhosted.org/packages/b1/82/85cb92f15a4231c89b95dfe08b09eb6adca929ef7df7e17ab59902b6f589/ruamel.yaml.clib-0.2.12-cp312-cp312-win32.whl", hash = "sha256:e8c4ebfcfd57177b572e2040777b8abc537cdef58a2120e830124946aa9b42c5", size = 98777 },
{ url = "https://files.pythonhosted.org/packages/d7/8f/c3654f6f1ddb75daf3922c3d8fc6005b1ab56671ad56ffb874d908bfa668/ruamel.yaml.clib-0.2.12-cp312-cp312-win_amd64.whl", hash = "sha256:0467c5965282c62203273b838ae77c0d29d7638c8a4e3a1c8bdd3602c10904e4", size = 115523 }, { url = "https://files.pythonhosted.org/packages/d7/8f/c3654f6f1ddb75daf3922c3d8fc6005b1ab56671ad56ffb874d908bfa668/ruamel.yaml.clib-0.2.12-cp312-cp312-win_amd64.whl", hash = "sha256:0467c5965282c62203273b838ae77c0d29d7638c8a4e3a1c8bdd3602c10904e4", size = 115523 },
] ]
@ -4922,7 +4933,7 @@ name = "shapely"
version = "2.0.6" version = "2.0.6"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "numpy", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "numpy" },
] ]
sdist = { url = "https://files.pythonhosted.org/packages/4a/89/0d20bac88016be35ff7d3c0c2ae64b477908f1b1dfa540c5d69ac7af07fe/shapely-2.0.6.tar.gz", hash = "sha256:997f6159b1484059ec239cacaa53467fd8b5564dabe186cd84ac2944663b0bf6", size = 282361 } sdist = { url = "https://files.pythonhosted.org/packages/4a/89/0d20bac88016be35ff7d3c0c2ae64b477908f1b1dfa540c5d69ac7af07fe/shapely-2.0.6.tar.gz", hash = "sha256:997f6159b1484059ec239cacaa53467fd8b5564dabe186cd84ac2944663b0bf6", size = 282361 }
wheels = [ wheels = [
@ -5147,7 +5158,7 @@ name = "yapf"
version = "0.43.0" version = "0.43.0"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "platformdirs", marker = "platform_machine != 'aarch64' or platform_system != 'Linux'" }, { name = "platformdirs" },
] ]
sdist = { url = "https://files.pythonhosted.org/packages/23/97/b6f296d1e9cc1ec25c7604178b48532fa5901f721bcf1b8d8148b13e5588/yapf-0.43.0.tar.gz", hash = "sha256:00d3aa24bfedff9420b2e0d5d9f5ab6d9d4268e72afbf59bb3fa542781d5218e", size = 254907 } sdist = { url = "https://files.pythonhosted.org/packages/23/97/b6f296d1e9cc1ec25c7604178b48532fa5901f721bcf1b8d8148b13e5588/yapf-0.43.0.tar.gz", hash = "sha256:00d3aa24bfedff9420b2e0d5d9f5ab6d9d4268e72afbf59bb3fa542781d5218e", size = 254907 }
wheels = [ wheels = [

Loading…
Cancel
Save