Merge remote-tracking branch 'upstream/master' into disengage-on-gas

4 years ago · 1885c9e2ba
parent 1f591736a2 9c70e8bcad
commit 1885c9e2ba
99 changed files with 1539 additions and 421 deletions
--- a/RELEASES.md
+++ b/RELEASES.md
@ -1,4 +1,8 @@
-Version 0.8.13 (2022-02-16)
+Version 0.8.14 (2022-0X-XX)
+========================
+ * bigmodel!
+
+Version 0.8.13 (2022-02-18)
 ========================
 * Improved driver monitoring
   * Retuned driver pose learner for relaxed driving positions
--- a/19
+++ b/19
@ -119,27 +119,26 @@ else:
  cxxflags = []
  cpppath = []

+  # MacOS
  if arch == "Darwin":
+    brew_prefix = subprocess.check_output(['brew', '--prefix'], encoding='utf8').strip()
    yuv_dir = "mac" if real_arch != "arm64" else "mac_arm64"
    libpath = [
      f"#third_party/libyuv/{yuv_dir}/lib",
-      "/usr/local/lib",
-      "/opt/homebrew/lib",
-      "/usr/local/Homebrew/Library",
-      "/usr/local/opt/openssl/lib",
-      "/opt/homebrew/opt/openssl/lib",
-      "/usr/local/Cellar",
+      f"{brew_prefix}/lib",
+      f"{brew_prefix}/Library",
+      f"{brew_prefix}/opt/openssl/lib",
+      f"{brew_prefix}/Cellar",
      f"#third_party/acados/{arch}/lib",
      "/System/Library/Frameworks/OpenGL.framework/Libraries",
    ]
    cflags += ["-DGL_SILENCE_DEPRECATION"]
    cxxflags += ["-DGL_SILENCE_DEPRECATION"]
    cpppath += [
-      "/opt/homebrew/include",
-      "/usr/local/include",
-      "/usr/local/opt/openssl/include",
-      "/opt/homebrew/opt/openssl/include"
+      f"{brew_prefix}/include",
+      f"{brew_prefix}/opt/openssl/include",
    ]
+  # Linux 86_64
  else:
    libpath = [
      "#third_party/acados/x86_64/lib",
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 03860ae0b2b8128cae7768e4301d889e627c9275
+Subproject commit 28d458a9af49b38bd0a9052f09fbe927324320fb
--- a/common/window.py
+++ b/common/window.py
@ -11,11 +11,18 @@ class Window():
    self.double = double
    self.halve = halve
    if self.double:
-      self.screen = pygame.display.set_mode((w*2, h*2))
+      self.rw, self.rh = w*2, h*2
    elif self.halve:
-      self.screen = pygame.display.set_mode((w//2, h//2))
+      self.rw, self.rh = w//2, h//2
    else:
-      self.screen = pygame.display.set_mode((w, h))
+      self.rw, self.rh = w, h
+    self.screen = pygame.display.set_mode((self.rw, self.rh))
+    pygame.display.flip()
+
+    # hack for xmonad, it shrinks the window by 6 pixels after the display.flip
+    if self.screen.get_width() != self.rw:
+      self.screen = pygame.display.set_mode((self.rw+(self.rw-self.screen.get_width()), self.rh+(self.rh-self.screen.get_height())))
+      pygame.display.flip()

  def draw(self, out):
    pygame.event.pump()
--- a/docs/CARS.md
+++ b/docs/CARS.md
@ -99,7 +99,7 @@
 | Genesis   | G90 2018                        | All               | Stock            | 0mph               | 0mph         |
 | GMC       | Acadia 2018<sup>1</sup>         | Adaptive Cruise   | openpilot        | 0mph               | 7mph         |
 | Hyundai   | Elantra 2017-19                 | SCC + LKAS        | Stock            | 19mph              | 34mph        |
-| Hyundai   | Elantra 2021                    | SCC + LKAS        | Stock            | 0mph               | 0mph         |
+| Hyundai   | Elantra 2021-22                 | SCC + LKAS        | Stock            | 0mph               | 0mph         |
 | Hyundai   | Elantra Hybrid 2021             | SCC + LKAS        | Stock            | 0mph               | 0mph         |
 | Hyundai   | Genesis 2015-16                 | SCC + LKAS        | Stock            | 19mph              | 37mph        |
 | Hyundai   | Ioniq Electric 2019             | SCC + LKAS        | Stock            | 0mph               | 32mph        |
--- a/docs/conf.py
+++ b/docs/conf.py
@ -56,7 +56,7 @@ myst_html_meta = {
  "property=og:locale": "en_US",
  "property=og:site_name": "docs.comma.ai",
  "property=og:url": "https://docs.comma.ai",
-  "property=og:title": "openpilot Docuemntation",
+  "property=og:title": "openpilot Documentation",
  "property=og:type": "website",
  "property=og:image:type": "image/jpeg",
  "property=og:image:width": "400",
--- a/models/big_supercombo.dlc
+++ b/models/big_supercombo.dlc
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba3fe3e61853cc1434e3e220f40c8e9d1f1b9bab8458196ba3bea6a10b82c6ed
+size 72718099
--- a/models/big_supercombo.onnx
+++ b/models/big_supercombo.onnx
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bda57c1a66944f5a633ecd739a24d62702c717a234f2fdcc499dfa1d61c3c19e
+size 73147489
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit c3d3c71aa7a37364814f029778070fcb550c7cd3
+Subproject commit 46a942d6790531cf5b94b14266140e43afcfda3e
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit f56ebf5b776b677bf12ec772b0223274dd798999
+Subproject commit 51ccb9fbd266796e1bf6ffda8b93c4119ab09ff4
--- a/release/files_common
+++ b/release/files_common
@ -58,6 +58,7 @@ common/transformations/transformations.pyx
 common/api/__init__.py

 models/supercombo.dlc
+models/big_supercombo.dlc
 models/dmonitoring_model_q.dlc

 release/*
@ -426,7 +427,9 @@ selfdrive/modeld/transforms/transform.cl
 selfdrive/modeld/thneed/thneed.*
 selfdrive/modeld/thneed/serialize.cc
 selfdrive/modeld/thneed/compile.cc
+selfdrive/modeld/thneed/optimizer.cc
 selfdrive/modeld/thneed/include/*
+selfdrive/modeld/thneed/kernels/*.cl

 selfdrive/modeld/runners/snpemodel.cc
 selfdrive/modeld/runners/snpemodel.h
@ -569,12 +572,10 @@ opendbc/acura_rdx_2018_can_generated.dbc
 opendbc/acura_rdx_2020_can_generated.dbc
 opendbc/honda_civic_touring_2016_can_generated.dbc
 opendbc/honda_civic_hatchback_ex_2017_can_generated.dbc
-opendbc/honda_civic_sedan_16_diesel_2019_can_generated.dbc
 opendbc/honda_crv_touring_2016_can_generated.dbc
 opendbc/honda_crv_ex_2017_can_generated.dbc
 opendbc/honda_crv_ex_2017_body_generated.dbc
 opendbc/honda_crv_executive_2016_can_generated.dbc
-opendbc/honda_crv_hybrid_2019_can_generated.dbc
 opendbc/honda_fit_ex_2018_can_generated.dbc
 opendbc/honda_odyssey_exl_2018_generated.dbc
 opendbc/honda_odyssey_extreme_edition_2018_china_can_generated.dbc
--- a/selfdrive/athena/athenad.py
+++ b/selfdrive/athena/athenad.py
@ -163,6 +163,8 @@ def upload_handler(end_event: threading.Event) -> None:
  sm = messaging.SubMaster(['deviceState'])
  tid = threading.get_ident()

+  cellular_unmetered = Params().get_bool("CellularUnmetered")
+
  while not end_event.is_set():
    cur_upload_items[tid] = None

@ -182,7 +184,7 @@ def upload_handler(end_event: threading.Event) -> None:
      # Check if uploading over cell is allowed
      sm.update(0)
      cell = sm['deviceState'].networkType not in [NetworkType.wifi, NetworkType.ethernet]
-      if cell and (not cur_upload_items[tid].allow_cellular):
+      if cell and (not cur_upload_items[tid].allow_cellular) and (not cellular_unmetered):
        retry_upload(tid, end_event, False)
        continue

@ -191,7 +193,7 @@ def upload_handler(end_event: threading.Event) -> None:
          # Abort transfer if connection changed to cell after starting upload
          sm.update(0)
          cell = sm['deviceState'].networkType not in [NetworkType.wifi, NetworkType.ethernet]
-          if cell and (not cur_upload_items[tid].allow_cellular):
+          if cell and (not cur_upload_items[tid].allow_cellular) and (not cellular_unmetered):
            raise AbortTransferException

          cur_upload_items[tid] = cur_upload_items[tid]._replace(progress=cur / sz if sz else 1)
--- a/selfdrive/athena/tests/helpers.py
+++ b/selfdrive/athena/tests/helpers.py
@ -53,7 +53,8 @@ class MockParams():
  default_params = {
    "DongleId": b"0000000000000000",
    "GithubSshKeys": b"ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC307aE+nuHzTAgaJhzSf5v7ZZQW9gaperjhCmyPyl4PzY7T1mDGenTlVTN7yoVFZ9UfO9oMQqo0n1OwDIiqbIFxqnhrHU0cYfj88rI85m5BEKlNu5RdaVTj1tcbaPpQc5kZEolaI1nDDjzV0lwS7jo5VYDHseiJHlik3HH1SgtdtsuamGR2T80q1SyW+5rHoMOJG73IH2553NnWuikKiuikGHUYBd00K1ilVAK2xSiMWJp55tQfZ0ecr9QjEsJ+J/efL4HqGNXhffxvypCXvbUYAFSddOwXUPo5BTKevpxMtH+2YrkpSjocWA04VnTYFiPG6U4ItKmbLOTFZtPzoez private",  # noqa: E501
-    "AthenadUploadQueue": '[]'
+    "AthenadUploadQueue": '[]',
+    "CellularUnmetered": False,
  }
  params = default_params.copy()

@ -61,6 +62,9 @@ class MockParams():
  def restore_defaults():
    MockParams.params = MockParams.default_params.copy()

+  def get_bool(self, k):
+    return bool(MockParams.params.get(k))
+
  def get(self, k, encoding=None):
    ret = MockParams.params.get(k)
    if ret is not None and encoding is not None:
--- a/selfdrive/boardd/boardd.cc
+++ b/selfdrive/boardd/boardd.cc
@ -610,11 +610,20 @@ void pigeon_thread(Panda *panda) {
 }

 void boardd_main_thread(std::vector<std::string> serials) {
-  if (serials.size() == 0) serials.push_back("");
-
  PubMaster pm({"pandaStates", "peripheralState"});
  LOGW("attempting to connect");

+  if (serials.size() == 0) {
+    // connect to all
+    serials = Panda::list();
+
+    // exit if no pandas are connected
+    if (serials.size() == 0) {
+      LOGW("no pandas found, exiting");
+      return;
+    }
+  }
+
  // connect to all provided serials
  std::vector<Panda *> pandas;
  for (int i = 0; i < serials.size() && !do_exit; /**/) {
--- a/selfdrive/camerad/cameras/camera_common.cc
+++ b/selfdrive/camerad/cameras/camera_common.cc
@ -22,6 +22,7 @@
 #include "CL/cl_ext_qcom.h"
 #include "selfdrive/camerad/cameras/camera_qcom.h"
 #elif QCOM2
+#include "CL/cl_ext_qcom.h"
 #include "selfdrive/camerad/cameras/camera_qcom2.h"
 #elif WEBCAM
 #include "selfdrive/camerad/cameras/camera_webcam.h"
@ -36,6 +37,7 @@ public:
  Debayer(cl_device_id device_id, cl_context context, const CameraBuf *b, const CameraState *s) {
    char args[4096];
    const CameraInfo *ci = &s->ci;
+    hdr_ = ci->hdr;
    snprintf(args, sizeof(args),
             "-cl-fast-relaxed-math -cl-denorms-are-zero "
             "-DFRAME_WIDTH=%d -DFRAME_HEIGHT=%d -DFRAME_STRIDE=%d "
@ -62,9 +64,20 @@ public:
      CL_CHECK(clSetKernelArg(krnl_, 2, localMemSize, 0));
      CL_CHECK(clEnqueueNDRangeKernel(q, krnl_, 2, NULL, globalWorkSize, localWorkSize, 0, 0, debayer_event));
    } else {
-      const size_t debayer_work_size = height;  // doesn't divide evenly, is this okay?
-      CL_CHECK(clSetKernelArg(krnl_, 2, sizeof(float), &gain));
-      CL_CHECK(clEnqueueNDRangeKernel(q, krnl_, 1, NULL, &debayer_work_size, NULL, 0, 0, debayer_event));
+      if (hdr_) {
+        // HDR requires a 1-D kernel due to the DPCM compression
+        const size_t debayer_local_worksize = 128;
+        const size_t debayer_work_size = height;  // doesn't divide evenly, is this okay?
+        CL_CHECK(clSetKernelArg(krnl_, 2, sizeof(float), &gain));
+        CL_CHECK(clEnqueueNDRangeKernel(q, krnl_, 1, NULL, &debayer_work_size, &debayer_local_worksize, 0, 0, debayer_event));
+      } else {
+        const int debayer_local_worksize = 32;
+        assert(width % 2 == 0);
+        const size_t globalWorkSize[] = {size_t(height), size_t(width / 2)};
+        const size_t localWorkSize[] = {debayer_local_worksize, debayer_local_worksize};
+        CL_CHECK(clSetKernelArg(krnl_, 2, sizeof(float), &gain));
+        CL_CHECK(clEnqueueNDRangeKernel(q, krnl_, 2, NULL, globalWorkSize, localWorkSize, 0, 0, debayer_event));
+      }
    }
  }

@ -74,6 +87,7 @@ public:

 private:
  cl_kernel krnl_;
+  bool hdr_;
 };

 void CameraBuf::init(cl_device_id device_id, cl_context context, CameraState *s, VisionIpcServer * v, int frame_cnt, VisionStreamType init_rgb_type, VisionStreamType init_yuv_type, release_cb init_release_callback) {
@ -427,8 +441,7 @@ void common_process_driver_camera(MultiCameraState *s, CameraState *c, int cnt)

 void camerad_thread() {
  cl_device_id device_id = cl_get_device_id(CL_DEVICE_TYPE_DEFAULT);
-   // TODO: do this for QCOM2 too
-#if defined(QCOM)
+#if defined(QCOM) || defined(QCOM2)
  const cl_context_properties props[] = {CL_CONTEXT_PRIORITY_HINT_QCOM, CL_PRIORITY_HINT_HIGH_QCOM, 0};
  cl_context context = CL_CHECK_ERR(clCreateContext(props, 1, &device_id, NULL, NULL, &err));
 #else
@ -447,3 +460,16 @@ void camerad_thread() {

  CL_CHECK(clReleaseContext(context));
 }
+
+int open_v4l_by_name_and_index(const char name[], int index, int flags) {
+  for (int v4l_index = 0; /**/; ++v4l_index) {
+    std::string v4l_name = util::read_file(util::string_format("/sys/class/video4linux/v4l-subdev%d/name", v4l_index));
+    if (v4l_name.empty()) return -1;
+    if (v4l_name.find(name) == 0) {
+      if (index == 0) {
+        return HANDLE_EINTR(open(util::string_format("/dev/v4l-subdev%d", v4l_index).c_str(), flags));
+      }
+      index--;
+    }
+  }
+}
--- a/selfdrive/camerad/cameras/camera_common.h
+++ b/selfdrive/camerad/cameras/camera_common.h
@ -135,3 +135,5 @@ void cameras_run(MultiCameraState *s);
 void cameras_close(MultiCameraState *s);
 void camera_autoexposure(CameraState *s, float grey_frac);
 void camerad_thread();
+
+int open_v4l_by_name_and_index(const char name[], int index = 0, int flags = O_RDWR | O_NONBLOCK);
--- a/selfdrive/camerad/cameras/camera_qcom.cc
+++ b/selfdrive/camerad/cameras/camera_qcom.cc
@ -399,7 +399,7 @@ static void sensors_init(MultiCameraState *s) {
    .output_format = MSM_SENSOR_BAYER,
  }};

-  unique_fd sensorinit_fd = HANDLE_EINTR(open("/dev/v4l-subdev11", O_RDWR | O_NONBLOCK));
+  unique_fd sensorinit_fd = open_v4l_by_name_and_index("msm_sensor_init");
  assert(sensorinit_fd >= 0);
  for (auto &info : slave_infos) {
    info.power_setting_array.power_setting = &info.power_setting_array.power_setting_a[0];
@ -417,31 +417,22 @@ static void camera_open(CameraState *s, bool is_road_cam) {
  struct msm_actuator_cfg_data actuator_cfg_data = {};

  // open devices
-  const char *sensor_dev;
+  s->csid_fd = open_v4l_by_name_and_index("msm_csid", is_road_cam ? 0 : 2);
+  assert(s->csid_fd >= 0);
+  s->csiphy_fd = open_v4l_by_name_and_index("msm_csiphy", is_road_cam ? 0 : 2);
+  assert(s->csiphy_fd >= 0);
+  s->isp_fd = open_v4l_by_name_and_index("vfe", is_road_cam ? 0 : 1);
+  assert(s->isp_fd >= 0);
+
  if (is_road_cam) {
-    s->csid_fd = HANDLE_EINTR(open("/dev/v4l-subdev3", O_RDWR | O_NONBLOCK));
-    assert(s->csid_fd >= 0);
-    s->csiphy_fd = HANDLE_EINTR(open("/dev/v4l-subdev0", O_RDWR | O_NONBLOCK));
-    assert(s->csiphy_fd >= 0);
-    sensor_dev = "/dev/v4l-subdev17";
-    s->isp_fd = HANDLE_EINTR(open("/dev/v4l-subdev13", O_RDWR | O_NONBLOCK));
-    assert(s->isp_fd >= 0);
-    s->actuator_fd = HANDLE_EINTR(open("/dev/v4l-subdev7", O_RDWR | O_NONBLOCK));
+    s->actuator_fd = open_v4l_by_name_and_index("msm_actuator");
    assert(s->actuator_fd >= 0);
-  } else {
-    s->csid_fd = HANDLE_EINTR(open("/dev/v4l-subdev5", O_RDWR | O_NONBLOCK));
-    assert(s->csid_fd >= 0);
-    s->csiphy_fd = HANDLE_EINTR(open("/dev/v4l-subdev2", O_RDWR | O_NONBLOCK));
-    assert(s->csiphy_fd >= 0);
-    sensor_dev = "/dev/v4l-subdev18";
-    s->isp_fd = HANDLE_EINTR(open("/dev/v4l-subdev14", O_RDWR | O_NONBLOCK));
-    assert(s->isp_fd >= 0);
  }

  // wait for sensor device
  // on first startup, these devices aren't present yet
  for (int i = 0; i < 10; i++) {
-    s->sensor_fd = HANDLE_EINTR(open(sensor_dev, O_RDWR | O_NONBLOCK));
+    s->sensor_fd = open_v4l_by_name_and_index(is_road_cam ? "imx298" : "ov8865_sunny");
    if (s->sensor_fd >= 0) break;
    LOGW("waiting for sensors...");
    util::sleep_for(1000); // sleep one second
@ -908,7 +899,7 @@ void cameras_open(MultiCameraState *s) {
  s->v4l_fd = HANDLE_EINTR(open("/dev/video0", O_RDWR | O_NONBLOCK));
  assert(s->v4l_fd >= 0);

-  s->ispif_fd = HANDLE_EINTR(open("/dev/v4l-subdev15", O_RDWR | O_NONBLOCK));
+  s->ispif_fd = open_v4l_by_name_and_index("msm_ispif");
  assert(s->ispif_fd >= 0);

  // ISPIF: stop
--- a/selfdrive/camerad/cameras/camera_qcom2.cc
+++ b/selfdrive/camerad/cameras/camera_qcom2.cc
@ -578,19 +578,6 @@ static void camera_init(MultiCameraState *multi_cam_state, VisionIpcServer * v,
  s->buf.init(device_id, ctx, s, v, FRAME_BUF_COUNT, rgb_type, yuv_type);
 }

-int open_v4l_by_name_and_index(const char name[], int index, int flags = O_RDWR | O_NONBLOCK) {
-  for (int v4l_index = 0; /**/; ++v4l_index) {
-    std::string v4l_name = util::read_file(util::string_format("/sys/class/video4linux/v4l-subdev%d/name", v4l_index));
-    if (v4l_name.empty()) return -1;
-    if (v4l_name.find(name) == 0) {
-      if (index == 0) {
-        return open(util::string_format("/dev/v4l-subdev%d", v4l_index).c_str(), flags);
-      }
-      index--;
-    }
-  }
-}
-
 static void camera_open(CameraState *s) {
  s->sensor_fd = open_v4l_by_name_and_index("cam-sensor-driver", s->camera_num);
  assert(s->sensor_fd >= 0);
--- a/selfdrive/camerad/cameras/debayer.cl
+++ b/selfdrive/camerad/cameras/debayer.cl
@ -26,6 +26,8 @@ float3 srgb_gamma(float3 p) {
 	return select(ph, pl, islessequal(p, 0.0031308f));
 }

+#if HDR
+
 __constant int dpcm_lookup[512] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, 935, 951, 967, 983, 999, 1015, 1031, 1047, 1063, 1079, 1095, 1111, 1127, 1143, 1159, 1175, 1191, 1207, 1223, 1239, 1255, 1271, 1287, 1303, 1319, 1335, 1351, 1367, 1383, 1399, 1415, 1431, -935, -951, -967, -983, -999, -1015, -1031, -1047, -1063, -1079, -1095, -1111, -1127, -1143, -1159, -1175, -1191, -1207, -1223, -1239, -1255, -1271, -1287, -1303, -1319, -1335, -1351, -1367, -1383, -1399, -1415, -1431, 419, 427, 435, 443, 451, 459, 467, 475, 483, 491, 499, 507, 515, 523, 531, 539, 547, 555, 563, 571, 579, 587, 595, 603, 611, 619, 627, 635, 643, 651, 659, 667, 675, 683, 691, 699, 707, 715, 723, 731, 739, 747, 755, 763, 771, 779, 787, 795, 803, 811, 819, 827, 835, 843, 851, 859, 867, 875, 883, 891, 899, 907, 915, 923, -419, -427, -435, -443, -451, -459, -467, -475, -483, -491, -499, -507, -515, -523, -531, -539, -547, -555, -563, -571, -579, -587, -595, -603, -611, -619, -627, -635, -643, -651, -659, -667, -675, -683, -691, -699, -707, -715, -723, -731, -739, -747, -755, -763, -771, -779, -787, -795, -803, -811, -819, -827, -835, -843, -851, -859, -867, -875, -883, -891, -899, -907, -915, -923, 161, 165, 169, 173, 177, 181, 185, 189, 193, 197, 201, 205, 209, 213, 217, 221, 225, 229, 233, 237, 241, 245, 249, 253, 257, 261, 265, 269, 273, 277, 281, 285, 289, 293, 297, 301, 305, 309, 313, 317, 321, 325, 329, 333, 337, 341, 345, 349, 353, 357, 361, 365, 369, 373, 377, 381, 385, 389, 393, 397, 401, 405, 409, 413, -161, -165, -169, -173, -177, -181, -185, -189, -193, -197, -201, -205, -209, -213, -217, -221, -225, -229, -233, -237, -241, -245, -249, -253, -257, -261, -265, -269, -273, -277, -281, -285, -289, -293, -297, -301, -305, -309, -313, -317, -321, -325, -329, -333, -337, -341, -345, -349, -353, -357, -361, -365, -369, -373, -377, -381, -385, -389, -393, -397, -401, -405, -409, -413, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, -32, -34, -36, -38, -40, -42, -44, -46, -48, -50, -52, -54, -56, -58, -60, -62, -64, -66, -68, -70, -72, -74, -76, -78, -80, -82, -84, -86, -88, -90, -92, -94, -96, -98, -100, -102, -104, -106, -108, -110, -112, -114, -116, -118, -120, -122, -124, -126, -128, -130, -132, -134, -136, -138, -140, -142, -144, -146, -148, -150, -152, -154, -156, -158};

 inline uint4 decompress(uint4 p, uint4 pl) {
@ -35,6 +37,8 @@ inline uint4 decompress(uint4 p, uint4 pl) {
  return select(r2, r1, p < 0x200);
 }

+#endif
+
 __kernel void debayer10(__global uchar const * const in,
                        __global uchar * out, float digital_gain)
 {
@ -42,8 +46,13 @@ __kernel void debayer10(__global uchar const * const in,
  if (oy >= RGB_HEIGHT) return;
  const int iy = oy * 2;

+#if HDR
  uint4 pint_last;
  for (int ox = 0; ox < RGB_WIDTH; ox += 2) {
+#else
+  int ox = get_global_id(1) * 2;
+  {
+#endif
    const int ix = (ox/2) * 5;

    // TODO: why doesn't this work for the frontview
--- a/selfdrive/car/honda/values.py
+++ b/selfdrive/car/honda/values.py
@ -1346,11 +1346,11 @@ DBC = {
  CAR.ACURA_RDX_3G: dbc_dict('acura_rdx_2020_can_generated', None),
  CAR.CIVIC: dbc_dict('honda_civic_touring_2016_can_generated', 'acura_ilx_2016_nidec'),
  CAR.CIVIC_BOSCH: dbc_dict('honda_civic_hatchback_ex_2017_can_generated', None),
-  CAR.CIVIC_BOSCH_DIESEL: dbc_dict('honda_civic_sedan_16_diesel_2019_can_generated', None),
+  CAR.CIVIC_BOSCH_DIESEL: dbc_dict('honda_accord_2018_can_generated', None),
  CAR.CRV: dbc_dict('honda_crv_touring_2016_can_generated', 'acura_ilx_2016_nidec'),
  CAR.CRV_5G: dbc_dict('honda_crv_ex_2017_can_generated', None, body_dbc='honda_crv_ex_2017_body_generated'),
  CAR.CRV_EU: dbc_dict('honda_crv_executive_2016_can_generated', 'acura_ilx_2016_nidec'),
-  CAR.CRV_HYBRID: dbc_dict('honda_crv_hybrid_2019_can_generated', None),
+  CAR.CRV_HYBRID: dbc_dict('honda_accord_2018_can_generated', None),
  CAR.FIT: dbc_dict('honda_fit_ex_2018_can_generated', 'acura_ilx_2016_nidec'),
  CAR.FREED: dbc_dict('honda_fit_ex_2018_can_generated', 'acura_ilx_2016_nidec'),
  CAR.HRV: dbc_dict('honda_fit_ex_2018_can_generated', 'acura_ilx_2016_nidec'),
--- a/selfdrive/car/hyundai/interface.py
+++ b/selfdrive/car/hyundai/interface.py
@ -30,6 +30,11 @@ class CarInterface(CarInterfaceBase):

    ret.pcmCruise = not ret.openpilotLongitudinalControl

+    # These cars have been put into dashcam only due to both a lack of users and test coverage.
+    # These cars likely still work fine. Once a user confirms each car works and a test route is
+    # added to selfdrive/test/test_routes, we can remove it from this list.
+    ret.dashcamOnly = candidate in {CAR.KIA_OPTIMA_H, CAR.ELANTRA_GT_I30}
+
    ret.steerActuatorDelay = 0.1  # Default delay
    ret.steerRateCost = 0.5
    ret.steerLimitTimer = 0.4
--- a/selfdrive/car/hyundai/values.py
+++ b/selfdrive/car/hyundai/values.py
@ -225,6 +225,7 @@ FW_VERSIONS = {
      b'\xf1\x00AE  MDPS C 1.00 1.02 56310G7300\x00 4AEEC102',
      b'\xf1\x00AE  MDPS C 1.00 1.04 56310/G7501 4AEEC104',
      b'\xf1\x00AE  MDPS C 1.00 1.03 56310/G7300 4AEEC103',
+      b'\xf1\x00AE  MDPS C 1.00 1.03 56310G7300\x00 4AEEC103',
    ],
    (Ecu.fwdCamera, 0x7c4, None): [
      b'\xf1\x00AEE MFC  AT EUR LHD 1.00 1.00 95740-G7200 160418',
@ -292,6 +293,8 @@ FW_VERSIONS = {
      b'HM6M2_0a0_BD0',
    ],
    (Ecu.eps, 0x7d4, None): [
+      b'\xf1\x00DN8 MDPS C 1,00 1,01 56310L0010\x00 4DNAC101', # modified firmware
+      b'\xf1\x8756310L0010\x00\xf1\x00DN8 MDPS C 1,00 1,01 56310L0010\x00 4DNAC101', # modified firmware
      b'\xf1\x00DN8 MDPS C 1.00 1.01 \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00 4DNAC101',
      b'\xf1\x00DN8 MDPS C 1.00 1.01 56310-L0010 4DNAC101',
      b'\xf1\x00DN8 MDPS C 1.00 1.01 56310L0010\x00 4DNAC101',
@ -647,10 +650,14 @@ FW_VERSIONS = {
      b'\xf1\x00JS__ SCC H-CUP      1.00 1.02 95650-J3200         ',
      b'\xf1\x00JS__ SCC HNCUP      1.00 1.02 95650-J3100         ',
    ],
-    (Ecu.esp, 0x7d1, None): [b'\xf1\x00\x00\x00\x00\x00\x00\x00', ],
+    (Ecu.esp, 0x7d1, None): [
+      b'\xf1\x00\x00\x00\x00\x00\x00\x00',
+      b'\xf1\x816V8RAC00121.ELF\xf1\x00\x00\x00\x00\x00\x00\x00',
+    ],
    (Ecu.engine, 0x7e0, None): [
      b'\x01TJS-JNU06F200H0A',
      b'\x01TJS-JDK06F200H0A',
+      b'391282BJF5 ',
    ],
    (Ecu.eps, 0x7d4, None): [b'\xf1\x00JSL MDPS C 1.00 1.03 56340-J3000 8308', ],
    (Ecu.fwdCamera, 0x7c4, None): [
@ -907,6 +914,7 @@ FW_VERSIONS = {
    (Ecu.fwdCamera, 0x7c4, None): [
      b'\xf1\x00CN7 MFC  AT USA LHD 1.00 1.00 99210-AB000 200819',
      b'\xf1\x00CN7 MFC  AT USA LHD 1.00 1.03 99210-AA000 200819',
+      b'\xf1\x00CN7 MFC  AT USA LHD 1.00 1.01 99210-AB000 210205',
    ],
    (Ecu.esp, 0x7d1, None): [
      b'\xf1\x00CN ESC \t 101 \x10\x03 58910-AB800',
@ -919,11 +927,13 @@ FW_VERSIONS = {
      b'\xf1\x87CXMQFM2135005JB2E\xb9\x89\x98W\xa9y\x97h\xa9\x98\x99wxvwh\x87\177\xffx\xff\xff\xff,,\xf1\x89HT6VA640A1\xf1\x82CCN0N20NS5\x00\x00\x00\x00\x00\x00',
      b'\xf1\x87CXMQFM1916035JB2\x88vvgg\x87Wuwgev\xa9\x98\x88\x98h\x99\x9f\xffh\xff\xff\xff\xa5\xee\xf1\x89HT6VA640A1\xf1\x82CCN0N20NS5\x00\x00\x00\x00\x00\x00',
      b'\xf1\x87CXLQF40189012JL2f\x88\x86\x88\x88vUex\xb8\x88\x88\x88\x87\x88\x89fh?\xffz\xff\xff\xff\x08z\xf1\x89HT6VA640A1\xf1\x82CCN0N20NS5\x00\x00\x00\x00\x00\x00',
+      b'\xf1\x87CXMQFM2728305JB2E\x97\x87xw\x87vwgw\x84x\x88\x88w\x89EI\xbf\xff{\xff\xff\xff\xe6\x0e\xf1\x89HT6VA640A1\xf1\x82CCN0N20NS5\x00\x00\x00\x00\x00\x00'
    ],
    (Ecu.engine, 0x7e0, None): [
      b'\xf1\x82CNCWD0AMFCXCSFFA',
      b'\xf1\x82CNCWD0AMFCXCSFFB',
      b'\xf1\x82CNCVD0AMFCXCSFFB',
+      b'\xf1\x870\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf1\x82CNDWD0AMFCXCSG8A',
    ],
  },
  CAR.ELANTRA_HEV_2021: {
@ -997,6 +1007,23 @@ FW_VERSIONS = {
      b'\xf1\x87391162J013',
      ],
  },
+  CAR.KIA_SORENTO: {
+    (Ecu.fwdCamera, 0x7c4, None): [
+      b'\xf1\x00UMP LKAS AT USA LHD 1.01 1.01 95740-C6550 d01'
+    ],
+    (Ecu.esp, 0x7d1, None): [
+      b'\xf1\x00UM ESC \x0c 12 \x18\x05\x06 58910-C6330'
+    ],
+    (Ecu.fwdRadar, 0x7D0, None): [
+      b'\xf1\x00UM__ SCC F-CUP      1.00 1.00 96400-C6500         '
+    ],
+    (Ecu.transmission, 0x7e1, None): [
+      b'\xf1\x87LDKUAA0348164HE3\x87www\x87www\x88\x88\xa8\x88w\x88\x97xw\x88\x97x\x86o\xf8\xff\x87f\x7f\xff\x15\xe0\xf1\x81U811\x00\x00\x00\x00\x00\x00\xf1\x00bcsh8p54  U811\x00\x00\x00\x00\x00\x00TUM4G33NL3V|DG'
+    ],
+    (Ecu.engine, 0x7e0, None): [
+      b'\xf1\x81640F0051\x00\x00\x00\x00\x00\x00\x00\x00'
+    ],
+  },
 }

 CHECKSUM = {
--- a/selfdrive/car/interfaces.py
+++ b/selfdrive/car/interfaces.py
@ -94,10 +94,12 @@ class CarInterfaceBase(ABC):
    ret.stoppingControl = True
    ret.longitudinalTuning.deadzoneBP = [0.]
    ret.longitudinalTuning.deadzoneV = [0.]
+    ret.longitudinalTuning.kf = 1.
    ret.longitudinalTuning.kpBP = [0.]
    ret.longitudinalTuning.kpV = [1.]
    ret.longitudinalTuning.kiBP = [0.]
    ret.longitudinalTuning.kiV = [1.]
+    # TODO estimate car specific lag, use .15s for now
    ret.longitudinalActuatorDelayLowerBound = 0.15
    ret.longitudinalActuatorDelayUpperBound = 0.15
    ret.steerLimitTimer = 1.0
--- a/selfdrive/car/subaru/values.py
+++ b/selfdrive/car/subaru/values.py
@ -26,9 +26,6 @@ class CAR:
  OUTBACK_PREGLOBAL_2018 = "SUBARU OUTBACK 2018 - 2019"

 FINGERPRINTS = {
-  CAR.IMPREZA: [{
-    2: 8, 64: 8, 65: 8, 72: 8, 73: 8, 280: 8, 281: 8, 290: 8, 312: 8, 313: 8, 314: 8, 315: 8, 316: 8, 326: 8, 372: 8, 544: 8, 545: 8, 546: 8, 552: 8, 554: 8, 557: 8, 576: 8, 577: 8, 722: 8, 801: 8, 802: 8, 805: 8, 808: 8, 811: 8, 816: 8, 826: 8, 827: 8, 837: 8, 838: 8, 839: 8, 842: 8, 912: 8, 915: 8, 940: 8, 1614: 8, 1617: 8, 1632: 8, 1650: 8, 1657: 8, 1658: 8, 1677: 8, 1697: 8, 1722: 8, 1743: 8, 1759: 8, 1786: 5, 1787: 5, 1788: 8, 1809: 8, 1813: 8, 1817: 8, 1821: 8, 1840: 8, 1848: 8, 1924: 8, 1932: 8, 1952: 8, 1960: 8
-  }],
  CAR.IMPREZA_2020: [{
    2: 8, 64: 8, 65: 8, 72: 8, 73: 8, 280: 8, 281: 8, 282: 8, 290: 8, 312: 8, 313: 8, 314: 8, 315: 8, 316: 8, 326: 8, 372: 8, 544: 8, 545: 8, 546: 8, 552: 8, 554: 8, 557: 8, 576: 8, 577: 8, 722: 8, 801: 8, 802: 8, 803: 8, 805: 8, 808: 8, 816: 8, 826: 8, 837: 8, 838: 8, 839: 8, 842: 8, 912: 8, 915: 8, 940: 8, 1617: 8, 1632: 8, 1650: 8, 1677: 8, 1697: 8, 1722: 8, 1743: 8, 1759: 8, 1786: 5, 1787: 5, 1788: 8, 1809: 8, 1813: 8, 1817: 8, 1821: 8, 1840: 8, 1848: 8, 1924: 8, 1932: 8, 1952: 8, 1960: 8, 1968: 8, 1976: 8, 2015: 8, 2016: 8, 2024: 8
  },
@ -56,6 +53,7 @@ FW_VERSIONS = {
      b'\x00\x00d\xb9\x1f@ \x10',
      b'\000\000e~\037@ \'',
      b'\x00\x00e@\x1f@ $',
+      b'\x00\x00d\xb9\x00\x00\x00\x00',
    ],
    (Ecu.engine, 0x7e0, None): [
      b'\xbb,\xa0t\a',
@ -68,6 +66,7 @@ FW_VERSIONS = {
      b'\x00\xfe\xf7\x00\x00',
      b'\001\xfe\xf9\000\000',
      b'\x01\xfe\xf7\x00\x00',
+      b'\xf1\x00\xa4\x10@',
    ],
  },
  CAR.IMPREZA: {
@ -135,6 +134,7 @@ FW_VERSIONS = {
      b'\xe4\xf5\002\000\000',
      b'\xe3\xd0\x081\x00',
      b'\xe3\xf5\x06\x00\x00',
+      b'\xf1\x00\xa4\x10@',
    ],
  },
  CAR.IMPREZA_2020: {
--- a/selfdrive/car/tesla/interface.py
+++ b/selfdrive/car/tesla/interface.py
@ -41,14 +41,14 @@ class CarInterface(CarInterfaceBase):
      ret.safetyConfigs = [get_safety_config(car.CarParams.SafetyModel.tesla, 0)]

    ret.steerLimitTimer = 1.0
-    ret.steerActuatorDelay = 0.1
+    ret.steerActuatorDelay = 0.25
    ret.steerRateCost = 0.5

    if candidate in (CAR.AP2_MODELS, CAR.AP1_MODELS):
      ret.mass = 2100. + STD_CARGO_KG
      ret.wheelbase = 2.959
      ret.centerToFront = ret.wheelbase * 0.5
-      ret.steerRatio = 13.5
+      ret.steerRatio = 15.0
    else:
      raise ValueError(f"Unsupported car: {candidate}")

--- a/selfdrive/car/toyota/interface.py
+++ b/selfdrive/car/toyota/interface.py
@ -2,7 +2,7 @@
 from cereal import car
 from selfdrive.config import Conversions as CV
 from selfdrive.car.toyota.tunes import LatTunes, LongTunes, set_long_tune, set_lat_tune
-from selfdrive.car.toyota.values import Ecu, CAR, ToyotaFlags, TSS2_CAR, NO_DSU_CAR, MIN_ACC_SPEED, EPS_SCALE, CarControllerParams
+from selfdrive.car.toyota.values import Ecu, CAR, ToyotaFlags, TSS2_CAR, NO_DSU_CAR, MIN_ACC_SPEED, EPS_SCALE, EV_HYBRID_CAR, CarControllerParams
 from selfdrive.car import STD_CARGO_KG, scale_rot_inertia, scale_tire_stiffness, gen_empty_fingerprint, get_safety_config
 from selfdrive.car.interfaces import CarInterfaceBase

@ -217,7 +217,9 @@ class CarInterface(CarInterfaceBase):
    # if the smartDSU is detected, openpilot can send ACC_CMD (and the smartDSU will block it from the DSU) or not (the DSU is "connected")
    ret.openpilotLongitudinalControl = smartDsu or ret.enableDsu or candidate in TSS2_CAR

-    if 0x245 in fingerprint[0]:
+    # we can't use the fingerprint to detect this reliably, since
+    # the EV gas pedal signal can take a couple seconds to appear
+    if candidate in EV_HYBRID_CAR:
      ret.flags |= ToyotaFlags.HYBRID.value

    # min speed to enable ACC. if car can do stop and go, then set enabling speed
--- a/selfdrive/car/toyota/toyotacan.py
+++ b/selfdrive/car/toyota/toyotacan.py
@ -19,7 +19,7 @@ def create_lta_steer_command(packer, steer, steer_req, raw_cnt):
    "SETME_X3": 3,
    "PERCENTAGE": 100,
    "SETME_X64": 0x64,
-    "ANGLE": 0,  # Rate limit? Lower values seeem to work better, but needs more testing
+    "ANGLE": 0,
    "STEER_ANGLE_CMD": steer,
    "STEER_REQUEST": steer_req,
    "STEER_REQUEST_2": steer_req,
--- a/selfdrive/car/toyota/values.py
+++ b/selfdrive/car/toyota/values.py
@ -398,6 +398,7 @@ FW_VERSIONS = {
    (Ecu.fwdCamera, 0x750, 109): [
      b'\x028646F3305200\x00\x00\x00\x008646G5301200\x00\x00\x00\x00',
      b'\x028646F3305300\x00\x00\x00\x008646G5301200\x00\x00\x00\x00',
+      b'\x028646F3305300\x00\x00\x00\x008646G3304000\x00\x00\x00\x00',
    ],
  },
  CAR.CHR: {
@ -1718,5 +1719,9 @@ TSS2_CAR = {CAR.RAV4_TSS2, CAR.COROLLA_TSS2, CAR.COROLLAH_TSS2, CAR.LEXUS_ES_TSS

 NO_DSU_CAR = TSS2_CAR | {CAR.CHR, CAR.CHRH, CAR.CAMRY, CAR.CAMRYH}

+EV_HYBRID_CAR = {CAR.AVALONH_2019, CAR.CAMRYH, CAR.CAMRYH_TSS2, CAR.CHRH, CAR.COROLLAH_TSS2, CAR.HIGHLANDERH, CAR.HIGHLANDERH_TSS2, CAR.PRIUS,
+                 CAR.PRIUS_V, CAR.RAV4H, CAR.RAV4H_TSS2, CAR.LEXUS_CTH, CAR.MIRAI, CAR.LEXUS_ESH, CAR.LEXUS_ESH_TSS2, CAR.LEXUS_NXH, CAR.LEXUS_RXH,
+                 CAR.LEXUS_RXH_TSS2, CAR.PRIUS_TSS2}
+
 # no resume button press required
 NO_STOP_TIMER_CAR = TSS2_CAR | {CAR.PRIUS_V, CAR.RAV4H, CAR.HIGHLANDERH, CAR.HIGHLANDER, CAR.SIENNA, CAR.LEXUS_ESH}
--- a/selfdrive/common/modeldata.h
+++ b/selfdrive/common/modeldata.h
@ -40,9 +40,10 @@ const mat3 fcam_intrinsic_matrix =
                              0.0, 2648.0, 1208.0 / 2,
                              0.0, 0.0, 1.0}};

-// without unwarp, focal length is for center portion only
-const mat3 ecam_intrinsic_matrix = (mat3){{620.0, 0.0, 1928.0 / 2,
-                                           0.0, 620.0, 1208.0 / 2,
+// tici ecam focal probably wrong? magnification is not consistent across frame
+// Need to retrain model before this can be changed
+const mat3 ecam_intrinsic_matrix = (mat3){{567.0, 0.0, 1928.0 / 2,
+                                           0.0, 567.0, 1208.0 / 2,
                                           0.0, 0.0, 1.0}};

 static inline mat3 get_model_yuv_transform(bool bayer = true) {
--- a/selfdrive/common/params.cc
+++ b/selfdrive/common/params.cc
@ -91,6 +91,7 @@ std::unordered_map<std::string, uint32_t> keys = {
    {"CarParams", CLEAR_ON_MANAGER_START | CLEAR_ON_IGNITION_ON},
    {"CarParamsCache", CLEAR_ON_MANAGER_START},
    {"CarVin", CLEAR_ON_MANAGER_START | CLEAR_ON_IGNITION_ON},
+    {"CellularUnmetered", PERSISTENT},
    {"CompletedTrainingVersion", PERSISTENT},
    {"ControlsReady", CLEAR_ON_MANAGER_START | CLEAR_ON_IGNITION_ON},
    {"CurrentRoute", CLEAR_ON_MANAGER_START | CLEAR_ON_IGNITION_ON},
--- a/selfdrive/common/version.h
+++ b/selfdrive/common/version.h
@ -1 +1 @@
-#define COMMA_VERSION "0.8.13"
+#define COMMA_VERSION "0.8.14"
--- a/selfdrive/controls/controlsd.py
+++ b/selfdrive/controls/controlsd.py
@ -492,7 +492,8 @@ class Controls:
    if not self.joystick_mode:
      # accel PID loop
      pid_accel_limits = self.CI.get_pid_accel_limits(self.CP, CS.vEgo, self.v_cruise_kph * CV.KPH_TO_MS)
-      actuators.accel = self.LoC.update(self.active, CS, self.CP, long_plan, pid_accel_limits)
+      t_since_plan = (self.sm.frame - self.sm.rcv_frame['longitudinalPlan']) * DT_CTRL
+      actuators.accel = self.LoC.update(self.active, CS, self.CP, long_plan, pid_accel_limits, t_since_plan)

      # Steering PID loop and lateral MPC
      lat_active = self.active and not CS.steerWarning and not CS.steerError and CS.vEgo > self.CP.minSteerSpeed
--- a/selfdrive/controls/lib/drive_helpers.py
+++ b/selfdrive/controls/lib/drive_helpers.py
@ -37,13 +37,6 @@ class MPC_COST_LAT:
  STEER_RATE = 1.0


-class MPC_COST_LONG:
-  TTC = 5.0
-  DISTANCE = 0.1
-  ACCELERATION = 10.0
-  JERK = 20.0
-
-
 def rate_limit(new_value, last_value, dw_step, up_step):
  return clip(new_value, last_value + dw_step, last_value + up_step)

--- a/selfdrive/controls/lib/events.py
+++ b/selfdrive/controls/lib/events.py
@ -1,3 +1,4 @@
+import os
 from enum import IntEnum
 from typing import Dict, Union, Callable, List, Optional

@ -6,6 +7,7 @@ import cereal.messaging as messaging
 from common.realtime import DT_CTRL
 from selfdrive.config import Conversions as CV
 from selfdrive.locationd.calibrationd import MIN_SPEED_FILTER
+from selfdrive.version import get_short_branch

 AlertSize = log.ControlsState.AlertSize
 AlertStatus = log.ControlsState.AlertStatus
@ -206,7 +208,6 @@ def soft_disable_alert(alert_text_2: str) -> AlertCallbackType:
    return SoftDisableAlert(alert_text_2)
  return func

-
 def user_soft_disable_alert(alert_text_2: str) -> AlertCallbackType:
  def func(CP: car.CarParams, sm: messaging.SubMaster, metric: bool, soft_disable_time: int) -> Alert:
    if soft_disable_time < int(0.5 / DT_CTRL):
@ -214,6 +215,12 @@ def user_soft_disable_alert(alert_text_2: str) -> AlertCallbackType:
    return UserSoftDisableAlert(alert_text_2)
  return func

+def startup_master_alert(CP: car.CarParams, sm: messaging.SubMaster, metric: bool, soft_disable_time: int) -> Alert:
+  branch = get_short_branch("")
+  if "REPLAY" in os.environ:
+    branch = "replay"
+
+  return StartupAlert("WARNING: This branch is not tested", branch, alert_status=AlertStatus.userPrompt)

 def below_engage_speed_alert(CP: car.CarParams, sm: messaging.SubMaster, metric: bool, soft_disable_time: int) -> Alert:
  return NoEntryAlert(f"Speed Below {get_display_speed(CP.minEnableSpeed, metric)}")
@ -280,8 +287,7 @@ EVENTS: Dict[int, Dict[str, Union[Alert, AlertCallbackType]]] = {
  },

  EventName.startupMaster: {
-    ET.PERMANENT: StartupAlert("WARNING: This branch is not tested",
-                               alert_status=AlertStatus.userPrompt),
+    ET.PERMANENT: startup_master_alert,
  },

  # Car is recognized, but marked as dashcam only
--- a/selfdrive/controls/lib/lane_planner.py
+++ b/selfdrive/controls/lib/lane_planner.py
@ -9,17 +9,16 @@ from selfdrive.swaglog import cloudlog

 TRAJECTORY_SIZE = 33
 # camera offset is meters from center car to camera
-# model path is in the frame of EON's camera. TICI is 0.1 m away,
-# however the average measured path difference is 0.04 m
+# model path is in the frame of the camera. Empirically 
+# the model knows the difference between TICI and EON
+# so a path offset is not needed
+PATH_OFFSET = 0.00
 if EON:
  CAMERA_OFFSET = -0.06
-  PATH_OFFSET = 0.0
 elif TICI:
  CAMERA_OFFSET = 0.04
-  PATH_OFFSET = 0.04
 else:
  CAMERA_OFFSET = 0.0
-  PATH_OFFSET = 0.0


 class LanePlanner:
--- a/selfdrive/controls/lib/lateral_mpc_lib/lat_mpc.py
+++ b/selfdrive/controls/lib/lateral_mpc_lib/lat_mpc.py
@ -6,7 +6,7 @@ from casadi import SX, vertcat, sin, cos

 from common.realtime import sec_since_boot
 from selfdrive.controls.lib.drive_helpers import LAT_MPC_N as N
-from selfdrive.controls.lib.drive_helpers import T_IDXS
+from selfdrive.modeld.constants import T_IDXS

 if __name__ == '__main__':  # generating code
  from pyextra.acados_template import AcadosModel, AcadosOcp, AcadosOcpSolver
--- a/selfdrive/controls/lib/longcontrol.py
+++ b/selfdrive/controls/lib/longcontrol.py
@ -44,7 +44,7 @@ class LongControl():
    self.long_control_state = LongCtrlState.off  # initialized to off
    self.pid = PIController((CP.longitudinalTuning.kpBP, CP.longitudinalTuning.kpV),
                            (CP.longitudinalTuning.kiBP, CP.longitudinalTuning.kiV),
-                            rate=1 / DT_CTRL)
+                            k_f = CP.longitudinalTuning.kf, rate=1 / DT_CTRL)
    self.v_pid = 0.0
    self.last_output_accel = 0.0

@ -53,20 +53,21 @@ class LongControl():
    self.pid.reset()
    self.v_pid = v_pid

-  def update(self, active, CS, CP, long_plan, accel_limits):
+  def update(self, active, CS, CP, long_plan, accel_limits, t_since_plan):
    """Update longitudinal control. This updates the state machine and runs a PID loop"""
    # Interp control trajectory
-    # TODO estimate car specific lag, use .15s for now
    speeds = long_plan.speeds
    if len(speeds) == CONTROL_N:
-      v_target_lower = interp(CP.longitudinalActuatorDelayLowerBound, T_IDXS[:CONTROL_N], speeds)
-      a_target_lower = 2 * (v_target_lower - speeds[0])/CP.longitudinalActuatorDelayLowerBound - long_plan.accels[0]
+      v_target = interp(t_since_plan, T_IDXS[:CONTROL_N], speeds)
+      a_target = interp(t_since_plan, T_IDXS[:CONTROL_N], long_plan.accels)

-      v_target_upper = interp(CP.longitudinalActuatorDelayUpperBound, T_IDXS[:CONTROL_N], speeds)
-      a_target_upper = 2 * (v_target_upper - speeds[0])/CP.longitudinalActuatorDelayUpperBound - long_plan.accels[0]
+      v_target_lower = interp(CP.longitudinalActuatorDelayLowerBound + t_since_plan, T_IDXS[:CONTROL_N], speeds)
+      a_target_lower = 2 * (v_target_lower - v_target) / CP.longitudinalActuatorDelayLowerBound - a_target
+
+      v_target_upper = interp(CP.longitudinalActuatorDelayUpperBound + t_since_plan, T_IDXS[:CONTROL_N], speeds)
+      a_target_upper = 2 * (v_target_upper - v_target) / CP.longitudinalActuatorDelayUpperBound - a_target
      a_target = min(a_target_lower, a_target_upper)

-      v_target = speeds[0]
      v_target_future = speeds[-1]
    else:
      v_target = 0.0
--- a/selfdrive/controls/lib/longitudinal_mpc_lib/long_mpc.py
+++ b/selfdrive/controls/lib/longitudinal_mpc_lib/long_mpc.py
@ -219,19 +219,20 @@ class LongitudinalMpc:
    self.x0 = np.zeros(X_DIM)
    self.set_weights()

-  def set_weights(self):
+  def set_weights(self, prev_accel_constraint=True):
    if self.e2e:
      self.set_weights_for_xva_policy()
      self.params[:,0] = -10.
      self.params[:,1] = 10.
      self.params[:,2] = 1e5
    else:
-      self.set_weights_for_lead_policy()
+      self.set_weights_for_lead_policy(prev_accel_constraint)

-  def set_weights_for_lead_policy(self):
-    W = np.asfortranarray(np.diag([X_EGO_OBSTACLE_COST, X_EGO_COST, V_EGO_COST, A_EGO_COST, A_CHANGE_COST, J_EGO_COST]))
+  def set_weights_for_lead_policy(self, prev_accel_constraint=True):
+    a_change_cost = A_CHANGE_COST if prev_accel_constraint else 0
+    W = np.asfortranarray(np.diag([X_EGO_OBSTACLE_COST, X_EGO_COST, V_EGO_COST, A_EGO_COST, a_change_cost, J_EGO_COST]))
    for i in range(N):
-      W[4,4] = A_CHANGE_COST * np.interp(T_IDXS[i], [0.0, 1.0, 2.0], [1.0, 1.0, 0.0])
+      W[4,4] = a_change_cost * np.interp(T_IDXS[i], [0.0, 1.0, 2.0], [1.0, 1.0, 0.0])
      self.solver.cost_set(i, 'W', W)
    # Setting the slice without the copy make the array not contiguous,
    # causing issues with the C interface.
@ -300,9 +301,8 @@ class LongitudinalMpc:
    self.cruise_min_a = min_a
    self.cruise_max_a = max_a

-  def update(self, carstate, radarstate, v_cruise, prev_accel_constraint=False):
+  def update(self, carstate, radarstate, v_cruise):
    v_ego = self.x0[1]
-    a_ego = self.x0[2]
    self.status = radarstate.leadOne.status or radarstate.leadTwo.status

    lead_xv_0 = self.process_lead(radarstate.leadOne)
@ -330,10 +330,7 @@ class LongitudinalMpc:
    x_obstacles = np.column_stack([lead_0_obstacle, lead_1_obstacle, cruise_obstacle])
    self.source = SOURCES[np.argmin(x_obstacles[0])]
    self.params[:,2] = np.min(x_obstacles, axis=1)
-    if prev_accel_constraint:
-      self.params[:,3] = np.copy(self.prev_a)
-    else:
-      self.params[:,3] = a_ego
+    self.params[:,3] = np.copy(self.prev_a)

    self.run()
    if (np.any(lead_xv_0[:,0] - self.x_sol[:,0] < CRASH_DISTANCE) and
--- a/selfdrive/controls/lib/longitudinal_planner.py
+++ b/selfdrive/controls/lib/longitudinal_planner.py
@ -58,7 +58,6 @@ class Planner:

  def update(self, sm):
    v_ego = sm['carState'].vEgo
-    a_ego = sm['carState'].aEgo

    v_cruise_kph = sm['controlsState'].vCruise
    v_cruise_kph = min(v_cruise_kph, V_CRUISE_MAX)
@ -67,12 +66,16 @@ class Planner:
    long_control_state = sm['controlsState'].longControlState
    force_slow_decel = sm['controlsState'].forceDecel

-    prev_accel_constraint = True
-    if long_control_state == LongCtrlState.off or sm['carState'].gasPressed:
+    # Reset current state when not engaged, or user is controlling the speed
+    reset_state = long_control_state == LongCtrlState.off
+    reset_state = reset_state or sm['carState'].gasPressed
+
+    # No change cost when user is controlling the speed, or when standstill
+    prev_accel_constraint = not (reset_state or sm['carState'].standstill)
+
+    if reset_state:
      self.v_desired_filter.x = v_ego
-      self.a_desired = a_ego
-      # Smoothly changing between accel trajectory is only relevant when OP is driving
-      prev_accel_constraint = False
+      self.a_desired = 0.0

    # Prevent divergence, smooth in current v_ego
    self.v_desired_filter.x = max(0.0, self.v_desired_filter.update(v_ego))
@ -86,9 +89,12 @@ class Planner:
    # clip limits, cannot init MPC outside of bounds
    accel_limits_turns[0] = min(accel_limits_turns[0], self.a_desired + 0.05)
    accel_limits_turns[1] = max(accel_limits_turns[1], self.a_desired - 0.05)
+
+    self.mpc.set_weights(prev_accel_constraint)
    self.mpc.set_accel_limits(accel_limits_turns[0], accel_limits_turns[1])
    self.mpc.set_cur_state(self.v_desired_filter.x, self.a_desired)
-    self.mpc.update(sm['carState'], sm['radarState'], v_cruise, prev_accel_constraint=prev_accel_constraint)
+    self.mpc.update(sm['carState'], sm['radarState'], v_cruise)
+
    self.v_desired_trajectory = np.interp(T_IDXS[:CONTROL_N], T_IDXS_MPC, self.mpc.v_solution)
    self.a_desired_trajectory = np.interp(T_IDXS[:CONTROL_N], T_IDXS_MPC, self.mpc.a_solution)
    self.j_desired_trajectory = np.interp(T_IDXS[:CONTROL_N], T_IDXS_MPC[:-1], self.mpc.j_solution)
--- a/selfdrive/controls/lib/pid.py
+++ b/selfdrive/controls/lib/pid.py
@ -13,7 +13,7 @@ def apply_deadzone(error, deadzone):
  return error

 class PIController():
-  def __init__(self, k_p, k_i, k_f=1., pos_limit=None, neg_limit=None, rate=100):
+  def __init__(self, k_p, k_i, k_f=0., pos_limit=None, neg_limit=None, rate=100):
    self._k_p = k_p  # proportional gain
    self._k_i = k_i  # integral gain
    self.k_f = k_f   # feedforward gain
--- a/selfdrive/debug/can_table.py
+++ b/selfdrive/debug/can_table.py
@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+import argparse
+import pandas as pd  # pylint: disable=import-error
+
+import cereal.messaging as messaging
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser(description="Cabana-like table of bits for your terminal",
+                                   formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+  parser.add_argument("addr", type=str, nargs=1)
+  parser.add_argument("bus", type=int, default=0, nargs='?')
+
+  args = parser.parse_args()
+
+  addr = int(args.addr[0], 0)
+  can = messaging.sub_sock('can', conflate=False, timeout=None)
+
+  print(f"waiting for {hex(addr)} ({addr}) on bus {args.bus}...")
+
+  latest = None
+  while True:
+    for msg in messaging.drain_sock(can, wait_for_one=True):
+      for m in msg.can:
+        if m.address == addr and m.src == args.bus:
+          latest = m
+
+    if latest is None:
+      continue
+
+    rows = []
+    for b in latest.dat:
+      r = list(bin(b).lstrip('0b').zfill(8))
+      r += [hex(b)]
+      rows.append(r)
+
+    df = pd.DataFrame(data=rows)
+    table = df.to_markdown(tablefmt='grid')
+    print(f"\n\n{hex(addr)} ({addr}) on bus {args.bus}\n{table}")
--- a/selfdrive/debug/clear_dtc.py
+++ b/selfdrive/debug/clear_dtc.py
@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+import sys
+from subprocess import check_output, CalledProcessError
+from panda import Panda
+from panda.python.uds import UdsClient, MessageTimeoutError, SESSION_TYPE, DTC_GROUP_TYPE
+
+try:
+  check_output(["pidof", "boardd"])
+  print("boardd is running, please kill openpilot before running this script! (aborted)")
+  sys.exit(1)
+except CalledProcessError as e:
+  if e.returncode != 1: # 1 == no process found (boardd not running)
+    raise e
+
+panda = Panda()
+panda.set_safety_mode(Panda.SAFETY_ELM327)
+address = 0x7DF # functional (broadcast) address
+uds_client = UdsClient(panda, address, bus=0, debug=False)
+print("extended diagnostic session ...")
+try:
+  uds_client.diagnostic_session_control(SESSION_TYPE.EXTENDED_DIAGNOSTIC)
+except MessageTimeoutError:
+  pass # functional address isn't properly handled so a timeout occurs
+print("clear diagnostic info ...")
+try:
+  uds_client.clear_diagnostic_information(DTC_GROUP_TYPE.ALL)
+except MessageTimeoutError:
+  pass # functional address isn't properly handled so a timeout occurs
+print("")
+print("you may need to power cycle your vehicle now")
--- a/selfdrive/debug/test_fw_query_on_routes.py
+++ b/selfdrive/debug/test_fw_query_on_routes.py
@ -15,6 +15,7 @@ from selfdrive.car.honda.values import FW_VERSIONS as HONDA_FW_VERSIONS
 from selfdrive.car.hyundai.values import FW_VERSIONS as HYUNDAI_FW_VERSIONS
 from selfdrive.car.volkswagen.values import FW_VERSIONS as VW_FW_VERSIONS
 from selfdrive.car.mazda.values import FW_VERSIONS as MAZDA_FW_VERSIONS
+from selfdrive.car.subaru.values import FW_VERSIONS as SUBARU_FW_VERSIONS


 NO_API = "NO_API" in os.environ
@ -23,6 +24,7 @@ SUPPORTED_CARS |= set(interface_names['honda'])
 SUPPORTED_CARS |= set(interface_names['hyundai'])
 SUPPORTED_CARS |= set(interface_names['volkswagen'])
 SUPPORTED_CARS |= set(interface_names['mazda'])
+SUPPORTED_CARS |= set(interface_names['subaru'])

 try:
  from xx.pipeline.c.CarState import migration
@ -120,7 +122,7 @@ if __name__ == "__main__":

          print("Mismatches")
          found = False
-          for car_fws in [TOYOTA_FW_VERSIONS, HONDA_FW_VERSIONS, HYUNDAI_FW_VERSIONS, VW_FW_VERSIONS, MAZDA_FW_VERSIONS]:
+          for car_fws in [TOYOTA_FW_VERSIONS, HONDA_FW_VERSIONS, HYUNDAI_FW_VERSIONS, VW_FW_VERSIONS, MAZDA_FW_VERSIONS, SUBARU_FW_VERSIONS]:
            if live_fingerprint in car_fws:
              found = True
              expected = car_fws[live_fingerprint]
--- a/selfdrive/loggerd/bootlog.cc
+++ b/selfdrive/loggerd/bootlog.cc
@ -51,8 +51,6 @@ static kj::Array<capnp::word> build_boot_log() {
 }

 int main(int argc, char** argv) {
-  clear_locks(LOG_ROOT);
-
  const std::string path = LOG_ROOT + "/boot/" + logger_get_route_name() + ".bz2";
  LOGW("bootlog to %s", path.c_str());

--- a/selfdrive/loggerd/logger.cc
+++ b/selfdrive/loggerd/logger.cc
@ -267,15 +267,3 @@ void lh_close(LoggerHandle* h) {
  }
  pthread_mutex_unlock(&h->lock);
 }
-
-int clear_locks_fn(const char* fpath, const struct stat *sb, int tyupeflag) {
-  const char* dot = strrchr(fpath, '.');
-  if (dot && strcmp(dot, ".lock") == 0) {
-    unlink(fpath);
-  }
-  return 0;
-}
-
-void clear_locks(const std::string log_root) {
-  ftw(log_root.c_str(), clear_locks_fn, 16);
-}
--- a/selfdrive/loggerd/logger.h
+++ b/selfdrive/loggerd/logger.h
@ -96,4 +96,3 @@ void logger_log(LoggerState *s, uint8_t* data, size_t data_size, bool in_qlog);

 void lh_log(LoggerHandle* h, uint8_t* data, size_t data_size, bool in_qlog);
 void lh_close(LoggerHandle* h);
-void clear_locks(const std::string log_root);
--- a/selfdrive/loggerd/omx_encoder.cc
+++ b/selfdrive/loggerd/omx_encoder.cc
@ -66,7 +66,6 @@ OMX_ERRORTYPE OmxEncoder::event_handler(OMX_HANDLETYPE component, OMX_PTR app_da

 OMX_ERRORTYPE OmxEncoder::empty_buffer_done(OMX_HANDLETYPE component, OMX_PTR app_data,
                                                   OMX_BUFFERHEADERTYPE *buffer) {
-  // printf("empty_buffer_done\n");
  OmxEncoder *e = (OmxEncoder*)app_data;
  e->free_in.push(buffer);
  return OMX_ErrorNone;
@ -74,7 +73,6 @@ OMX_ERRORTYPE OmxEncoder::empty_buffer_done(OMX_HANDLETYPE component, OMX_PTR ap

 OMX_ERRORTYPE OmxEncoder::fill_buffer_done(OMX_HANDLETYPE component, OMX_PTR app_data,
                                                  OMX_BUFFERHEADERTYPE *buffer) {
-  // printf("fill_buffer_done\n");
  OmxEncoder *e = (OmxEncoder*)app_data;
  e->done_out.push(buffer);
  return OMX_ErrorNone;
@ -155,7 +153,6 @@ static const char* omx_color_fomat_name(uint32_t format) {


 // ***** encoder functions *****
-
 OmxEncoder::OmxEncoder(const char* filename, int width, int height, int fps, int bitrate, bool h265, bool downscale, bool write) {
  this->filename = filename;
  this->write = write;
@ -325,27 +322,82 @@ OmxEncoder::OmxEncoder(const char* filename, int width, int height, int fps, int
  for (auto &buf : this->in_buf_headers) {
    this->free_in.push(buf);
  }
+
+  LOGE("omx initialized - in: %d - out %d", this->in_buf_headers.size(), this->out_buf_headers.size());
+}
+
+void OmxEncoder::callback_handler(OmxEncoder *e) {
+  // OMX documentation specifies to not empty the buffer from the callback function
+  // so we use this intermediate handler to copy the buffer for further writing
+  // and give it back to OMX. We could also send the data over msgq from here.
+  bool exit = false;
+
+  while (!exit) {
+    OMX_BUFFERHEADERTYPE *buffer = e->done_out.pop();
+    OmxBuffer *new_buffer = (OmxBuffer*)malloc(sizeof(OmxBuffer) + buffer->nFilledLen);
+    assert(new_buffer);
+
+    new_buffer->header = *buffer;
+    memcpy(new_buffer->data, buffer->pBuffer + buffer->nOffset, buffer->nFilledLen);
+
+    e->to_write.push(new_buffer);
+
+#ifdef QCOM2
+    if (buffer->nFlags & OMX_BUFFERFLAG_CODECCONFIG) {
+      buffer->nTimeStamp = 0;
+    }
+
+    if (buffer->nFlags & OMX_BUFFERFLAG_EOS) {
+      buffer->nTimeStamp = 0;
+    }
+#endif
+
+    if (buffer->nFlags & OMX_BUFFERFLAG_EOS) {
+      exit = true;
+    }
+
+    // give omx back the buffer
+    // TOOD: fails when shutting down
+    OMX_CHECK(OMX_FillThisBuffer(e->handle, buffer));
+  }
 }

-void OmxEncoder::handle_out_buf(OmxEncoder *e, OMX_BUFFERHEADERTYPE *out_buf) {
+
+void OmxEncoder::write_handler(OmxEncoder *e){
+  bool exit = false;
+  while (!exit) {
+    OmxBuffer *out_buf = e->to_write.pop();
+    OmxEncoder::handle_out_buf(e, out_buf);
+
+    if (out_buf->header.nFlags & OMX_BUFFERFLAG_EOS) {
+      exit = true;
+    }
+
+    free(out_buf);
+  }
+}
+
+
+void OmxEncoder::handle_out_buf(OmxEncoder *e, OmxBuffer *out_buf) {
  int err;
-  uint8_t *buf_data = out_buf->pBuffer + out_buf->nOffset;

-  if (out_buf->nFlags & OMX_BUFFERFLAG_CODECCONFIG) {
-    if (e->codec_config_len < out_buf->nFilledLen) {
-      e->codec_config = (uint8_t *)realloc(e->codec_config, out_buf->nFilledLen);
+  if (out_buf->header.nFlags & OMX_BUFFERFLAG_CODECCONFIG) {
+    if (e->codec_config_len < out_buf->header.nFilledLen) {
+      e->codec_config = (uint8_t *)realloc(e->codec_config, out_buf->header.nFilledLen);
    }
-    e->codec_config_len = out_buf->nFilledLen;
-    memcpy(e->codec_config, buf_data, out_buf->nFilledLen);
+    e->codec_config_len = out_buf->header.nFilledLen;
+    memcpy(e->codec_config, out_buf->data, out_buf->header.nFilledLen);
+
+    // TODO: is still needed?
 #ifdef QCOM2
-    out_buf->nTimeStamp = 0;
+    out_buf->header.nTimeStamp = 0;
 #endif
  }

  if (e->of) {
    //printf("write %d flags 0x%x\n", out_buf->nFilledLen, out_buf->nFlags);
-    size_t written = util::safe_fwrite(buf_data, 1, out_buf->nFilledLen, e->of);
-    if (written != out_buf->nFilledLen) {
+    size_t written = util::safe_fwrite(out_buf->data, 1, out_buf->header.nFilledLen, e->of);
+    if (written != out_buf->header.nFilledLen) {
      LOGE("failed to write file.errno=%d", errno);
    }
  }
@ -365,20 +417,20 @@ void OmxEncoder::handle_out_buf(OmxEncoder *e, OMX_BUFFERHEADERTYPE *out_buf) {
      e->wrote_codec_config = true;
    }

-    if (out_buf->nTimeStamp > 0) {
+    if (out_buf->header.nTimeStamp > 0) {
      // input timestamps are in microseconds
      AVRational in_timebase = {1, 1000000};

      AVPacket pkt;
      av_init_packet(&pkt);
-      pkt.data = buf_data;
-      pkt.size = out_buf->nFilledLen;
+      pkt.data = out_buf->data;
+      pkt.size = out_buf->header.nFilledLen;

      enum AVRounding rnd = static_cast<enum AVRounding>(AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX);
-      pkt.pts = pkt.dts = av_rescale_q_rnd(out_buf->nTimeStamp, in_timebase, e->ofmt_ctx->streams[0]->time_base, rnd);
+      pkt.pts = pkt.dts = av_rescale_q_rnd(out_buf->header.nTimeStamp, in_timebase, e->ofmt_ctx->streams[0]->time_base, rnd);
      pkt.duration = av_rescale_q(50*1000, in_timebase, e->ofmt_ctx->streams[0]->time_base);

-      if (out_buf->nFlags & OMX_BUFFERFLAG_SYNCFRAME) {
+      if (out_buf->header.nFlags & OMX_BUFFERFLAG_SYNCFRAME) {
        pkt.flags |= AV_PKT_FLAG_KEY;
      }

@ -388,14 +440,6 @@ void OmxEncoder::handle_out_buf(OmxEncoder *e, OMX_BUFFERHEADERTYPE *out_buf) {
      av_free_packet(&pkt);
    }
  }
-
-  // give omx back the buffer
-#ifdef QCOM2
-  if (out_buf->nFlags & OMX_BUFFERFLAG_EOS) {
-    out_buf->nTimeStamp = 0;
-  }
-#endif
-  OMX_CHECK(OMX_FillThisBuffer(e->handle, out_buf));
 }

 int OmxEncoder::encode_frame(const uint8_t *y_ptr, const uint8_t *u_ptr, const uint8_t *v_ptr,
@ -459,15 +503,6 @@ int OmxEncoder::encode_frame(const uint8_t *y_ptr, const uint8_t *u_ptr, const u

  OMX_CHECK(OMX_EmptyThisBuffer(this->handle, in_buf));

-  // pump output
-  while (true) {
-    OMX_BUFFERHEADERTYPE *out_buf;
-    if (!this->done_out.try_pop(out_buf)) {
-      break;
-    }
-    handle_out_buf(this, out_buf);
-  }
-
  this->dirty = true;

  this->counter++;
@ -524,6 +559,10 @@ void OmxEncoder::encoder_open(const char* path) {
  assert(lock_fd >= 0);
  close(lock_fd);

+  // start writer threads
+  callback_handler_thread = std::thread(OmxEncoder::callback_handler, this);
+  write_handler_thread = std::thread(OmxEncoder::write_handler, this);
+
  this->is_open = true;
  this->counter = 0;
 }
@ -541,18 +580,12 @@ void OmxEncoder::encoder_close() {

      OMX_CHECK(OMX_EmptyThisBuffer(this->handle, in_buf));

-      while (true) {
-        OMX_BUFFERHEADERTYPE *out_buf = this->done_out.pop();
-
-        handle_out_buf(this, out_buf);
-
-        if (out_buf->nFlags & OMX_BUFFERFLAG_EOS) {
-          break;
-        }
-      }
      this->dirty = false;
    }

+    callback_handler_thread.join();
+    write_handler_thread.join();
+
    if (this->remuxing) {
      av_write_trailer(this->ofmt_ctx);
      avcodec_free_context(&this->codec_ctx);
@ -591,9 +624,14 @@ OmxEncoder::~OmxEncoder() {

  OMX_CHECK(OMX_FreeHandle(this->handle));

-  OMX_BUFFERHEADERTYPE *out_buf;
-  while (this->free_in.try_pop(out_buf));
-  while (this->done_out.try_pop(out_buf));
+  OMX_BUFFERHEADERTYPE *buf;
+  while (this->free_in.try_pop(buf));
+  while (this->done_out.try_pop(buf));
+
+  OmxBuffer *write_buf;
+  while (this->to_write.try_pop(write_buf)) {
+    free(write_buf);
+  };

  if (this->codec_config) {
    free(this->codec_config);
--- a/selfdrive/loggerd/omx_encoder.h
+++ b/selfdrive/loggerd/omx_encoder.h
@ -3,6 +3,7 @@
 #include <cstdint>
 #include <cstdio>
 #include <vector>
+#include <thread>

 #include <OMX_Component.h>
 extern "C" {
@ -12,6 +13,12 @@ extern "C" {
 #include "selfdrive/common/queue.h"
 #include "selfdrive/loggerd/encoder.h"

+struct OmxBuffer {
+  OMX_BUFFERHEADERTYPE header;
+  OMX_U8 data[];
+};
+
+
 // OmxEncoder, lossey codec using hardware hevc
 class OmxEncoder : public VideoEncoder {
 public:
@ -32,7 +39,9 @@ public:

 private:
  void wait_for_state(OMX_STATETYPE state);
-  static void handle_out_buf(OmxEncoder *e, OMX_BUFFERHEADERTYPE *out_buf);
+  static void callback_handler(OmxEncoder *e);
+  static void write_handler(OmxEncoder *e);
+  static void handle_out_buf(OmxEncoder *e, OmxBuffer *out_buf);

  int width, height, fps;
  char vid_path[1024];
@ -41,6 +50,8 @@ private:
  bool dirty = false;
  bool write = false;
  int counter = 0;
+  std::thread callback_handler_thread;
+  std::thread write_handler_thread;

  const char* filename;
  FILE *of = nullptr;
@ -62,6 +73,7 @@ private:

  SafeQueue<OMX_BUFFERHEADERTYPE *> free_in;
  SafeQueue<OMX_BUFFERHEADERTYPE *> done_out;
+  SafeQueue<OmxBuffer *> to_write;

  AVFormatContext *ofmt_ctx;
  AVCodecContext *codec_ctx;
--- a/selfdrive/loggerd/raw_logger.cc
+++ b/selfdrive/loggerd/raw_logger.cc
@ -28,7 +28,6 @@ RawLogger::RawLogger(const char* filename, int width, int height, int fps,

  // TODO: respect write arg

-  av_register_all();
  codec = avcodec_find_encoder(AV_CODEC_ID_FFVHUFF);
  // codec = avcodec_find_encoder(AV_CODEC_ID_FFV1);
  assert(codec);
@ -111,8 +110,6 @@ void RawLogger::encoder_close() {
  int err = av_write_trailer(format_ctx);
  assert(err == 0);

-  avcodec_close(stream->codec);
-
  err = avio_closep(&format_ctx->pb);
  assert(err == 0);

@ -155,18 +152,32 @@ int RawLogger::encode_frame(const uint8_t *y_ptr, const uint8_t *u_ptr, const ui

  int ret = counter;

-  int got_output = 0;
-  int err = avcodec_encode_video2(codec_ctx, &pkt, frame, &got_output);
-  if (err) {
-    LOGE("encoding error\n");
+  int err = avcodec_send_frame(codec_ctx, frame);
+  if (ret < 0) {
+    LOGE("avcode_send_frame error %d", err);
    ret = -1;
-  } else if (got_output) {
+  }
+
+  while (ret >= 0){
+    err = avcodec_receive_packet(codec_ctx, &pkt);
+    if (err == AVERROR_EOF) {
+      break;
+    } else if (err == AVERROR(EAGAIN)) {
+      // Encoder might need a few frames on startup to get started. Keep going
+      ret = 0;
+      break;
+    } else if (err < 0) {
+      LOGE("avcodec_receive_packet error %d", err);
+      ret = -1;
+      break;
+    }
+
    av_packet_rescale_ts(&pkt, codec_ctx->time_base, stream->time_base);
    pkt.stream_index = 0;

    err = av_interleaved_write_frame(format_ctx, &pkt);
    if (err < 0) {
-      LOGE("encoder writer error\n");
+      LOGE("av_interleaved_write_frame %d", err);
      ret = -1;
    } else {
      counter++;
--- a/selfdrive/loggerd/raw_logger.h
+++ b/selfdrive/loggerd/raw_logger.h
@ -32,7 +32,7 @@ private:

  std::string vid_path, lock_path;

-  AVCodec *codec = NULL;
+  const AVCodec *codec = NULL;
  AVCodecContext *codec_ctx = NULL;

  AVStream *stream = NULL;
--- a/selfdrive/loggerd/tests/loggerd_tests_common.py
+++ b/selfdrive/loggerd/tests/loggerd_tests_common.py
@ -14,7 +14,8 @@ def create_random_file(file_path, size_mb, lock=False):
      pass

    lock_path = file_path + ".lock"
-    os.close(os.open(lock_path, os.O_CREAT | os.O_EXCL))
+    if lock:
+      os.close(os.open(lock_path, os.O_CREAT | os.O_EXCL))

    chunks = 128
    chunk_bytes = int(size_mb * 1024 * 1024 / chunks)
@ -24,9 +25,6 @@ def create_random_file(file_path, size_mb, lock=False):
      for _ in range(chunks):
        f.write(data)

-    if not lock:
-        os.remove(lock_path)
-
 class MockResponse():
  def __init__(self, text, status_code):
    self.text = text
--- a/selfdrive/loggerd/tests/test_loggerd.cc
+++ b/selfdrive/loggerd/tests/test_loggerd.cc
@ -91,21 +91,3 @@ TEST_CASE("trigger_rotate") {
    REQUIRE(frame_id == start_frame_id + encoder_seg * (SEGMENT_LENGTH * MAIN_FPS));
  }
 }
-
-TEST_CASE("clear_locks") {
-  std::vector<std::string> dirs;
-  for (int i = 0; i < 10; ++i) {
-    std::string &path = dirs.emplace_back(LOG_ROOT + "/" + std::to_string(i));
-    REQUIRE(util::create_directories(path, 0775));
-    std::ofstream{path + "/.lock"};
-    REQUIRE(util::file_exists(path + "/.lock"));
-  }
-
-  clear_locks(LOG_ROOT);
-
-  for (const auto &dir : dirs) {
-    std::string lock_file = dir + "/.lock";
-    REQUIRE(util::file_exists(lock_file) == false);
-    rmdir(dir.c_str());
-  }
-}
--- a/selfdrive/loggerd/tests/test_uploader.py
+++ b/selfdrive/loggerd/tests/test_uploader.py
@ -133,9 +133,11 @@ class TestUploader(UploaderTestCase):
    self.assertTrue(log_handler.upload_order == exp_order, "Files uploaded in wrong order")

  def test_no_upload_with_lock_file(self):
+    self.start_thread()
+
+    time.sleep(0.25)
    f_paths = self.gen_files(lock=True, boot=False)

-    self.start_thread()
    # allow enough time that files should have been uploaded if they would be uploaded
    time.sleep(5)
    self.join_thread()
@ -144,5 +146,15 @@ class TestUploader(UploaderTestCase):
      self.assertFalse(getxattr(f_path, uploader.UPLOAD_ATTR_NAME), "File upload when locked")


+  def test_clear_locks_on_startup(self):
+    f_paths = self.gen_files(lock=True, boot=False)
+    self.start_thread()
+    time.sleep(1)
+    self.join_thread()
+
+    for f_path in f_paths:
+      self.assertFalse(os.path.isfile(f_path + ".lock"), "File lock not cleared on startup")
+
+
 if __name__ == "__main__":
  unittest.main(failfast=True)
--- a/selfdrive/loggerd/uploader.py
+++ b/selfdrive/loggerd/uploader.py
@ -213,6 +213,8 @@ class Uploader():
    return msg

 def uploader_fn(exit_event):
+  clear_locks(ROOT)
+
  params = Params()
  dongle_id = params.get("DongleId", encoding='utf8')

--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@ -1,3 +1,5 @@
+import os
+
 Import('env', 'arch', 'cereal', 'messaging', 'common', 'gpucommon', 'visionipc')
 lenv = env.Clone()

@ -23,11 +25,15 @@ common_src = [
 thneed_src = [
  "thneed/thneed.cc",
  "thneed/serialize.cc",
+  "thneed/optimizer.cc",
  "runners/thneedmodel.cc",
 ]

 use_thneed = not GetOption('no_thneed')

+use_extra = True  # arch == "larch64"
+lenv['CXXFLAGS'].append('-DUSE_EXTRA=true' if use_extra else '-DUSE_EXTRA=false')
+
 if arch == "aarch64" or arch == "larch64":
  libs += ['gsl', 'CB']
  libs += ['gnustl_shared'] if arch == "aarch64" else ['pthread', 'dl']
@ -62,12 +68,16 @@ common_model = lenv.Object(common_src)

 # build thneed model
 if use_thneed and arch in ("aarch64", "larch64"):
+  fn = "../../models/big_supercombo" if use_extra else "../../models/supercombo"
  compiler = lenv.Program('thneed/compile', ["thneed/compile.cc"]+common_model, LIBS=libs)
-  cmd = f"cd {Dir('.').abspath} && {compiler[0].abspath} ../../models/supercombo.dlc ../../models/supercombo.thneed --binary"
+  cmd = f"cd {Dir('.').abspath} && {compiler[0].abspath} {fn}.dlc {fn}.thneed --binary"

  lib_paths = ':'.join(Dir(p).abspath for p in lenv["LIBPATH"])
-  cenv = Environment(ENV={'LD_LIBRARY_PATH': f"{lib_paths}:{lenv['ENV']['LD_LIBRARY_PATH']}"})
-  cenv.Command("../../models/supercombo.thneed", ["../../models/supercombo.dlc", compiler], cmd)
+  kernel_path = os.path.join(Dir('.').abspath, "thneed", "kernels")
+  cenv = Environment(ENV={'LD_LIBRARY_PATH': f"{lib_paths}:{lenv['ENV']['LD_LIBRARY_PATH']}", 'KERNEL_PATH': kernel_path})
+
+  kernels = [os.path.join(kernel_path, x) for x in os.listdir(kernel_path) if x.endswith(".cl")]
+  cenv.Command(fn + ".thneed", [fn + ".dlc", kernels, compiler], cmd)

 lenv.Program('_dmonitoringmodeld', [
    "dmonitoringmodeld.cc",
--- a/selfdrive/modeld/modeld.cc
+++ b/selfdrive/modeld/modeld.cc
@ -1,6 +1,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <mutex>
+#include <cmath>

 #include <eigen3/Eigen/Dense>

@ -15,34 +16,34 @@

 ExitHandler do_exit;

-mat3 update_calibration(cereal::LiveCalibrationData::Reader live_calib, bool wide_camera) {
+mat3 update_calibration(Eigen::Matrix<float, 3, 4> &extrinsics, bool wide_camera, bool bigmodel_frame) {
  /*
     import numpy as np
     from common.transformations.model import medmodel_frame_from_road_frame
     medmodel_frame_from_ground = medmodel_frame_from_road_frame[:, (0, 1, 3)]
     ground_from_medmodel_frame = np.linalg.inv(medmodel_frame_from_ground)
  */
-  static const auto ground_from_medmodel_frame = (Eigen::Matrix<float, 3, 3>() << 
-    0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
+  static const auto ground_from_medmodel_frame = (Eigen::Matrix<float, 3, 3>() <<
+     0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
    -1.09890110e-03, 0.00000000e+00, 2.81318681e-01,
    -1.84808520e-20, 9.00738606e-04, -4.28751576e-02).finished();

-  static const auto cam_intrinsics = Eigen::Matrix<float, 3, 3, Eigen::RowMajor>(wide_camera ? ecam_intrinsic_matrix.v : fcam_intrinsic_matrix.v);
-  static const mat3 yuv_transform = get_model_yuv_transform();
+  static const auto ground_from_sbigmodel_frame = (Eigen::Matrix<float, 3, 3>() <<
+     0.00000000e+00,  7.31372216e-19,  1.00000000e+00,
+    -2.19780220e-03,  4.11497335e-19,  5.62637363e-01,
+    -5.46146580e-20,  1.80147721e-03, -2.73464241e-01).finished();

-  auto extrinsic_matrix = live_calib.getExtrinsicMatrix();
-  Eigen::Matrix<float, 3, 4> extrinsic_matrix_eigen;
-  for (int i = 0; i < 4*3; i++) {
-    extrinsic_matrix_eigen(i / 4, i % 4) = extrinsic_matrix[i];
-  }
+  const auto cam_intrinsics = Eigen::Matrix<float, 3, 3, Eigen::RowMajor>(wide_camera ? ecam_intrinsic_matrix.v : fcam_intrinsic_matrix.v);
+  static const mat3 yuv_transform = get_model_yuv_transform();

-  auto camera_frame_from_road_frame = cam_intrinsics * extrinsic_matrix_eigen;
+  auto ground_from_model_frame = bigmodel_frame ? ground_from_sbigmodel_frame : ground_from_medmodel_frame;
+  auto camera_frame_from_road_frame = cam_intrinsics * extrinsics;
  Eigen::Matrix<float, 3, 3> camera_frame_from_ground;
  camera_frame_from_ground.col(0) = camera_frame_from_road_frame.col(0);
  camera_frame_from_ground.col(1) = camera_frame_from_road_frame.col(1);
  camera_frame_from_ground.col(2) = camera_frame_from_road_frame.col(3);

-  auto warp_matrix = camera_frame_from_ground * ground_from_medmodel_frame;
+  auto warp_matrix = camera_frame_from_ground * ground_from_model_frame;
  mat3 transform = {};
  for (int i=0; i<3*3; i++) {
    transform.v[i] = warp_matrix(i / 3, i % 3);
@ -50,7 +51,7 @@ mat3 update_calibration(cereal::LiveCalibrationData::Reader live_calib, bool wid
  return matmul3(yuv_transform, transform);
 }

-void run_model(ModelState &model, VisionIpcClient &vipc_client, bool wide_camera) {
+void run_model(ModelState &model, VisionIpcClient &vipc_client_main, VisionIpcClient &vipc_client_extra, bool main_wide_camera, bool use_extra, bool use_extra_client) {
  // messaging
  PubMaster pm({"modelV2", "cameraOdometry"});
  SubMaster sm({"lateralPlan", "roadCameraState", "liveCalibration"});
@ -62,20 +63,80 @@ void run_model(ModelState &model, VisionIpcClient &vipc_client, bool wide_camera
  double last = 0;
  uint32_t run_count = 0;

-  mat3 model_transform = {};
+  mat3 model_transform_main = {};
+  mat3 model_transform_extra = {};
  bool live_calib_seen = false;

+  VisionBuf *buf_main = nullptr;
+  VisionBuf *buf_extra = nullptr;
+
+  VisionIpcBufExtra meta_main = {0};
+  VisionIpcBufExtra meta_extra = {0};
+
  while (!do_exit) {
-    VisionIpcBufExtra extra = {};
-    VisionBuf *buf = vipc_client.recv(&extra);
-    if (buf == nullptr) continue;
+    // TODO: change sync logic to use timestamp start of frame in case camerad skips a frame
+    // log frame id in model packet
+
+    // Keep receiving frames until we are at least 1 frame ahead of previous extra frame
+    while (meta_main.frame_id <= meta_extra.frame_id) {
+      buf_main = vipc_client_main.recv(&meta_main);
+      if (meta_main.frame_id <= meta_extra.frame_id) {
+        LOGE("main camera behind! main: %d (%.5f), extra: %d (%.5f)",
+          meta_main.frame_id, double(meta_main.timestamp_sof) / 1e9,
+          meta_extra.frame_id, double(meta_extra.timestamp_sof) / 1e9);
+      }
+
+      if (buf_main == nullptr)  break;
+    }
+
+    if (buf_main == nullptr) {
+      LOGE("vipc_client_main no frame");
+      continue;
+    }
+
+    if (use_extra_client) {
+      // Keep receiving extra frames until frame id matches main camera
+      do {
+        buf_extra = vipc_client_extra.recv(&meta_extra);
+
+        if (meta_main.frame_id > meta_extra.frame_id) {
+          LOGE("extra camera behind! main: %d (%.5f), extra: %d (%.5f)",
+            meta_main.frame_id, double(meta_main.timestamp_sof) / 1e9,
+            meta_extra.frame_id, double(meta_extra.timestamp_sof) / 1e9);
+        }
+      } while (buf_extra != nullptr && meta_main.frame_id > meta_extra.frame_id);
+
+      if (buf_extra == nullptr) {
+        LOGE("vipc_client_extra no frame");
+        continue;
+      }
+
+      if (meta_main.frame_id != meta_extra.frame_id || std::abs((int64_t)meta_main.timestamp_sof - (int64_t)meta_extra.timestamp_sof) > 10000000ULL) {
+        LOGE("frames out of sync! main: %d (%.5f), extra: %d (%.5f)",
+          meta_main.frame_id, double(meta_main.timestamp_sof) / 1e9,
+          meta_extra.frame_id, double(meta_extra.timestamp_sof) / 1e9);
+      }
+    } else {
+      // Use single camera
+      buf_extra = buf_main;
+      meta_extra = meta_main;
+    }

    // TODO: path planner timeout?
    sm.update(0);
    int desire = ((int)sm["lateralPlan"].getLateralPlan().getDesire());
    frame_id = sm["roadCameraState"].getRoadCameraState().getFrameId();
    if (sm.updated("liveCalibration")) {
-      model_transform = update_calibration(sm["liveCalibration"].getLiveCalibration(), wide_camera);
+      auto extrinsic_matrix = sm["liveCalibration"].getLiveCalibration().getExtrinsicMatrix();
+      Eigen::Matrix<float, 3, 4> extrinsic_matrix_eigen;
+      for (int i = 0; i < 4*3; i++) {
+        extrinsic_matrix_eigen(i / 4, i % 4) = extrinsic_matrix[i];
+      }
+
+      model_transform_main = update_calibration(extrinsic_matrix_eigen, main_wide_camera, false);
+      if (use_extra) {
+        model_transform_extra = update_calibration(extrinsic_matrix_eigen, Hardware::TICI(), true);
+      }
      live_calib_seen = true;
    }

@ -85,13 +146,12 @@ void run_model(ModelState &model, VisionIpcClient &vipc_client, bool wide_camera
    }

    double mt1 = millis_since_boot();
-    ModelOutput *model_output = model_eval_frame(&model, buf->buf_cl, buf->width, buf->height,
-                                              model_transform, vec_desire);
+    ModelOutput *model_output = model_eval_frame(&model, buf_main, buf_extra, model_transform_main, model_transform_extra, vec_desire);
    double mt2 = millis_since_boot();
    float model_execution_time = (mt2 - mt1) / 1000.0;

    // tracked dropped frames
-    uint32_t vipc_dropped_frames = extra.frame_id - last_vipc_frame_id - 1;
+    uint32_t vipc_dropped_frames = meta_main.frame_id - last_vipc_frame_id - 1;
    float frames_dropped = frame_dropped_filter.update((float)std::min(vipc_dropped_frames, 10U));
    if (run_count < 10) { // let frame drops warm up
      frame_dropped_filter.reset(0);
@ -101,13 +161,13 @@ void run_model(ModelState &model, VisionIpcClient &vipc_client, bool wide_camera

    float frame_drop_ratio = frames_dropped / (1 + frames_dropped);

-    model_publish(pm, extra.frame_id, frame_id, frame_drop_ratio, *model_output, extra.timestamp_eof, model_execution_time,
+    model_publish(pm, meta_main.frame_id, frame_id, frame_drop_ratio, *model_output, meta_main.timestamp_eof, model_execution_time,
                  kj::ArrayPtr<const float>(model.output.data(), model.output.size()), live_calib_seen);
-    posenet_publish(pm, extra.frame_id, vipc_dropped_frames, *model_output, extra.timestamp_eof, live_calib_seen);
+    posenet_publish(pm, meta_main.frame_id, vipc_dropped_frames, *model_output, meta_main.timestamp_eof, live_calib_seen);

    //printf("model process: %.2fms, from last %.2fms, vipc_frame_id %u, frame_id, %u, frame_drop %.3f\n", mt2 - mt1, mt1 - last, extra.frame_id, frame_id, frame_drop_ratio);
    last = mt1;
-    last_vipc_frame_id = extra.frame_id;
+    last_vipc_frame_id = meta_main.frame_id;
  }
 }

@ -120,7 +180,9 @@ int main(int argc, char **argv) {
    assert(ret == 0);
  }

-  bool wide_camera = Hardware::TICI() ? Params().getBool("EnableWideCamera") : false;
+  bool main_wide_camera = Hardware::TICI() ? Params().getBool("EnableWideCamera") : false;
+  bool use_extra = USE_EXTRA;
+  bool use_extra_client = Hardware::TICI() && use_extra && !main_wide_camera;

  // cl init
  cl_device_id device_id = cl_get_device_id(CL_DEVICE_TYPE_DEFAULT);
@ -128,20 +190,32 @@ int main(int argc, char **argv) {

  // init the models
  ModelState model;
-  model_init(&model, device_id, context);
+  model_init(&model, device_id, context, use_extra);
  LOGW("models loaded, modeld starting");

-  VisionIpcClient vipc_client = VisionIpcClient("camerad", wide_camera ? VISION_STREAM_WIDE_ROAD : VISION_STREAM_ROAD, true, device_id, context);
-  while (!do_exit && !vipc_client.connect(false)) {
+  VisionIpcClient vipc_client_main = VisionIpcClient("camerad", main_wide_camera ? VISION_STREAM_WIDE_ROAD : VISION_STREAM_ROAD, true, device_id, context);
+  VisionIpcClient vipc_client_extra = VisionIpcClient("camerad", VISION_STREAM_WIDE_ROAD, false, device_id, context);
+
+  while (!do_exit && !vipc_client_main.connect(false)) {
+    util::sleep_for(100);
+  }
+
+  while (!do_exit && use_extra_client && !vipc_client_extra.connect(false)) {
    util::sleep_for(100);
  }

  // run the models
  // vipc_client.connected is false only when do_exit is true
-  if (vipc_client.connected) {
-    const VisionBuf *b = &vipc_client.buffers[0];
-    LOGW("connected with buffer size: %d (%d x %d)", b->len, b->width, b->height);
-    run_model(model, vipc_client, wide_camera);
+  if (!do_exit) {
+    const VisionBuf *b = &vipc_client_main.buffers[0];
+    LOGW("connected main cam with buffer size: %d (%d x %d)", b->len, b->width, b->height);
+
+    if (use_extra_client) {
+      const VisionBuf *wb = &vipc_client_extra.buffers[0];
+      LOGW("connected extra cam with buffer size: %d (%d x %d)", wb->len, wb->width, wb->height);
+    }
+
+    run_model(model, vipc_client_main, vipc_client_extra, main_wide_camera, use_extra, use_extra_client);
  }

  model_free(&model);
--- a/selfdrive/modeld/models/dmonitoring.cc
+++ b/selfdrive/modeld/models/dmonitoring.cc
@ -166,7 +166,8 @@ DMonitoringResult dmonitoring_eval_frame(DMonitoringModelState* s, void* stream_
  //fclose(dump_yuv_file2);

  double t1 = millis_since_boot();
-  s->m->execute(net_input_buf, yuv_buf_len);
+  s->m->addImage(net_input_buf, yuv_buf_len);
+  s->m->execute();
  double t2 = millis_since_boot();

  DMonitoringResult ret = {0};
--- a/selfdrive/modeld/models/driving.cc
+++ b/selfdrive/modeld/models/driving.cc
@ -26,15 +26,19 @@ constexpr const kj::ArrayPtr<const T> to_kj_array_ptr(const std::array<T, size>
  return kj::ArrayPtr(arr.data(), arr.size());
 }

-void model_init(ModelState* s, cl_device_id device_id, cl_context context) {
+void model_init(ModelState* s, cl_device_id device_id, cl_context context, bool use_extra) {
  s->frame = new ModelFrame(device_id, context);
+  s->wide_frame = new ModelFrame(device_id, context);

 #ifdef USE_THNEED
-  s->m = std::make_unique<ThneedModel>("../../models/supercombo.thneed", &s->output[0], NET_OUTPUT_SIZE, USE_GPU_RUNTIME);
+  s->m = std::make_unique<ThneedModel>(use_extra ? "../../models/big_supercombo.thneed" : "../../models/supercombo.thneed",
+   &s->output[0], NET_OUTPUT_SIZE, USE_GPU_RUNTIME, use_extra);
 #elif USE_ONNX_MODEL
-  s->m = std::make_unique<ONNXModel>("../../models/supercombo.onnx", &s->output[0], NET_OUTPUT_SIZE, USE_GPU_RUNTIME);
+  s->m = std::make_unique<ONNXModel>(use_extra ? "../../models/big_supercombo.onnx" : "../../models/supercombo.onnx",
+   &s->output[0], NET_OUTPUT_SIZE, USE_GPU_RUNTIME, use_extra);
 #else
-  s->m = std::make_unique<SNPEModel>("../../models/supercombo.dlc", &s->output[0], NET_OUTPUT_SIZE, USE_GPU_RUNTIME);
+  s->m = std::make_unique<SNPEModel>(use_extra ? "../../models/big_supercombo.dlc" : "../../models/supercombo.dlc",
+   &s->output[0], NET_OUTPUT_SIZE, USE_GPU_RUNTIME, use_extra);
 #endif

 #ifdef TEMPORAL
@ -52,8 +56,8 @@ void model_init(ModelState* s, cl_device_id device_id, cl_context context) {
 #endif
 }

-ModelOutput* model_eval_frame(ModelState* s, cl_mem yuv_cl, int width, int height,
-                           const mat3 &transform, float *desire_in) {
+ModelOutput* model_eval_frame(ModelState* s, VisionBuf* buf, VisionBuf* wbuf,
+                              const mat3 &transform, const mat3 &transform_wide, float *desire_in) {
 #ifdef DESIRE
  if (desire_in != NULL) {
    for (int i = 1; i < DESIRE_LEN; i++) {
@ -70,8 +74,14 @@ ModelOutput* model_eval_frame(ModelState* s, cl_mem yuv_cl, int width, int heigh
 #endif

  // if getInputBuf is not NULL, net_input_buf will be
-  auto net_input_buf = s->frame->prepare(yuv_cl, width, height, transform, static_cast<cl_mem*>(s->m->getInputBuf()));
-  s->m->execute(net_input_buf, s->frame->buf_size);
+  auto net_input_buf = s->frame->prepare(buf->buf_cl, buf->width, buf->height, transform, static_cast<cl_mem*>(s->m->getInputBuf()));
+  s->m->addImage(net_input_buf, s->frame->buf_size);
+
+  if (wbuf != nullptr) {
+    auto net_extra_buf = s->wide_frame->prepare(wbuf->buf_cl, wbuf->width, wbuf->height, transform_wide, static_cast<cl_mem*>(s->m->getExtraBuf()));
+    s->m->addExtra(net_extra_buf, s->wide_frame->buf_size);
+  }
+  s->m->execute();

  return (ModelOutput*)&s->output;
 }
--- a/selfdrive/modeld/models/driving.h
+++ b/selfdrive/modeld/models/driving.h
@ -9,6 +9,7 @@
 #include <memory>

 #include "cereal/messaging/messaging.h"
+#include "cereal/visionipc/visionipc_client.h"
 #include "selfdrive/common/mat.h"
 #include "selfdrive/common/modeldata.h"
 #include "selfdrive/common/util.h"
@ -25,6 +26,7 @@ constexpr int BLINKER_LEN = 6;
 constexpr int META_STRIDE = 7;

 constexpr int PLAN_MHP_N = 5;
+constexpr int STOP_LINE_MHP_N = 3;

 constexpr int LEAD_MHP_N = 2;
 constexpr int LEAD_TRAJ_LEN = 6;
@ -147,6 +149,37 @@ struct ModelOutputLeads {
 };
 static_assert(sizeof(ModelOutputLeads) == (sizeof(ModelOutputLeadPrediction)*LEAD_MHP_N) + (sizeof(float)*LEAD_MHP_SELECTION));

+struct ModelOutputStopLineElement {
+  ModelOutputXYZ position;
+  ModelOutputXYZ rotation;
+  float speed;
+  float time;
+};
+static_assert(sizeof(ModelOutputStopLineElement) == (sizeof(ModelOutputXYZ)*2 + sizeof(float)*2));
+
+struct ModelOutputStopLinePrediction {
+  ModelOutputStopLineElement mean;
+  ModelOutputStopLineElement std;
+  float prob;
+};
+static_assert(sizeof(ModelOutputStopLinePrediction) == (sizeof(ModelOutputStopLineElement)*2 + sizeof(float)));
+
+struct ModelOutputStopLines {
+  std::array<ModelOutputStopLinePrediction, STOP_LINE_MHP_N> prediction;
+  float prob;
+
+  constexpr const ModelOutputStopLinePrediction &get_best_prediction(int t_idx) const {
+    int max_idx = 0;
+    for (int i = 1; i < prediction.size(); i++) {
+      if (prediction[i].prob > prediction[max_idx].prob) {
+        max_idx = i;
+      }
+    }
+    return prediction[max_idx];
+  }
+};
+static_assert(sizeof(ModelOutputStopLines) == (sizeof(ModelOutputStopLinePrediction)*STOP_LINE_MHP_N) + sizeof(float));
+
 struct ModelOutputPose {
  ModelOutputXYZ velocity_mean;
  ModelOutputXYZ rotation_mean;
@ -205,6 +238,7 @@ struct ModelOutput {
  const ModelOutputLaneLines lane_lines;
  const ModelOutputRoadEdges road_edges;
  const ModelOutputLeads leads;
+  const ModelOutputStopLines stop_lines;
  const ModelOutputMeta meta;
  const ModelOutputPose pose;
 };
@ -220,6 +254,7 @@ constexpr int NET_OUTPUT_SIZE = OUTPUT_SIZE + TEMPORAL_SIZE;
 // TODO: convert remaining arrays to std::array and update model runners
 struct ModelState {
  ModelFrame *frame;
+  ModelFrame *wide_frame;
  std::array<float, NET_OUTPUT_SIZE> output = {};
  std::unique_ptr<RunModel> m;
 #ifdef DESIRE
@ -231,9 +266,9 @@ struct ModelState {
 #endif
 };

-void model_init(ModelState* s, cl_device_id device_id, cl_context context);
-ModelOutput *model_eval_frame(ModelState* s, cl_mem yuv_cl, int width, int height,
-                           const mat3 &transform, float *desire_in);
+void model_init(ModelState* s, cl_device_id device_id, cl_context context, bool use_extra);
+ModelOutput *model_eval_frame(ModelState* s, VisionBuf* buf, VisionBuf* buf_wide,
+                              const mat3 &transform, const mat3 &transform_wide, float *desire_in);
 void model_free(ModelState* s);
 void model_publish(PubMaster &pm, uint32_t vipc_frame_id, uint32_t frame_id, float frame_drop,
                   const ModelOutput &net_outputs, uint64_t timestamp_eof,
--- a/selfdrive/modeld/runners/onnxmodel.cc
+++ b/selfdrive/modeld/runners/onnxmodel.cc
@ -14,11 +14,12 @@
 #include "selfdrive/common/swaglog.h"
 #include "selfdrive/common/util.h"

-ONNXModel::ONNXModel(const char *path, float *_output, size_t _output_size, int runtime) {
+ONNXModel::ONNXModel(const char *path, float *_output, size_t _output_size, int runtime, bool _use_extra) {
  LOGD("loading model %s", path);

  output = _output;
  output_size = _output_size;
+  use_extra = _use_extra;

  int err = pipe(pipein);
  assert(err == 0);
@ -99,9 +100,24 @@ void ONNXModel::addTrafficConvention(float *state, int state_size) {
  traffic_convention_size = state_size;
 }

-void ONNXModel::execute(float *net_input_buf, int buf_size) {
+void ONNXModel::addImage(float *image_buf, int buf_size) {
+  image_input_buf = image_buf;
+  image_buf_size = buf_size;
+}
+
+void ONNXModel::addExtra(float *image_buf, int buf_size) {
+  extra_input_buf = image_buf;
+  extra_buf_size = buf_size;
+}
+
+void ONNXModel::execute() {
  // order must be this
-  pwrite(net_input_buf, buf_size);
+  if (image_input_buf != NULL) {
+    pwrite(image_input_buf, image_buf_size);
+  }
+  if (extra_input_buf != NULL) {
+    pwrite(extra_input_buf, extra_buf_size);
+  }
  if (desire_input_buf != NULL) {
    pwrite(desire_input_buf, desire_state_size);
  }
--- a/selfdrive/modeld/runners/onnxmodel.h
+++ b/selfdrive/modeld/runners/onnxmodel.h
@ -6,12 +6,14 @@

 class ONNXModel : public RunModel {
 public:
-  ONNXModel(const char *path, float *output, size_t output_size, int runtime);
+  ONNXModel(const char *path, float *output, size_t output_size, int runtime, bool use_extra = false);
 	~ONNXModel();
  void addRecurrent(float *state, int state_size);
  void addDesire(float *state, int state_size);
  void addTrafficConvention(float *state, int state_size);
-  void execute(float *net_input_buf, int buf_size);
+  void addImage(float *image_buf, int buf_size);
+  void addExtra(float *image_buf, int buf_size);
+  void execute();
 private:
  int proc_pid;

@ -24,6 +26,11 @@ private:
  int desire_state_size;
  float *traffic_convention_input_buf = NULL;
  int traffic_convention_size;
+  float *image_input_buf = NULL;
+  int image_buf_size;
+  float *extra_input_buf = NULL;
+  int extra_buf_size;
+  bool use_extra;

  // pipe to communicate to keras subprocess
  void pread(float *buf, int size);
--- a/selfdrive/modeld/runners/runmodel.h
+++ b/selfdrive/modeld/runners/runmodel.h
@ -5,7 +5,10 @@ public:
  virtual void addRecurrent(float *state, int state_size) {}
  virtual void addDesire(float *state, int state_size) {}
  virtual void addTrafficConvention(float *state, int state_size) {}
-  virtual void execute(float *net_input_buf, int buf_size) {}
+  virtual void addImage(float *image_buf, int buf_size) {}
+  virtual void addExtra(float *image_buf, int buf_size) {}
+  virtual void execute() {}
  virtual void* getInputBuf() { return nullptr; }
+  virtual void* getExtraBuf() { return nullptr; }
 };

--- a/selfdrive/modeld/runners/snpemodel.cc
+++ b/selfdrive/modeld/runners/snpemodel.cc
@ -7,15 +7,17 @@
 #include <cstring>

 #include "selfdrive/common/util.h"
+#include "selfdrive/common/timing.h"

 void PrintErrorStringAndExit() {
  std::cerr << zdl::DlSystem::getLastErrorString() << std::endl;
  std::exit(EXIT_FAILURE);
 }

-SNPEModel::SNPEModel(const char *path, float *loutput, size_t loutput_size, int runtime) {
+SNPEModel::SNPEModel(const char *path, float *loutput, size_t loutput_size, int runtime, bool luse_extra) {
  output = loutput;
  output_size = loutput_size;
+  use_extra = luse_extra;
 #if defined(QCOM) || defined(QCOM2)
  if (runtime==USE_GPU_RUNTIME) {
    Runtime = zdl::DlSystem::Runtime_t::GPU;
@ -89,6 +91,25 @@ SNPEModel::SNPEModel(const char *path, float *loutput, size_t loutput_size, int
    inputMap.add(input_tensor_name, inputBuffer.get());
  }

+  if (use_extra) {
+    const char *extra_tensor_name = strListi.at(1);
+    const auto &extraDims_opt = snpe->getInputDimensions(extra_tensor_name);
+    const zdl::DlSystem::TensorShape& bufferShape = *extraDims_opt;
+    std::vector<size_t> strides(bufferShape.rank());
+    strides[strides.size() - 1] = sizeof(float);
+    size_t product = 1;
+    for (size_t i = 0; i < bufferShape.rank(); i++) product *= bufferShape[i];
+    size_t stride = strides[strides.size() - 1];
+    for (size_t i = bufferShape.rank() - 1; i > 0; i--) {
+      stride *= bufferShape[i];
+      strides[i-1] = stride;
+    }
+    printf("extra product is %lu\n", product);
+    extraBuffer = ubFactory.createUserBuffer(NULL, product*sizeof(float), strides, &userBufferEncodingFloat);
+
+    inputMap.add(extra_tensor_name, extraBuffer.get());
+  }
+
  // create output buffer
  {
    const zdl::DlSystem::TensorShape& bufferShape = snpe->getInputOutputBufferAttributes(output_tensor_name)->getDims();
@ -120,13 +141,24 @@ void SNPEModel::addDesire(float *state, int state_size) {
  desireBuffer = this->addExtra(state, state_size, 1);
 }

+void SNPEModel::addImage(float *image_buf, int buf_size) {
+  input = image_buf;
+  input_size = buf_size;
+}
+
+void SNPEModel::addExtra(float *image_buf, int buf_size) {
+  extra = image_buf;
+  extra_size = buf_size;
+}
+
 std::unique_ptr<zdl::DlSystem::IUserBuffer> SNPEModel::addExtra(float *state, int state_size, int idx) {
  // get input and output names
+  const auto real_idx = idx + (use_extra ? 1 : 0);
  const auto &strListi_opt = snpe->getInputTensorNames();
  if (!strListi_opt) throw std::runtime_error("Error obtaining Input tensor names");
  const auto &strListi = *strListi_opt;
-  const char *input_tensor_name = strListi.at(idx);
-  printf("adding index %d: %s\n", idx, input_tensor_name);
+  const char *input_tensor_name = strListi.at(real_idx);
+  printf("adding index %d: %s\n", real_idx, input_tensor_name);

  zdl::DlSystem::UserBufferEncodingFloat userBufferEncodingFloat;
  zdl::DlSystem::IUserBufferFactory& ubFactory = zdl::SNPE::SNPEFactory::getUserBufferFactory();
@ -136,13 +168,17 @@ std::unique_ptr<zdl::DlSystem::IUserBuffer> SNPEModel::addExtra(float *state, in
  return ret;
 }

-void SNPEModel::execute(float *net_input_buf, int buf_size) {
+void SNPEModel::execute() {
 #ifdef USE_THNEED
  if (Runtime == zdl::DlSystem::Runtime_t::GPU) {
-    float *inputs[4] = {recurrent, trafficConvention, desire, net_input_buf};
    if (thneed == NULL) {
-      bool ret = inputBuffer->setBufferAddress(net_input_buf);
+      bool ret = inputBuffer->setBufferAddress(input);
      assert(ret == true);
+      if (use_extra) {
+        assert(extra != NULL);
+        bool extra_ret = extraBuffer->setBufferAddress(extra);
+        assert(extra_ret == true);
+      }
      if (!snpe->execute(inputMap, outputMap)) {
        PrintErrorStringAndExit();
      }
@ -159,7 +195,16 @@ void SNPEModel::execute(float *net_input_buf, int buf_size) {
      memcpy(outputs_golden, output, output_size*sizeof(float));
      memset(output, 0, output_size*sizeof(float));
      memset(recurrent, 0, recurrent_size*sizeof(float));
-      thneed->execute(inputs, output);
+      uint64_t start_time = nanos_since_boot();
+      if (extra != NULL) {
+        float *inputs[5] = {recurrent, trafficConvention, desire, extra, input};
+        thneed->execute(inputs, output);
+      } else {
+        float *inputs[4] = {recurrent, trafficConvention, desire, input};
+        thneed->execute(inputs, output);
+      }
+      uint64_t elapsed_time = nanos_since_boot() - start_time;
+      printf("ran model in %.2f ms\n", float(elapsed_time)/1e6);

      if (memcmp(output, outputs_golden, output_size*sizeof(float)) == 0) {
        printf("thneed selftest passed\n");
@ -171,12 +216,22 @@ void SNPEModel::execute(float *net_input_buf, int buf_size) {
      }
      free(outputs_golden);
    } else {
-      thneed->execute(inputs, output);
+      if (use_extra) {
+        float *inputs[5] = {recurrent, trafficConvention, desire, extra, input};
+        thneed->execute(inputs, output);
+      } else {
+        float *inputs[4] = {recurrent, trafficConvention, desire, input};
+        thneed->execute(inputs, output);
+      }
    }
  } else {
 #endif
-    bool ret = inputBuffer->setBufferAddress(net_input_buf);
+    bool ret = inputBuffer->setBufferAddress(input);
    assert(ret == true);
+    if (use_extra) {
+      bool extra_ret = extraBuffer->setBufferAddress(extra);
+      assert(extra_ret == true);
+    }
    if (!snpe->execute(inputMap, outputMap)) {
      PrintErrorStringAndExit();
    }
--- a/selfdrive/modeld/runners/snpemodel.h
+++ b/selfdrive/modeld/runners/snpemodel.h
@ -22,11 +22,13 @@

 class SNPEModel : public RunModel {
 public:
-  SNPEModel(const char *path, float *loutput, size_t loutput_size, int runtime);
+  SNPEModel(const char *path, float *loutput, size_t loutput_size, int runtime, bool luse_extra = false);
  void addRecurrent(float *state, int state_size);
  void addTrafficConvention(float *state, int state_size);
  void addDesire(float *state, int state_size);
-  void execute(float *net_input_buf, int buf_size);
+  void addImage(float *image_buf, int buf_size);
+  void addExtra(float *image_buf, int buf_size);
+  void execute();

 #ifdef USE_THNEED
  Thneed *thneed = NULL;
@ -45,6 +47,8 @@ private:
  // snpe input stuff
  zdl::DlSystem::UserBufferMap inputMap;
  std::unique_ptr<zdl::DlSystem::IUserBuffer> inputBuffer;
+  float *input;
+  size_t input_size;

  // snpe output stuff
  zdl::DlSystem::UserBufferMap outputMap;
@ -52,6 +56,12 @@ private:
  float *output;
  size_t output_size;

+  // extra input stuff
+  std::unique_ptr<zdl::DlSystem::IUserBuffer> extraBuffer;
+  float *extra;
+  size_t extra_size;
+  bool use_extra;
+
  // recurrent and desire
  std::unique_ptr<zdl::DlSystem::IUserBuffer> addExtra(float *state, int state_size, int idx);
  float *recurrent;
--- a/selfdrive/modeld/runners/thneedmodel.cc
+++ b/selfdrive/modeld/runners/thneedmodel.cc
@ -2,7 +2,7 @@

 #include <cassert>

-ThneedModel::ThneedModel(const char *path, float *loutput, size_t loutput_size, int runtime) {
+ThneedModel::ThneedModel(const char *path, float *loutput, size_t loutput_size, int runtime, bool luse_extra) {
  thneed = new Thneed(true);
  thneed->record = 0;
  thneed->load(path);
@ -11,6 +11,7 @@ ThneedModel::ThneedModel(const char *path, float *loutput, size_t loutput_size,

  recorded = false;
  output = loutput;
+  use_extra = luse_extra;
 }

 void ThneedModel::addRecurrent(float *state, int state_size) {
@ -25,23 +26,48 @@ void ThneedModel::addDesire(float *state, int state_size) {
  desire = state;
 }

+void ThneedModel::addImage(float *image_input_buf, int buf_size) {
+  input = image_input_buf;
+}
+
+void ThneedModel::addExtra(float *extra_input_buf, int buf_size) {
+  extra = extra_input_buf;
+}
+
 void* ThneedModel::getInputBuf() {
+  if (use_extra && thneed->input_clmem.size() > 4) return &(thneed->input_clmem[4]);
+  else if (!use_extra && thneed->input_clmem.size() > 3) return &(thneed->input_clmem[3]);
+  else return nullptr;
+}
+
+void* ThneedModel::getExtraBuf() {
  if (thneed->input_clmem.size() > 3) return &(thneed->input_clmem[3]);
  else return nullptr;
 }

-void ThneedModel::execute(float *net_input_buf, int buf_size) {
-  float *inputs[4] = {recurrent, trafficConvention, desire, net_input_buf};
+void ThneedModel::execute() {
  if (!recorded) {
    thneed->record = THNEED_RECORD;
-    thneed->copy_inputs(inputs);
+    if (use_extra) {
+      float *inputs[5] = {recurrent, trafficConvention, desire, extra, input};
+      thneed->copy_inputs(inputs);
+    } else {
+      float *inputs[4] = {recurrent, trafficConvention, desire, input};
+      thneed->copy_inputs(inputs);
+    }
    thneed->clexec();
    thneed->copy_output(output);
    thneed->stop();

    recorded = true;
  } else {
-    thneed->execute(inputs, output);
+    if (use_extra) {
+      float *inputs[5] = {recurrent, trafficConvention, desire, extra, input};
+      thneed->execute(inputs, output);
+    } else {
+      float *inputs[4] = {recurrent, trafficConvention, desire, input};
+      thneed->execute(inputs, output);
+    }
  }
 }

--- a/selfdrive/modeld/runners/thneedmodel.h
+++ b/selfdrive/modeld/runners/thneedmodel.h
@ -5,16 +5,22 @@

 class ThneedModel : public RunModel {
 public:
-  ThneedModel(const char *path, float *loutput, size_t loutput_size, int runtime);
+  ThneedModel(const char *path, float *loutput, size_t loutput_size, int runtime, bool luse_extra = false);
  void addRecurrent(float *state, int state_size);
  void addTrafficConvention(float *state, int state_size);
  void addDesire(float *state, int state_size);
-  void execute(float *net_input_buf, int buf_size);
+  void addImage(float *image_buf, int buf_size);
+  void addExtra(float *image_buf, int buf_size);
+  void execute();
  void* getInputBuf();
+  void* getExtraBuf();
 private:
  Thneed *thneed = NULL;
  bool recorded;
+  bool use_extra;

+  float *input;
+  float *extra;
  float *output;

  // recurrent and desire
--- a/selfdrive/modeld/thneed/compile.cc
+++ b/selfdrive/modeld/thneed/compile.cc
@ -2,6 +2,7 @@

 #include "selfdrive/modeld/runners/snpemodel.h"
 #include "selfdrive/modeld/thneed/thneed.h"
+#include "selfdrive/hardware/hw.h"

 #define TEMPORAL_SIZE 512
 #define DESIRE_LEN 8
@ -10,22 +11,28 @@
 // TODO: This should probably use SNPE directly.
 int main(int argc, char* argv[]) {
  #define OUTPUT_SIZE 0x10000
+
  float *output = (float*)calloc(OUTPUT_SIZE, sizeof(float));
-  SNPEModel mdl(argv[1], output, 0, USE_GPU_RUNTIME);
+  SNPEModel mdl(argv[1], output, 0, USE_GPU_RUNTIME, USE_EXTRA);

  float state[TEMPORAL_SIZE] = {0};
  float desire[DESIRE_LEN] = {0};
  float traffic_convention[TRAFFIC_CONVENTION_LEN] = {0};
  float *input = (float*)calloc(0x1000000, sizeof(float));
+  float *extra = (float*)calloc(0x1000000, sizeof(float));

  mdl.addRecurrent(state, TEMPORAL_SIZE);
  mdl.addDesire(desire, DESIRE_LEN);
  mdl.addTrafficConvention(traffic_convention, TRAFFIC_CONVENTION_LEN);
+  mdl.addImage(input, 0);
+  if (USE_EXTRA) {
+    mdl.addExtra(extra, 0);
+  }

  // first run
  printf("************** execute 1 **************\n");
  memset(output, 0, OUTPUT_SIZE * sizeof(float));
-  mdl.execute(input, 0);
+  mdl.execute();

  // save model
  bool save_binaries = (argc > 3) && (strcmp(argv[3], "--binary") == 0);
--- a/selfdrive/modeld/thneed/kernels/convolution_.cl
+++ b/selfdrive/modeld/thneed/kernels/convolution_.cl
@ -0,0 +1,272 @@
+    read_only image2d_t input,
+#ifndef DEPTHWISE
+    short startPackedInputChannel,
+    short numPackedInputChannelsForGroup, short totalNumPackedInputChannels,
+    // typo required for API compatibility
+    short packedOuputChannelOffset, short totalNumPackedOutputChannels,
+#else
+    short totalNumPackedChannels,
+#endif
+    read_only image2d_t weights, __constant float *biases,
+    short filterSizeX, short filterSizeY,
+    write_only image2d_t output,
+    short paddingX, short paddingY, short strideX, short strideY,
+#ifdef SUPPORT_DILATION
+    short dilationX, short dilationY,
+#endif
+    short neuron, float a, float b, float min_clamp, float max_clamp,
+#ifndef DEPTHWISE
+    // note: these are not supported
+    __constant float *parameters, __constant float *batchNormBiases,
+#endif
+    short numOutputColumns
+#ifdef SUPPORT_ACCUMULATION
+    , short doAccumulate, read_only image2d_t accumulator
+#endif
+    ) {
+
+#ifndef NUM_OUTPUTS
+  #define NUM_OUTPUTS 4
+#endif
+
+  // init
+  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  short packedOutputChannel = get_global_id(0);
+  short startOutputColumn = mul24((short)get_global_id(1), NUM_OUTPUTS);
+  short outputRow = get_global_id(2);
+
+#ifdef DEPTHWISE
+  short totalNumPackedInputChannels = totalNumPackedChannels;
+  short totalNumPackedOutputChannels = totalNumPackedChannels;
+  short startPackedInputChannel = packedOutputChannel;
+#endif
+
+  short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), totalNumPackedInputChannels, startPackedInputChannel);
+  short strideWithChannels = mul24(strideX, totalNumPackedInputChannels);
+
+  float4 outputValues[NUM_OUTPUTS];
+  for (short i = 0; i < NUM_OUTPUTS; ++i) {
+    outputValues[i] = (float4)(0, 0, 0, 0);
+  }
+
+  int2 inputLocation;
+  inputLocation.y = mad24(outputRow, strideY, -paddingY);
+
+  int2 weightLocation;
+  weightLocation.x = 0;
+  weightLocation.y = packedOutputChannel;
+
+#ifdef DEPTHWISE
+
+#ifdef SUPPORT_DILATION
+
+  // depthwise convolution
+  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
+    for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
+      short dilatedStepX = mul24(totalNumPackedChannels, dilationX);
+      inputLocation.x = mad24(rfColumn, dilatedStepX, startX);
+      float4 inputValues[4];
+      for (short i = 0; i < 4; ++i) {
+        inputValues[i] = read_imagef(input, smp, inputLocation);
+        inputLocation.x += strideWithChannels;
+      }
+      float4 weightValues = read_imagef(weights, smp, weightLocation);
+      ++weightLocation.x;
+      outputValues[0] += inputValues[0] * weightValues;
+      outputValues[1] += inputValues[1] * weightValues;
+      outputValues[2] += inputValues[2] * weightValues;
+      outputValues[3] += inputValues[3] * weightValues;
+    }
+    inputLocation.y += dilationY;
+  }
+
+#else
+
+  // depthwise unstrided convolution
+  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
+    float4 inputValues[4];
+    inputLocation.x = startX;
+    for (short i = 1; i < 4; ++i) {
+      inputValues[i] = read_imagef(input, smp, inputLocation);
+      inputLocation.x += totalNumPackedOutputChannels;
+    }
+    for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
+      inputValues[0] = inputValues[1];
+      inputValues[1] = inputValues[2];
+      inputValues[2] = inputValues[3];
+      inputValues[3] = read_imagef(input, smp, inputLocation);
+      inputLocation.x += totalNumPackedChannels;
+      float4 weightValues = read_imagef(weights, smp, weightLocation);
+      ++weightLocation.x;
+      outputValues[0] += inputValues[0] * weightValues;
+      outputValues[1] += inputValues[1] * weightValues;
+      outputValues[2] += inputValues[2] * weightValues;
+      outputValues[3] += inputValues[3] * weightValues;
+    }
+    ++inputLocation.y;
+  }
+
+#endif
+
+#elif defined(ONLY_1X1_CONV)
+
+  // 1x1 convolution
+  short endPackedInputChannel = startPackedInputChannel + numPackedInputChannelsForGroup;
+  for (short packedInputChannel = startPackedInputChannel; packedInputChannel < endPackedInputChannel; ++packedInputChannel) {
+    float4 weightValues[4];
+    for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
+      weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
+      ++weightLocation.x;
+    }
+
+    inputLocation.x = startX + packedInputChannel;
+    float4 inputValues[NUM_OUTPUTS];
+    for (short i = 0; i < NUM_OUTPUTS; ++i) {
+      inputValues[i] = read_imagef(input, smp, inputLocation);
+      inputLocation.x += strideWithChannels;
+    }
+
+    for (short i = 0; i < NUM_OUTPUTS; ++i) {
+      float4 curOutputValues = outputValues[i];
+      curOutputValues.x += inputValues[i].x * weightValues[0].x;
+      curOutputValues.x += inputValues[i].y * weightValues[0].y;
+      curOutputValues.x += inputValues[i].z * weightValues[0].z;
+      curOutputValues.x += inputValues[i].w * weightValues[0].w;
+      curOutputValues.y += inputValues[i].x * weightValues[1].x;
+      curOutputValues.y += inputValues[i].y * weightValues[1].y;
+      curOutputValues.y += inputValues[i].z * weightValues[1].z;
+      curOutputValues.y += inputValues[i].w * weightValues[1].w;
+      curOutputValues.z += inputValues[i].x * weightValues[2].x;
+      curOutputValues.z += inputValues[i].y * weightValues[2].y;
+      curOutputValues.z += inputValues[i].z * weightValues[2].z;
+      curOutputValues.z += inputValues[i].w * weightValues[2].w;
+      curOutputValues.w += inputValues[i].x * weightValues[3].x;
+      curOutputValues.w += inputValues[i].y * weightValues[3].y;
+      curOutputValues.w += inputValues[i].z * weightValues[3].z;
+      curOutputValues.w += inputValues[i].w * weightValues[3].w;
+      outputValues[i] = curOutputValues;
+    }
+  }
+  packedOutputChannel += packedOuputChannelOffset;
+
+#else
+
+  // normal convolution
+  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
+    for (short packedInputChannel = 0; packedInputChannel < numPackedInputChannelsForGroup; ++packedInputChannel) {
+      short startXForChannel = startX + packedInputChannel;
+      for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
+
+        float4 weightValues[4];
+        for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
+          weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
+          ++weightLocation.x;
+        }
+
+#ifdef SUPPORT_DILATION
+        short dilatedStepX = mul24(totalNumPackedInputChannels, dilationX);
+        inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel);
+#else
+        inputLocation.x = mad24(rfColumn, totalNumPackedInputChannels, startXForChannel);
+#endif
+        float4 inputValues[NUM_OUTPUTS];
+        for (short i = 0; i < NUM_OUTPUTS; ++i) {
+          inputValues[i] = read_imagef(input, smp, inputLocation);
+          inputLocation.x += strideWithChannels;
+        }
+        for (short i = 0; i < NUM_OUTPUTS; ++i) {
+          float4 curOutputValues = outputValues[i];
+          curOutputValues.x += inputValues[i].x * weightValues[0].x;
+          curOutputValues.x += inputValues[i].y * weightValues[0].y;
+          curOutputValues.x += inputValues[i].z * weightValues[0].z;
+          curOutputValues.x += inputValues[i].w * weightValues[0].w;
+          curOutputValues.y += inputValues[i].x * weightValues[1].x;
+          curOutputValues.y += inputValues[i].y * weightValues[1].y;
+          curOutputValues.y += inputValues[i].z * weightValues[1].z;
+          curOutputValues.y += inputValues[i].w * weightValues[1].w;
+          curOutputValues.z += inputValues[i].x * weightValues[2].x;
+          curOutputValues.z += inputValues[i].y * weightValues[2].y;
+          curOutputValues.z += inputValues[i].z * weightValues[2].z;
+          curOutputValues.z += inputValues[i].w * weightValues[2].w;
+          curOutputValues.w += inputValues[i].x * weightValues[3].x;
+          curOutputValues.w += inputValues[i].y * weightValues[3].y;
+          curOutputValues.w += inputValues[i].z * weightValues[3].z;
+          curOutputValues.w += inputValues[i].w * weightValues[3].w;
+          outputValues[i] = curOutputValues;
+        }
+      }
+    }
+#ifdef SUPPORT_DILATION
+    inputLocation.y += dilationY;
+#else
+    ++inputLocation.y;
+#endif
+  }
+  packedOutputChannel += packedOuputChannelOffset;
+#endif
+
+  // bias
+  short outputChannel = mul24(packedOutputChannel, 4);
+  float4 biasValues = vload4(0, biases + outputChannel);
+  for (short i = 0; i < NUM_OUTPUTS; ++i) {
+    outputValues[i] += biasValues;
+  }
+
+#ifdef SUPPORT_ACCUMULATION
+  // accumulate
+  if (doAccumulate) {
+    int2 outputLocation;
+    short outputColumn = startOutputColumn;
+    outputLocation.y = outputRow;
+    for (short i = 0; i < NUM_OUTPUTS; ++i) {
+      outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
+      if (outputColumn < numOutputColumns) {
+        outputValues[i] += read_imagef(accumulator, smp, outputLocation);
+      }
+      ++outputColumn;
+    }
+  }
+#endif
+
+  // activation
+  switch (neuron) {
+  case 1:
+    for (short i = 0; i < NUM_OUTPUTS; ++i) {
+      outputValues[i] = max(outputValues[i], 0.0f);
+    }
+    break;
+  case 2:
+    for (short i = 0; i < NUM_OUTPUTS; ++i) {
+      outputValues[i] = a * tanh(b * outputValues[i]);
+    }
+    break;
+  case 3:
+    for (short i = 0; i < NUM_OUTPUTS; ++i) {
+      outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
+    }
+    break;
+  case 4:
+    for (short i = 0; i < NUM_OUTPUTS; ++i) {
+      outputValues[i] = max(outputValues[i], min_clamp);
+      outputValues[i] = min(outputValues[i], max_clamp);
+    }
+    break;
+  case 5:
+    for (short i = 0; i < NUM_OUTPUTS; ++i) {
+      outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
+    }
+    break;
+  }
+
+  // output
+  int2 outputLocation;
+  short outputColumn = startOutputColumn;
+  outputLocation.y = outputRow;
+  for (short i = 0; i < NUM_OUTPUTS; ++i) {
+    outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
+    if (outputColumn < numOutputColumns) {
+      write_imagef(output, outputLocation, outputValues[i]);
+    }
+    ++outputColumn;
+  }
+}
--- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads.cl
+++ b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads.cl
@ -0,0 +1,4 @@
+#define SUPPORT_DILATION
+
+__kernel void convolution_horizontal_reduced_reads(
+#include "convolution_.cl"
--- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_1x1.cl
+++ b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_1x1.cl
@ -0,0 +1,5 @@
+#define ONLY_1X1_CONV
+#define SUPPORT_ACCUMULATION
+
+__kernel void convolution_horizontal_reduced_reads_1x1(
+#include "convolution_.cl"
--- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_5_outputs.cl
+++ b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_5_outputs.cl
@ -0,0 +1,4 @@
+#define NUM_OUTPUTS 5
+
+__kernel void convolution_horizontal_reduced_reads_5_outputs(
+#include "convolution_.cl"
--- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise.cl
+++ b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise.cl
@ -0,0 +1,5 @@
+#define DEPTHWISE
+#define SUPPORT_DILATION
+
+__kernel void convolution_horizontal_reduced_reads_depthwise(
+#include "convolution_.cl"
--- a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise_stride_1.cl
+++ b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_depthwise_stride_1.cl
@ -0,0 +1,4 @@
+#define DEPTHWISE
+
+__kernel void convolution_horizontal_reduced_reads_depthwise_stride_1(
+#include "convolution_.cl"
--- a/selfdrive/modeld/thneed/optimizer.cc
+++ b/selfdrive/modeld/thneed/optimizer.cc
@ -0,0 +1,266 @@
+#include <map>
+#include <string>
+#include <string.h>
+#include <assert.h>
+#include "thneed.h"
+
+extern map<cl_program, string> g_program_source;
+
+static int is_same_size_image(cl_mem a, cl_mem b) {
+  size_t a_width, a_height, a_depth, a_array_size, a_row_pitch, a_slice_pitch;
+  clGetImageInfo(a, CL_IMAGE_WIDTH, sizeof(a_width), &a_width, NULL);
+  clGetImageInfo(a, CL_IMAGE_HEIGHT, sizeof(a_height), &a_height, NULL);
+  clGetImageInfo(a, CL_IMAGE_DEPTH, sizeof(a_depth), &a_depth, NULL);
+  clGetImageInfo(a, CL_IMAGE_ARRAY_SIZE, sizeof(a_array_size), &a_array_size, NULL);
+  clGetImageInfo(a, CL_IMAGE_ROW_PITCH, sizeof(a_row_pitch), &a_row_pitch, NULL);
+  clGetImageInfo(a, CL_IMAGE_SLICE_PITCH, sizeof(a_slice_pitch), &a_slice_pitch, NULL);
+
+  size_t b_width, b_height, b_depth, b_array_size, b_row_pitch, b_slice_pitch;
+  clGetImageInfo(b, CL_IMAGE_WIDTH, sizeof(b_width), &b_width, NULL);
+  clGetImageInfo(b, CL_IMAGE_HEIGHT, sizeof(b_height), &b_height, NULL);
+  clGetImageInfo(b, CL_IMAGE_DEPTH, sizeof(b_depth), &b_depth, NULL);
+  clGetImageInfo(b, CL_IMAGE_ARRAY_SIZE, sizeof(b_array_size), &b_array_size, NULL);
+  clGetImageInfo(b, CL_IMAGE_ROW_PITCH, sizeof(b_row_pitch), &b_row_pitch, NULL);
+  clGetImageInfo(b, CL_IMAGE_SLICE_PITCH, sizeof(b_slice_pitch), &b_slice_pitch, NULL);
+
+  return (a_width == b_width) && (a_height == b_height) &&
+    (a_depth == b_depth) && (a_array_size == b_array_size) &&
+    (a_row_pitch == b_row_pitch) && (a_slice_pitch == b_slice_pitch);
+}
+
+static cl_mem make_image_like(cl_context context, cl_mem val) {
+  cl_image_format format;
+  size_t width, height, row_pitch;
+  clGetImageInfo(val, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
+  assert(format.image_channel_order == CL_RGBA);
+  assert(format.image_channel_data_type == CL_HALF_FLOAT);
+  clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
+  clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
+  clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);
+
+  cl_image_desc desc = {0};
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = width;
+  desc.image_height = height;
+  desc.image_row_pitch = row_pitch;
+
+  cl_mem buf = clCreateBuffer(context, CL_MEM_READ_WRITE, row_pitch*height, NULL, NULL);
+  assert(buf != NULL);
+  desc.buffer = buf;
+
+  cl_int err;
+  cl_mem tmp = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
+  //printf("got %d for image %zux%zu %zu\n", err, width, height, row_pitch);
+  assert(tmp != NULL);
+
+  return tmp;
+}
+
+// convolution_horizontal_reduced_reads_1x1 is 66% of the model runtime
+// make that faster and the model gets faster
+
+// this cuts ~2 ms off the model runtime right now
+int Thneed::optimize() {
+  const char *kernel_path = getenv("KERNEL_PATH");
+  if (!kernel_path) { kernel_path = "/data/openpilot/selfdrive/modeld/thneed/kernels"; printf("no KERNEL_PATH set, defaulting to %s\n", kernel_path); }
+  // load custom kernels
+  map<string, cl_program> g_programs;
+  for (auto &k : kq) {
+    // replace program?
+    if (g_programs.find(k->name) == g_programs.end()) {
+      char fn[0x100];
+      snprintf(fn, sizeof(fn), "%s/%s.cl", kernel_path, k->name.c_str());
+      FILE *g = fopen(fn, "rb");
+      if (g != NULL) {
+        char *src[0x10000];
+        const char *srcs[1]; srcs[0] = (const char *)src;
+        memset(src, 0, sizeof(src));
+        size_t length = fread(src, 1, sizeof(src), g);
+        fclose(g);
+
+        printf("building kernel %s\n", k->name.c_str());
+        k->program = clCreateProgramWithSource(context, 1, srcs, &length, NULL);
+        char options[0x100];
+        snprintf(options, sizeof(options)-1, "-I %s", kernel_path);
+        int err = clBuildProgram(k->program, 1, &device_id, options, NULL, NULL);
+
+        if (err != 0) {
+          printf("got err %d\n", err);
+          size_t err_length;
+          char buffer[2048];
+          clGetProgramBuildInfo(k->program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &err_length);
+          buffer[err_length] = '\0';
+          printf("%s\n", buffer);
+        }
+        assert(err == 0);
+
+        // save in cache
+        g_programs[k->name] = k->program;
+        g_program_source[k->program] = string((char *)src, length);
+      } else {
+        g_programs[k->name] = NULL;
+      }
+    } else {
+      // cached replacement
+      if (g_programs[k->name] != NULL) {
+        k->program = g_programs[k->name];
+      }
+    }
+
+    // hack in accumulator to convolution_horizontal_reduced_reads_1x1
+    if (k->name == "convolution_horizontal_reduced_reads_1x1") {
+      k->arg_names.push_back("doAccumulate");
+      short doAccumulate = 0;
+      k->args.push_back(string((char *)&doAccumulate, sizeof(doAccumulate)));
+      k->args_size.push_back(2);
+      k->arg_names.push_back("accumulator");
+      k->args.push_back(k->args[k->get_arg_num("output")]);
+      k->args_size.push_back(8);
+      k->num_args += 2;
+    }
+
+    // assert that parameters + batchNormBiases are not used
+    // since they aren't supported in custom replacement kernels
+    if (k->name == "convolution_horizontal_reduced_reads_1x1" ||
+        k->name == "convolution_horizontal_reduced_reads" ||
+        k->name == "convolution_horizontal_reduced_reads_5_outputs") {
+      string p1 = k->args[k->get_arg_num("parameters")];
+      string p2 = k->args[k->get_arg_num("batchNormBiases")];
+      assert(p1.length() == 8 && *((uint64_t*)p1.data()) == 0);
+      assert(p2.length() == 8 && *((uint64_t*)p2.data()) == 0);
+    }
+  }
+
+  // optimizer
+  size_t start_size;
+  do {
+    start_size = kq.size();
+
+    // get optimizations
+    map<string, string> replacements;
+    for (int i = 0; i < kq.size(); i++) {
+      // fusing elementwise_sum + activate_image will save 3 enqueues
+
+      // delete useless copy layers
+      // saves ~0.7 ms
+      if (kq[i]->name == "concatenation" || kq[i]->name == "flatten") {
+        string in = kq[i]->args[kq[i]->get_arg_num("input")];
+        string out = kq[i]->args[kq[i]->get_arg_num("output")];
+        if (is_same_size_image(*(cl_mem*)in.data(), *(cl_mem*)out.data())) {
+          cl_mem tmp = make_image_like(context, *(cl_mem *)in.data());
+          replacements[in] = string((char *)&tmp, sizeof(tmp));
+          replacements[out] = string((char *)&tmp, sizeof(tmp));
+
+          kq.erase(kq.begin()+i); --i;
+        }
+      }
+
+      // NOTE: if activations/accumulation are done in the wrong order, this will be wrong
+
+      // fuse activations into convs and fc_Wtx
+      // saves ~1.5 ms
+      // NOTE: this changes the outputs because of rounding, should be better now!
+      if (i != 0 && kq[i]->name == "activate_image") {
+        if (kq[i-1]->name == "convolution_horizontal_reduced_reads_1x1" ||
+            kq[i-1]->name == "convolution_horizontal_reduced_reads_5_outputs" ||
+            kq[i-1]->name == "convolution_horizontal_reduced_reads" ||
+            kq[i-1]->name == "convolution_horizontal_reduced_reads_depthwise" ||
+            kq[i-1]->name == "convolution_horizontal_reduced_reads_depthwise_stride_1" ||
+            kq[i-1]->name == "fc_Wtx") {
+          string lastout = kq[i-1]->args[kq[i-1]->get_arg_num("output")];
+          string in = kq[i]->args[kq[i]->get_arg_num("input")];
+          string out = kq[i]->args[kq[i]->get_arg_num("output")];
+
+          if (lastout == in) {
+            short neuron = *(int*)kq[i]->args[kq[i]->get_arg_num("neuron")].data();
+            assert(neuron <= 5);
+
+            // ELU isn't supported in fc_Wtx
+            assert(!(kq[i-1]->name == "fc_Wtx" && neuron == 5));
+
+            kq[i-1]->args[kq[i-1]->get_arg_num("neuron")] = string((char *)&neuron, sizeof(neuron));
+
+            cl_mem tmp = make_image_like(context, *(cl_mem *)lastout.data());
+            replacements[in] = string((char *)&tmp, sizeof(tmp));
+            replacements[out] = string((char *)&tmp, sizeof(tmp));
+
+            kq.erase(kq.begin()+i); --i;
+          }
+        }
+      }
+
+      // fuse accumulation into convs and fc_Wtx
+      if (i != 0 && kq[i]->name == "elementwise_sum") {
+        if (kq[i-1]->name == "convolution_horizontal_reduced_reads_1x1" ||
+            kq[i-1]->name == "fc_Wtx") {
+          string lastout = kq[i-1]->args[kq[i-1]->get_arg_num("output")];
+          string a = kq[i]->args[kq[i]->get_arg_num("a")];
+          string b = kq[i]->args[kq[i]->get_arg_num("b")];
+          string out = kq[i]->args[kq[i]->get_arg_num("output")];
+
+          if (lastout == a) {
+            kq[i-1]->args[kq[i-1]->get_arg_num("accumulator")] = b;
+          } else if (lastout == b) {
+            kq[i-1]->args[kq[i-1]->get_arg_num("accumulator")] = a;
+          } else {
+            continue;
+          }
+
+          cl_mem tmp = make_image_like(context, *(cl_mem *)lastout.data());
+          replacements[lastout] = string((char *)&tmp, sizeof(tmp));
+          replacements[out] = string((char *)&tmp, sizeof(tmp));
+
+          short doAccumulate = 1;
+          kq[i-1]->args[kq[i-1]->get_arg_num("doAccumulate")] = string((char *)&doAccumulate, sizeof(doAccumulate));
+
+          kq.erase(kq.begin()+i); --i;
+        }
+      }
+    }
+
+    // remap inputs and outputs, and clear the kernels
+    for (int i = 0; i < kq.size(); i++) {
+      kq[i]->kernel = NULL;
+      for (int j = 0; j < kq[i]->num_args; j++) {
+        if (replacements.find(kq[i]->args[j]) != replacements.end()) {
+          kq[i]->args[j] = replacements[kq[i]->args[j]];
+        }
+      }
+    }
+
+    printf("optimize %lu -> %lu\n", start_size, kq.size());
+  } while (kq.size() != start_size);
+
+  size_t work_group_size = 0;
+  clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(work_group_size), &work_group_size, NULL);
+  printf("max work group size %lu\n", work_group_size);
+
+  // local work group optimizer
+  for (auto &k : kq) {
+    // only do it for convs, since others might share memory
+    if (k->name.rfind("convolution_", 0) == 0) {
+      int best = -1;
+      if (k->local_work_size[0] * k->local_work_size[1] * k->local_work_size[2] < work_group_size/2) {
+        uint64_t base_time = k->benchmark();
+        uint64_t best_time = base_time;
+        for (int i = 0; i < 3; i++) {
+          k->local_work_size[i] *= 2;
+          uint64_t this_time = k->benchmark();
+          if (this_time < best_time) {
+            best = i;
+            best_time = this_time;
+          }
+          k->local_work_size[i] /= 2;
+        }
+        if (best != -1) {
+          k->local_work_size[best] *= 2;
+          //printf("%s %.2f ms doubled %d to %.2f ms\n", k->name.c_str(), base_time/1e6, best, best_time/1e6);
+        }
+      }
+
+    }
+  }
+
+  return 0;
+}
+
--- a/selfdrive/modeld/thneed/thneed.cc
+++ b/selfdrive/modeld/thneed/thneed.cc
@ -12,7 +12,7 @@
 #include "selfdrive/common/clutil.h"
 #include "selfdrive/common/timing.h"
 //#define RUN_DISASSEMBLER
-//#define RUN_OPTIMIZER
+#define RUN_OPTIMIZER

 Thneed *g_thneed = NULL;
 int g_fd = -1;
@ -528,6 +528,23 @@ cl_int CLQueuedKernel::exec() {
    kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL);
 }

+uint64_t CLQueuedKernel::benchmark() {
+  uint64_t ret = 0;
+  int old_record = thneed->record;
+  thneed->record = 0;
+  clFinish(thneed->command_queue);
+  // TODO: benchmarking at a lower level will make this more accurate
+  for (int i = 0; i < 10; i++) {
+    uint64_t sb = nanos_since_boot();
+    exec();
+    clFinish(thneed->command_queue);
+    uint64_t et = nanos_since_boot() - sb;
+    if (ret == 0 || et < ret) ret = et;
+  }
+  thneed->record = old_record;
+  return ret;
+}
+
 void CLQueuedKernel::debug_print(bool verbose) {
  printf("%p %56s -- ", kernel, name.c_str());
  for (int i = 0; i < work_dim; i++) {
--- a/selfdrive/modeld/thneed/thneed.h
+++ b/selfdrive/modeld/thneed/thneed.h
@ -44,6 +44,7 @@ class CLQueuedKernel {
                   const size_t *_global_work_size,
                   const size_t *_local_work_size);
    cl_int exec();
+    uint64_t benchmark();
    void debug_print(bool verbose);
    int get_arg_num(const char *search_arg_name);
    cl_program program;
--- a/selfdrive/test/longitudinal_maneuvers/plant.py
+++ b/selfdrive/test/longitudinal_maneuvers/plant.py
@ -86,11 +86,10 @@ class Plant():
      radar.radarState.leadOne = lead
    radar.radarState.leadTwo = lead

-
    control.controlsState.longControlState = LongCtrlState.pid
    control.controlsState.vCruise = float(v_cruise * 3.6)
    car_state.carState.vEgo = float(self.speed)
-
+    car_state.carState.standstill = self.speed < 0.01

    # ******** get controlsState messages for plotting ***
    sm = {'radarState': radar.radarState,
--- a/selfdrive/test/openpilotci.py
+++ b/selfdrive/test/openpilotci.py
@ -8,7 +8,7 @@ BASE_URL = "https://commadataci.blob.core.windows.net/openpilotci/"
 TOKEN_PATH = "/data/azure_token"

 def get_url(route_name, segment_num, log_type="rlog"):
-  ext = "hevc" if log_type in ["fcamera", "dcamera"] else "bz2"
+  ext = "hevc" if log_type.endswith('camera') else "bz2"
  return BASE_URL + f"{route_name.replace('|', '/')}/{segment_num}/{log_type}.{ext}"

 def upload_file(path, name):
--- a/selfdrive/test/process_replay/model_replay.py
+++ b/selfdrive/test/process_replay/model_replay.py
@ -3,8 +3,8 @@ import os
 import sys
 import time
 from collections import defaultdict
-from tqdm import tqdm
 from typing import Any
+from itertools import zip_longest

 import cereal.messaging as messaging
 from cereal.visionipc.visionipc_pyx import VisionIpcServer, VisionStreamType  # pylint: disable=no-name-in-module, import-error
@ -21,17 +21,21 @@ from selfdrive.version import get_commit
 from tools.lib.framereader import FrameReader
 from tools.lib.logreader import LogReader

+TICI_TEST_ROUTE = "4cf7a6ad03080c90|2021-09-29--13-46-36"
+EON_TEST_ROUTE = "303055c0002aefd1|2021-11-22--18-36-32"
+SEGMENT = 0
 if TICI:
-  TEST_ROUTE = "4cf7a6ad03080c90|2021-09-29--13-46-36"
+  TEST_ROUTE = TICI_TEST_ROUTE
 else:
-  TEST_ROUTE = "303055c0002aefd1|2021-11-22--18-36-32"
-SEGMENT = 0
+  TEST_ROUTE = EON_TEST_ROUTE

 SEND_EXTRA_INPUTS = bool(os.getenv("SEND_EXTRA_INPUTS", "0"))

+VIPC_STREAM = {"roadCameraState": VisionStreamType.VISION_STREAM_ROAD, "driverCameraState": VisionStreamType.VISION_STREAM_DRIVER,
+               "wideRoadCameraState": VisionStreamType.VISION_STREAM_WIDE_ROAD}

-def get_log_fn(ref_commit):
-  return f"{TEST_ROUTE}_{'model_tici' if TICI else 'model'}_{ref_commit}.bz2"
+def get_log_fn(ref_commit, test_route, tici=True):
+  return f"{test_route}_{'model_tici' if tici else 'model'}_{ref_commit}.bz2"


 def replace_calib(msg, calib):
@ -48,19 +52,21 @@ def model_replay(lr, frs):
  vipc_server = VisionIpcServer("camerad")
  vipc_server.create_buffers(VisionStreamType.VISION_STREAM_ROAD, 40, False, *(tici_f_frame_size if TICI else eon_f_frame_size))
  vipc_server.create_buffers(VisionStreamType.VISION_STREAM_DRIVER, 40, False, *(tici_d_frame_size if TICI else eon_d_frame_size))
+  vipc_server.create_buffers(VisionStreamType.VISION_STREAM_WIDE_ROAD, 40, False, *(tici_f_frame_size))
  vipc_server.start_listener()

  sm = messaging.SubMaster(['modelV2', 'driverState'])
-  pm = messaging.PubMaster(['roadCameraState', 'driverCameraState', 'liveCalibration', 'lateralPlan'])
+  pm = messaging.PubMaster(['roadCameraState', 'wideRoadCameraState', 'driverCameraState', 'liveCalibration', 'lateralPlan'])

  try:
    managed_processes['modeld'].start()
    managed_processes['dmonitoringmodeld'].start()
-    time.sleep(2)
+    time.sleep(5)
    sm.update(1000)

    log_msgs = []
    last_desire = None
+    recv_cnt = defaultdict(lambda: 0)
    frame_idxs = defaultdict(lambda: 0)

    # init modeld with valid calibration
@ -69,38 +75,59 @@ def model_replay(lr, frs):
      pm.send(cal_msgs[0].which(), cal_msgs[0].as_builder())
      time.sleep(0.1)

-    for msg in tqdm(lr):
-      if SEND_EXTRA_INPUTS:
-        if msg.which() == "liveCalibration":
-          last_calib = list(msg.liveCalibration.rpyCalib)
-          pm.send(msg.which(), replace_calib(msg, last_calib))
-        elif msg.which() == "lateralPlan":
-          last_desire = msg.lateralPlan.desire
-          dat = messaging.new_message('lateralPlan')
-          dat.lateralPlan.desire = last_desire
-          pm.send('lateralPlan', dat)
-
-      if msg.which() in ["roadCameraState", "driverCameraState"]:
-        camera_state = getattr(msg, msg.which())
-        stream = VisionStreamType.VISION_STREAM_ROAD if msg.which() == "roadCameraState" else VisionStreamType.VISION_STREAM_DRIVER
-        img = frs[msg.which()].get(frame_idxs[msg.which()], pix_fmt="yuv420p")[0]
-
-        # send camera state and frame
-        pm.send(msg.which(), msg.as_builder())
-        vipc_server.send(stream, img.flatten().tobytes(), camera_state.frameId,
-                         camera_state.timestampSof, camera_state.timestampEof)
-
-        # wait for a response
-        with Timeout(seconds=15):
-          packet_from_camera = {"roadCameraState": "modelV2", "driverCameraState": "driverState"}
-          log_msgs.append(messaging.recv_one(sm.sock[packet_from_camera[msg.which()]]))
-
-        frame_idxs[msg.which()] += 1
-        if frame_idxs[msg.which()] >= frs[msg.which()].frame_count:
-          break
-
-        spinner.update("replaying models:  road %d/%d,  driver %d/%d" % (frame_idxs['roadCameraState'],
-                       frs['roadCameraState'].frame_count, frame_idxs['driverCameraState'], frs['driverCameraState'].frame_count))
+    msgs = defaultdict(list)
+    for msg in lr:
+      msgs[msg.which()].append(msg)
+
+    for cam_msgs in zip_longest(msgs['roadCameraState'], msgs['wideRoadCameraState'], msgs['driverCameraState']):
+      # need a pair of road/wide msgs
+      if TICI and None in (cam_msgs[0], cam_msgs[1]):
+        break
+
+      for msg in cam_msgs:
+        if msg is None:
+          continue
+
+        if SEND_EXTRA_INPUTS:
+          if msg.which() == "liveCalibration":
+            last_calib = list(msg.liveCalibration.rpyCalib)
+            pm.send(msg.which(), replace_calib(msg, last_calib))
+          elif msg.which() == "lateralPlan":
+            last_desire = msg.lateralPlan.desire
+            dat = messaging.new_message('lateralPlan')
+            dat.lateralPlan.desire = last_desire
+            pm.send('lateralPlan', dat)
+
+        if msg.which() in VIPC_STREAM:
+          msg = msg.as_builder()
+          camera_state = getattr(msg, msg.which())
+          img = frs[msg.which()].get(frame_idxs[msg.which()], pix_fmt="yuv420p")[0]
+          frame_idxs[msg.which()] += 1
+
+          # send camera state and frame
+          camera_state.frameId = frame_idxs[msg.which()]
+          pm.send(msg.which(), msg)
+          vipc_server.send(VIPC_STREAM[msg.which()], img.flatten().tobytes(), camera_state.frameId,
+                           camera_state.timestampSof, camera_state.timestampEof)
+
+          recv = None
+          if msg.which() in ('roadCameraState', 'wideRoadCameraState'):
+            if not TICI or min(frame_idxs['roadCameraState'], frame_idxs['wideRoadCameraState']) > recv_cnt['modelV2']:
+              recv = "modelV2"
+          elif msg.which() == 'driverCameraState':
+            recv = "driverState"
+
+          # wait for a response
+          with Timeout(15, f"timed out waiting for {recv}"):
+            if recv:
+              recv_cnt[recv] += 1
+              log_msgs.append(messaging.recv_one(sm.sock[recv]))
+
+          spinner.update("replaying models:  road %d/%d,  driver %d/%d" % (frame_idxs['roadCameraState'],
+                         frs['roadCameraState'].frame_count, frame_idxs['driverCameraState'], frs['driverCameraState'].frame_count))
+
+      if any(frame_idxs[c] >= frs[c].frame_count for c in frame_idxs.keys()):
+        break

  finally:
    spinner.close()
@ -123,6 +150,8 @@ if __name__ == "__main__":
    'roadCameraState': FrameReader(get_url(TEST_ROUTE, SEGMENT, log_type="fcamera")),
    'driverCameraState': FrameReader(get_url(TEST_ROUTE, SEGMENT, log_type="dcamera")),
  }
+  if TICI:
+    frs['wideRoadCameraState'] = FrameReader(get_url(TEST_ROUTE, SEGMENT, log_type="ecamera"))

  # run replay
  log_msgs = model_replay(lr, frs)
@ -132,26 +161,30 @@ if __name__ == "__main__":
  if not update:
    with open(ref_commit_fn) as f:
      ref_commit = f.read().strip()
-    log_fn = get_log_fn(ref_commit)
-    cmp_log = LogReader(BASE_URL + log_fn)
-
-    ignore = [
-      'logMonoTime',
-      'modelV2.frameDropPerc',
-      'modelV2.modelExecutionTime',
-      'driverState.modelExecutionTime',
-      'driverState.dspExecutionTime'
-    ]
-    tolerance = None if not PC else 1e-3
-    results: Any = {TEST_ROUTE: {}}
-    results[TEST_ROUTE]["models"] = compare_logs(cmp_log, log_msgs, tolerance=tolerance, ignore_fields=ignore)
-    diff1, diff2, failed = format_diff(results, ref_commit)
-
-    print(diff2)
-    print('-------------\n'*5)
-    print(diff1)
-    with open("model_diff.txt", "w") as f:
-      f.write(diff2)
+    log_fn = get_log_fn(ref_commit, TEST_ROUTE, tici=TICI)
+    try:
+      cmp_log = LogReader(BASE_URL + log_fn)
+
+      ignore = [
+        'logMonoTime',
+        'modelV2.frameDropPerc',
+        'modelV2.modelExecutionTime',
+        'driverState.modelExecutionTime',
+        'driverState.dspExecutionTime'
+      ]
+      tolerance = None if not PC else 1e-3
+      results: Any = {TEST_ROUTE: {}}
+      results[TEST_ROUTE]["models"] = compare_logs(cmp_log, log_msgs, tolerance=tolerance, ignore_fields=ignore)
+      diff1, diff2, failed = format_diff(results, ref_commit)
+
+      print(diff2)
+      print('-------------\n'*5)
+      print(diff1)
+      with open("model_diff.txt", "w") as f:
+        f.write(diff2)
+    except Exception as e:
+      print(str(e))
+      failed = True

  # upload new refs
  if update or failed:
@ -160,7 +193,7 @@ if __name__ == "__main__":
    print("Uploading new refs")

    new_commit = get_commit()
-    log_fn = get_log_fn(new_commit)
+    log_fn = get_log_fn(new_commit, TEST_ROUTE, tici=TICI)
    save_log(log_fn, log_msgs)
    try:
      upload_file(log_fn, os.path.basename(log_fn))
--- a/selfdrive/test/process_replay/model_replay_ref_commit
+++ b/selfdrive/test/process_replay/model_replay_ref_commit
@ -1 +1 @@
-7d3ad941bc4ba4c923af7a1d7b48544bfc0d3e13
+19720e79b1c5136a882efd689651d9044e2e2007
--- a/selfdrive/test/process_replay/process_replay.py
+++ b/selfdrive/test/process_replay/process_replay.py
@ -389,7 +389,7 @@ def python_replay_process(cfg, lr, fingerprint=None):
    for msg in lr:
      if msg.which() == 'carParams':
        car_fingerprint = migration.get(msg.carParams.carFingerprint, msg.carParams.carFingerprint)
-        if len(msg.carParams.carFw) and (car_fingerprint in FW_VERSIONS):
+        if msg.carParams.fingerprintSource == "fw" and (car_fingerprint in FW_VERSIONS):
          Params().put("CarParamsCache", msg.carParams.as_builder().to_bytes())
        else:
          os.environ['SKIP_FW_QUERY'] = "1"
--- a/selfdrive/test/process_replay/ref_commit
+++ b/selfdrive/test/process_replay/ref_commit
@ -1 +1 @@
-0c4da879ace9c1517c2324b35da7ff05a4744dd9
+67c8f283858998b75ac28879e1350a589a968e5d
--- a/selfdrive/test/test_models.py
+++ b/selfdrive/test/test_models.py
@ -14,7 +14,7 @@ from selfdrive.boardd.boardd import can_capnp_to_can_list, can_list_to_can_capnp
 from selfdrive.car.fingerprints import all_known_cars
 from selfdrive.car.car_helpers import interfaces
 from selfdrive.car.gm.values import CAR as GM
-from selfdrive.car.honda.values import CAR as HONDA
+from selfdrive.car.honda.values import CAR as HONDA, HONDA_BOSCH
 from selfdrive.car.hyundai.values import CAR as HYUNDAI
 from selfdrive.car.toyota.values import CAR as TOYOTA
 from selfdrive.test.test_routes import routes, non_tested_cars
@ -232,6 +232,10 @@ class TestCarModel(unittest.TestCase):
    if self.CP.carFingerprint == TOYOTA.SIENNA and checks['brakePressed'] < 25:
      checks['brakePressed'] = 0

+    # Honda Nidec uses button enable in panda, but pcm enable in openpilot
+    if self.CP.carName == "honda" and self.CP.carFingerprint not in HONDA_BOSCH and checks['controlsAllowed'] < 25:
+      checks['controlsAllowed'] = 0
+
    failed_checks = {k: v for k, v in checks.items() if v > 0}
    self.assertFalse(len(failed_checks), f"panda safety doesn't agree with openpilot: {failed_checks}")

--- a/selfdrive/test/test_onroad.py
+++ b/selfdrive/test/test_onroad.py
@ -27,8 +27,8 @@ PROCS = {
  "./locationd": 9.1,
  "selfdrive.controls.plannerd": 22.6,
  "./_ui": 20.0,
-  "selfdrive.locationd.paramsd": 9.1,
-  "./camerad": 7.07,
+  "selfdrive.locationd.paramsd": 14.0,
+  "./camerad": 9.16,
  "./_sensord": 6.17,
  "selfdrive.controls.radard": 7.0,
  "./_modeld": 4.48,
@ -56,7 +56,7 @@ if TICI:
  PROCS.update({
    "./loggerd": 70.0,
    "selfdrive.controls.controlsd": 31.0,
-    "./camerad": 31.0,
+    "./camerad": 36.8,
    "./_ui": 33.0,
    "selfdrive.controls.plannerd": 11.7,
    "./_dmonitoringmodeld": 10.0,
@ -227,7 +227,12 @@ class TestOnroad(unittest.TestCase):
    result += "----------------- Model Timing -----------------\n"
    result += "------------------------------------------------\n"
    # TODO: this went up when plannerd cpu usage increased, why?
-    cfgs = [("modelV2", 0.038, 0.036), ("driverState", 0.028, 0.026)]
+    cfgs = [("driverState", 0.028, 0.026)]
+    if EON:
+      cfgs += [("modelV2", 0.045, 0.04)]
+    else:
+      cfgs += [("modelV2", 0.038, 0.036), ("driverState", 0.028, 0.026)]
+
    for (s, instant_max, avg_max) in cfgs:
      ts = [getattr(getattr(m, s), "modelExecutionTime") for m in self.lr if m.which() == s]
      self.assertLess(min(ts), instant_max, f"high '{s}' execution time: {min(ts)}")
--- a/selfdrive/ui/qt/maps/map_helpers.cc
+++ b/selfdrive/ui/qt/maps/map_helpers.cc
@ -160,6 +160,11 @@ static float dot(QGeoCoordinate v, QGeoCoordinate w) {
 }

 float minimum_distance(QGeoCoordinate a, QGeoCoordinate b, QGeoCoordinate p) {
+  // If a and b are the same coordinate the computation below doesn't work
+  if (a.distanceTo(b) < 0.01) {
+    return a.distanceTo(p);
+  }
+
  const QGeoCoordinate ap = sub(p, a);
  const QGeoCoordinate ab = sub(b, a);
  const float t = std::clamp(dot(ap, ab) / dot(ab, ab), 0.0f, 1.0f);
--- a/selfdrive/ui/qt/maps/map_settings.cc
+++ b/selfdrive/ui/qt/maps/map_settings.cc
@ -115,11 +115,12 @@ MapPanel::MapPanel(QWidget* parent) : QWidget(parent) {

  stack->addWidget(main_widget);
  stack->addWidget(no_prime_widget);
-  stack->setCurrentIndex(1);
+  stack->setCurrentIndex(uiState()->prime_type ? 0 : 1);

  QVBoxLayout *wrapper = new QVBoxLayout(this);
  wrapper->addWidget(stack);

+
  clear();

  if (auto dongle_id = getDongleId()) {
@ -183,8 +184,9 @@ void MapPanel::updateCurrentRoute() {
 }

 void MapPanel::parseResponse(const QString &response, bool success) {
+  stack->setCurrentIndex((uiState()->prime_type || success) ? 0 : 1);
+
  if (!success) {
-    stack->setCurrentIndex(1);
    return;
  }

@ -283,7 +285,6 @@ void MapPanel::parseResponse(const QString &response, bool success) {
  }

  recent_layout->addStretch();
-  stack->setCurrentIndex(0);
  repaint();
 }

--- a/selfdrive/ui/qt/offroad/settings.cc
+++ b/selfdrive/ui/qt/offroad/settings.cc
@ -202,10 +202,10 @@ void DevicePanel::updateCalibDescription() {
 }

 void DevicePanel::reboot() {
-  if (uiState()->status == UIStatus::STATUS_DISENGAGED) {
+  if (!uiState()->engaged()) {
    if (ConfirmationDialog::confirm("Are you sure you want to reboot?", this)) {
      // Check engaged again in case it changed while the dialog was open
-      if (uiState()->status == UIStatus::STATUS_DISENGAGED) {
+      if (!uiState()->engaged()) {
        Params().putBool("DoReboot", true);
      }
    }
@ -215,10 +215,10 @@ void DevicePanel::reboot() {
 }

 void DevicePanel::poweroff() {
-  if (uiState()->status == UIStatus::STATUS_DISENGAGED) {
+  if (!uiState()->engaged()) {
    if (ConfirmationDialog::confirm("Are you sure you want to power off?", this)) {
      // Check engaged again in case it changed while the dialog was open
-      if (uiState()->status == UIStatus::STATUS_DISENGAGED) {
+      if (!uiState()->engaged()) {
        Params().putBool("DoShutdown", true);
      }
    }
--- a/selfdrive/ui/qt/onroad.cc
+++ b/selfdrive/ui/qt/onroad.cc
@ -80,11 +80,15 @@ void OnroadWindow::offroadTransition(bool offroad) {
  if (!offroad) {
    if (map == nullptr && (uiState()->prime_type || !MAPBOX_TOKEN.isEmpty())) {
      MapWindow * m = new MapWindow(get_mapbox_settings());
-      m->setFixedWidth(topWidget(this)->width() / 2);
-      m->offroadTransition(offroad);
+      map = m;
+
      QObject::connect(uiState(), &UIState::offroadTransition, m, &MapWindow::offroadTransition);
+
+      m->setFixedWidth(topWidget(this)->width() / 2);
      split->addWidget(m, 0, Qt::AlignRight);
-      map = m;
+
+      // Make map visible after adding to split
+      m->offroadTransition(offroad);
    }
  }
 #endif
--- a/selfdrive/ui/replay/framereader.cc
+++ b/selfdrive/ui/replay/framereader.cc
@ -6,6 +6,14 @@

 #include "cereal/visionipc/visionbuf.h"

+#ifdef __APPLE__
+#define HW_DEVICE_TYPE AV_HWDEVICE_TYPE_VIDEOTOOLBOX
+#define HW_PIX_FMT AV_PIX_FMT_VIDEOTOOLBOX
+#else
+#define HW_DEVICE_TYPE AV_HWDEVICE_TYPE_CUDA
+#define HW_PIX_FMT AV_PIX_FMT_CUDA
+#endif
+
 namespace {

 struct buffer_data {
@ -30,7 +38,7 @@ enum AVPixelFormat get_hw_format(AVCodecContext *ctx, const enum AVPixelFormat *
  for (const enum AVPixelFormat *p = pix_fmts; *p != -1; p++) {
    if (*p == *hw_pix_fmt) return *p;
  }
-  rWarning("Please run replay with the --no-cuda flag!");
+  rWarning("Please run replay with the --no-hw-decoder flag!");
  // fallback to YUV420p
  *hw_pix_fmt = AV_PIX_FMT_NONE;
  return AV_PIX_FMT_YUV420P;
@ -57,15 +65,15 @@ FrameReader::~FrameReader() {
  }
 }

-bool FrameReader::load(const std::string &url, bool no_cuda, std::atomic<bool> *abort, bool local_cache, int chunk_size, int retries) {
+bool FrameReader::load(const std::string &url, bool no_hw_decoder, std::atomic<bool> *abort, bool local_cache, int chunk_size, int retries) {
  FileReader f(local_cache, chunk_size, retries);
  std::string data = f.read(url, abort);
  if (data.empty()) return false;

-  return load((std::byte *)data.data(), data.size(), no_cuda, abort);
+  return load((std::byte *)data.data(), data.size(), no_hw_decoder, abort);
 }

-bool FrameReader::load(const std::byte *data, size_t size, bool no_cuda, std::atomic<bool> *abort) {
+bool FrameReader::load(const std::byte *data, size_t size, bool no_hw_decoder, std::atomic<bool> *abort) {
  input_ctx = avformat_alloc_context();
  if (!input_ctx) return false;

@ -95,7 +103,7 @@ bool FrameReader::load(const std::byte *data, size_t size, bool no_cuda, std::at
  }

  AVStream *video = input_ctx->streams[0];
-  AVCodec *decoder = avcodec_find_decoder(video->codec->codec_id);
+  const AVCodec *decoder = avcodec_find_decoder(video->codecpar->codec_id);
  if (!decoder) return false;

  decoder_ctx = avcodec_alloc_context3(decoder);
@ -106,9 +114,9 @@ bool FrameReader::load(const std::byte *data, size_t size, bool no_cuda, std::at
  height = decoder_ctx->height;
  visionbuf_compute_aligned_width_and_height(width, height, &aligned_width, &aligned_height);

-  if (has_cuda_device && !no_cuda) {
-    if (!initHardwareDecoder(AV_HWDEVICE_TYPE_CUDA)) {
-      rWarning("No CUDA capable device was found. fallback to CPU decoding.");
+  if (has_hw_decoder && !no_hw_decoder) {
+    if (!initHardwareDecoder(HW_DEVICE_TYPE)) {
+      rWarning("No device with hardware decoder found. fallback to CPU decoding.");
    } else {
      nv12toyuv_buffer.resize(getYUVSize());
    }
@ -151,7 +159,7 @@ bool FrameReader::initHardwareDecoder(AVHWDeviceType hw_device_type) {
  int ret = av_hwdevice_ctx_create(&hw_device_ctx, hw_device_type, nullptr, nullptr, 0);
  if (ret < 0) {
    hw_pix_fmt = AV_PIX_FMT_NONE;
-    has_cuda_device = false;
+    has_hw_decoder = false;
    rWarning("Failed to create specified HW device %d.", ret);
    return false;
  }
@ -219,7 +227,7 @@ AVFrame *FrameReader::decodeFrame(AVPacket *pkt) {
 }

 bool FrameReader::copyBuffers(AVFrame *f, uint8_t *rgb, uint8_t *yuv) {
-  if (hw_pix_fmt == AV_PIX_FMT_CUDA) {
+  if (hw_pix_fmt == HW_PIX_FMT) {
    uint8_t *y = yuv ? yuv : nv12toyuv_buffer.data();
    uint8_t *u = y + width * height;
    uint8_t *v = u + (width / 2) * (height / 2);
--- a/selfdrive/ui/replay/framereader.h
+++ b/selfdrive/ui/replay/framereader.h
@ -19,8 +19,8 @@ class FrameReader {
 public:
  FrameReader();
  ~FrameReader();
-  bool load(const std::string &url, bool no_cuda = false, std::atomic<bool> *abort = nullptr, bool local_cache = false, int chunk_size = -1, int retries = 0);
-  bool load(const std::byte *data, size_t size, bool no_cuda = false, std::atomic<bool> *abort = nullptr);
+  bool load(const std::string &url, bool no_hw_decoder = false, std::atomic<bool> *abort = nullptr, bool local_cache = false, int chunk_size = -1, int retries = 0);
+  bool load(const std::byte *data, size_t size, bool no_hw_decoder = false, std::atomic<bool> *abort = nullptr);
  bool get(int idx, uint8_t *rgb, uint8_t *yuv);
  int getRGBSize() const { return aligned_width * aligned_height * 3; }
  int getYUVSize() const { return width * height * 3 / 2; }
@ -48,5 +48,5 @@ private:
  AVBufferRef *hw_device_ctx = nullptr;
  std::vector<uint8_t> nv12toyuv_buffer;
  int prev_idx = -1;
-  inline static std::atomic<bool> has_cuda_device = true;
+  inline static std::atomic<bool> has_hw_decoder = true;
 };
--- a/selfdrive/ui/replay/main.cc
+++ b/selfdrive/ui/replay/main.cc
@ -7,7 +7,7 @@
 const QString DEMO_ROUTE = "4cf7a6ad03080c90|2021-09-29--13-46-36";

 int main(int argc, char *argv[]) {
-  QApplication app(argc, argv);
+  QCoreApplication app(argc, argv);

  const std::tuple<QString, REPLAY_FLAGS, QString> flags[] = {
      {"dcam", REPLAY_FLAG_DCAM, "load driver camera"},
@ -16,7 +16,7 @@ int main(int argc, char *argv[]) {
      {"no-cache", REPLAY_FLAG_NO_FILE_CACHE, "turn off local cache"},
      {"qcam", REPLAY_FLAG_QCAMERA, "load qcamera"},
      {"yuv", REPLAY_FLAG_SEND_YUV, "send yuv frame"},
-      {"no-cuda", REPLAY_FLAG_NO_CUDA, "disable CUDA"},
+      {"no-hw-decoder", REPLAY_FLAG_NO_HW_DECODER, "disable HW video decoding"},
      {"no-vipc", REPLAY_FLAG_NO_VIPC, "do not output video"},
  };

--- a/selfdrive/ui/replay/replay.h
+++ b/selfdrive/ui/replay/replay.h
@ -18,7 +18,7 @@ enum REPLAY_FLAGS {
  REPLAY_FLAG_NO_FILE_CACHE = 0x0020,
  REPLAY_FLAG_QCAMERA = 0x0040,
  REPLAY_FLAG_SEND_YUV = 0x0080,
-  REPLAY_FLAG_NO_CUDA = 0x0100,
+  REPLAY_FLAG_NO_HW_DECODER = 0x0100,
  REPLAY_FLAG_FULL_SPEED = 0x0200,
  REPLAY_FLAG_NO_VIPC = 0x0400,
 };
--- a/selfdrive/ui/replay/route.cc
+++ b/selfdrive/ui/replay/route.cc
@ -117,7 +117,7 @@ void Segment::loadFile(int id, const std::string file) {
  bool success = false;
  if (id < MAX_CAMERAS) {
    frames[id] = std::make_unique<FrameReader>();
-    success = frames[id]->load(file, flags & REPLAY_FLAG_NO_CUDA, &abort_, local_cache, 20 * 1024 * 1024, 3);
+    success = frames[id]->load(file, flags & REPLAY_FLAG_NO_HW_DECODER, &abort_, local_cache, 20 * 1024 * 1024, 3);
  } else {
    log = std::make_unique<LogReader>();
    success = log->load(file, &abort_, local_cache, 0, 3);
--- a/selfdrive/ui/ui.cc
+++ b/selfdrive/ui/ui.cc
@ -208,7 +208,7 @@ void UIState::updateStatus() {
  }

  // Handle onroad/offroad transition
-  if (scene.started != started_prev) {
+  if (scene.started != started_prev || sm->frame == 1) {
    if (scene.started) {
      status = STATUS_DISENGAGED;
      scene.started_frame = sm->frame;
@ -217,8 +217,6 @@ void UIState::updateStatus() {
    }
    started_prev = scene.started;
    emit offroadTransition(!scene.started);
-  } else if (sm->frame == 1) {
-    emit offroadTransition(!scene.started);
  }
 }

--- a/selfdrive/ui/ui.h
+++ b/selfdrive/ui/ui.h
@ -118,6 +118,9 @@ public:
  inline bool worldObjectsVisible() const {
    return sm->rcv_frame("liveCalibration") > scene.started_frame;
  };
+  inline bool engaged() const {
+    return scene.started && (*sm)["controlsState"].getControlsState().getEnabled();
+  };

  int fb_w = 0, fb_h = 0;

@ -141,7 +144,7 @@ private slots:

 private:
  QTimer *timer;
-  bool started_prev = true;
+  bool started_prev = false;
 };

 UIState *uiState();
--- a/selfdrive/ui/watch3.cc
+++ b/selfdrive/ui/watch3.cc
@ -6,9 +6,7 @@
 #include "selfdrive/ui/qt/widgets/cameraview.h"

 int main(int argc, char *argv[]) {
-  QSurfaceFormat fmt;
-  fmt.setRenderableType(QSurfaceFormat::OpenGLES);
-  QSurfaceFormat::setDefaultFormat(fmt);
+  setQtSurfaceFormat();

  QApplication a(argc, argv);
  QWidget w;
--- a/selfdrive/version.py
+++ b/selfdrive/version.py
@ -55,11 +55,15 @@ def get_origin(default: Optional[str] = None) -> Optional[str]:

@cache
 def get_normalized_origin(default: Optional[str] = None) -> Optional[str]:
-  return get_origin()\
-          .replace("git@", "", 1)\
-          .replace(".git", "", 1)\
-          .replace("https://", "", 1)\
-          .replace(":", "/", 1)
+  origin = get_origin()
+
+  if origin is None:
+    return default
+
+  return origin.replace("git@", "", 1) \
+               .replace(".git", "", 1) \
+               .replace("https://", "", 1) \
+               .replace(":", "/", 1)


@cache
--- a/tools/mac_setup.sh
+++ b/tools/mac_setup.sh
@ -4,12 +4,28 @@ set -e

 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
 ROOT="$(cd $DIR/../ && pwd)"
+ARCH=$(uname -m)
+
+if [[ $SHELL == "/bin/zsh" ]]; then
+  RC_FILE="$HOME/.zshrc"
+elif [[ $SHELL == "/bin/bash" ]]; then
+  RC_FILE="$HOME/.bashrc"
+fi

 # Install brew if required
 if [[ $(command -v brew) == "" ]]; then
  echo "Installing Hombrew"
  /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install.sh)"
  echo "[ ] installed brew t=$SECONDS"
+
+  # make brew available now
+  if [[ $ARCH == "x86_64" ]]; then
+      echo 'eval "$(/usr/local/homebrew/bin/brew shellenv)"' >> $RC_FILE
+      eval "$(/usr/local/homebrew/bin/brew shellenv)"
+  else
+      echo 'eval "$(/opt/homebrew/bin/brew shellenv)"' >> $RC_FILE
+      eval "$(/opt/homebrew/bin/brew shellenv)"
+  fi
 fi

 # TODO: remove protobuf,protobuf-c,swig when casadi can be pip installed
@ -40,20 +56,18 @@ EOS

 echo "[ ] finished brew install t=$SECONDS"

-if [[ $SHELL == "/bin/zsh" ]]; then
-  RC_FILE="$HOME/.zshrc"
-elif [[ $SHELL == "/bin/bash" ]]; then
-  RC_FILE="$HOME/.bash_profile"
-fi
+BREW_PREFIX=$(brew --prefix)
+
+# archive backend tools for pip dependencies
+export LDFLAGS="$LDFLAGS -L${BREW_PREFIX}/opt/zlib/lib"
+export LDFLAGS="$LDFLAGS -L${BREW_PREFIX}/opt/bzip2/lib"
+export CPPFLAGS="$CPPFLAGS -I${BREW_PREFIX}/opt/zlib/include"
+export CPPFLAGS="$CPPFLAGS -I${BREW_PREFIX}/opt/bzip2/include"

-export LDFLAGS="$LDFLAGS -L/usr/local/opt/zlib/lib"
-export LDFLAGS="$LDFLAGS -L/usr/local/opt/bzip2/lib"
-export LDFLAGS="$LDFLAGS -L/usr/local/opt/openssl@1.1/lib"
-export CPPFLAGS="$CPPFLAGS -I/usr/local/opt/zlib/include"
-export CPPFLAGS="$CPPFLAGS -I/usr/local/opt/bzip2/include"
-export CPPFLAGS="$CPPFLAGS -I/usr/local/opt/openssl@1.1/include"
-export PATH="$PATH:/usr/local/opt/openssl@1.1/bin"
-export PATH="$PATH:/usr/local/bin"
+# pycurl curl/openssl backend dependencies
+export LDFLAGS="$LDFLAGS -L${BREW_PREFIX}/opt/openssl@3/lib"
+export CPPFLAGS="$CPPFLAGS -I${BREW_PREFIX}/opt/openssl@3/include"
+export PYCURL_SSL_LIBRARY=openssl

 # openpilot environment
 if [ -z "$OPENPILOT_ENV" ] && [ -n "$RC_FILE" ] && [ -z "$CI" ]; then
--- a/tools/replay/lib/ui_helpers.py
+++ b/tools/replay/lib/ui_helpers.py
@ -185,12 +185,12 @@ def plot_model(m, img, calibration, top_down):
  if calibration is None or top_down is None:
    return

-  for lead in m.leads:
+  for lead in m.leadsV3:
    if lead.prob < 0.5:
      continue

-    x, y, _, _ = lead.xyva
-    x_std, _, _, _ = lead.xyvaStd
+    x, y = lead.x[0], lead.y[0]
+    x_std = lead.xStd[0]
    x -= RADAR_TO_CAMERA

    _, py_top = to_topdown_pt(x + x_std, y)