From cb0b7375b728d1b6e92db68c9ba55f0f54c09a3f Mon Sep 17 00:00:00 2001
From: HaraldSchafer <harald.the.engineer@gmail.com>
Date: Wed, 5 Oct 2022 21:43:38 -0700
Subject: [PATCH] Rocket Launcher Model (#25963)

* 1456d261-d232-4654-8885-4d9fde883894/440 6b7d7cec-ead8-40f3-86cc-86d52c9b03fe/300

* compute only 9 tokens: 1456d261-d232-4654-8885-4d9fde883894/440 6b7d7cec-ead8-40f3-86cc-86d52c9b03fe/300

* tinygrad: cleanup gather

* 1456d261-d232-4654-8885-4d9fde883894/440 6b7d7cec-ead8-40f3-86cc-86d52c9b03fe/700

* empty commit for tests

* bump tinygrad

* dont use tinygrad matmul for now

* bump tinygrad

* 1456d261-d232-4654-8885-4d9fde883894/440 e63ab895-2222-4abd-a9a5-af86bb70e260/700

* float16 1456d261-d232-4654-8885-4d9fde883894/440 e63ab895-2222-4abd-a9a5-af86bb70e260/700

* increase steer rate cost

* Revert "increase steer rate cost"

This reverts commit 74ce9ab9be7ef17ecfec931f96851b12f37f2336.

* fork tinygrad

* empty commit for tests

* basics

* Kinda works

* new lat

* new tuning

* Move LATMPCN so scons compiles

* Update long weights

* Add tinygrad optim

* Update model ref

* update weights

* Update ref

* Try

* Error message for field ignore

* update model regf

* ref commit

* Fix onnx test

Co-authored-by: Yassine Yousfi <yyousfi1@binghamton.edu>
---
 .gitmodules                                   |  2 +-
 release/files_common                          |  1 +
 selfdrive/controls/lib/drive_helpers.py       |  6 --
 .../controls/lib/lateral_mpc_lib/lat_mpc.py   | 63 ++++++++++---------
 selfdrive/controls/lib/lateral_planner.py     | 18 +++---
 .../lib/longitudinal_mpc_lib/long_mpc.py      |  2 +-
 .../controls/lib/longitudinal_planner.py      |  6 +-
 selfdrive/controls/tests/test_lateral_mpc.py  |  5 +-
 selfdrive/modeld/SConscript                   |  4 +-
 selfdrive/modeld/models/driving.cc            | 16 +++--
 selfdrive/modeld/models/driving.h             | 15 ++++-
 selfdrive/modeld/models/supercombo.onnx       |  4 +-
 selfdrive/modeld/runners/onnx_runner.py       | 13 ++--
 selfdrive/modeld/thneed/thneed_common.cc      |  2 +-
 selfdrive/test/process_replay/compare_logs.py |  2 +-
 selfdrive/test/process_replay/model_replay.py |  4 +-
 .../process_replay/model_replay_ref_commit    |  2 +-
 selfdrive/test/process_replay/ref_commit      |  2 +-
 tinygrad_repo                                 |  2 +-
 19 files changed, 93 insertions(+), 76 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 26f93ef164..544c95c943 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -18,4 +18,4 @@
   url = ../../commaai/body.git
 [submodule "tinygrad"]
   path = tinygrad_repo
-  url = https://github.com/geohot/tinygrad.git
+  url = ../../commaai/tinygrad.git
diff --git a/release/files_common b/release/files_common
index 0c341eb3f6..07ffaf8501 100644
--- a/release/files_common
+++ b/release/files_common
@@ -574,3 +574,4 @@ tinygrad_repo/tinygrad/ops.py
 tinygrad_repo/tinygrad/shapetracker.py
 tinygrad_repo/tinygrad/tensor.py
 tinygrad_repo/tinygrad/nn/__init__.py
+tinygrad_repo/tinygrad/nn/optim.py
diff --git a/selfdrive/controls/lib/drive_helpers.py b/selfdrive/controls/lib/drive_helpers.py
index e8f9585a6f..f81cd0c40c 100644
--- a/selfdrive/controls/lib/drive_helpers.py
+++ b/selfdrive/controls/lib/drive_helpers.py
@@ -33,12 +33,6 @@ CRUISE_INTERVAL_SIGN = {
 }
 
 
-class MPC_COST_LAT:
-  PATH = 1.0
-  HEADING = 1.0
-  STEER_RATE = 1.0
-
-
 def apply_deadzone(error, deadzone):
   if error > deadzone:
     error -= deadzone
diff --git a/selfdrive/controls/lib/lateral_mpc_lib/lat_mpc.py b/selfdrive/controls/lib/lateral_mpc_lib/lat_mpc.py
index c0e7358160..07efad73c9 100755
--- a/selfdrive/controls/lib/lateral_mpc_lib/lat_mpc.py
+++ b/selfdrive/controls/lib/lateral_mpc_lib/lat_mpc.py
@@ -5,7 +5,6 @@ import numpy as np
 from casadi import SX, vertcat, sin, cos
 
 from common.realtime import sec_since_boot
-from selfdrive.controls.lib.drive_helpers import LAT_MPC_N as N
 from selfdrive.modeld.constants import T_IDXS
 
 if __name__ == '__main__':  # generating code
@@ -18,6 +17,9 @@ EXPORT_DIR = os.path.join(LAT_MPC_DIR, "c_generated_code")
 JSON_FILE = os.path.join(LAT_MPC_DIR, "acados_ocp_lat.json")
 X_DIM = 4
 P_DIM = 2
+N = 16
+COST_E_DIM = 3
+COST_DIM = COST_E_DIM + 1
 MODEL_NAME = 'lat'
 ACADOS_SOLVER_TYPE = 'SQP_RTI'
 
@@ -29,8 +31,8 @@ def gen_lat_model():
   x_ego = SX.sym('x_ego')
   y_ego = SX.sym('y_ego')
   psi_ego = SX.sym('psi_ego')
-  curv_ego = SX.sym('curv_ego')
-  model.x = vertcat(x_ego, y_ego, psi_ego, curv_ego)
+  psi_rate_ego = SX.sym('psi_rate_ego')
+  model.x = vertcat(x_ego, y_ego, psi_ego, psi_rate_ego)
 
   # parameters
   v_ego = SX.sym('v_ego')
@@ -38,22 +40,22 @@ def gen_lat_model():
   model.p = vertcat(v_ego, rotation_radius)
 
   # controls
-  curv_rate = SX.sym('curv_rate')
-  model.u = vertcat(curv_rate)
+  psi_accel_ego = SX.sym('psi_accel_ego')
+  model.u = vertcat(psi_accel_ego)
 
   # xdot
   x_ego_dot = SX.sym('x_ego_dot')
   y_ego_dot = SX.sym('y_ego_dot')
   psi_ego_dot = SX.sym('psi_ego_dot')
-  curv_ego_dot = SX.sym('curv_ego_dot')
+  psi_rate_ego_dot = SX.sym('psi_rate_ego_dot')
 
-  model.xdot = vertcat(x_ego_dot, y_ego_dot, psi_ego_dot, curv_ego_dot)
+  model.xdot = vertcat(x_ego_dot, y_ego_dot, psi_ego_dot, psi_rate_ego_dot)
 
   # dynamics model
-  f_expl = vertcat(v_ego * cos(psi_ego) - rotation_radius * sin(psi_ego) * (v_ego * curv_ego),
-                   v_ego * sin(psi_ego) + rotation_radius * cos(psi_ego) * (v_ego * curv_ego),
-                   v_ego * curv_ego,
-                   curv_rate)
+  f_expl = vertcat(v_ego * cos(psi_ego) - rotation_radius * sin(psi_ego) * psi_rate_ego,
+                   v_ego * sin(psi_ego) + rotation_radius * cos(psi_ego) * psi_rate_ego,
+                   psi_rate_ego,
+                   psi_accel_ego)
   model.f_impl_expr = model.xdot - f_expl
   model.f_expl_expr = f_expl
   return model
@@ -72,26 +74,28 @@ def gen_lat_ocp():
   ocp.cost.cost_type = 'NONLINEAR_LS'
   ocp.cost.cost_type_e = 'NONLINEAR_LS'
 
-  Q = np.diag([0.0, 0.0])
-  QR = np.diag([0.0, 0.0, 0.0])
+  Q = np.diag(np.zeros(COST_E_DIM))
+  QR = np.diag(np.zeros(COST_DIM))
 
   ocp.cost.W = QR
   ocp.cost.W_e = Q
 
-  y_ego, psi_ego = ocp.model.x[1], ocp.model.x[2]
-  curv_rate = ocp.model.u[0]
+  y_ego, psi_ego, psi_rate_ego = ocp.model.x[1], ocp.model.x[2], ocp.model.x[3]
+  psi_rate_ego_dot = ocp.model.u[0]
   v_ego = ocp.model.p[0]
 
   ocp.parameter_values = np.zeros((P_DIM, ))
 
-  ocp.cost.yref = np.zeros((3, ))
-  ocp.cost.yref_e = np.zeros((2, ))
+  ocp.cost.yref = np.zeros((COST_DIM, ))
+  ocp.cost.yref_e = np.zeros((COST_E_DIM, ))
   # TODO hacky weights to keep behavior the same
   ocp.model.cost_y_expr = vertcat(y_ego,
-                                  ((v_ego +5.0) * psi_ego),
-                                  ((v_ego + 5.0) * 4.0 * curv_rate))
+                                  ((v_ego + 5.0) * psi_ego),
+                                  ((v_ego + 5.0) * psi_rate_ego),
+                                  ((v_ego + 5.0) * psi_rate_ego_dot))
   ocp.model.cost_y_expr_e = vertcat(y_ego,
-                                    ((v_ego +5.0) * psi_ego))
+                                  ((v_ego + 5.0) * psi_ego),
+                                  ((v_ego + 5.0) * psi_rate_ego))
 
   # set constraints
   ocp.constraints.constr_type = 'BGH'
@@ -124,10 +128,10 @@ class LateralMpc():
   def reset(self, x0=np.zeros(X_DIM)):
     self.x_sol = np.zeros((N+1, X_DIM))
     self.u_sol = np.zeros((N, 1))
-    self.yref = np.zeros((N+1, 3))
+    self.yref = np.zeros((N+1, COST_DIM))
     for i in range(N):
       self.solver.cost_set(i, "yref", self.yref[i])
-    self.solver.cost_set(N, "yref", self.yref[N][:2])
+    self.solver.cost_set(N, "yref", self.yref[N][:COST_E_DIM])
 
     # Somehow needed for stable init
     for i in range(N+1):
@@ -140,14 +144,13 @@ class LateralMpc():
     self.solve_time = 0.0
     self.cost = 0
 
-  def set_weights(self, path_weight, heading_weight, steer_rate_weight):
-    W = np.asfortranarray(np.diag([path_weight, heading_weight, steer_rate_weight]))
+  def set_weights(self, path_weight, heading_weight, yaw_rate_weight, yaw_accel_cost):
+    W = np.asfortranarray(np.diag([path_weight, heading_weight, yaw_rate_weight, yaw_accel_cost]))
     for i in range(N):
       self.solver.cost_set(i, 'W', W)
-    #TODO hacky weights to keep behavior the same
-    self.solver.cost_set(N, 'W', (3/20.)*W[:2,:2])
+    self.solver.cost_set(N, 'W', W[:COST_E_DIM,:COST_E_DIM])
 
-  def run(self, x0, p, y_pts, heading_pts, curv_rate_pts):
+  def run(self, x0, p, y_pts, heading_pts, yaw_rate_pts):
     x0_cp = np.copy(x0)
     p_cp = np.copy(p)
     self.solver.constraints_set(0, "lbx", x0_cp)
@@ -155,13 +158,13 @@ class LateralMpc():
     self.yref[:,0] = y_pts
     v_ego = p_cp[0]
     # rotation_radius = p_cp[1]
-    self.yref[:,1] = heading_pts*(v_ego+5.0)
-    self.yref[:,2] = curv_rate_pts * (v_ego+5.0) * 4.0
+    self.yref[:,1] = heading_pts * (v_ego+5.0)
+    self.yref[:,2] = yaw_rate_pts * (v_ego+5.0)
     for i in range(N):
       self.solver.cost_set(i, "yref", self.yref[i])
       self.solver.set(i, "p", p_cp)
     self.solver.set(N, "p", p_cp)
-    self.solver.cost_set(N, "yref", self.yref[N][:2])
+    self.solver.cost_set(N, "yref", self.yref[N][:COST_E_DIM])
 
     t = sec_since_boot()
     self.solution_status = self.solver.solve()
diff --git a/selfdrive/controls/lib/lateral_planner.py b/selfdrive/controls/lib/lateral_planner.py
index 2ad5f784d7..48c98e7350 100644
--- a/selfdrive/controls/lib/lateral_planner.py
+++ b/selfdrive/controls/lib/lateral_planner.py
@@ -3,7 +3,8 @@ from common.realtime import sec_since_boot, DT_MDL
 from common.numpy_fast import interp
 from system.swaglog import cloudlog
 from selfdrive.controls.lib.lateral_mpc_lib.lat_mpc import LateralMpc
-from selfdrive.controls.lib.drive_helpers import CONTROL_N, MPC_COST_LAT, LAT_MPC_N
+from selfdrive.controls.lib.lateral_mpc_lib.lat_mpc import N as LAT_MPC_N
+from selfdrive.controls.lib.drive_helpers import CONTROL_N
 from selfdrive.controls.lib.desire_helper import DesireHelper
 import cereal.messaging as messaging
 from cereal import log
@@ -23,7 +24,7 @@ class LateralPlanner:
 
     self.path_xyz = np.zeros((TRAJECTORY_SIZE, 3))
     self.plan_yaw = np.zeros((TRAJECTORY_SIZE,))
-    self.plan_curv_rate = np.zeros((TRAJECTORY_SIZE,))
+    self.plan_yaw_rate = np.zeros((TRAJECTORY_SIZE,))
     self.t_idxs = np.arange(TRAJECTORY_SIZE)
     self.y_pts = np.zeros(TRAJECTORY_SIZE)
 
@@ -44,6 +45,7 @@ class LateralPlanner:
       self.path_xyz = np.column_stack([md.position.x, md.position.y, md.position.z])
       self.t_idxs = np.array(md.position.t)
       self.plan_yaw = np.array(md.orientation.z)
+      self.plan_yaw_rate = np.array(md.orientationRate.z)
 
     # Lane change logic
     desire_state = md.meta.desireState
@@ -55,24 +57,24 @@ class LateralPlanner:
 
     d_path_xyz = self.path_xyz
     # Heading cost is useful at low speed, otherwise end of plan can be off-heading
-    heading_cost = interp(v_ego, [5.0, 10.0], [MPC_COST_LAT.HEADING, 0.15])
-    self.lat_mpc.set_weights(MPC_COST_LAT.PATH, heading_cost, MPC_COST_LAT.STEER_RATE)
+    heading_cost = interp(v_ego, [5.0, 10.0], [1.0, 0.15])
+    self.lat_mpc.set_weights(1.0, heading_cost, 0.0, .075)
 
     y_pts = np.interp(v_ego * self.t_idxs[:LAT_MPC_N + 1], np.linalg.norm(d_path_xyz, axis=1), d_path_xyz[:, 1])
     heading_pts = np.interp(v_ego * self.t_idxs[:LAT_MPC_N + 1], np.linalg.norm(self.path_xyz, axis=1), self.plan_yaw)
-    curv_rate_pts = np.interp(v_ego * self.t_idxs[:LAT_MPC_N + 1], np.linalg.norm(self.path_xyz, axis=1), self.plan_curv_rate)
+    yaw_rate_pts = np.interp(v_ego * self.t_idxs[:LAT_MPC_N + 1], np.linalg.norm(self.path_xyz, axis=1), self.plan_yaw_rate)
     self.y_pts = y_pts
 
     assert len(y_pts) == LAT_MPC_N + 1
     assert len(heading_pts) == LAT_MPC_N + 1
-    assert len(curv_rate_pts) == LAT_MPC_N + 1
+    assert len(yaw_rate_pts) == LAT_MPC_N + 1
     lateral_factor = max(0, self.factor1 - (self.factor2 * v_ego**2))
     p = np.array([v_ego, lateral_factor])
     self.lat_mpc.run(self.x0,
                      p,
                      y_pts,
                      heading_pts,
-                     curv_rate_pts)
+                     yaw_rate_pts)
     # init state for next
     # mpc.u_sol is the desired curvature rate given x0 curv state.
     # with x0[3] = measured_curvature, this would be the actual desired rate.
@@ -103,7 +105,7 @@ class LateralPlanner:
     lateralPlan.modelMonoTime = sm.logMonoTime['modelV2']
     lateralPlan.dPathPoints = self.y_pts.tolist()
     lateralPlan.psis = self.lat_mpc.x_sol[0:CONTROL_N, 2].tolist()
-    lateralPlan.curvatures = self.lat_mpc.x_sol[0:CONTROL_N, 3].tolist()
+    lateralPlan.curvatures = (self.lat_mpc.x_sol[0:CONTROL_N, 3]/sm['carState'].vEgo).tolist()
     lateralPlan.curvatureRates = [float(x) for x in self.lat_mpc.u_sol[0:CONTROL_N - 1]] + [0.0]
 
     lateralPlan.mpcSolutionValid = bool(plan_solution_valid)
diff --git a/selfdrive/controls/lib/longitudinal_mpc_lib/long_mpc.py b/selfdrive/controls/lib/longitudinal_mpc_lib/long_mpc.py
index 695222ed4d..2aab4b71af 100644
--- a/selfdrive/controls/lib/longitudinal_mpc_lib/long_mpc.py
+++ b/selfdrive/controls/lib/longitudinal_mpc_lib/long_mpc.py
@@ -253,7 +253,7 @@ class LongitudinalMpc:
       cost_weights = [X_EGO_OBSTACLE_COST, X_EGO_COST, V_EGO_COST, A_EGO_COST, a_change_cost, J_EGO_COST]
       constraint_cost_weights = [LIMIT_COST, LIMIT_COST, LIMIT_COST, DANGER_ZONE_COST]
     elif self.mode == 'blended':
-      cost_weights = [0., 0.2, 0.25, 1.0, 0.0, 1.0]
+      cost_weights = [0., 0.1, 0.2, 5.0, 0.0, 1.0]
       constraint_cost_weights = [LIMIT_COST, LIMIT_COST, LIMIT_COST, 50.0]
     else:
       raise NotImplementedError(f'Planner mode {self.mode} not recognized in planner cost set')
diff --git a/selfdrive/controls/lib/longitudinal_planner.py b/selfdrive/controls/lib/longitudinal_planner.py
index a0363536ca..018136e6f2 100755
--- a/selfdrive/controls/lib/longitudinal_planner.py
+++ b/selfdrive/controls/lib/longitudinal_planner.py
@@ -58,7 +58,6 @@ class LongitudinalPlanner:
 
     self.a_desired = init_a
     self.v_desired_filter = FirstOrderFilter(init_v, 2.0, DT_MDL)
-    self.t_uniform = np.arange(0.0, T_IDXS_MPC[-1] + 0.5, 0.5)
 
     self.v_desired_trajectory = np.zeros(CONTROL_N)
     self.a_desired_trajectory = np.zeros(CONTROL_N)
@@ -76,10 +75,7 @@ class LongitudinalPlanner:
       x = np.interp(T_IDXS_MPC, T_IDXS, model_msg.position.x)
       v = np.interp(T_IDXS_MPC, T_IDXS, model_msg.velocity.x)
       a = np.interp(T_IDXS_MPC, T_IDXS, model_msg.acceleration.x)
-      # Uniform interp so gradient is less noisy
-      a_sparse = np.interp(self.t_uniform, T_IDXS, model_msg.acceleration.x)
-      j_sparse = np.gradient(a_sparse, self.t_uniform)
-      j = np.interp(T_IDXS_MPC, self.t_uniform, j_sparse)
+      j = np.zeros(len(T_IDXS_MPC))
     else:
       x = np.zeros(len(T_IDXS_MPC))
       v = np.zeros(len(T_IDXS_MPC))
diff --git a/selfdrive/controls/tests/test_lateral_mpc.py b/selfdrive/controls/tests/test_lateral_mpc.py
index 4864dbdc06..9b986c053d 100644
--- a/selfdrive/controls/tests/test_lateral_mpc.py
+++ b/selfdrive/controls/tests/test_lateral_mpc.py
@@ -1,7 +1,8 @@
 import unittest
 import numpy as np
 from selfdrive.controls.lib.lateral_mpc_lib.lat_mpc import LateralMpc
-from selfdrive.controls.lib.drive_helpers import LAT_MPC_N, CAR_ROTATION_RADIUS
+from selfdrive.controls.lib.drive_helpers import CAR_ROTATION_RADIUS
+from selfdrive.controls.lib.lateral_mpc_lib.lat_mpc import N as LAT_MPC_N
 
 
 def run_mpc(lat_mpc=None, v_ref=30., x_init=0., y_init=0., psi_init=0., curvature_init=0.,
@@ -9,7 +10,7 @@ def run_mpc(lat_mpc=None, v_ref=30., x_init=0., y_init=0., psi_init=0., curvatur
 
   if lat_mpc is None:
     lat_mpc = LateralMpc()
-  lat_mpc.set_weights(1., 1., 1.)
+  lat_mpc.set_weights(1., 1., 0.0, 1.)
 
   y_pts = poly_shift * np.ones(LAT_MPC_N + 1)
   heading_pts = np.zeros(LAT_MPC_N + 1)
diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index 246f8c2941..5c02e2b15f 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -71,9 +71,9 @@ if use_thneed and arch == "larch64" or GetOption('pc_thneed'):
   fn = File("models/supercombo").abspath
 
   if GetOption('pc_thneed'):
-    cmd = f"cd {Dir('#').abspath}/tinygrad_repo && NATIVE_EXPLOG=1 OPTWG=1 UNSAFE_FLOAT4=1 DEBUGCL=1 python3 openpilot/compile.py {fn}.onnx {fn}.thneed"
+    cmd = f"cd {Dir('#').abspath}/tinygrad_repo && GPU=1 NATIVE_EXPLOG=1 OPTWG=1 UNSAFE_FLOAT4=1 DEBUGCL=1 python3 openpilot/compile.py {fn}.onnx {fn}.thneed"
   else:
-    cmd = f"cd {Dir('#').abspath}/tinygrad_repo && FLOAT16=1 PYOPENCL_NO_CACHE=1 MATMUL=1 NATIVE_EXPLOG=1 OPTWG=1 UNSAFE_FLOAT4=1 DEBUGCL=1 python3 openpilot/compile.py {fn}.onnx {fn}.thneed"
+    cmd = f"cd {Dir('#').abspath}/tinygrad_repo && FLOAT16=1 MATMUL=1 PYOPENCL_NO_CACHE=1 NATIVE_EXPLOG=1 OPTWG=1 UNSAFE_FLOAT4=1 DEBUGCL=1 python3 openpilot/compile.py {fn}.onnx {fn}.thneed"
 
   # is there a better way then listing all of tinygrad?
   lenv.Command(fn + ".thneed", [fn + ".onnx",
diff --git a/selfdrive/modeld/models/driving.cc b/selfdrive/modeld/models/driving.cc
index 8d02eb6b2f..3316e6d114 100644
--- a/selfdrive/modeld/models/driving.cc
+++ b/selfdrive/modeld/models/driving.cc
@@ -41,11 +41,11 @@ void model_init(ModelState* s, cl_device_id device_id, cl_context context) {
    &s->output[0], NET_OUTPUT_SIZE, USE_GPU_RUNTIME, true, false, context);
 
 #ifdef TEMPORAL
-  s->m->addRecurrent(&s->output[OUTPUT_SIZE], TEMPORAL_SIZE);
+  s->m->addRecurrent(&s->feature_buffer[0], TEMPORAL_SIZE);
 #endif
 
 #ifdef DESIRE
-  s->m->addDesire(s->pulse_desire, DESIRE_LEN);
+  s->m->addDesire(s->pulse_desire, DESIRE_LEN*(HISTORY_BUFFER_LEN+1));
 #endif
 
 #ifdef TRAFFIC_CONVENTION
@@ -56,18 +56,20 @@ void model_init(ModelState* s, cl_device_id device_id, cl_context context) {
 ModelOutput* model_eval_frame(ModelState* s, VisionBuf* buf, VisionBuf* wbuf,
                               const mat3 &transform, const mat3 &transform_wide, float *desire_in, bool is_rhd, bool prepare_only) {
 #ifdef DESIRE
+  std::memmove(&s->pulse_desire[0], &s->pulse_desire[DESIRE_LEN], sizeof(float) * DESIRE_LEN*HISTORY_BUFFER_LEN);
   if (desire_in != NULL) {
     for (int i = 1; i < DESIRE_LEN; i++) {
       // Model decides when action is completed
       // so desire input is just a pulse triggered on rising edge
       if (desire_in[i] - s->prev_desire[i] > .99) {
-        s->pulse_desire[i] = desire_in[i];
+        s->pulse_desire[DESIRE_LEN*(HISTORY_BUFFER_LEN-1)+i] = desire_in[i];
       } else {
-        s->pulse_desire[i] = 0.0;
+        s->pulse_desire[DESIRE_LEN*(HISTORY_BUFFER_LEN-1)+i] = 0.0;
       }
       s->prev_desire[i] = desire_in[i];
     }
   }
+LOGT("Desire enqueued");
 #endif
 
   int rhd_idx = is_rhd;
@@ -92,6 +94,12 @@ ModelOutput* model_eval_frame(ModelState* s, VisionBuf* buf, VisionBuf* wbuf,
   s->m->execute();
   LOGT("Execution finished");
 
+  #ifdef TEMPORAL
+    std::memmove(&s->feature_buffer[0], &s->feature_buffer[FEATURE_LEN], sizeof(float) * FEATURE_LEN*(HISTORY_BUFFER_LEN-1));
+    std::memcpy(&s->feature_buffer[FEATURE_LEN*(HISTORY_BUFFER_LEN-1)], &s->output[OUTPUT_SIZE], sizeof(float) * FEATURE_LEN);
+    LOGT("Features enqueued");
+  #endif
+
   return (ModelOutput*)&s->output;
 }
 
diff --git a/selfdrive/modeld/models/driving.h b/selfdrive/modeld/models/driving.h
index e2ee812e44..90767384eb 100644
--- a/selfdrive/modeld/models/driving.h
+++ b/selfdrive/modeld/models/driving.h
@@ -16,6 +16,8 @@
 #include "selfdrive/modeld/models/commonmodel.h"
 #include "selfdrive/modeld/runners/run.h"
 
+constexpr int FEATURE_LEN = 2048;
+constexpr int HISTORY_BUFFER_LEN = 99;
 constexpr int DESIRE_LEN = 8;
 constexpr int DESIRE_PRED_LEN = 4;
 constexpr int TRAFFIC_CONVENTION_LEN = 2;
@@ -233,6 +235,11 @@ struct ModelOutputMeta {
 };
 static_assert(sizeof(ModelOutputMeta) == sizeof(ModelOutputDesireProb) + sizeof(float) + (sizeof(ModelOutputDisengageProb)*DISENGAGE_LEN) + (sizeof(ModelOutputBlinkerProb)*BLINKER_LEN) + (sizeof(ModelOutputDesireProb)*DESIRE_PRED_LEN));
 
+struct ModelOutputFeatures {
+  std::array<float, FEATURE_LEN> feature;
+};
+static_assert(sizeof(ModelOutputFeatures) == (sizeof(float)*FEATURE_LEN));
+
 struct ModelOutput {
   const ModelOutputPlans plans;
   const ModelOutputLaneLines lane_lines;
@@ -244,22 +251,24 @@ struct ModelOutput {
 };
 
 constexpr int OUTPUT_SIZE = sizeof(ModelOutput) / sizeof(float);
+
 #ifdef TEMPORAL
-  constexpr int TEMPORAL_SIZE = 512;
+  constexpr int TEMPORAL_SIZE = HISTORY_BUFFER_LEN * FEATURE_LEN;
 #else
   constexpr int TEMPORAL_SIZE = 0;
 #endif
-constexpr int NET_OUTPUT_SIZE = OUTPUT_SIZE + TEMPORAL_SIZE;
+constexpr int NET_OUTPUT_SIZE = OUTPUT_SIZE + FEATURE_LEN;
 
 // TODO: convert remaining arrays to std::array and update model runners
 struct ModelState {
   ModelFrame *frame = nullptr;
   ModelFrame *wide_frame = nullptr;
+  std::array<float, HISTORY_BUFFER_LEN * FEATURE_LEN> feature_buffer = {};
   std::array<float, NET_OUTPUT_SIZE> output = {};
   std::unique_ptr<RunModel> m;
 #ifdef DESIRE
   float prev_desire[DESIRE_LEN] = {};
-  float pulse_desire[DESIRE_LEN] = {};
+  float pulse_desire[DESIRE_LEN*(HISTORY_BUFFER_LEN+1)] = {};
 #endif
 #ifdef TRAFFIC_CONVENTION
   float traffic_convention[TRAFFIC_CONVENTION_LEN] = {};
diff --git a/selfdrive/modeld/models/supercombo.onnx b/selfdrive/modeld/models/supercombo.onnx
index 7b11edbe08..37edc36c02 100644
--- a/selfdrive/modeld/models/supercombo.onnx
+++ b/selfdrive/modeld/models/supercombo.onnx
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:50c7fc8565ac69a4b9a0de122e961326820e78bf13659255a89d0ed04be030d5
-size 95167481
+oid sha256:30f30bc1251c03db135564ecbf7dc0bc96cbb07be0ebd3691edd8d555dc087fa
+size 58539693
diff --git a/selfdrive/modeld/runners/onnx_runner.py b/selfdrive/modeld/runners/onnx_runner.py
index ac7cc68814..d4a11a7c0b 100755
--- a/selfdrive/modeld/runners/onnx_runner.py
+++ b/selfdrive/modeld/runners/onnx_runner.py
@@ -9,6 +9,8 @@ os.environ["OMP_WAIT_POLICY"] = "PASSIVE"
 
 import onnxruntime as ort # pylint: disable=import-error
 
+ORT_TYPES_TO_NP_TYPES = {'tensor(float16)': np.float16, 'tensor(float)': np.float32, 'tensor(uint8)': np.uint8}
+
 def read(sz, tf8=False):
   dd = []
   gt = 0
@@ -18,7 +20,7 @@ def read(sz, tf8=False):
     assert(len(st) > 0)
     dd.append(st)
     gt += len(st)
-  r = np.frombuffer(b''.join(dd), dtype=np.uint8 if tf8 else np.float32).astype(np.float32)
+  r = np.frombuffer(b''.join(dd), dtype=np.uint8 if tf8 else np.float32)
   if tf8:
     r = r / 255.
   return r
@@ -29,22 +31,23 @@ def write(d):
 def run_loop(m, tf8_input=False):
   ishapes = [[1]+ii.shape[1:] for ii in m.get_inputs()]
   keys = [x.name for x in m.get_inputs()]
+  itypes = [ORT_TYPES_TO_NP_TYPES[x.type] for x in m.get_inputs()]
 
   # run once to initialize CUDA provider
   if "CUDAExecutionProvider" in m.get_providers():
-    m.run(None, dict(zip(keys, [np.zeros(shp, dtype=np.float32) for shp in ishapes])))
+    m.run(None, dict(zip(keys, [np.zeros(shp, dtype=itp) for shp, itp in zip(ishapes, itypes)])))
 
   print("ready to run onnx model", keys, ishapes, file=sys.stderr)
   while 1:
     inputs = []
-    for k, shp in zip(keys, ishapes):
+    for k, shp, itp in zip(keys, ishapes, itypes):
       ts = np.product(shp)
       #print("reshaping %s with offset %d" % (str(shp), offset), file=sys.stderr)
-      inputs.append(read(ts, (k=='input_img' and tf8_input)).reshape(shp))
+      inputs.append(read(ts, (k=='input_img' and tf8_input)).reshape(shp).astype(itp))
     ret = m.run(None, dict(zip(keys, inputs)))
     #print(ret, file=sys.stderr)
     for r in ret:
-      write(r)
+      write(r.astype(np.float32))
 
 
 if __name__ == "__main__":
diff --git a/selfdrive/modeld/thneed/thneed_common.cc b/selfdrive/modeld/thneed/thneed_common.cc
index 21170b13a6..ecdf1237e3 100644
--- a/selfdrive/modeld/thneed/thneed_common.cc
+++ b/selfdrive/modeld/thneed/thneed_common.cc
@@ -12,7 +12,7 @@ map<pair<cl_kernel, int>, int> g_args_size;
 map<cl_program, string> g_program_source;
 
 void Thneed::stop() {
-  printf("Thneed::stop: recorded %lu commands\n", cmds.size());
+  //printf("Thneed::stop: recorded %lu commands\n", cmds.size());
   record = false;
 }
 
diff --git a/selfdrive/test/process_replay/compare_logs.py b/selfdrive/test/process_replay/compare_logs.py
index bf6daf5fed..c14956b1b2 100755
--- a/selfdrive/test/process_replay/compare_logs.py
+++ b/selfdrive/test/process_replay/compare_logs.py
@@ -41,7 +41,7 @@ def remove_ignored_fields(msg, ignore):
       elif isinstance(v, numbers.Number):
         val = 0
       else:
-        raise NotImplementedError
+        raise NotImplementedError('Error ignoring field')
       setattr(attr, keys[-1], val)
   return msg.as_reader()
 
diff --git a/selfdrive/test/process_replay/model_replay.py b/selfdrive/test/process_replay/model_replay.py
index ccd89bea9a..ecfacbf5d2 100755
--- a/selfdrive/test/process_replay/model_replay.py
+++ b/selfdrive/test/process_replay/model_replay.py
@@ -22,7 +22,7 @@ from tools.lib.logreader import LogReader
 
 TEST_ROUTE = "4cf7a6ad03080c90|2021-09-29--13-46-36"
 SEGMENT = 0
-MAX_FRAMES = 10 if PC else 1300
+MAX_FRAMES = 100 if PC else 1300
 
 SEND_EXTRA_INPUTS = bool(os.getenv("SEND_EXTRA_INPUTS", "0"))
 
@@ -174,7 +174,7 @@ if __name__ == "__main__":
         'driverStateV2.dspExecutionTime'
       ]
       # TODO this tolerance is absurdly large
-      tolerance = 5e-1 if PC else None
+      tolerance = 2.0 if PC else None
       results: Any = {TEST_ROUTE: {}}
       log_paths: Any = {TEST_ROUTE: {"models": {'ref': BASE_URL + log_fn, 'new': log_fn}}}
       results[TEST_ROUTE]["models"] = compare_logs(cmp_log, log_msgs, tolerance=tolerance, ignore_fields=ignore)
diff --git a/selfdrive/test/process_replay/model_replay_ref_commit b/selfdrive/test/process_replay/model_replay_ref_commit
index 958d3da14d..9727ef3d17 100644
--- a/selfdrive/test/process_replay/model_replay_ref_commit
+++ b/selfdrive/test/process_replay/model_replay_ref_commit
@@ -1 +1 @@
-c40319a454840d8a2196ec1227d27b536ee14375
+008c0a622b7471c6234690d668f76bcb5dc8d999
diff --git a/selfdrive/test/process_replay/ref_commit b/selfdrive/test/process_replay/ref_commit
index d4d16b726b..0e3cdf797f 100644
--- a/selfdrive/test/process_replay/ref_commit
+++ b/selfdrive/test/process_replay/ref_commit
@@ -1 +1 @@
-1989d8c2a5de94faa3756b7d10fc94e6c063afa5
\ No newline at end of file
+e0ffcae8def2fd9c82c547d1f257d4f06a48a3c3
diff --git a/tinygrad_repo b/tinygrad_repo
index 2e9b7637b3..82ca9c6666 160000
--- a/tinygrad_repo
+++ b/tinygrad_repo
@@ -1 +1 @@
-Subproject commit 2e9b7637b3c3c8895fda9f964215db3a35fe3441
+Subproject commit 82ca9c66664b5f3739e1881f62cb94b72cf0b1fa