test_onroad: enforce CPU usage test for all procs (#28024)

* cleanup * add missing procs * ugh not systemd * floor * unused * handle laikad * continue --------- Co-authored-by: Comma Device <device@comma.ai> old-commit-hash: 64df41c9af
2 years ago · a25d2f95de
parent 56b63e12bf
commit a25d2f95de
1 changed files with 80 additions and 51 deletions
--- a/selfdrive/test/test_onroad.py
+++ b/selfdrive/test/test_onroad.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+import math
 import json
 import os
 import subprocess
@ -6,6 +7,7 @@ import time
 import numpy as np
 import unittest
 from collections import Counter, defaultdict
+from functools import cached_property
 from pathlib import Path

 from cereal import car
@ -45,6 +47,15 @@ PROCS = {
  "./ubloxd": 0.02,
  "selfdrive.tombstoned": 0,
  "./logcatd": 0,
+  "system.micd": 10.0,
+  "system.timezoned": 0,
+  "system.sensord.pigeond": 6.0,
+  "selfdrive.boardd.pandad": 0,
+  "selfdrive.statsd": 0.4,
+  "selfdrive.navd.navd": 0.4,
+  "system.loggerd.uploader": 30.0,
+  "system.loggerd.deleter": 0.1,
+  "selfdrive.locationd.laikad": None,  # TODO: laikad cpu usage is sporadic
 }

 TIMINGS = {
@ -71,48 +82,6 @@ def cputime_total(ct):
  return ct.cpuUser + ct.cpuSystem + ct.cpuChildrenUser + ct.cpuChildrenSystem


-def check_cpu_usage(proclogs):
-  result = "\n"
-  result += "------------------------------------------------\n"
-  result += "------------------ CPU Usage -------------------\n"
-  result += "------------------------------------------------\n"
-
-  plogs_by_proc = defaultdict(list)
-  for pl in proclogs:
-    for x in pl.procLog.procs:
-      if len(x.cmdline) > 0:
-        n = list(x.cmdline)[0]
-        plogs_by_proc[n].append(x)
-
-  print(plogs_by_proc.keys())
-
-  r = True
-  dt = (proclogs[-1].logMonoTime - proclogs[0].logMonoTime) / 1e9
-  for proc_name, expected_cpu in PROCS.items():
-    err = ""
-    cpu_usage = 0.
-    x = plogs_by_proc[proc_name]
-    if len(x) > 2:
-      cpu_time = cputime_total(x[-1]) - cputime_total(x[0])
-      cpu_usage = cpu_time / dt * 100.
-      if cpu_usage > max(expected_cpu * 1.15, expected_cpu + 5.0):
-        # cpu usage is high while playing sounds
-        if not (proc_name == "./_soundd" and cpu_usage < 65.):
-          err = "using more CPU than normal"
-      elif cpu_usage < min(expected_cpu * 0.65, max(expected_cpu - 1.0, 0.0)):
-        err = "using less CPU than normal"
-    else:
-      err = "NO METRICS FOUND"
-
-    result += f"{proc_name.ljust(35)}  {cpu_usage:5.2f}% ({expected_cpu:5.2f}%) {err}\n"
-    if len(err) > 0:
-      r = False
-
-  result += "------------------------------------------------\n"
-  print(result)
-  return r
-
-
 class TestOnroad(unittest.TestCase):

  @classmethod
@ -120,8 +89,8 @@ class TestOnroad(unittest.TestCase):
    if "DEBUG" in os.environ:
      segs = filter(lambda x: os.path.exists(os.path.join(x, "rlog")), Path(ROOT).iterdir())
      segs = sorted(segs, key=lambda x: x.stat().st_mtime)
-      print(segs[-2])
-      cls.lr = list(LogReader(os.path.join(segs[-2], "rlog")))
+      print(segs[-3])
+      cls.lr = list(LogReader(os.path.join(segs[-3], "rlog")))
      return

    # setup env
@ -177,6 +146,21 @@ class TestOnroad(unittest.TestCase):
    # use the second segment by default as it's the first full segment
    cls.lr = list(LogReader(os.path.join(str(cls.segments[1]), "rlog")))

+  @cached_property
+  def service_msgs(self):
+    msgs = defaultdict(list)
+    for m in self.lr:
+      msgs[m.which()].append(m)
+    return msgs
+
+  def test_service_frequencies(self):
+    for s, msgs in self.service_msgs.items():
+      if s in ('initData', 'sentinel'):
+        continue
+
+      with self.subTest(service=s):
+        assert len(msgs) >= math.floor(service_list[s].frequency*55)
+
  def test_cloudlog_size(self):
    msgs = [m for m in self.lr if m.which() == 'logMessage']

@ -193,7 +177,7 @@ class TestOnroad(unittest.TestCase):
    result += "-------------- UI Draw Timing ------------------\n"
    result += "------------------------------------------------\n"

-    ts = [m.uiDebug.drawTimeMillis for m in self.lr if m.which() == 'uiDebug']
+    ts = [m.uiDebug.drawTimeMillis for m in self.service_msgs['uiDebug']]
    result += f"min  {min(ts):.2f}ms\n"
    result += f"max  {max(ts):.2f}ms\n"
    result += f"std  {np.std(ts):.2f}ms\n"
@ -201,15 +185,60 @@ class TestOnroad(unittest.TestCase):
    result += "------------------------------------------------\n"
    print(result)

-    self.assertGreater(len(ts), 20*50, "insufficient samples")
    #self.assertLess(max(ts), 30.)
    self.assertLess(np.mean(ts), 10.)
    #self.assertLess(np.std(ts), 5.)

  def test_cpu_usage(self):
-    proclogs = [m for m in self.lr if m.which() == 'procLog']
-    self.assertGreater(len(proclogs), service_list['procLog'].frequency * 45, "insufficient samples")
-    cpu_ok = check_cpu_usage(proclogs)
+    result = "\n"
+    result += "------------------------------------------------\n"
+    result += "------------------ CPU Usage -------------------\n"
+    result += "------------------------------------------------\n"
+
+    plogs_by_proc = defaultdict(list)
+    for pl in self.service_msgs['procLog']:
+      for x in pl.procLog.procs:
+        if len(x.cmdline) > 0:
+          n = list(x.cmdline)[0]
+          plogs_by_proc[n].append(x)
+    print(plogs_by_proc.keys())
+
+    cpu_ok = True
+    dt = (self.service_msgs['procLog'][-1].logMonoTime - self.service_msgs['procLog'][0].logMonoTime) / 1e9
+    for proc_name, expected_cpu in PROCS.items():
+
+      err = ""
+      cpu_usage = 0.
+      x = plogs_by_proc[proc_name]
+      if len(x) > 2:
+        cpu_time = cputime_total(x[-1]) - cputime_total(x[0])
+        cpu_usage = cpu_time / dt * 100.
+
+        if expected_cpu is None:
+          result += f"{proc_name.ljust(35)}  {cpu_usage:5.2f}% ({expected_cpu}) SKIPPED\n"
+          continue
+        elif cpu_usage > max(expected_cpu * 1.15, expected_cpu + 5.0):
+          # cpu usage is high while playing sounds
+          if not (proc_name == "./_soundd" and cpu_usage < 65.):
+            err = "using more CPU than normal"
+        elif cpu_usage < min(expected_cpu * 0.65, max(expected_cpu - 1.0, 0.0)):
+          err = "using less CPU than normal"
+      else:
+        err = "NO METRICS FOUND"
+
+      result += f"{proc_name.ljust(35)}  {cpu_usage:5.2f}% ({expected_cpu:5.2f}%) {err}\n"
+      if len(err) > 0:
+        cpu_ok = False
+
+    # Ensure there's no missing procs
+    all_procs = set([p.name for p in self.service_msgs['managerState'][0].managerState.processes if p.shouldBeRunning])
+    for p in all_procs:
+      with self.subTest(proc=p):
+        assert any(p in pp for pp in PROCS.keys()), f"Expected CPU usage missing for {p}"
+
+    result += "------------------------------------------------\n"
+    print(result)
+
    self.assertTrue(cpu_ok)

  def test_camera_processing_time(self):
@ -234,7 +263,7 @@ class TestOnroad(unittest.TestCase):

    cfgs = [("lateralPlan", 0.05, 0.05), ("longitudinalPlan", 0.05, 0.05)]
    for (s, instant_max, avg_max) in cfgs:
-      ts = [getattr(getattr(m, s), "solverExecutionTime") for m in self.lr if m.which() == s]
+      ts = [getattr(getattr(m, s), "solverExecutionTime") for m in self.service_msgs[s]]
      self.assertLess(max(ts), instant_max, f"high '{s}' execution time: {max(ts)}")
      self.assertLess(np.mean(ts), avg_max, f"high avg '{s}' execution time: {np.mean(ts)}")
      result += f"'{s}' execution time: min  {min(ts):.5f}s\n"
@ -254,7 +283,7 @@ class TestOnroad(unittest.TestCase):
      ("driverStateV2", 0.050, 0.026),
    ]
    for (s, instant_max, avg_max) in cfgs:
-      ts = [getattr(getattr(m, s), "modelExecutionTime") for m in self.lr if m.which() == s]
+      ts = [getattr(getattr(m, s), "modelExecutionTime") for m in self.service_msgs[s]]
      self.assertLess(max(ts), instant_max, f"high '{s}' execution time: {max(ts)}")
      self.assertLess(np.mean(ts), avg_max, f"high avg '{s}' execution time: {np.mean(ts)}")
      result += f"'{s}' execution time: min  {min(ts):.5f}s\n"