From 247c153a35fd521f34a8529a29fa7e79c67b7009 Mon Sep 17 00:00:00 2001 From: Adeeb Shihadeh Date: Fri, 29 Apr 2022 13:36:25 -0700 Subject: [PATCH] controlsd: better alert precedence for system malfunctions (#24366) * controlsd: better alert precedence for system malfunctions * down to 20 * move that * check can flags * update refs * update refs old-commit-hash: be748c0d6a1a931a26b4ec741bbba71d4269b821 --- selfdrive/controls/controlsd.py | 75 +++++++++++++----------- selfdrive/test/process_replay/ref_commit | 2 +- 2 files changed, 42 insertions(+), 35 deletions(-) diff --git a/selfdrive/controls/controlsd.py b/selfdrive/controls/controlsd.py index 958de45995..6dff3bb2e1 100755 --- a/selfdrive/controls/controlsd.py +++ b/selfdrive/controls/controlsd.py @@ -59,7 +59,7 @@ ENABLED_STATES = (State.preEnabled, *ACTIVE_STATES) class Controls: def __init__(self, sm=None, pm=None, can_sock=None, CI=None): - config_realtime_process(4 if TICI else 3, Priority.CTRL_HIGH) + config_realtime_process(4, Priority.CTRL_HIGH) # Setup sockets self.pm = pm @@ -73,7 +73,7 @@ class Controls: self.can_sock = can_sock if can_sock is None: - can_timeout = None if os.environ.get('NO_CAN_TIMEOUT', False) else 100 + can_timeout = None if os.environ.get('NO_CAN_TIMEOUT', False) else 20 self.can_sock = messaging.sub_sock('can', timeout=can_timeout) if TICI: @@ -162,7 +162,7 @@ class Controls: self.last_functional_fan_frame = 0 self.events_prev = [] self.current_alert_types = [ET.PERMANENT] - self.logged_comm_issue = False + self.logged_comm_issue = None self.button_timers = {ButtonEvent.Type.decelCruise: 0, ButtonEvent.Type.accelCruise: 0} self.last_actuators = car.CarControl.Actuators.new_message() @@ -213,11 +213,17 @@ class Controls: self.events.add(EventName.pedalPressedPreEnable if self.disengage_on_accelerator else EventName.gasPressedOverride) - self.events.add_from_msg(CS.events) - if not self.CP.notCar: self.events.add_from_msg(self.sm['driverMonitoringState'].events) + # Handle car events. Ignore when CAN is invalid + if CS.canTimeout: + self.events.add(EventName.canBusMissing) + elif not CS.canValid: + self.events.add(EventName.canError) + else: + self.events.add_from_msg(CS.events) + # Create events for temperature, disk space, and memory if self.sm['deviceState'].thermalStatus >= ThermalStatus.red: self.events.add(EventName.overheat) @@ -225,7 +231,7 @@ class Controls: # under 7% of space free no enable allowed self.events.add(EventName.outOfSpace) # TODO: make tici threshold the same - if self.sm['deviceState'].memoryUsagePercent > (90 if TICI else 65) and not SIMULATION: + if self.sm['deviceState'].memoryUsagePercent > 90 and not SIMULATION: self.events.add(EventName.lowMemory) # TODO: enable this once loggerd CPU usage is more reasonable @@ -234,7 +240,7 @@ class Controls: # self.events.add(EventName.highCpuUsage) # Alert if fan isn't spinning for 5 seconds - if self.sm['peripheralState'].pandaType in (PandaType.uno, PandaType.dos): + if self.sm['peripheralState'].pandaType == PandaType.dos: if self.sm['peripheralState'].fanSpeedRpm == 0 and self.sm['deviceState'].fanSpeedPercentDesired > 50: if (self.sm.frame - self.last_functional_fan_frame) * DT_CTRL > 5.0: self.events.add(EventName.fanMalfunction) @@ -264,11 +270,6 @@ class Controls: LaneChangeState.laneChangeFinishing): self.events.add(EventName.laneChange) - if CS.canTimeout: - self.events.add(EventName.canBusMissing) - elif not CS.canValid: - self.events.add(EventName.canError) - for i, pandaState in enumerate(self.sm['pandaStates']): # All pandas must match the list of safetyConfigs, and if outside this list, must be silent or noOutput if i < len(self.CP.safetyConfigs): @@ -284,15 +285,29 @@ class Controls: if log.PandaState.FaultType.relayMalfunction in pandaState.faults: self.events.add(EventName.relayMalfunction) - # Check for HW or system issues + # Handle HW and system malfunctions + # Order is very intentional here. Be careful when modifying this. + num_events = len(self.events) + + not_running = {p.name for p in self.sm['managerState'].processes if not p.running and p.shouldBeRunning} + if self.sm.rcv_frame['managerState'] and (not_running - IGNORE_PROCESSES): + self.events.add(EventName.processNotRunning) + else: + if not SIMULATION and not self.rk.lagging: + if not self.sm.all_alive(self.camera_packets): + self.events.add(EventName.cameraMalfunction) + elif not self.sm.all_freq_ok(self.camera_packets): + self.events.add(EventName.cameraFrameRate) if self.rk.lagging: self.events.add(EventName.controlsdLagging) - elif len(self.sm['radarState'].radarErrors): + if len(self.sm['radarState'].radarErrors): self.events.add(EventName.radarFault) - elif not self.sm.valid['pandaStates']: + if not self.sm.valid['pandaStates']: self.events.add(EventName.usbError) - elif not self.sm.all_checks() or self.can_rcv_error: + # generic catch-all. ideally, a more specific event should be added above instead + no_system_errors = len(self.events) != num_events + if (not self.sm.all_checks() or self.can_rcv_error) and no_system_errors and CS.canValid and not CS.canTimeout: if not self.sm.all_alive(): self.events.add(EventName.commIssue) elif not self.sm.all_freq_ok(): @@ -300,14 +315,17 @@ class Controls: else: # invalid or can_rcv_error. self.events.add(EventName.commIssue) - if not self.logged_comm_issue: - invalid = [s for s, valid in self.sm.valid.items() if not valid] - not_alive = [s for s, alive in self.sm.alive.items() if not alive] - not_freq_ok = [s for s, freq_ok in self.sm.freq_ok.items() if not freq_ok] - cloudlog.event("commIssue", invalid=invalid, not_alive=not_alive, not_freq_ok=not_freq_ok, can_error=self.can_rcv_error, error=True) - self.logged_comm_issue = True + logs = { + 'invalid': [s for s, valid in self.sm.valid.items() if not valid], + 'not_alive': [s for s, alive in self.sm.alive.items() if not alive], + 'not_freq_ok': [s for s, freq_ok in self.sm.freq_ok.items() if not freq_ok], + 'can_error': self.can_rcv_error, + } + if logs != self.logged_comm_issue: + cloudlog.event("commIssue", error=True, **logs) + self.logged_comm_issue = logs else: - self.logged_comm_issue = False + self.logged_comm_issue = None if not self.sm['liveParameters'].valid: self.events.add(EventName.vehicleModelInvalid) @@ -354,22 +372,11 @@ class Controls: # Not show in first 1 km to allow for driving out of garage. This event shows after 5 minutes self.events.add(EventName.noGps) - if not self.rk.lagging: - if not self.sm.all_alive(self.camera_packets): - self.events.add(EventName.cameraMalfunction) - elif not self.sm.all_freq_ok(self.camera_packets): - self.events.add(EventName.cameraFrameRate) - if self.sm['modelV2'].frameDropPerc > 20: self.events.add(EventName.modeldLagging) if self.sm['liveLocationKalman'].excessiveResets: self.events.add(EventName.localizerMalfunction) - # Check if all manager processes are running - not_running = {p.name for p in self.sm['managerState'].processes if not p.running and p.shouldBeRunning} - if self.sm.rcv_frame['managerState'] and (not_running - IGNORE_PROCESSES): - self.events.add(EventName.processNotRunning) - # Only allow engagement with brake pressed when stopped behind another stopped car speeds = self.sm['longitudinalPlan'].speeds if len(speeds) > 1: diff --git a/selfdrive/test/process_replay/ref_commit b/selfdrive/test/process_replay/ref_commit index 39c8add0e1..b870f98460 100644 --- a/selfdrive/test/process_replay/ref_commit +++ b/selfdrive/test/process_replay/ref_commit @@ -1 +1 @@ -e37c2ea1447363af1403e8260450c30e2d862101 \ No newline at end of file +d17ecea1f071007193a8a1e1ececf78c96b9deac \ No newline at end of file