controlsd: better alert precedence for system malfunctions (#24366)

* controlsd: better alert precedence for system malfunctions

* down to 20

* move that

* check can flags

* update refs

* update refs
old-commit-hash: be748c0d6a
taco
Adeeb Shihadeh 3 years ago committed by GitHub
parent 2d6ae65ea7
commit 247c153a35
  1. 75
      selfdrive/controls/controlsd.py
  2. 2
      selfdrive/test/process_replay/ref_commit

@ -59,7 +59,7 @@ ENABLED_STATES = (State.preEnabled, *ACTIVE_STATES)
class Controls:
def __init__(self, sm=None, pm=None, can_sock=None, CI=None):
config_realtime_process(4 if TICI else 3, Priority.CTRL_HIGH)
config_realtime_process(4, Priority.CTRL_HIGH)
# Setup sockets
self.pm = pm
@ -73,7 +73,7 @@ class Controls:
self.can_sock = can_sock
if can_sock is None:
can_timeout = None if os.environ.get('NO_CAN_TIMEOUT', False) else 100
can_timeout = None if os.environ.get('NO_CAN_TIMEOUT', False) else 20
self.can_sock = messaging.sub_sock('can', timeout=can_timeout)
if TICI:
@ -162,7 +162,7 @@ class Controls:
self.last_functional_fan_frame = 0
self.events_prev = []
self.current_alert_types = [ET.PERMANENT]
self.logged_comm_issue = False
self.logged_comm_issue = None
self.button_timers = {ButtonEvent.Type.decelCruise: 0, ButtonEvent.Type.accelCruise: 0}
self.last_actuators = car.CarControl.Actuators.new_message()
@ -213,11 +213,17 @@ class Controls:
self.events.add(EventName.pedalPressedPreEnable if self.disengage_on_accelerator else
EventName.gasPressedOverride)
self.events.add_from_msg(CS.events)
if not self.CP.notCar:
self.events.add_from_msg(self.sm['driverMonitoringState'].events)
# Handle car events. Ignore when CAN is invalid
if CS.canTimeout:
self.events.add(EventName.canBusMissing)
elif not CS.canValid:
self.events.add(EventName.canError)
else:
self.events.add_from_msg(CS.events)
# Create events for temperature, disk space, and memory
if self.sm['deviceState'].thermalStatus >= ThermalStatus.red:
self.events.add(EventName.overheat)
@ -225,7 +231,7 @@ class Controls:
# under 7% of space free no enable allowed
self.events.add(EventName.outOfSpace)
# TODO: make tici threshold the same
if self.sm['deviceState'].memoryUsagePercent > (90 if TICI else 65) and not SIMULATION:
if self.sm['deviceState'].memoryUsagePercent > 90 and not SIMULATION:
self.events.add(EventName.lowMemory)
# TODO: enable this once loggerd CPU usage is more reasonable
@ -234,7 +240,7 @@ class Controls:
# self.events.add(EventName.highCpuUsage)
# Alert if fan isn't spinning for 5 seconds
if self.sm['peripheralState'].pandaType in (PandaType.uno, PandaType.dos):
if self.sm['peripheralState'].pandaType == PandaType.dos:
if self.sm['peripheralState'].fanSpeedRpm == 0 and self.sm['deviceState'].fanSpeedPercentDesired > 50:
if (self.sm.frame - self.last_functional_fan_frame) * DT_CTRL > 5.0:
self.events.add(EventName.fanMalfunction)
@ -264,11 +270,6 @@ class Controls:
LaneChangeState.laneChangeFinishing):
self.events.add(EventName.laneChange)
if CS.canTimeout:
self.events.add(EventName.canBusMissing)
elif not CS.canValid:
self.events.add(EventName.canError)
for i, pandaState in enumerate(self.sm['pandaStates']):
# All pandas must match the list of safetyConfigs, and if outside this list, must be silent or noOutput
if i < len(self.CP.safetyConfigs):
@ -284,15 +285,29 @@ class Controls:
if log.PandaState.FaultType.relayMalfunction in pandaState.faults:
self.events.add(EventName.relayMalfunction)
# Check for HW or system issues
# Handle HW and system malfunctions
# Order is very intentional here. Be careful when modifying this.
num_events = len(self.events)
not_running = {p.name for p in self.sm['managerState'].processes if not p.running and p.shouldBeRunning}
if self.sm.rcv_frame['managerState'] and (not_running - IGNORE_PROCESSES):
self.events.add(EventName.processNotRunning)
else:
if not SIMULATION and not self.rk.lagging:
if not self.sm.all_alive(self.camera_packets):
self.events.add(EventName.cameraMalfunction)
elif not self.sm.all_freq_ok(self.camera_packets):
self.events.add(EventName.cameraFrameRate)
if self.rk.lagging:
self.events.add(EventName.controlsdLagging)
elif len(self.sm['radarState'].radarErrors):
if len(self.sm['radarState'].radarErrors):
self.events.add(EventName.radarFault)
elif not self.sm.valid['pandaStates']:
if not self.sm.valid['pandaStates']:
self.events.add(EventName.usbError)
elif not self.sm.all_checks() or self.can_rcv_error:
# generic catch-all. ideally, a more specific event should be added above instead
no_system_errors = len(self.events) != num_events
if (not self.sm.all_checks() or self.can_rcv_error) and no_system_errors and CS.canValid and not CS.canTimeout:
if not self.sm.all_alive():
self.events.add(EventName.commIssue)
elif not self.sm.all_freq_ok():
@ -300,14 +315,17 @@ class Controls:
else: # invalid or can_rcv_error.
self.events.add(EventName.commIssue)
if not self.logged_comm_issue:
invalid = [s for s, valid in self.sm.valid.items() if not valid]
not_alive = [s for s, alive in self.sm.alive.items() if not alive]
not_freq_ok = [s for s, freq_ok in self.sm.freq_ok.items() if not freq_ok]
cloudlog.event("commIssue", invalid=invalid, not_alive=not_alive, not_freq_ok=not_freq_ok, can_error=self.can_rcv_error, error=True)
self.logged_comm_issue = True
logs = {
'invalid': [s for s, valid in self.sm.valid.items() if not valid],
'not_alive': [s for s, alive in self.sm.alive.items() if not alive],
'not_freq_ok': [s for s, freq_ok in self.sm.freq_ok.items() if not freq_ok],
'can_error': self.can_rcv_error,
}
if logs != self.logged_comm_issue:
cloudlog.event("commIssue", error=True, **logs)
self.logged_comm_issue = logs
else:
self.logged_comm_issue = False
self.logged_comm_issue = None
if not self.sm['liveParameters'].valid:
self.events.add(EventName.vehicleModelInvalid)
@ -354,22 +372,11 @@ class Controls:
# Not show in first 1 km to allow for driving out of garage. This event shows after 5 minutes
self.events.add(EventName.noGps)
if not self.rk.lagging:
if not self.sm.all_alive(self.camera_packets):
self.events.add(EventName.cameraMalfunction)
elif not self.sm.all_freq_ok(self.camera_packets):
self.events.add(EventName.cameraFrameRate)
if self.sm['modelV2'].frameDropPerc > 20:
self.events.add(EventName.modeldLagging)
if self.sm['liveLocationKalman'].excessiveResets:
self.events.add(EventName.localizerMalfunction)
# Check if all manager processes are running
not_running = {p.name for p in self.sm['managerState'].processes if not p.running and p.shouldBeRunning}
if self.sm.rcv_frame['managerState'] and (not_running - IGNORE_PROCESSES):
self.events.add(EventName.processNotRunning)
# Only allow engagement with brake pressed when stopped behind another stopped car
speeds = self.sm['longitudinalPlan'].speeds
if len(speeds) > 1:

@ -1 +1 @@
e37c2ea1447363af1403e8260450c30e2d862101
d17ecea1f071007193a8a1e1ececf78c96b9deac
Loading…
Cancel
Save