Alert when updated consistently fails (#2013)

* alert when update fails more than 10 times

* bring over offroad alert refactor from other branch

* and we have tests

* use it in snapshot

* bump apk

* don't show exceptions on release branches

* only write when changed

* why does delete use so much cpu

* clean that up

* little more
old-commit-hash: 8e63f06540
commatwo_master
Adeeb Shihadeh 5 years ago committed by GitHub
parent 0097b87b62
commit 017b084154
  1. 4
      apk/ai.comma.plus.offroad.apk
  2. 2
      common/params.py
  3. 9
      selfdrive/camerad/snapshot/snapshot.py
  4. 24
      selfdrive/controls/lib/alertmanager.py
  5. 5
      selfdrive/controls/lib/alerts_offroad.json
  6. 34
      selfdrive/controls/tests/test_alerts.py
  7. 72
      selfdrive/thermald/thermald.py
  8. 11
      selfdrive/updated.py

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1 version https://git-lfs.github.com/spec/v1
oid sha256:64e31350a4138675cb39703d070fec787c011fcbdee3fbae1cbbb21ce4ceb6be oid sha256:a198491887ed6029bffdf7f4dc28c4f9a6ba5f9d2235710fc11a1378893491d7
size 13701989 size 13702777

@ -80,6 +80,7 @@ keys = {
"IsUploadRawEnabled": [TxType.PERSISTENT], "IsUploadRawEnabled": [TxType.PERSISTENT],
"LastAthenaPingTime": [TxType.PERSISTENT], "LastAthenaPingTime": [TxType.PERSISTENT],
"LastUpdateTime": [TxType.PERSISTENT], "LastUpdateTime": [TxType.PERSISTENT],
"LastUpdateException": [TxType.PERSISTENT],
"LimitSetSpeed": [TxType.PERSISTENT], "LimitSetSpeed": [TxType.PERSISTENT],
"LimitSetSpeedNeural": [TxType.PERSISTENT], "LimitSetSpeedNeural": [TxType.PERSISTENT],
"LiveParameters": [TxType.PERSISTENT], "LiveParameters": [TxType.PERSISTENT],
@ -108,6 +109,7 @@ keys = {
"Offroad_InvalidTime": [TxType.CLEAR_ON_MANAGER_START], "Offroad_InvalidTime": [TxType.CLEAR_ON_MANAGER_START],
"Offroad_IsTakingSnapshot": [TxType.CLEAR_ON_MANAGER_START], "Offroad_IsTakingSnapshot": [TxType.CLEAR_ON_MANAGER_START],
"Offroad_NeosUpdate": [TxType.CLEAR_ON_MANAGER_START], "Offroad_NeosUpdate": [TxType.CLEAR_ON_MANAGER_START],
"Offroad_UpdateFailed": [TxType.CLEAR_ON_MANAGER_START],
} }

@ -1,6 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os import os
import json
import signal import signal
import subprocess import subprocess
import time import time
@ -8,9 +7,7 @@ from PIL import Image
from common.basedir import BASEDIR from common.basedir import BASEDIR
from common.params import Params from common.params import Params
from selfdrive.camerad.snapshot.visionipc import VisionIPC from selfdrive.camerad.snapshot.visionipc import VisionIPC
from selfdrive.controls.lib.alertmanager import set_offroad_alert
with open(BASEDIR + "/selfdrive/controls/lib/alerts_offroad.json") as json_file:
OFFROAD_ALERTS = json.load(json_file)
def jpeg_write(fn, dat): def jpeg_write(fn, dat):
@ -26,7 +23,7 @@ def snapshot():
return None return None
params.put("IsTakingSnapshot", "1") params.put("IsTakingSnapshot", "1")
params.put("Offroad_IsTakingSnapshot", json.dumps(OFFROAD_ALERTS["Offroad_IsTakingSnapshot"])) set_offroad_alert("Offroad_IsTakingSnapshot", True)
time.sleep(2.0) # Give thermald time to read the param, or if just started give camerad time to start time.sleep(2.0) # Give thermald time to read the param, or if just started give camerad time to start
# Check if camerad is already started # Check if camerad is already started
@ -64,7 +61,7 @@ def snapshot():
proc.communicate() proc.communicate()
params.put("IsTakingSnapshot", "0") params.put("IsTakingSnapshot", "0")
params.delete("Offroad_IsTakingSnapshot") set_offroad_alert("Offroad_IsTakingSnapshot", False)
return ret return ret

@ -1,14 +1,34 @@
import os
import copy
import json
from cereal import car, log from cereal import car, log
from common.basedir import BASEDIR
from common.params import Params
from common.realtime import DT_CTRL from common.realtime import DT_CTRL
from selfdrive.swaglog import cloudlog from selfdrive.swaglog import cloudlog
import copy
AlertSize = log.ControlsState.AlertSize AlertSize = log.ControlsState.AlertSize
AlertStatus = log.ControlsState.AlertStatus AlertStatus = log.ControlsState.AlertStatus
VisualAlert = car.CarControl.HUDControl.VisualAlert VisualAlert = car.CarControl.HUDControl.VisualAlert
AudibleAlert = car.CarControl.HUDControl.AudibleAlert AudibleAlert = car.CarControl.HUDControl.AudibleAlert
with open(os.path.join(BASEDIR, "selfdrive/controls/lib/alerts_offroad.json")) as f:
OFFROAD_ALERTS = json.load(f)
def set_offroad_alert(alert, show_alert, extra_text=None):
if show_alert:
a = OFFROAD_ALERTS[alert]
if extra_text is not None:
a = copy.copy(OFFROAD_ALERTS[alert])
a['text'] += extra_text
Params().put(alert, json.dumps(a))
else:
Params().delete(alert)
class AlertManager(): class AlertManager():
def __init__(self): def __init__(self):

@ -16,6 +16,11 @@
"text": "Connect to internet to check for updates. openpilot won't engage until it connects to internet to check for updates.", "text": "Connect to internet to check for updates. openpilot won't engage until it connects to internet to check for updates.",
"severity": 1 "severity": 1
}, },
"Offroad_UpdateFailed": {
"text": "Unable to download updates\n",
"severity": 1,
"_comment": "Append the command and error to the text."
},
"Offroad_PandaFirmwareMismatch": { "Offroad_PandaFirmwareMismatch": {
"text": "Unexpected panda firmware version. System won't start. Reboot your device to reflash panda.", "text": "Unexpected panda firmware version. System won't start. Reboot your device to reflash panda.",
"severity": 1 "severity": 1

@ -1,16 +1,27 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import json
import os import os
import unittest import unittest
import random
from PIL import Image, ImageDraw, ImageFont from PIL import Image, ImageDraw, ImageFont
from cereal import log, car from cereal import log, car
from common.basedir import BASEDIR from common.basedir import BASEDIR
from common.params import Params
from selfdrive.controls.lib.events import Alert, EVENTS from selfdrive.controls.lib.events import Alert, EVENTS
from selfdrive.controls.lib.alertmanager import set_offroad_alert
AlertSize = log.ControlsState.AlertSize AlertSize = log.ControlsState.AlertSize
OFFROAD_ALERTS_PATH = os.path.join(BASEDIR, "selfdrive/controls/lib/alerts_offroad.json")
class TestAlerts(unittest.TestCase): class TestAlerts(unittest.TestCase):
@classmethod
def setUpClass(cls):
with open(OFFROAD_ALERTS_PATH) as f:
cls.offroad_alerts = json.loads(f.read())
def test_events_defined(self): def test_events_defined(self):
# Ensure all events in capnp schema are defined in events.py # Ensure all events in capnp schema are defined in events.py
events = car.CarEvent.EventName.schema.enumerants events = car.CarEvent.EventName.schema.enumerants
@ -60,6 +71,29 @@ class TestAlerts(unittest.TestCase):
msg = "type: %s msg: %s" % (alert.alert_type, txt) msg = "type: %s msg: %s" % (alert.alert_type, txt)
self.assertLessEqual(w, max_text_width, msg=msg) self.assertLessEqual(w, max_text_width, msg=msg)
def test_offroad_alerts(self):
params = Params()
for a in self.offroad_alerts:
# set the alert
alert = self.offroad_alerts[a]
set_offroad_alert(a, True)
self.assertTrue(json.dumps(alert) == params.get(a, encoding='utf8'))
# then delete it
set_offroad_alert(a, False)
self.assertTrue(params.get(a) is None)
def test_offroad_alerts_extra_text(self):
params = Params()
for i in range(50):
# set the alert
a = random.choice(list(self.offroad_alerts))
alert = self.offroad_alerts[a]
set_offroad_alert(a, True, extra_text="a"*i)
expected_txt = alert['text'] + "a"*i
written_txt = json.loads(params.get(a, encoding='utf8'))['text']
self.assertTrue(expected_txt == written_txt)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

@ -1,20 +1,18 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os import os
import json
import copy
import datetime import datetime
import psutil import psutil
from smbus2 import SMBus from smbus2 import SMBus
from cereal import log from cereal import log
from common.android import ANDROID, get_network_type, get_network_strength from common.android import ANDROID, get_network_type, get_network_strength
from common.basedir import BASEDIR
from common.params import Params, put_nonblocking from common.params import Params, put_nonblocking
from common.realtime import sec_since_boot, DT_TRML from common.realtime import sec_since_boot, DT_TRML
from common.numpy_fast import clip, interp from common.numpy_fast import clip, interp
from common.filter_simple import FirstOrderFilter from common.filter_simple import FirstOrderFilter
from selfdrive.version import terms_version, training_version from selfdrive.version import terms_version, training_version, get_git_branch
from selfdrive.swaglog import cloudlog from selfdrive.swaglog import cloudlog
import cereal.messaging as messaging import cereal.messaging as messaging
from selfdrive.controls.lib.alertmanager import set_offroad_alert
from selfdrive.loggerd.config import get_available_percent from selfdrive.loggerd.config import get_available_percent
from selfdrive.pandad import get_expected_signature from selfdrive.pandad import get_expected_signature
from selfdrive.thermald.power_monitoring import PowerMonitoring, get_battery_capacity, get_battery_status, \ from selfdrive.thermald.power_monitoring import PowerMonitoring, get_battery_capacity, get_battery_status, \
@ -35,10 +33,6 @@ LEON = False
last_eon_fan_val = None last_eon_fan_val = None
with open(BASEDIR + "/selfdrive/controls/lib/alerts_offroad.json") as json_file:
OFFROAD_ALERTS = json.load(json_file)
def read_tz(x, clip=True): def read_tz(x, clip=True):
if not ANDROID: if not ANDROID:
# we don't monitor thermal on PC # we don't monitor thermal on PC
@ -171,6 +165,7 @@ def thermald_thread():
thermal_status_prev = ThermalStatus.green thermal_status_prev = ThermalStatus.green
usb_power = True usb_power = True
usb_power_prev = True usb_power_prev = True
current_branch = get_git_branch()
network_type = NetworkType.none network_type = NetworkType.none
network_strength = NetworkStrength.unknown network_strength = NetworkStrength.unknown
@ -179,7 +174,7 @@ def thermald_thread():
cpu_temp_filter = FirstOrderFilter(0., CPU_TEMP_TAU, DT_TRML) cpu_temp_filter = FirstOrderFilter(0., CPU_TEMP_TAU, DT_TRML)
health_prev = None health_prev = None
fw_version_match_prev = True fw_version_match_prev = True
current_connectivity_alert = None current_update_alert = None
time_valid_prev = True time_valid_prev = True
should_start_prev = False should_start_prev = False
handle_fan = None handle_fan = None
@ -300,9 +295,9 @@ def thermald_thread():
# show invalid date/time alert # show invalid date/time alert
time_valid = now.year >= 2019 time_valid = now.year >= 2019
if time_valid and not time_valid_prev: if time_valid and not time_valid_prev:
params.delete("Offroad_InvalidTime") set_offroad_alert("Offroad_InvalidTime", False)
if not time_valid and time_valid_prev: if not time_valid and time_valid_prev:
put_nonblocking("Offroad_InvalidTime", json.dumps(OFFROAD_ALERTS["Offroad_InvalidTime"])) set_offroad_alert("Offroad_InvalidTime", True)
time_valid_prev = time_valid time_valid_prev = time_valid
# Show update prompt # Show update prompt
@ -314,24 +309,37 @@ def thermald_thread():
update_failed_count = params.get("UpdateFailedCount") update_failed_count = params.get("UpdateFailedCount")
update_failed_count = 0 if update_failed_count is None else int(update_failed_count) update_failed_count = 0 if update_failed_count is None else int(update_failed_count)
last_update_exception = params.get("LastUpdateException", encoding='utf8')
if dt.days > DAYS_NO_CONNECTIVITY_MAX and update_failed_count > 1: if update_failed_count > 15 and last_update_exception is not None:
if current_connectivity_alert != "expired": if current_branch in ["release2", "dashcam"]:
current_connectivity_alert = "expired" extra_text = "Ensure the software is correctly installed"
params.delete("Offroad_ConnectivityNeededPrompt") else:
put_nonblocking("Offroad_ConnectivityNeeded", json.dumps(OFFROAD_ALERTS["Offroad_ConnectivityNeeded"])) extra_text = last_update_exception
if current_update_alert != "update" + extra_text:
current_update_alert = "update" + extra_text
set_offroad_alert("Offroad_ConnectivityNeeded", False)
set_offroad_alert("Offroad_ConnectivityNeededPrompt", False)
set_offroad_alert("Offroad_UpdateFailed", True, extra_text=extra_text)
elif dt.days > DAYS_NO_CONNECTIVITY_MAX and update_failed_count > 1:
if current_update_alert != "expired":
current_update_alert = "expired"
set_offroad_alert("Offroad_UpdateFailed", False)
set_offroad_alert("Offroad_ConnectivityNeededPrompt", False)
set_offroad_alert("Offroad_ConnectivityNeeded", True)
elif dt.days > DAYS_NO_CONNECTIVITY_PROMPT: elif dt.days > DAYS_NO_CONNECTIVITY_PROMPT:
remaining_time = str(max(DAYS_NO_CONNECTIVITY_MAX - dt.days, 0)) remaining_time = str(max(DAYS_NO_CONNECTIVITY_MAX - dt.days, 0))
if current_connectivity_alert != "prompt" + remaining_time: if current_update_alert != "prompt" + remaining_time:
current_connectivity_alert = "prompt" + remaining_time current_update_alert = "prompt" + remaining_time
alert_connectivity_prompt = copy.copy(OFFROAD_ALERTS["Offroad_ConnectivityNeededPrompt"]) set_offroad_alert("Offroad_UpdateFailed", False)
alert_connectivity_prompt["text"] += remaining_time + " days." set_offroad_alert("Offroad_ConnectivityNeeded", False)
params.delete("Offroad_ConnectivityNeeded") set_offroad_alert("Offroad_ConnectivityNeededPrompt", True, extra_text=f"{remaining_time} days.")
put_nonblocking("Offroad_ConnectivityNeededPrompt", json.dumps(alert_connectivity_prompt)) elif current_update_alert is not None:
elif current_connectivity_alert is not None: current_update_alert = None
current_connectivity_alert = None set_offroad_alert("Offroad_UpdateFailed", False)
params.delete("Offroad_ConnectivityNeeded") set_offroad_alert("Offroad_ConnectivityNeeded", False)
params.delete("Offroad_ConnectivityNeededPrompt") set_offroad_alert("Offroad_ConnectivityNeededPrompt", False)
do_uninstall = params.get("DoUninstall") == b"1" do_uninstall = params.get("DoUninstall") == b"1"
accepted_terms = params.get("HasAcceptedTerms") == terms_version accepted_terms = params.get("HasAcceptedTerms") == terms_version
@ -361,19 +369,19 @@ def thermald_thread():
should_start = should_start and (not is_taking_snapshot) and (not is_viewing_driver) should_start = should_start and (not is_taking_snapshot) and (not is_viewing_driver)
if fw_version_match and not fw_version_match_prev: if fw_version_match and not fw_version_match_prev:
params.delete("Offroad_PandaFirmwareMismatch") set_offroad_alert("Offroad_PandaFirmwareMismatch", False)
if not fw_version_match and fw_version_match_prev: if not fw_version_match and fw_version_match_prev:
put_nonblocking("Offroad_PandaFirmwareMismatch", json.dumps(OFFROAD_ALERTS["Offroad_PandaFirmwareMismatch"])) set_offroad_alert("Offroad_PandaFirmwareMismatch", True)
# if any CPU gets above 107 or the battery gets above 63, kill all processes # if any CPU gets above 107 or the battery gets above 63, kill all processes
# controls will warn with CPU above 95 or battery above 60 # controls will warn with CPU above 95 or battery above 60
if thermal_status >= ThermalStatus.danger: if thermal_status >= ThermalStatus.danger:
should_start = False should_start = False
if thermal_status_prev < ThermalStatus.danger: if thermal_status_prev < ThermalStatus.danger:
put_nonblocking("Offroad_TemperatureTooHigh", json.dumps(OFFROAD_ALERTS["Offroad_TemperatureTooHigh"])) set_offroad_alert("Offroad_TemperatureTooHigh", True)
else: else:
if thermal_status_prev >= ThermalStatus.danger: if thermal_status_prev >= ThermalStatus.danger:
params.delete("Offroad_TemperatureTooHigh") set_offroad_alert("Offroad_TemperatureTooHigh", False)
if should_start: if should_start:
if not should_start_prev: if not should_start_prev:
@ -411,9 +419,9 @@ def thermald_thread():
thermal_sock.send(msg.to_bytes()) thermal_sock.send(msg.to_bytes())
if usb_power_prev and not usb_power: if usb_power_prev and not usb_power:
put_nonblocking("Offroad_ChargeDisabled", json.dumps(OFFROAD_ALERTS["Offroad_ChargeDisabled"])) set_offroad_alert("Offroad_ChargeDisabled", True)
elif usb_power and not usb_power_prev: elif usb_power and not usb_power_prev:
params.delete("Offroad_ChargeDisabled") set_offroad_alert("Offroad_ChargeDisabled", False)
thermal_status_prev = thermal_status thermal_status_prev = thermal_status
usb_power_prev = usb_power usb_power_prev = usb_power

@ -254,7 +254,6 @@ def main():
update_failed_count = 0 update_failed_count = 0
overlay_initialized = False overlay_initialized = False
while not wait_helper.shutdown: while not wait_helper.shutdown:
update_failed_count += 1
wait_helper.ready_event.clear() wait_helper.ready_event.clear()
# Check for internet every 30s # Check for internet every 30s
@ -265,6 +264,8 @@ def main():
continue continue
# Attempt an update # Attempt an update
exception = None
update_failed_count += 1
try: try:
# Re-create the overlay if BASEDIR/.git has changed since we created the overlay # Re-create the overlay if BASEDIR/.git has changed since we created the overlay
if overlay_initialized: if overlay_initialized:
@ -293,11 +294,17 @@ def main():
output=e.output, output=e.output,
returncode=e.returncode returncode=e.returncode
) )
exception = e
overlay_initialized = False overlay_initialized = False
except Exception: except Exception as e:
cloudlog.exception("uncaught updated exception, shouldn't happen") cloudlog.exception("uncaught updated exception, shouldn't happen")
exception = e
params.put("UpdateFailedCount", str(update_failed_count)) params.put("UpdateFailedCount", str(update_failed_count))
if exception is None:
params.delete("LastUpdateException")
else:
params.put("LastUpdateException", f"command failed: {exception.cmd}\n{exception.output}")
# Wait 10 minutes between update attempts # Wait 10 minutes between update attempts
wait_helper.sleep(60*10) wait_helper.sleep(60*10)

Loading…
Cancel
Save