From 017b084154ad0e3e00d77630fb21008ddc2d8684 Mon Sep 17 00:00:00 2001 From: Adeeb Shihadeh Date: Tue, 11 Aug 2020 16:23:57 -0700 Subject: [PATCH] Alert when updated consistently fails (#2013) * alert when update fails more than 10 times * bring over offroad alert refactor from other branch * and we have tests * use it in snapshot * bump apk * don't show exceptions on release branches * only write when changed * why does delete use so much cpu * clean that up * little more old-commit-hash: 8e63f065400ea1868f077560ae89b2c17f5523f8 --- apk/ai.comma.plus.offroad.apk | 4 +- common/params.py | 2 + selfdrive/camerad/snapshot/snapshot.py | 9 +-- selfdrive/controls/lib/alertmanager.py | 24 ++++++- selfdrive/controls/lib/alerts_offroad.json | 5 ++ .../tests/{test_events.py => test_alerts.py} | 34 +++++++++ selfdrive/thermald/thermald.py | 72 ++++++++++--------- selfdrive/updated.py | 11 ++- 8 files changed, 117 insertions(+), 44 deletions(-) rename selfdrive/controls/tests/{test_events.py => test_alerts.py} (67%) diff --git a/apk/ai.comma.plus.offroad.apk b/apk/ai.comma.plus.offroad.apk index 414e4cf5c1..75f3b219b6 100644 --- a/apk/ai.comma.plus.offroad.apk +++ b/apk/ai.comma.plus.offroad.apk @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:64e31350a4138675cb39703d070fec787c011fcbdee3fbae1cbbb21ce4ceb6be -size 13701989 +oid sha256:a198491887ed6029bffdf7f4dc28c4f9a6ba5f9d2235710fc11a1378893491d7 +size 13702777 diff --git a/common/params.py b/common/params.py index 985390faa9..e106705022 100755 --- a/common/params.py +++ b/common/params.py @@ -80,6 +80,7 @@ keys = { "IsUploadRawEnabled": [TxType.PERSISTENT], "LastAthenaPingTime": [TxType.PERSISTENT], "LastUpdateTime": [TxType.PERSISTENT], + "LastUpdateException": [TxType.PERSISTENT], "LimitSetSpeed": [TxType.PERSISTENT], "LimitSetSpeedNeural": [TxType.PERSISTENT], "LiveParameters": [TxType.PERSISTENT], @@ -108,6 +109,7 @@ keys = { "Offroad_InvalidTime": [TxType.CLEAR_ON_MANAGER_START], "Offroad_IsTakingSnapshot": [TxType.CLEAR_ON_MANAGER_START], "Offroad_NeosUpdate": [TxType.CLEAR_ON_MANAGER_START], + "Offroad_UpdateFailed": [TxType.CLEAR_ON_MANAGER_START], } diff --git a/selfdrive/camerad/snapshot/snapshot.py b/selfdrive/camerad/snapshot/snapshot.py index 4ba10bf4be..317618c0b2 100755 --- a/selfdrive/camerad/snapshot/snapshot.py +++ b/selfdrive/camerad/snapshot/snapshot.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 import os -import json import signal import subprocess import time @@ -8,9 +7,7 @@ from PIL import Image from common.basedir import BASEDIR from common.params import Params from selfdrive.camerad.snapshot.visionipc import VisionIPC - -with open(BASEDIR + "/selfdrive/controls/lib/alerts_offroad.json") as json_file: - OFFROAD_ALERTS = json.load(json_file) +from selfdrive.controls.lib.alertmanager import set_offroad_alert def jpeg_write(fn, dat): @@ -26,7 +23,7 @@ def snapshot(): return None params.put("IsTakingSnapshot", "1") - params.put("Offroad_IsTakingSnapshot", json.dumps(OFFROAD_ALERTS["Offroad_IsTakingSnapshot"])) + set_offroad_alert("Offroad_IsTakingSnapshot", True) time.sleep(2.0) # Give thermald time to read the param, or if just started give camerad time to start # Check if camerad is already started @@ -64,7 +61,7 @@ def snapshot(): proc.communicate() params.put("IsTakingSnapshot", "0") - params.delete("Offroad_IsTakingSnapshot") + set_offroad_alert("Offroad_IsTakingSnapshot", False) return ret diff --git a/selfdrive/controls/lib/alertmanager.py b/selfdrive/controls/lib/alertmanager.py index 42daff420f..870d8d4b7e 100644 --- a/selfdrive/controls/lib/alertmanager.py +++ b/selfdrive/controls/lib/alertmanager.py @@ -1,14 +1,34 @@ +import os +import copy +import json + from cereal import car, log +from common.basedir import BASEDIR +from common.params import Params from common.realtime import DT_CTRL from selfdrive.swaglog import cloudlog -import copy - AlertSize = log.ControlsState.AlertSize AlertStatus = log.ControlsState.AlertStatus VisualAlert = car.CarControl.HUDControl.VisualAlert AudibleAlert = car.CarControl.HUDControl.AudibleAlert + +with open(os.path.join(BASEDIR, "selfdrive/controls/lib/alerts_offroad.json")) as f: + OFFROAD_ALERTS = json.load(f) + + +def set_offroad_alert(alert, show_alert, extra_text=None): + if show_alert: + a = OFFROAD_ALERTS[alert] + if extra_text is not None: + a = copy.copy(OFFROAD_ALERTS[alert]) + a['text'] += extra_text + Params().put(alert, json.dumps(a)) + else: + Params().delete(alert) + + class AlertManager(): def __init__(self): diff --git a/selfdrive/controls/lib/alerts_offroad.json b/selfdrive/controls/lib/alerts_offroad.json index cebb1a2c26..7cf5d8229b 100644 --- a/selfdrive/controls/lib/alerts_offroad.json +++ b/selfdrive/controls/lib/alerts_offroad.json @@ -16,6 +16,11 @@ "text": "Connect to internet to check for updates. openpilot won't engage until it connects to internet to check for updates.", "severity": 1 }, + "Offroad_UpdateFailed": { + "text": "Unable to download updates\n", + "severity": 1, + "_comment": "Append the command and error to the text." + }, "Offroad_PandaFirmwareMismatch": { "text": "Unexpected panda firmware version. System won't start. Reboot your device to reflash panda.", "severity": 1 diff --git a/selfdrive/controls/tests/test_events.py b/selfdrive/controls/tests/test_alerts.py similarity index 67% rename from selfdrive/controls/tests/test_events.py rename to selfdrive/controls/tests/test_alerts.py index ef41eea91e..5f3a962cfa 100755 --- a/selfdrive/controls/tests/test_events.py +++ b/selfdrive/controls/tests/test_alerts.py @@ -1,16 +1,27 @@ #!/usr/bin/env python3 +import json import os import unittest +import random from PIL import Image, ImageDraw, ImageFont from cereal import log, car from common.basedir import BASEDIR +from common.params import Params from selfdrive.controls.lib.events import Alert, EVENTS +from selfdrive.controls.lib.alertmanager import set_offroad_alert AlertSize = log.ControlsState.AlertSize +OFFROAD_ALERTS_PATH = os.path.join(BASEDIR, "selfdrive/controls/lib/alerts_offroad.json") class TestAlerts(unittest.TestCase): + + @classmethod + def setUpClass(cls): + with open(OFFROAD_ALERTS_PATH) as f: + cls.offroad_alerts = json.loads(f.read()) + def test_events_defined(self): # Ensure all events in capnp schema are defined in events.py events = car.CarEvent.EventName.schema.enumerants @@ -60,6 +71,29 @@ class TestAlerts(unittest.TestCase): msg = "type: %s msg: %s" % (alert.alert_type, txt) self.assertLessEqual(w, max_text_width, msg=msg) + def test_offroad_alerts(self): + params = Params() + for a in self.offroad_alerts: + # set the alert + alert = self.offroad_alerts[a] + set_offroad_alert(a, True) + self.assertTrue(json.dumps(alert) == params.get(a, encoding='utf8')) + + # then delete it + set_offroad_alert(a, False) + self.assertTrue(params.get(a) is None) + + def test_offroad_alerts_extra_text(self): + params = Params() + for i in range(50): + # set the alert + a = random.choice(list(self.offroad_alerts)) + alert = self.offroad_alerts[a] + set_offroad_alert(a, True, extra_text="a"*i) + + expected_txt = alert['text'] + "a"*i + written_txt = json.loads(params.get(a, encoding='utf8'))['text'] + self.assertTrue(expected_txt == written_txt) if __name__ == "__main__": unittest.main() diff --git a/selfdrive/thermald/thermald.py b/selfdrive/thermald/thermald.py index 65587ca9c9..db944bdd67 100755 --- a/selfdrive/thermald/thermald.py +++ b/selfdrive/thermald/thermald.py @@ -1,20 +1,18 @@ #!/usr/bin/env python3 import os -import json -import copy import datetime import psutil from smbus2 import SMBus from cereal import log from common.android import ANDROID, get_network_type, get_network_strength -from common.basedir import BASEDIR from common.params import Params, put_nonblocking from common.realtime import sec_since_boot, DT_TRML from common.numpy_fast import clip, interp from common.filter_simple import FirstOrderFilter -from selfdrive.version import terms_version, training_version +from selfdrive.version import terms_version, training_version, get_git_branch from selfdrive.swaglog import cloudlog import cereal.messaging as messaging +from selfdrive.controls.lib.alertmanager import set_offroad_alert from selfdrive.loggerd.config import get_available_percent from selfdrive.pandad import get_expected_signature from selfdrive.thermald.power_monitoring import PowerMonitoring, get_battery_capacity, get_battery_status, \ @@ -35,10 +33,6 @@ LEON = False last_eon_fan_val = None -with open(BASEDIR + "/selfdrive/controls/lib/alerts_offroad.json") as json_file: - OFFROAD_ALERTS = json.load(json_file) - - def read_tz(x, clip=True): if not ANDROID: # we don't monitor thermal on PC @@ -171,6 +165,7 @@ def thermald_thread(): thermal_status_prev = ThermalStatus.green usb_power = True usb_power_prev = True + current_branch = get_git_branch() network_type = NetworkType.none network_strength = NetworkStrength.unknown @@ -179,7 +174,7 @@ def thermald_thread(): cpu_temp_filter = FirstOrderFilter(0., CPU_TEMP_TAU, DT_TRML) health_prev = None fw_version_match_prev = True - current_connectivity_alert = None + current_update_alert = None time_valid_prev = True should_start_prev = False handle_fan = None @@ -300,9 +295,9 @@ def thermald_thread(): # show invalid date/time alert time_valid = now.year >= 2019 if time_valid and not time_valid_prev: - params.delete("Offroad_InvalidTime") + set_offroad_alert("Offroad_InvalidTime", False) if not time_valid and time_valid_prev: - put_nonblocking("Offroad_InvalidTime", json.dumps(OFFROAD_ALERTS["Offroad_InvalidTime"])) + set_offroad_alert("Offroad_InvalidTime", True) time_valid_prev = time_valid # Show update prompt @@ -314,24 +309,37 @@ def thermald_thread(): update_failed_count = params.get("UpdateFailedCount") update_failed_count = 0 if update_failed_count is None else int(update_failed_count) + last_update_exception = params.get("LastUpdateException", encoding='utf8') - if dt.days > DAYS_NO_CONNECTIVITY_MAX and update_failed_count > 1: - if current_connectivity_alert != "expired": - current_connectivity_alert = "expired" - params.delete("Offroad_ConnectivityNeededPrompt") - put_nonblocking("Offroad_ConnectivityNeeded", json.dumps(OFFROAD_ALERTS["Offroad_ConnectivityNeeded"])) + if update_failed_count > 15 and last_update_exception is not None: + if current_branch in ["release2", "dashcam"]: + extra_text = "Ensure the software is correctly installed" + else: + extra_text = last_update_exception + + if current_update_alert != "update" + extra_text: + current_update_alert = "update" + extra_text + set_offroad_alert("Offroad_ConnectivityNeeded", False) + set_offroad_alert("Offroad_ConnectivityNeededPrompt", False) + set_offroad_alert("Offroad_UpdateFailed", True, extra_text=extra_text) + elif dt.days > DAYS_NO_CONNECTIVITY_MAX and update_failed_count > 1: + if current_update_alert != "expired": + current_update_alert = "expired" + set_offroad_alert("Offroad_UpdateFailed", False) + set_offroad_alert("Offroad_ConnectivityNeededPrompt", False) + set_offroad_alert("Offroad_ConnectivityNeeded", True) elif dt.days > DAYS_NO_CONNECTIVITY_PROMPT: remaining_time = str(max(DAYS_NO_CONNECTIVITY_MAX - dt.days, 0)) - if current_connectivity_alert != "prompt" + remaining_time: - current_connectivity_alert = "prompt" + remaining_time - alert_connectivity_prompt = copy.copy(OFFROAD_ALERTS["Offroad_ConnectivityNeededPrompt"]) - alert_connectivity_prompt["text"] += remaining_time + " days." - params.delete("Offroad_ConnectivityNeeded") - put_nonblocking("Offroad_ConnectivityNeededPrompt", json.dumps(alert_connectivity_prompt)) - elif current_connectivity_alert is not None: - current_connectivity_alert = None - params.delete("Offroad_ConnectivityNeeded") - params.delete("Offroad_ConnectivityNeededPrompt") + if current_update_alert != "prompt" + remaining_time: + current_update_alert = "prompt" + remaining_time + set_offroad_alert("Offroad_UpdateFailed", False) + set_offroad_alert("Offroad_ConnectivityNeeded", False) + set_offroad_alert("Offroad_ConnectivityNeededPrompt", True, extra_text=f"{remaining_time} days.") + elif current_update_alert is not None: + current_update_alert = None + set_offroad_alert("Offroad_UpdateFailed", False) + set_offroad_alert("Offroad_ConnectivityNeeded", False) + set_offroad_alert("Offroad_ConnectivityNeededPrompt", False) do_uninstall = params.get("DoUninstall") == b"1" accepted_terms = params.get("HasAcceptedTerms") == terms_version @@ -361,19 +369,19 @@ def thermald_thread(): should_start = should_start and (not is_taking_snapshot) and (not is_viewing_driver) if fw_version_match and not fw_version_match_prev: - params.delete("Offroad_PandaFirmwareMismatch") + set_offroad_alert("Offroad_PandaFirmwareMismatch", False) if not fw_version_match and fw_version_match_prev: - put_nonblocking("Offroad_PandaFirmwareMismatch", json.dumps(OFFROAD_ALERTS["Offroad_PandaFirmwareMismatch"])) + set_offroad_alert("Offroad_PandaFirmwareMismatch", True) # if any CPU gets above 107 or the battery gets above 63, kill all processes # controls will warn with CPU above 95 or battery above 60 if thermal_status >= ThermalStatus.danger: should_start = False if thermal_status_prev < ThermalStatus.danger: - put_nonblocking("Offroad_TemperatureTooHigh", json.dumps(OFFROAD_ALERTS["Offroad_TemperatureTooHigh"])) + set_offroad_alert("Offroad_TemperatureTooHigh", True) else: if thermal_status_prev >= ThermalStatus.danger: - params.delete("Offroad_TemperatureTooHigh") + set_offroad_alert("Offroad_TemperatureTooHigh", False) if should_start: if not should_start_prev: @@ -411,9 +419,9 @@ def thermald_thread(): thermal_sock.send(msg.to_bytes()) if usb_power_prev and not usb_power: - put_nonblocking("Offroad_ChargeDisabled", json.dumps(OFFROAD_ALERTS["Offroad_ChargeDisabled"])) + set_offroad_alert("Offroad_ChargeDisabled", True) elif usb_power and not usb_power_prev: - params.delete("Offroad_ChargeDisabled") + set_offroad_alert("Offroad_ChargeDisabled", False) thermal_status_prev = thermal_status usb_power_prev = usb_power diff --git a/selfdrive/updated.py b/selfdrive/updated.py index 59ec7d8802..f93d9855cc 100755 --- a/selfdrive/updated.py +++ b/selfdrive/updated.py @@ -254,7 +254,6 @@ def main(): update_failed_count = 0 overlay_initialized = False while not wait_helper.shutdown: - update_failed_count += 1 wait_helper.ready_event.clear() # Check for internet every 30s @@ -265,6 +264,8 @@ def main(): continue # Attempt an update + exception = None + update_failed_count += 1 try: # Re-create the overlay if BASEDIR/.git has changed since we created the overlay if overlay_initialized: @@ -293,11 +294,17 @@ def main(): output=e.output, returncode=e.returncode ) + exception = e overlay_initialized = False - except Exception: + except Exception as e: cloudlog.exception("uncaught updated exception, shouldn't happen") + exception = e params.put("UpdateFailedCount", str(update_failed_count)) + if exception is None: + params.delete("LastUpdateException") + else: + params.put("LastUpdateException", f"command failed: {exception.cmd}\n{exception.output}") # Wait 10 minutes between update attempts wait_helper.sleep(60*10)