Alert when updated consistently fails (#2013)

* alert when update fails more than 10 times

* bring over offroad alert refactor from other branch

* and we have tests

* use it in snapshot

* bump apk

* don't show exceptions on release branches

* only write when changed

* why does delete use so much cpu

* clean that up

* little more
pull/2014/head^2
Adeeb Shihadeh 5 years ago committed by GitHub
parent d946638fce
commit 8e63f06540
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 4
      apk/ai.comma.plus.offroad.apk
  2. 2
      common/params.py
  3. 9
      selfdrive/camerad/snapshot/snapshot.py
  4. 24
      selfdrive/controls/lib/alertmanager.py
  5. 5
      selfdrive/controls/lib/alerts_offroad.json
  6. 34
      selfdrive/controls/tests/test_alerts.py
  7. 72
      selfdrive/thermald/thermald.py
  8. 11
      selfdrive/updated.py

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:64e31350a4138675cb39703d070fec787c011fcbdee3fbae1cbbb21ce4ceb6be
size 13701989
oid sha256:a198491887ed6029bffdf7f4dc28c4f9a6ba5f9d2235710fc11a1378893491d7
size 13702777

@ -80,6 +80,7 @@ keys = {
"IsUploadRawEnabled": [TxType.PERSISTENT],
"LastAthenaPingTime": [TxType.PERSISTENT],
"LastUpdateTime": [TxType.PERSISTENT],
"LastUpdateException": [TxType.PERSISTENT],
"LimitSetSpeed": [TxType.PERSISTENT],
"LimitSetSpeedNeural": [TxType.PERSISTENT],
"LiveParameters": [TxType.PERSISTENT],
@ -108,6 +109,7 @@ keys = {
"Offroad_InvalidTime": [TxType.CLEAR_ON_MANAGER_START],
"Offroad_IsTakingSnapshot": [TxType.CLEAR_ON_MANAGER_START],
"Offroad_NeosUpdate": [TxType.CLEAR_ON_MANAGER_START],
"Offroad_UpdateFailed": [TxType.CLEAR_ON_MANAGER_START],
}

@ -1,6 +1,5 @@
#!/usr/bin/env python3
import os
import json
import signal
import subprocess
import time
@ -8,9 +7,7 @@ from PIL import Image
from common.basedir import BASEDIR
from common.params import Params
from selfdrive.camerad.snapshot.visionipc import VisionIPC
with open(BASEDIR + "/selfdrive/controls/lib/alerts_offroad.json") as json_file:
OFFROAD_ALERTS = json.load(json_file)
from selfdrive.controls.lib.alertmanager import set_offroad_alert
def jpeg_write(fn, dat):
@ -26,7 +23,7 @@ def snapshot():
return None
params.put("IsTakingSnapshot", "1")
params.put("Offroad_IsTakingSnapshot", json.dumps(OFFROAD_ALERTS["Offroad_IsTakingSnapshot"]))
set_offroad_alert("Offroad_IsTakingSnapshot", True)
time.sleep(2.0) # Give thermald time to read the param, or if just started give camerad time to start
# Check if camerad is already started
@ -64,7 +61,7 @@ def snapshot():
proc.communicate()
params.put("IsTakingSnapshot", "0")
params.delete("Offroad_IsTakingSnapshot")
set_offroad_alert("Offroad_IsTakingSnapshot", False)
return ret

@ -1,14 +1,34 @@
import os
import copy
import json
from cereal import car, log
from common.basedir import BASEDIR
from common.params import Params
from common.realtime import DT_CTRL
from selfdrive.swaglog import cloudlog
import copy
AlertSize = log.ControlsState.AlertSize
AlertStatus = log.ControlsState.AlertStatus
VisualAlert = car.CarControl.HUDControl.VisualAlert
AudibleAlert = car.CarControl.HUDControl.AudibleAlert
with open(os.path.join(BASEDIR, "selfdrive/controls/lib/alerts_offroad.json")) as f:
OFFROAD_ALERTS = json.load(f)
def set_offroad_alert(alert, show_alert, extra_text=None):
if show_alert:
a = OFFROAD_ALERTS[alert]
if extra_text is not None:
a = copy.copy(OFFROAD_ALERTS[alert])
a['text'] += extra_text
Params().put(alert, json.dumps(a))
else:
Params().delete(alert)
class AlertManager():
def __init__(self):

@ -16,6 +16,11 @@
"text": "Connect to internet to check for updates. openpilot won't engage until it connects to internet to check for updates.",
"severity": 1
},
"Offroad_UpdateFailed": {
"text": "Unable to download updates\n",
"severity": 1,
"_comment": "Append the command and error to the text."
},
"Offroad_PandaFirmwareMismatch": {
"text": "Unexpected panda firmware version. System won't start. Reboot your device to reflash panda.",
"severity": 1

@ -1,16 +1,27 @@
#!/usr/bin/env python3
import json
import os
import unittest
import random
from PIL import Image, ImageDraw, ImageFont
from cereal import log, car
from common.basedir import BASEDIR
from common.params import Params
from selfdrive.controls.lib.events import Alert, EVENTS
from selfdrive.controls.lib.alertmanager import set_offroad_alert
AlertSize = log.ControlsState.AlertSize
OFFROAD_ALERTS_PATH = os.path.join(BASEDIR, "selfdrive/controls/lib/alerts_offroad.json")
class TestAlerts(unittest.TestCase):
@classmethod
def setUpClass(cls):
with open(OFFROAD_ALERTS_PATH) as f:
cls.offroad_alerts = json.loads(f.read())
def test_events_defined(self):
# Ensure all events in capnp schema are defined in events.py
events = car.CarEvent.EventName.schema.enumerants
@ -60,6 +71,29 @@ class TestAlerts(unittest.TestCase):
msg = "type: %s msg: %s" % (alert.alert_type, txt)
self.assertLessEqual(w, max_text_width, msg=msg)
def test_offroad_alerts(self):
params = Params()
for a in self.offroad_alerts:
# set the alert
alert = self.offroad_alerts[a]
set_offroad_alert(a, True)
self.assertTrue(json.dumps(alert) == params.get(a, encoding='utf8'))
# then delete it
set_offroad_alert(a, False)
self.assertTrue(params.get(a) is None)
def test_offroad_alerts_extra_text(self):
params = Params()
for i in range(50):
# set the alert
a = random.choice(list(self.offroad_alerts))
alert = self.offroad_alerts[a]
set_offroad_alert(a, True, extra_text="a"*i)
expected_txt = alert['text'] + "a"*i
written_txt = json.loads(params.get(a, encoding='utf8'))['text']
self.assertTrue(expected_txt == written_txt)
if __name__ == "__main__":
unittest.main()

@ -1,20 +1,18 @@
#!/usr/bin/env python3
import os
import json
import copy
import datetime
import psutil
from smbus2 import SMBus
from cereal import log
from common.android import ANDROID, get_network_type, get_network_strength
from common.basedir import BASEDIR
from common.params import Params, put_nonblocking
from common.realtime import sec_since_boot, DT_TRML
from common.numpy_fast import clip, interp
from common.filter_simple import FirstOrderFilter
from selfdrive.version import terms_version, training_version
from selfdrive.version import terms_version, training_version, get_git_branch
from selfdrive.swaglog import cloudlog
import cereal.messaging as messaging
from selfdrive.controls.lib.alertmanager import set_offroad_alert
from selfdrive.loggerd.config import get_available_percent
from selfdrive.pandad import get_expected_signature
from selfdrive.thermald.power_monitoring import PowerMonitoring, get_battery_capacity, get_battery_status, \
@ -35,10 +33,6 @@ LEON = False
last_eon_fan_val = None
with open(BASEDIR + "/selfdrive/controls/lib/alerts_offroad.json") as json_file:
OFFROAD_ALERTS = json.load(json_file)
def read_tz(x, clip=True):
if not ANDROID:
# we don't monitor thermal on PC
@ -171,6 +165,7 @@ def thermald_thread():
thermal_status_prev = ThermalStatus.green
usb_power = True
usb_power_prev = True
current_branch = get_git_branch()
network_type = NetworkType.none
network_strength = NetworkStrength.unknown
@ -179,7 +174,7 @@ def thermald_thread():
cpu_temp_filter = FirstOrderFilter(0., CPU_TEMP_TAU, DT_TRML)
health_prev = None
fw_version_match_prev = True
current_connectivity_alert = None
current_update_alert = None
time_valid_prev = True
should_start_prev = False
handle_fan = None
@ -300,9 +295,9 @@ def thermald_thread():
# show invalid date/time alert
time_valid = now.year >= 2019
if time_valid and not time_valid_prev:
params.delete("Offroad_InvalidTime")
set_offroad_alert("Offroad_InvalidTime", False)
if not time_valid and time_valid_prev:
put_nonblocking("Offroad_InvalidTime", json.dumps(OFFROAD_ALERTS["Offroad_InvalidTime"]))
set_offroad_alert("Offroad_InvalidTime", True)
time_valid_prev = time_valid
# Show update prompt
@ -314,24 +309,37 @@ def thermald_thread():
update_failed_count = params.get("UpdateFailedCount")
update_failed_count = 0 if update_failed_count is None else int(update_failed_count)
last_update_exception = params.get("LastUpdateException", encoding='utf8')
if dt.days > DAYS_NO_CONNECTIVITY_MAX and update_failed_count > 1:
if current_connectivity_alert != "expired":
current_connectivity_alert = "expired"
params.delete("Offroad_ConnectivityNeededPrompt")
put_nonblocking("Offroad_ConnectivityNeeded", json.dumps(OFFROAD_ALERTS["Offroad_ConnectivityNeeded"]))
if update_failed_count > 15 and last_update_exception is not None:
if current_branch in ["release2", "dashcam"]:
extra_text = "Ensure the software is correctly installed"
else:
extra_text = last_update_exception
if current_update_alert != "update" + extra_text:
current_update_alert = "update" + extra_text
set_offroad_alert("Offroad_ConnectivityNeeded", False)
set_offroad_alert("Offroad_ConnectivityNeededPrompt", False)
set_offroad_alert("Offroad_UpdateFailed", True, extra_text=extra_text)
elif dt.days > DAYS_NO_CONNECTIVITY_MAX and update_failed_count > 1:
if current_update_alert != "expired":
current_update_alert = "expired"
set_offroad_alert("Offroad_UpdateFailed", False)
set_offroad_alert("Offroad_ConnectivityNeededPrompt", False)
set_offroad_alert("Offroad_ConnectivityNeeded", True)
elif dt.days > DAYS_NO_CONNECTIVITY_PROMPT:
remaining_time = str(max(DAYS_NO_CONNECTIVITY_MAX - dt.days, 0))
if current_connectivity_alert != "prompt" + remaining_time:
current_connectivity_alert = "prompt" + remaining_time
alert_connectivity_prompt = copy.copy(OFFROAD_ALERTS["Offroad_ConnectivityNeededPrompt"])
alert_connectivity_prompt["text"] += remaining_time + " days."
params.delete("Offroad_ConnectivityNeeded")
put_nonblocking("Offroad_ConnectivityNeededPrompt", json.dumps(alert_connectivity_prompt))
elif current_connectivity_alert is not None:
current_connectivity_alert = None
params.delete("Offroad_ConnectivityNeeded")
params.delete("Offroad_ConnectivityNeededPrompt")
if current_update_alert != "prompt" + remaining_time:
current_update_alert = "prompt" + remaining_time
set_offroad_alert("Offroad_UpdateFailed", False)
set_offroad_alert("Offroad_ConnectivityNeeded", False)
set_offroad_alert("Offroad_ConnectivityNeededPrompt", True, extra_text=f"{remaining_time} days.")
elif current_update_alert is not None:
current_update_alert = None
set_offroad_alert("Offroad_UpdateFailed", False)
set_offroad_alert("Offroad_ConnectivityNeeded", False)
set_offroad_alert("Offroad_ConnectivityNeededPrompt", False)
do_uninstall = params.get("DoUninstall") == b"1"
accepted_terms = params.get("HasAcceptedTerms") == terms_version
@ -361,19 +369,19 @@ def thermald_thread():
should_start = should_start and (not is_taking_snapshot) and (not is_viewing_driver)
if fw_version_match and not fw_version_match_prev:
params.delete("Offroad_PandaFirmwareMismatch")
set_offroad_alert("Offroad_PandaFirmwareMismatch", False)
if not fw_version_match and fw_version_match_prev:
put_nonblocking("Offroad_PandaFirmwareMismatch", json.dumps(OFFROAD_ALERTS["Offroad_PandaFirmwareMismatch"]))
set_offroad_alert("Offroad_PandaFirmwareMismatch", True)
# if any CPU gets above 107 or the battery gets above 63, kill all processes
# controls will warn with CPU above 95 or battery above 60
if thermal_status >= ThermalStatus.danger:
should_start = False
if thermal_status_prev < ThermalStatus.danger:
put_nonblocking("Offroad_TemperatureTooHigh", json.dumps(OFFROAD_ALERTS["Offroad_TemperatureTooHigh"]))
set_offroad_alert("Offroad_TemperatureTooHigh", True)
else:
if thermal_status_prev >= ThermalStatus.danger:
params.delete("Offroad_TemperatureTooHigh")
set_offroad_alert("Offroad_TemperatureTooHigh", False)
if should_start:
if not should_start_prev:
@ -411,9 +419,9 @@ def thermald_thread():
thermal_sock.send(msg.to_bytes())
if usb_power_prev and not usb_power:
put_nonblocking("Offroad_ChargeDisabled", json.dumps(OFFROAD_ALERTS["Offroad_ChargeDisabled"]))
set_offroad_alert("Offroad_ChargeDisabled", True)
elif usb_power and not usb_power_prev:
params.delete("Offroad_ChargeDisabled")
set_offroad_alert("Offroad_ChargeDisabled", False)
thermal_status_prev = thermal_status
usb_power_prev = usb_power

@ -254,7 +254,6 @@ def main():
update_failed_count = 0
overlay_initialized = False
while not wait_helper.shutdown:
update_failed_count += 1
wait_helper.ready_event.clear()
# Check for internet every 30s
@ -265,6 +264,8 @@ def main():
continue
# Attempt an update
exception = None
update_failed_count += 1
try:
# Re-create the overlay if BASEDIR/.git has changed since we created the overlay
if overlay_initialized:
@ -293,11 +294,17 @@ def main():
output=e.output,
returncode=e.returncode
)
exception = e
overlay_initialized = False
except Exception:
except Exception as e:
cloudlog.exception("uncaught updated exception, shouldn't happen")
exception = e
params.put("UpdateFailedCount", str(update_failed_count))
if exception is None:
params.delete("LastUpdateException")
else:
params.put("LastUpdateException", f"command failed: {exception.cmd}\n{exception.output}")
# Wait 10 minutes between update attempts
wait_helper.sleep(60*10)

Loading…
Cancel
Save