From 722a440eb6102777fb97f6ae8936d852709a66a5 Mon Sep 17 00:00:00 2001 From: Adeeb Shihadeh Date: Tue, 22 Sep 2020 12:09:41 -0700 Subject: [PATCH] Improved updater robustness (#2046) * git ping * cleanup overlay init * separate update available check from fetch * cleanup setting params * only fetch neos update on android * move that * type hints * lightweight update check with git ls-remote * git fetch dry run * cleanup --- selfdrive/updated.py | 261 +++++++++++++++++++++++-------------------- 1 file changed, 142 insertions(+), 119 deletions(-) diff --git a/selfdrive/updated.py b/selfdrive/updated.py index c107944fa7..01083d1076 100755 --- a/selfdrive/updated.py +++ b/selfdrive/updated.py @@ -32,13 +32,14 @@ import fcntl import time import threading from pathlib import Path +from typing import List, Tuple, Optional +from common.hardware import ANDROID from common.basedir import BASEDIR from common.params import Params from selfdrive.swaglog import cloudlog from selfdrive.controls.lib.alertmanager import set_offroad_alert -TEST_IP = os.getenv("UPDATER_TEST_IP", "8.8.8.8") LOCK_FILE = os.getenv("UPDATER_LOCK_FILE", "/tmp/safe_staging_overlay.lock") STAGING_ROOT = os.getenv("UPDATER_STAGING_ROOT", "/data/safe_staging") @@ -60,7 +61,7 @@ class WaitTimeHelper: signal.signal(signal.SIGINT, self.graceful_shutdown) signal.signal(signal.SIGHUP, self.update_now) - def graceful_shutdown(self, signum, frame): + def graceful_shutdown(self, signum: int, frame) -> None: # umount -f doesn't appear effective in avoiding "device busy" on NEOS, # so don't actually die until the next convenient opportunity in main(). cloudlog.info("caught SIGINT/SIGTERM, dismounting overlay at next opportunity") @@ -73,35 +74,42 @@ class WaitTimeHelper: self.shutdown = True self.ready_event.set() - def update_now(self, signum, frame): + def update_now(self, signum: int, frame) -> None: cloudlog.info("caught SIGHUP, running update check immediately") self.ready_event.set() - def sleep(self, t): + def sleep(self, t: float) -> None: self.ready_event.wait(timeout=t) -def run(cmd, cwd=None, low_priority=False): +def run(cmd: List[str], cwd: Optional[str] = None, low_priority: bool = False): if low_priority: cmd = ["nice", "-n", "19"] + cmd return subprocess.check_output(cmd, cwd=cwd, stderr=subprocess.STDOUT, encoding='utf8') -def set_consistent_flag(consistent): - os.system("sync") +def set_consistent_flag(consistent: bool) -> None: + os.sync() consistent_file = Path(os.path.join(FINALIZED, ".overlay_consistent")) if consistent: consistent_file.touch() elif not consistent and consistent_file.exists(): consistent_file.unlink() - os.system("sync") + os.sync() -def set_update_available_params(new_version): +def set_params(new_version: bool, failed_count: int, exception: Optional[str]) -> None: params = Params() - t = datetime.datetime.utcnow().isoformat() - params.put("LastUpdateTime", t.encode('utf8')) + params.put("UpdateFailedCount", str(failed_count)) + if failed_count == 0: + t = datetime.datetime.utcnow().isoformat() + params.put("LastUpdateTime", t.encode('utf8')) + + if exception is None: + params.delete("LastUpdateException") + else: + params.put("LastUpdateException", exception) if new_version: try: @@ -114,13 +122,7 @@ def set_update_available_params(new_version): params.put("UpdateAvailable", "1") -def dismount_ovfs(): - if os.path.ismount(OVERLAY_MERGED): - cloudlog.error("unmounting existing overlay") - run(["umount", "-l", OVERLAY_MERGED]) - - -def setup_git_options(cwd): +def setup_git_options(cwd: str) -> None: # We sync FS object atimes (which NEOS doesn't use) and mtimes, but ctimes # are outside user control. Make sure Git is set up to ignore system ctimes, # because they change when we make hard links during finalize. Otherwise, @@ -134,66 +136,128 @@ def setup_git_options(cwd): ("core.checkStat", "minimal"), ] for option, value in git_cfg: - try: - ret = run(["git", "config", "--get", option], cwd) - config_ok = ret.strip() == value - except subprocess.CalledProcessError: - config_ok = False + run(["git", "config", option, value], cwd) + + +def dismount_overlay() -> None: + if os.path.ismount(OVERLAY_MERGED): + cloudlog.info("unmounting existing overlay") + run(["umount", "-l", OVERLAY_MERGED]) + - if not config_ok: - cloudlog.info(f"Setting git '{option}' to '{value}'") - run(["git", "config", option, value], cwd) +def init_overlay() -> None: + overlay_init_file = Path(os.path.join(BASEDIR, ".overlay_init")) + + # Re-create the overlay if BASEDIR/.git has changed since we created the overlay + if overlay_init_file.is_file(): + git_dir_path = os.path.join(BASEDIR, ".git") + new_files = run(["find", git_dir_path, "-newer", str(overlay_init_file)]) + if not len(new_files.splitlines()): + # A valid overlay already exists + return + else: + cloudlog.info(".git directory changed, recreating overlay") -def init_ovfs(): cloudlog.info("preparing new safe staging area") - Params().put("UpdateAvailable", "0") + Params().put("UpdateAvailable", "0") set_consistent_flag(False) - - dismount_ovfs() + dismount_overlay() if os.path.isdir(STAGING_ROOT): shutil.rmtree(STAGING_ROOT) - for dirname in [STAGING_ROOT, OVERLAY_UPPER, OVERLAY_METADATA, OVERLAY_MERGED, FINALIZED]: + for dirname in [STAGING_ROOT, OVERLAY_UPPER, OVERLAY_METADATA, OVERLAY_MERGED]: os.mkdir(dirname, 0o755) - if not os.lstat(BASEDIR).st_dev == os.lstat(OVERLAY_MERGED).st_dev: + if os.lstat(BASEDIR).st_dev != os.lstat(OVERLAY_MERGED).st_dev: raise RuntimeError("base and overlay merge directories are on different filesystems; not valid for overlay FS!") - # Remove consistent flag from current BASEDIR so it's not copied over - if os.path.isfile(os.path.join(BASEDIR, ".overlay_consistent")): - os.remove(os.path.join(BASEDIR, ".overlay_consistent")) - # Leave a timestamped canary in BASEDIR to check at startup. The device clock # should be correct by the time we get here. If the init file disappears, or # critical mtimes in BASEDIR are newer than .overlay_init, continue.sh can # assume that BASEDIR has used for local development or otherwise modified, # and skips the update activation attempt. - Path(os.path.join(BASEDIR, ".overlay_init")).touch() + consistent_file = Path(os.path.join(BASEDIR, ".overlay_consistent")) + if consistent_file.is_file(): + consistent_file.unlink() + overlay_init_file.touch() - os.system("sync") + os.sync() overlay_opts = f"lowerdir={BASEDIR},upperdir={OVERLAY_UPPER},workdir={OVERLAY_METADATA}" run(["mount", "-t", "overlay", "-o", overlay_opts, "none", OVERLAY_MERGED]) -def finalize_from_ovfs(): +def finalize_update() -> None: """Take the current OverlayFS merged view and finalize a copy outside of OverlayFS, ready to be swapped-in at BASEDIR. Copy using shutil.copytree""" # Remove the update ready flag and any old updates cloudlog.info("creating finalized version of the overlay") set_consistent_flag(False) - shutil.rmtree(FINALIZED) # Copy the merged overlay view and set the update ready flag + if os.path.exists(FINALIZED): + shutil.rmtree(FINALIZED) shutil.copytree(OVERLAY_MERGED, FINALIZED, symlinks=True) + set_consistent_flag(True) cloudlog.info("done finalizing overlay") -def attempt_update(wait_helper): - cloudlog.info("attempting git update inside staging overlay") +def handle_neos_update(wait_helper: WaitTimeHelper) -> None: + with open(NEOS_VERSION, "r") as f: + cur_neos = f.read().strip() + + updated_neos = run(["bash", "-c", r"unset REQUIRED_NEOS_VERSION && source launch_env.sh && \ + echo -n $REQUIRED_NEOS_VERSION"], OVERLAY_MERGED).strip() + + cloudlog.info(f"NEOS version check: {cur_neos} vs {updated_neos}") + if cur_neos == updated_neos: + return + + cloudlog.info(f"Beginning background download for NEOS {updated_neos}") + set_offroad_alert("Offroad_NeosUpdate", True) + + updater_path = os.path.join(OVERLAY_MERGED, "installer/updater/updater") + update_manifest = f"file://{OVERLAY_MERGED}/installer/updater/update.json" + + neos_downloaded = False + start_time = time.monotonic() + # Try to download for one day + while not neos_downloaded and not wait_helper.shutdown and \ + (time.monotonic() - start_time < 60*60*24): + wait_helper.ready_event.clear() + try: + run([updater_path, "bgcache", update_manifest], OVERLAY_MERGED, low_priority=True) + neos_downloaded = True + except subprocess.CalledProcessError: + cloudlog.info("NEOS background download failed, retrying") + wait_helper.sleep(120) + + # If the download failed, we'll show the alert again when we retry + set_offroad_alert("Offroad_NeosUpdate", False) + if not neos_downloaded: + raise Exception("Failed to download NEOS update") + cloudlog.info(f"NEOS background download successful, took {time.monotonic() - start_time} seconds") + + +def check_git_fetch_result(fetch_txt): + err_msg = "Failed to add the host to the list of known hosts (/data/data/com.termux/files/home/.ssh/known_hosts).\n" + return len(fetch_txt) > 0 and (fetch_txt != err_msg) + + +def check_for_update() -> Tuple[bool, bool]: + setup_git_options(OVERLAY_MERGED) + try: + git_fetch_output = run(["git", "fetch", "--dry-run"], OVERLAY_MERGED, low_priority=True) + return True, check_git_fetch_result(git_fetch_output) + except subprocess.CalledProcessError: + return False, False + + +def fetch_update(wait_helper: WaitTimeHelper) -> bool: + cloudlog.info("attempting git fetch inside staging overlay") setup_git_options(OVERLAY_MERGED) @@ -203,9 +267,7 @@ def attempt_update(wait_helper): cur_hash = run(["git", "rev-parse", "HEAD"], OVERLAY_MERGED).rstrip() upstream_hash = run(["git", "rev-parse", "@{u}"], OVERLAY_MERGED).rstrip() new_version = cur_hash != upstream_hash - - err_msg = "Failed to add the host to the list of known hosts (/data/data/com.termux/files/home/.ssh/known_hosts).\n" - git_fetch_result = len(git_fetch_output) > 0 and (git_fetch_output != err_msg) + git_fetch_result = check_git_fetch_result(git_fetch_output) cloudlog.info("comparing %s to %s" % (cur_hash, upstream_hash)) if new_version or git_fetch_result: @@ -221,48 +283,15 @@ def attempt_update(wait_helper): ] cloudlog.info("git reset success: %s", '\n'.join(r)) - # Download the accompanying NEOS version if it doesn't match the current version - with open(NEOS_VERSION, "r") as f: - cur_neos = f.read().strip() - - updated_neos = run(["bash", "-c", r"unset REQUIRED_NEOS_VERSION && source launch_env.sh && \ - echo -n $REQUIRED_NEOS_VERSION"], OVERLAY_MERGED).strip() - - cloudlog.info(f"NEOS version check: {cur_neos} vs {updated_neos}") - if cur_neos != updated_neos: - cloudlog.info(f"Beginning background download for NEOS {updated_neos}") - - set_offroad_alert("Offroad_NeosUpdate", True) - updater_path = os.path.join(OVERLAY_MERGED, "installer/updater/updater") - update_manifest = f"file://{OVERLAY_MERGED}/installer/updater/update.json" - - neos_downloaded = False - start_time = time.monotonic() - # Try to download for one day - while (time.monotonic() - start_time < 60*60*24) and not wait_helper.shutdown: - wait_helper.ready_event.clear() - try: - run([updater_path, "bgcache", update_manifest], OVERLAY_MERGED, low_priority=True) - neos_downloaded = True - break - except subprocess.CalledProcessError: - cloudlog.info("NEOS background download failed, retrying") - wait_helper.sleep(120) - - # If the download failed, we'll show the alert again when we retry - set_offroad_alert("Offroad_NeosUpdate", False) - if not neos_downloaded: - raise Exception("Failed to download NEOS update") - - cloudlog.info(f"NEOS background download successful, took {time.monotonic() - start_time} seconds") + if ANDROID: + handle_neos_update(wait_helper) # Create the finalized, ready-to-swap update - finalize_from_ovfs() + finalize_update() cloudlog.info("openpilot update successful!") else: cloudlog.info("nothing new from git at this time") - set_update_available_params(new_version) return new_version @@ -272,7 +301,7 @@ def main(): if params.get("DisableUpdates") == b"1": raise RuntimeError("updates are disabled by the DisableUpdates param") - if os.geteuid() != 0: + if ANDROID and os.geteuid() != 0: raise RuntimeError("updated must be launched as root!") # Set low io priority @@ -290,45 +319,45 @@ def main(): wait_helper = WaitTimeHelper(proc) wait_helper.sleep(30) + first_run = True + last_fetch_time = 0 update_failed_count = 0 - update_available = False - overlay_initialized = False + + # Run the update loop + # * every 1m, do a lightweight internet/update check + # * every 10m, do a full git fetch while not wait_helper.shutdown: + update_now = wait_helper.ready_event.is_set() wait_helper.ready_event.clear() - # Check for internet every 30s + # Don't run updater while onroad or if the time's wrong time_wrong = datetime.datetime.utcnow().year < 2019 - ping_failed = os.system(f"ping -W 4 -c 1 {TEST_IP}") != 0 - if ping_failed or time_wrong: + is_onroad = params.get("IsOffroad") != b"1" + if is_onroad or time_wrong: wait_helper.sleep(30) + cloudlog.info("not running updater, not offroad") continue # Attempt an update exception = None + new_version = False update_failed_count += 1 try: - # Re-create the overlay if BASEDIR/.git has changed since we created the overlay - if overlay_initialized: - overlay_init_fn = os.path.join(BASEDIR, ".overlay_init") - git_dir_path = os.path.join(BASEDIR, ".git") - new_files = run(["find", git_dir_path, "-newer", overlay_init_fn]) - - if len(new_files.splitlines()): - cloudlog.info(".git directory changed, recreating overlay") - overlay_initialized = False - - if not overlay_initialized: - init_ovfs() - overlay_initialized = True - - if params.get("IsOffroad") == b"1": - update_available = attempt_update(wait_helper) or update_available + init_overlay() + + internet_ok, update_available = check_for_update() + if internet_ok and not update_available: update_failed_count = 0 - if not update_available and os.path.isdir(NEOSUPDATE_DIR): - shutil.rmtree(NEOSUPDATE_DIR) - else: - cloudlog.info("not running updater, openpilot running") + # Fetch updates at most every 10 minutes + if internet_ok and (update_now or time.monotonic() - last_fetch_time > 60*10): + new_version = fetch_update(wait_helper) + update_failed_count = 0 + last_fetch_time = time.monotonic() + + if first_run and not new_version and os.path.isdir(NEOSUPDATE_DIR): + shutil.rmtree(NEOSUPDATE_DIR) + first_run = False except subprocess.CalledProcessError as e: cloudlog.event( "update process failed", @@ -336,21 +365,15 @@ def main(): output=e.output, returncode=e.returncode ) - exception = e - overlay_initialized = False - except Exception: + exception = f"command failed: {e.cmd}\n{e.output}" + except Exception as e: cloudlog.exception("uncaught updated exception, shouldn't happen") + exception = str(e) - params.put("UpdateFailedCount", str(update_failed_count)) - if exception is None: - params.delete("LastUpdateException") - else: - params.put("LastUpdateException", f"command failed: {exception.cmd}\n{exception.output}") - - # Wait 10 minutes between update attempts - wait_helper.sleep(60*10) + set_params(new_version, update_failed_count, exception) + wait_helper.sleep(60) - dismount_ovfs() + dismount_overlay() if __name__ == "__main__": main()