android health daemon (#21965)
* androidd
* three strikes
* better logging
* persistent
* add to release files
* cleanup
* no cpu usage
* colon
Co-authored-by: Comma Device <device@comma.ai>
old-commit-hash: c8ca56dddd
commatwo_master
parent
e9cd2a5561
commit
67a7abb492
4 changed files with 89 additions and 2 deletions
@ -0,0 +1,78 @@ |
||||
#!/usr/bin/env python3 |
||||
import os |
||||
import time |
||||
import psutil |
||||
from typing import Optional |
||||
|
||||
from common.realtime import set_core_affinity, set_realtime_priority |
||||
from selfdrive.swaglog import cloudlog |
||||
|
||||
|
||||
MAX_MODEM_CRASHES = 3 |
||||
MODEM_PATH = "/sys/devices/soc/2080000.qcom,mss/subsys5" |
||||
WATCHED_PROCS = ["zygote", "zygote64", "/system/bin/servicemanager", "/system/bin/surfaceflinger"] |
||||
|
||||
|
||||
def get_modem_crash_count() -> Optional[int]: |
||||
try: |
||||
with open(os.path.join(MODEM_PATH, "crash_count")) as f: |
||||
return int(f.read()) |
||||
except Exception: |
||||
cloudlog.exception("Error reading modem crash count") |
||||
return None |
||||
|
||||
def get_modem_state() -> str: |
||||
try: |
||||
with open(os.path.join(MODEM_PATH, "state")) as f: |
||||
return f.read().strip() |
||||
except Exception: |
||||
cloudlog.exception("Error reading modem state") |
||||
return "" |
||||
|
||||
def main(): |
||||
set_core_affinity(1) |
||||
set_realtime_priority(1) |
||||
|
||||
procs = {} |
||||
crash_count = 0 |
||||
modem_killed = False |
||||
modem_state = "ONLINE" |
||||
while True: |
||||
# check critical android services |
||||
if any(p is None or not p.is_running() for p in procs.values()) or not len(procs): |
||||
cur = {p: None for p in WATCHED_PROCS} |
||||
for p in psutil.process_iter(attrs=['cmdline']): |
||||
cmdline = None if not len(p.info['cmdline']) else p.info['cmdline'][0] |
||||
if cmdline in WATCHED_PROCS: |
||||
cur[cmdline] = p |
||||
|
||||
if len(procs): |
||||
for p in WATCHED_PROCS: |
||||
if cur[p] != procs[p]: |
||||
cloudlog.event("android service pid changed", proc=p, cur=cur[p], prev=procs[p]) |
||||
procs.update(cur) |
||||
|
||||
# check modem state |
||||
state = get_modem_state() |
||||
if state != modem_state and not modem_killed: |
||||
cloudlog.event("modem state changed", state=state) |
||||
modem_state = state |
||||
|
||||
# check modem crashes |
||||
cnt = get_modem_crash_count() |
||||
if cnt is not None: |
||||
if cnt > crash_count: |
||||
cloudlog.event("modem crash", count=cnt) |
||||
crash_count = cnt |
||||
|
||||
# handle excessive modem crashes |
||||
if crash_count > MAX_MODEM_CRASHES and not modem_killed: |
||||
cloudlog.event("killing modem") |
||||
with open("/sys/kernel/debug/msm_subsys/modem", "w") as f: |
||||
f.write("put") |
||||
modem_killed = True |
||||
|
||||
time.sleep(1) |
||||
|
||||
if __name__ == "__main__": |
||||
main() |
Loading…
Reference in new issue