thermald: move dbus and other slow calls into thread (#23525)

* split thermald in threads

* small cleanup

* type

* already done that

* add none check

* fix sleep

* shut down on exception
old-commit-hash: 850a2307d6
commatwo_master
Willem Melching 3 years ago committed by GitHub
parent 651df33062
commit 5cb4528fc3
  1. 162
      selfdrive/thermald/thermald.py

@ -1,28 +1,31 @@
#!/usr/bin/env python3
import datetime
import os
import queue
import threading
import time
from collections import OrderedDict, namedtuple
from pathlib import Path
from typing import Dict, NoReturn, Optional, Tuple
from collections import namedtuple, OrderedDict
from typing import Dict, Optional, Tuple
import psutil
from smbus2 import SMBus
import cereal.messaging as messaging
from cereal import log
from common.dict_helpers import strip_deprecated_keys
from common.filter_simple import FirstOrderFilter
from common.numpy_fast import interp
from common.params import Params
from common.realtime import DT_TRML, sec_since_boot
from common.dict_helpers import strip_deprecated_keys
from selfdrive.controls.lib.alertmanager import set_offroad_alert
from selfdrive.controls.lib.pid import PIController
from selfdrive.hardware import EON, TICI, PC, HARDWARE
from selfdrive.hardware import EON, HARDWARE, PC, TICI
from selfdrive.loggerd.config import get_available_percent
from selfdrive.statsd import statlog
from selfdrive.swaglog import cloudlog
from selfdrive.thermald.power_monitoring import PowerMonitoring
from selfdrive.version import terms_version, training_version
from selfdrive.statsd import statlog
ThermalStatus = log.DeviceState.ThermalStatus
NetworkType = log.DeviceState.NetworkType
@ -30,8 +33,10 @@ NetworkStrength = log.DeviceState.NetworkStrength
CURRENT_TAU = 15. # 15s time constant
TEMP_TAU = 5. # 5s time constant
DISCONNECT_TIMEOUT = 5. # wait 5 seconds before going offroad after disconnect so you get an alert
PANDA_STATES_TIMEOUT = int(1000 * 2.5 * DT_TRML) # 2.5x the expected pandaState frequency
ThermalBand = namedtuple("ThermalBand", ['min_temp', 'max_temp'])
HardwareState = namedtuple("HardwareState", ['network_type', 'network_strength', 'network_info', 'nvme_temps', 'modem_temps'])
# List of thermal bands. We will stay within this region as long as we are within the bounds.
# When exiting the bounds, we'll jump to the lower or higher band. Bands are ordered in the dict.
@ -152,13 +157,50 @@ def set_offroad_alert_if_changed(offroad_alert: str, show_alert: bool, extra_tex
set_offroad_alert(offroad_alert, show_alert, extra_text)
def thermald_thread() -> NoReturn:
def hw_state_thread(end_event, hw_queue):
"""Handles non critical hardware state, and sends over queue"""
count = 0
registered_count = 0
pm = messaging.PubMaster(['deviceState'])
while not end_event.is_set():
# these are expensive calls. update every 10s
if (count % int(10. / DT_TRML)) == 0:
try:
network_type = HARDWARE.get_network_type()
hw_state = HardwareState(
network_type=network_type,
network_strength=HARDWARE.get_network_strength(network_type),
network_info=HARDWARE.get_network_info(),
nvme_temps=HARDWARE.get_nvme_temperatures(),
modem_temps=HARDWARE.get_modem_temperatures(),
)
try:
hw_queue.put_nowait(hw_state)
except queue.Full:
pass
if TICI and (hw_state.network_info is not None) and (hw_state.network_info.get('state', None) == "REGISTERED"):
registered_count += 1
else:
registered_count = 0
if registered_count > 10:
cloudlog.warning(f"Modem stuck in registered state {hw_state.network_info}. nmcli conn up lte")
os.system("nmcli conn up lte")
registered_count = 0
except Exception:
cloudlog.exception("Error getting network status")
count += 1
time.sleep(DT_TRML)
pandaState_timeout = int(1000 * 2.5 * DT_TRML) # 2.5x the expected pandaState frequency
pandaState_sock = messaging.sub_sock('pandaStates', timeout=pandaState_timeout)
sm = messaging.SubMaster(["peripheralState", "gpsLocationExternal", "controlsState"])
def thermald_thread(end_event, hw_queue):
pm = messaging.PubMaster(['deviceState'])
sm = messaging.SubMaster(["peripheralState", "gpsLocationExternal", "controlsState", "pandaStates"], poll=["pandaStates"])
fan_speed = 0
count = 0
@ -175,12 +217,13 @@ def thermald_thread() -> NoReturn:
thermal_status = ThermalStatus.green
usb_power = True
network_type = NetworkType.none
network_strength = NetworkStrength.unknown
network_info = None
registered_count = 0
nvme_temps = None
modem_temps = None
last_hw_state = HardwareState(
network_type=NetworkType.none,
network_strength=NetworkStrength.unknown,
network_info=None,
nvme_temps=[],
modem_temps=[],
)
current_filter = FirstOrderFilter(0., CURRENT_TAU, DT_TRML)
temp_filter = FirstOrderFilter(0., TEMP_TAU, DT_TRML)
@ -199,16 +242,16 @@ def thermald_thread() -> NoReturn:
# TODO: use PI controller for UNO
controller = PIController(k_p=0, k_i=2e-3, neg_limit=-80, pos_limit=0, rate=(1 / DT_TRML))
while True:
pandaStates = messaging.recv_sock(pandaState_sock, wait=True)
while not end_event.is_set():
sm.update(PANDA_STATES_TIMEOUT)
sm.update(0)
pandaStates = sm['pandaStates']
peripheralState = sm['peripheralState']
msg = read_thermal(thermal_config)
if pandaStates is not None and len(pandaStates.pandaStates) > 0:
pandaState = pandaStates.pandaStates[0]
if sm.updated['pandaStates'] and len(pandaStates) > 0:
pandaState = pandaStates[0]
if pandaState.pandaType != log.PandaState.PandaType.unknown:
onroad_conditions["ignition"] = pandaState.ignitionLine or pandaState.ignitionCan
@ -231,44 +274,23 @@ def thermald_thread() -> NoReturn:
setup_eon_fan()
handle_fan = handle_fan_eon
# these are expensive calls. update every 10s
if (count % int(10. / DT_TRML)) == 0:
try:
network_type = HARDWARE.get_network_type()
network_strength = HARDWARE.get_network_strength(network_type)
network_info = HARDWARE.get_network_info() # pylint: disable=assignment-from-none
nvme_temps = HARDWARE.get_nvme_temperatures()
modem_temps = HARDWARE.get_modem_temperatures()
if TICI and (network_info.get('state', None) == "REGISTERED"):
registered_count += 1
else:
registered_count = 0
if registered_count > 10:
cloudlog.warning(f"Modem stuck in registered state {network_info}. nmcli conn up lte")
os.system("nmcli conn up lte")
registered_count = 0
except Exception:
cloudlog.exception("Error getting network status")
try:
last_hw_state = hw_queue.get_nowait()
except queue.Empty:
pass
msg.deviceState.freeSpacePercent = get_available_percent(default=100.0)
msg.deviceState.memoryUsagePercent = int(round(psutil.virtual_memory().percent))
msg.deviceState.cpuUsagePercent = [int(round(n)) for n in psutil.cpu_percent(percpu=True)]
msg.deviceState.gpuUsagePercent = int(round(HARDWARE.get_gpu_usage_percent()))
msg.deviceState.networkType = network_type
msg.deviceState.networkStrength = network_strength
if network_info is not None:
msg.deviceState.networkInfo = network_info
if nvme_temps is not None:
msg.deviceState.nvmeTempC = nvme_temps
for i, temp in enumerate(nvme_temps):
statlog.gauge(f"nvme_temperature{i}", temp)
if modem_temps is not None:
msg.deviceState.modemTempC = modem_temps
for i, temp in enumerate(modem_temps):
statlog.gauge(f"modem_temperature{i}", temp)
msg.deviceState.networkType = last_hw_state.network_type
msg.deviceState.networkStrength = last_hw_state.network_strength
if last_hw_state.network_info is not None:
msg.deviceState.networkInfo = last_hw_state.network_info
msg.deviceState.nvmeTempC = last_hw_state.nvme_temps
msg.deviceState.modemTempC = last_hw_state.modem_temps
msg.deviceState.screenBrightnessPercent = HARDWARE.get_screen_brightness()
msg.deviceState.batteryPercent = HARDWARE.get_battery_capacity()
@ -392,7 +414,7 @@ def thermald_thread() -> NoReturn:
should_start_prev = should_start
startup_conditions_prev = startup_conditions.copy()
# log more stats
# Log to statsd
statlog.gauge("free_space_percent", msg.deviceState.freeSpacePercent)
statlog.gauge("gpu_usage_percent", msg.deviceState.gpuUsagePercent)
statlog.gauge("memory_usage_percent", msg.deviceState.memoryUsagePercent)
@ -406,6 +428,10 @@ def thermald_thread() -> NoReturn:
statlog.gauge("ambient_temperature", msg.deviceState.ambientTempC)
for i, temp in enumerate(msg.deviceState.pmicTempC):
statlog.gauge(f"pmic{i}_temperature", temp)
for i, temp in enumerate(last_hw_state.nvme_temps):
statlog.gauge(f"nvme_temperature{i}", temp)
for i, temp in enumerate(last_hw_state.modem_temps):
statlog.gauge(f"modem_temperature{i}", temp)
statlog.gauge("fan_speed_percent_desired", msg.deviceState.fanSpeedPercentDesired)
statlog.gauge("screen_brightness_percent", msg.deviceState.screenBrightnessPercent)
@ -416,7 +442,7 @@ def thermald_thread() -> NoReturn:
cloudlog.event("STATUS_PACKET",
count=count,
pandaStates=(strip_deprecated_keys(pandaStates.to_dict()) if pandaStates else None),
pandaStates=[strip_deprecated_keys(p.to_dict()) for p in pandaStates],
peripheralState=strip_deprecated_keys(peripheralState.to_dict()),
location=(strip_deprecated_keys(sm["gpsLocationExternal"].to_dict()) if sm.alive["gpsLocationExternal"] else None),
deviceState=strip_deprecated_keys(msg.to_dict()))
@ -424,8 +450,28 @@ def thermald_thread() -> NoReturn:
count += 1
def main() -> NoReturn:
thermald_thread()
def main():
hw_queue = queue.Queue(maxsize=1)
end_event = threading.Event()
threads = [
threading.Thread(target=hw_state_thread, args=(end_event, hw_queue)),
threading.Thread(target=thermald_thread, args=(end_event, hw_queue)),
]
for t in threads:
t.start()
try:
while True:
time.sleep(1)
if not all(t.is_alive() for t in threads):
break
finally:
end_event.set()
for t in threads:
t.join()
if __name__ == "__main__":

Loading…
Cancel
Save