Skip to content

Commit c1a83bd

Browse files
committed
hw-mgmt: thermal control: fix signal handling for clean shutdown
If TC receives "exit" signal, it tried to log and shut down from inside the signal handler. That is unsafe and can deadlock or crash. If TC interrupted during startup (e.g. while waiting for config), it could then try to use things that were not set up yet and crash. Fix: The signal handler only records which signal was received and tells the main program to exit. All logging and proper shutdown happen in the main thread after the mainloop ends. Bug:4837925 Signed-off-by: Oleksandr Shamray <oleksandrs@nvidia.com>
1 parent 193927e commit c1a83bd

File tree

2 files changed

+65
-32
lines changed

2 files changed

+65
-32
lines changed

usr/usr/bin/hw_management_thermal_control.py

Lines changed: 32 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# pylint: disable=line-too-long
33
# pylint: disable=C0103
44
########################################################################
5-
# Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES.
5+
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
66
#
77
# Redistribution and use in source and binary forms, with or without
88
# modification, are permitted provided that the following conditions are met:
@@ -390,8 +390,11 @@ class CONST(object):
390390

391391
ASIC_CONF_DEFAULT = {"1": {"pwm_control": False, "fan_control": False}}
392392

393+
_sig_condition_name = "-"
393394

394395
# ----------------------------------------------------------------------
396+
397+
395398
def str2bool(val):
396399
"""
397400
@summary:
@@ -2671,21 +2674,24 @@ def __init__(self, cmd_arg, tc_logger):
26712674
signal.signal(signal.SIGINT, self.sig_handler)
26722675
signal.signal(signal.SIGHUP, self.sig_handler)
26732676
self.exit = Event()
2674-
self.exit_flag = False
26752677

26762678
self.load_configuration()
26772679
if not str2bool(self.sys_config.get("platform_support", 1)):
26782680
self.log.notice("Platform Board:'{}', SKU:'{}' is not supported.".format(self.board_type, self.sku), 1)
26792681
self.log.notice("Set TC to idle.")
26802682
while True:
26812683
self.exit.wait(60)
2684+
if self.exit.is_set():
2685+
return
26822686

26832687
if not self.is_pwm_exists():
26842688
self.log.notice("Missing PWM control (probably ASIC driver not loaded). PWM control is requiured for TC run\nWaiting for ASIC init", 1)
26852689
while not self.is_pwm_exists():
26862690
self.log.notice("Wait...")
26872691
self.exit.wait(10)
26882692
self.log.notice("PWM control activated", 1)
2693+
if self.exit.is_set():
2694+
return
26892695

26902696
# Set PWM to the default state while we are waiting for system configuration
26912697
self.log.notice("Set FAN PWM {}".format(self.pwm_target), 1)
@@ -2703,19 +2709,25 @@ def __init__(self, cmd_arg, tc_logger):
27032709
else:
27042710
self.log.info("Set PWM successful")
27052711
break
2712+
if self.exit.is_set():
2713+
return
27062714

27072715
if not self.is_fan_tacho_init():
27082716
self.log.notice("Missing FAN tacho (probably ASIC not inited yet). FANs is requiured for TC run\nWaiting for ASIC init", 1)
27092717
while not self.is_fan_tacho_init():
27102718
self.log.notice("Wait...")
27112719
self.exit.wait(10)
2720+
if self.exit.is_set():
2721+
return
27122722

27132723
self.log.notice("Mellanox thermal control is waiting for configuration ({} sec).".format(CONST.THERMAL_WAIT_FOR_CONFIG), 1)
27142724
timeout = current_milli_time() + 1000 * CONST.THERMAL_WAIT_FOR_CONFIG
27152725
while timeout > current_milli_time():
27162726
if not self.write_pwm(self.pwm_target):
27172727
self.log.info("Set PWM failed. Possible SDK is not started")
27182728
self.exit.wait(2)
2729+
if self.exit.is_set():
2730+
return
27192731

27202732
self._collect_hw_info()
27212733
self.amb_tmp = CONST.TEMP_INIT_VAL_DEF
@@ -3216,15 +3228,12 @@ def sig_handler(self, sig, *_):
32163228
Signal handler for termination signals
32173229
"""
32183230
if sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]:
3219-
self.exit_flag = True
3220-
self.log.close_tc_log_handler()
3221-
if self.sys_config.get("platform_support", 1):
3222-
self.stop(reason="SIG {}".format(sig))
3223-
3224-
self.log.notice("Thermal control stopped", 1)
3225-
self.log.logger_emit = False
3226-
self.log.stop()
3227-
os._exit(0)
3231+
global _sig_condition_name
3232+
try:
3233+
_sig_condition_name = signal.Signals(sig).name
3234+
except (ValueError, AttributeError):
3235+
_sig_condition_name = str(sig)
3236+
self.exit.set()
32283237

32293238
# ----------------------------------------------------------------------
32303239
def load_configuration(self):
@@ -3599,7 +3608,7 @@ def run(self):
35993608
self.log.notice("********************************", 1)
36003609
module_scan_timeout = 0
36013610
# main loop
3602-
while not self.exit.is_set() or not self.exit_flag:
3611+
while not self.exit.is_set():
36033612
try:
36043613
log_level = int(self.read_file(CONST.LOG_LEVEL_FILENAME))
36053614
if log_level != self.cmd_arg["verbosity"]:
@@ -3837,9 +3846,17 @@ class RawTextArgumentDefaultsHelpFormatter(
38373846
thermal_management = None
38383847
try:
38393848
thermal_management = ThermalManagement(args, logger)
3840-
thermal_management.init()
3841-
thermal_management.start(reason="init")
3842-
thermal_management.run()
3849+
if not thermal_management.exit.is_set():
3850+
thermal_management.init()
3851+
thermal_management.start(reason="init")
3852+
thermal_management.run()
3853+
3854+
logger.notice("Thermal control stopped by signal {}".format(_sig_condition_name), 1)
3855+
if (thermal_management is not None and
3856+
hasattr(thermal_management, 'sys_config') and
3857+
thermal_management.sys_config.get("platform_support", 1)):
3858+
thermal_management.stop(reason="SIG {}".format(_sig_condition_name))
3859+
38433860
except BaseException as e:
38443861
logger.info(traceback.format_exc())
38453862
if thermal_management:

usr/usr/bin/hw_management_thermal_control_2_5.py

Lines changed: 33 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# pylint: disable=line-too-long
33
# pylint: disable=C0103
44
########################################################################
5-
# Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES.
5+
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
66
#
77
# Redistribution and use in source and binary forms, with or without
88
# modification, are permitted provided that the following conditions are met:
@@ -32,7 +32,6 @@
3232
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
3333
# POSSIBILITY OF SUCH DAMAGE.
3434
#
35-
3635
"""
3736
Created on Apr 08, 2025
3837
@@ -401,8 +400,11 @@ class CONST(object):
401400

402401
ASIC_CONF_DEFAULT = {"1": {"pwm_control": False, "fan_control": False}}
403402

403+
_sig_condition_name = "-"
404404

405405
# ----------------------------------------------------------------------
406+
407+
406408
def str2bool(val):
407409
"""
408410
@summary:
@@ -2946,21 +2948,24 @@ def __init__(self, cmd_arg, tc_logger):
29462948
signal.signal(signal.SIGINT, self.sig_handler)
29472949
signal.signal(signal.SIGHUP, self.sig_handler)
29482950
self.exit = Event()
2949-
self.exit_flag = False
29502951

29512952
self.load_configuration()
29522953
if not str2bool(self.sys_config.get("platform_support", 1)):
29532954
self.log.notice("Platform Board:'{}', SKU:'{}' is not supported.".format(self.board_type, self.sku), 1)
29542955
self.log.notice("Set TC to idle.")
29552956
while True:
29562957
self.exit.wait(60)
2958+
if self.exit.is_set():
2959+
return
29572960

29582961
if not self.is_pwm_exists():
29592962
self.log.notice("Missing PWM control (probably ASIC driver not loaded). PWM control is requiured for TC run\nWaiting for ASIC init", 1)
29602963
while not self.is_pwm_exists():
29612964
self.log.notice("Wait...")
29622965
self.exit.wait(10)
29632966
self.log.notice("PWM control activated", 1)
2967+
if self.exit.is_set():
2968+
return
29642969

29652970
pwm_update_period = get_dict_val_by_path(self.sys_config, [CONST.SYS_CONF_GENERAL_CONFIG_PARAM, CONST.SYS_CONF_PWM_UPDATE_PERIOD_PARAM])
29662971
if pwm_update_period:
@@ -2985,19 +2990,25 @@ def __init__(self, cmd_arg, tc_logger):
29852990
else:
29862991
self.log.info("Set PWM successful")
29872992
break
2993+
if self.exit.is_set():
2994+
return
29882995

29892996
if not self.is_fan_tacho_init():
29902997
self.log.notice("Missing FAN tacho (probably ASIC not inited yet). FANs is requiured for TC run\nWaiting for ASIC init", 1)
29912998
while not self.is_fan_tacho_init():
29922999
self.log.notice("Wait...")
29933000
self.exit.wait(10)
3001+
if self.exit.is_set():
3002+
return
29943003

29953004
self.log.notice("Mellanox thermal control is waiting for configuration ({} sec).".format(CONST.THERMAL_WAIT_FOR_CONFIG), 1)
29963005
timeout = current_milli_time() + 1000 * CONST.THERMAL_WAIT_FOR_CONFIG
29973006
while timeout > current_milli_time():
29983007
if not self.write_pwm(self.pwm_target):
29993008
self.log.info("Set PWM failed. Possible SDK is not started")
30003009
self.exit.wait(2)
3010+
if self.exit.is_set():
3011+
return
30013012

30023013
self._collect_hw_info()
30033014
self.amb_tmp = CONST.TEMP_INIT_VAL_DEF
@@ -3533,15 +3544,12 @@ def sig_handler(self, sig, *_):
35333544
Signal handler for termination signals
35343545
"""
35353546
if sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]:
3536-
self.exit_flag = True
3537-
self.log.close_tc_log_handler()
3538-
if self.sys_config.get("platform_support", 1):
3539-
self.stop(reason="SIG {}".format(sig))
3540-
3541-
self.log.notice("Thermal control stopped", 1)
3542-
self.log.logger_emit = False
3543-
self.log.stop()
3544-
os._exit(0)
3547+
global _sig_condition_name
3548+
try:
3549+
_sig_condition_name = signal.Signals(sig).name
3550+
except (ValueError, AttributeError):
3551+
_sig_condition_name = str(sig)
3552+
self.exit.set()
35453553

35463554
# ----------------------------------------------------------------------
35473555
def load_configuration(self):
@@ -3919,7 +3927,7 @@ def run(self):
39193927
self.log.notice("********************************", 1)
39203928
module_scan_timeout = 0
39213929
# main loop
3922-
while not self.exit.is_set() or not self.exit_flag:
3930+
while not self.exit.is_set():
39233931
try:
39243932
log_level = int(self.read_file(CONST.LOG_LEVEL_FILENAME))
39253933
if log_level != self.cmd_arg["verbosity"]:
@@ -4155,12 +4163,20 @@ class RawTextArgumentDefaultsHelpFormatter(
41554163
thermal_management = None
41564164
try:
41574165
thermal_management = ThermalManagement(args, logger)
4158-
thermal_management.init()
4159-
thermal_management.start(reason="init")
4160-
thermal_management.run()
4166+
if not thermal_management.exit.is_set():
4167+
thermal_management.init()
4168+
thermal_management.start(reason="init")
4169+
thermal_management.run()
4170+
4171+
logger.notice("Thermal control stopped by signal {}".format(_sig_condition_name), 1)
4172+
if (thermal_management is not None and
4173+
hasattr(thermal_management, 'sys_config') and
4174+
thermal_management.sys_config.get("platform_support", 1)):
4175+
thermal_management.stop(reason="SIG {}".format(_sig_condition_name))
4176+
41614177
except BaseException as e:
41624178
logger.info(traceback.format_exc())
4163-
if thermal_management:
4179+
if thermal_management is not None:
41644180
thermal_management.stop(reason="crash ({})".format(str(e)))
41654181
sys.exit(1)
41664182

0 commit comments

Comments
 (0)