Skip to content

Commit 81b8f6c

Browse files
committed
hw-mgmt: thermal control: fix signal handling for clean shutdown
If TC receives "exit" signal, it tried to log and shut down from inside the signal handler. That is unsafe and can deadlock or crash. If TC interrupted during startup (e.g. while waiting for config), it could then try to use things that were not set up yet and crash. Fix: The signal handler only records which signal was received and tells the main program to exit. All logging and proper shutdown happen in the main thread after the mainloop ends. Bug:4837925 Signed-off-by: Oleksandr Shamray <oleksandrs@nvidia.com>
1 parent ca4fba8 commit 81b8f6c

File tree

2 files changed

+80
-87
lines changed

2 files changed

+80
-87
lines changed

usr/usr/bin/hw_management_thermal_control.py

Lines changed: 40 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,15 @@
11
#!/usr/bin/python
2-
# pylint: disable=line-too-long
3-
# pylint: disable=C0103
4-
########################################################################
5-
# Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES.
62
#
7-
# Redistribution and use in source and binary forms, with or without
8-
# modification, are permitted provided that the following conditions are met:
3+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
4+
# Copyright (c) 2020-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5+
# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
96
#
10-
# 1. Redistributions of source code must retain the above copyright
11-
# notice, this list of conditions and the following disclaimer.
12-
# 2. Redistributions in binary form must reproduce the above copyright
13-
# notice, this list of conditions and the following disclaimer in the
14-
# documentation and/or other materials provided with the distribution.
15-
# 3. Neither the names of the copyright holders nor the names of its
16-
# contributors may be used to endorse or promote products derived from
17-
# this software without specific prior written permission.
18-
#
19-
# Alternatively, this software may be distributed under the terms of the
20-
# GNU General Public License ("GPL") version 2 as published by the Free
21-
# Software Foundation.
22-
#
23-
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
24-
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25-
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26-
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
27-
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
28-
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
29-
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
30-
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
31-
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32-
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33-
# POSSIBILITY OF SUCH DAMAGE.
7+
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
8+
# property and proprietary rights in and to this material, related
9+
# documentation and any modifications thereto. Any use, reproduction,
10+
# disclosure or distribution of this material and related documentation
11+
# without an express license agreement from NVIDIA CORPORATION or
12+
# its affiliates is strictly prohibited.
3413
#
3514

3615
"""
@@ -415,6 +394,8 @@ class CONST:
415394
gmemory_snapshot = None
416395
gmemory_snapshot_profiler = ObjectSnapshot(max_depth=16)
417396

397+
_sig_condition_name = "-"
398+
418399
# ----------------------------------------------------------------------
419400

420401

@@ -2875,19 +2856,22 @@ def __init__(self, cmd_arg, tc_logger):
28752856
signal.signal(signal.SIGINT, self.sig_handler)
28762857
signal.signal(signal.SIGHUP, self.sig_handler)
28772858
self.exit = threading.Event()
2878-
self.exit_flag = False
28792859

28802860
if not str2bool(self.sys_config.get("platform_support", 1)):
28812861
self.log.notice("Platform Board:'{}', SKU:'{}' is not supported.".format(self.board_type, self.sku), repeat=1)
28822862
self.log.notice("Set TC to idle.")
28832863
while True:
28842864
self.exit.wait(60)
2865+
if self.exit.is_set():
2866+
return
28852867

28862868
if not self.is_pwm_exists():
28872869
self.log.notice("Missing PWM control (probably ASIC driver not loaded). PWM control is required for TC run\nWaiting for ASIC init", repeat=1)
28882870
while not self.is_pwm_exists():
28892871
self.log.notice("Wait...")
28902872
self.exit.wait(10)
2873+
if self.exit.is_set():
2874+
return
28912875
self.log.notice("PWM control activated", repeat=1)
28922876

28932877
self.attention_fans_lst = get_dict_val_by_path(self.sys_config, [CONST.SYS_CONF_GENERAL_CONFIG_PARAM, CONST.SYS_CONF_FAN_STEADY_ATTENTION_ITEMS])
@@ -2918,19 +2902,25 @@ def __init__(self, cmd_arg, tc_logger):
29182902
else:
29192903
self.log.info("Set PWM successful")
29202904
break
2905+
if self.exit.is_set():
2906+
return
29212907

29222908
if not self.is_fan_tacho_init():
29232909
self.log.notice("Missing FAN tacho (probably ASIC not initialized yet). FANs is required for TC run\nWaiting for ASIC init", repeat=1)
29242910
while not self.is_fan_tacho_init():
29252911
self.log.notice("Wait...")
29262912
self.exit.wait(10)
2913+
if self.exit.is_set():
2914+
return
29272915

29282916
self.log.notice("Nvidia thermal control is waiting for configuration ({} sec).".format(CONST.THERMAL_WAIT_FOR_CONFIG), repeat=1)
29292917
timeout = current_milli_time() + 1000 * CONST.THERMAL_WAIT_FOR_CONFIG
29302918
while timeout > current_milli_time():
29312919
if not self.write_pwm(self.pwm_target):
29322920
self.log.info("Set PWM failed. Possible SDK is not started")
29332921
self.exit.wait(2)
2922+
if self.exit.is_set():
2923+
return
29342924

29352925
self._collect_hw_info()
29362926
self.amb_tmp = CONST.TEMP_INIT_VAL_DEF
@@ -3509,13 +3499,12 @@ def sig_handler(self, sig, *_):
35093499
Signal handler for termination signals
35103500
"""
35113501
if sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]:
3512-
self.exit_flag = True
3513-
self.log.syslog_log(self.log.NOTICE, "Thermal control stopped by signal {}".format(sig))
3514-
self.log.stop()
3515-
if self.sys_config.get("platform_support", 1):
3516-
self.stop(reason="SIG {}".format(sig))
3517-
3518-
os._exit(0)
3502+
global _sig_condition_name
3503+
try:
3504+
_sig_condition_name = signal.Signals(sig).name
3505+
except (ValueError, AttributeError):
3506+
_sig_condition_name = str(sig)
3507+
self.exit.set()
35193508

35203509
# ----------------------------------------------------------------------
35213510
def load_configuration(self):
@@ -3918,7 +3907,7 @@ def run(self):
39183907
gmemory_snapshot = None
39193908

39203909
# main loop
3921-
while not self.exit.is_set() or not self.exit_flag:
3910+
while not self.exit.is_set():
39223911
try:
39233912
log_level = int(self.read_file(CONST.LOG_LEVEL_FILENAME))
39243913
if log_level != self.cmd_arg["verbosity"]:
@@ -4237,13 +4226,20 @@ class RawTextArgumentDefaultsHelpFormatter(
42374226
thermal_management = None
42384227
try:
42394228
thermal_management = ThermalManagement(args, logger)
4240-
thermal_management.init()
4241-
thermal_management.start(reason="init")
4242-
thermal_management.run()
4229+
if not thermal_management.exit.is_set():
4230+
thermal_management.init()
4231+
thermal_management.start(reason="init")
4232+
thermal_management.run()
4233+
4234+
logger.notice("Thermal control stopped by signal {}".format(_sig_condition_name), repeat=1)
4235+
if (thermal_management is not None and
4236+
hasattr(thermal_management, 'sys_config') and
4237+
thermal_management.sys_config.get("platform_support", 1)):
4238+
thermal_management.stop(reason="SIG {}".format(_sig_condition_name))
4239+
42434240
except Exception as e:
42444241
logger.info(traceback.format_exc())
4245-
if thermal_management:
4242+
if thermal_management is not None:
42464243
thermal_management.stop(reason="crash ({})".format(str(e)))
42474244
sys.exit(1)
4248-
42494245
sys.exit(0)

usr/usr/bin/hw_management_thermal_control_2_5.py

Lines changed: 40 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,15 @@
11
#!/usr/bin/python
2-
# pylint: disable=line-too-long
3-
# pylint: disable=C0103
4-
########################################################################
5-
# Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES.
62
#
7-
# Redistribution and use in source and binary forms, with or without
8-
# modification, are permitted provided that the following conditions are met:
3+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
4+
# Copyright (c) 2020-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5+
# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
96
#
10-
# 1. Redistributions of source code must retain the above copyright
11-
# notice, this list of conditions and the following disclaimer.
12-
# 2. Redistributions in binary form must reproduce the above copyright
13-
# notice, this list of conditions and the following disclaimer in the
14-
# documentation and/or other materials provided with the distribution.
15-
# 3. Neither the names of the copyright holders nor the names of its
16-
# contributors may be used to endorse or promote products derived from
17-
# this software without specific prior written permission.
18-
#
19-
# Alternatively, this software may be distributed under the terms of the
20-
# GNU General Public License ("GPL") version 2 as published by the Free
21-
# Software Foundation.
22-
#
23-
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
24-
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25-
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26-
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
27-
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
28-
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
29-
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
30-
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
31-
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32-
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33-
# POSSIBILITY OF SUCH DAMAGE.
7+
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
8+
# property and proprietary rights in and to this material, related
9+
# documentation and any modifications thereto. Any use, reproduction,
10+
# disclosure or distribution of this material and related documentation
11+
# without an express license agreement from NVIDIA CORPORATION or
12+
# its affiliates is strictly prohibited.
3413
#
3514

3615
"""
@@ -463,6 +442,8 @@ class CONST:
463442
gmemory_snapshot = None
464443
gmemory_snapshot_profiler = ObjectSnapshot(max_depth=16)
465444

445+
_sig_condition_name = "-"
446+
466447
# ----------------------------------------------------------------------
467448

468449

@@ -3240,19 +3221,22 @@ def __init__(self, cmd_arg, tc_logger):
32403221
signal.signal(signal.SIGINT, self.sig_handler)
32413222
signal.signal(signal.SIGHUP, self.sig_handler)
32423223
self.exit = threading.Event()
3243-
self.exit_flag = False
32443224

32453225
if not str2bool(self.sys_config.get("platform_support", 1)):
32463226
self.log.notice("Platform Board:'{}', SKU:'{}' is not supported.".format(self.board_type, self.sku), repeat=1)
32473227
self.log.notice("Set TC to idle.")
32483228
while True:
32493229
self.exit.wait(60)
3230+
if self.exit.is_set():
3231+
return
32503232

32513233
if not self.is_pwm_exists():
32523234
self.log.notice("Missing PWM control (probably ASIC driver not loaded). PWM control is required for TC run\nWaiting for ASIC init", repeat=1)
32533235
while not self.is_pwm_exists():
32543236
self.log.notice("Wait...")
32553237
self.exit.wait(10)
3238+
if self.exit.is_set():
3239+
return
32563240
self.log.notice("PWM control activated", repeat=1)
32573241

32583242
self.attention_fans_lst = get_dict_val_by_path(self.sys_config, [CONST.SYS_CONF_GENERAL_CONFIG_PARAM, CONST.SYS_CONF_FAN_STEADY_ATTENTION_ITEMS])
@@ -3290,19 +3274,25 @@ def __init__(self, cmd_arg, tc_logger):
32903274
else:
32913275
self.log.info("Set PWM successful")
32923276
break
3277+
if self.exit.is_set():
3278+
return
32933279

32943280
if not self.is_fan_tacho_init():
32953281
self.log.notice("Missing FAN tacho (probably ASIC not initialized yet). FANs is required for TC run\nWaiting for ASIC init", repeat=1)
32963282
while not self.is_fan_tacho_init():
32973283
self.log.notice("Wait...")
32983284
self.exit.wait(10)
3285+
if self.exit.is_set():
3286+
return
32993287

33003288
self.log.notice("Nvidia thermal control is waiting for configuration ({} sec).".format(CONST.THERMAL_WAIT_FOR_CONFIG), repeat=1)
33013289
timeout = current_milli_time() + 1000 * CONST.THERMAL_WAIT_FOR_CONFIG
33023290
while timeout > current_milli_time():
33033291
if not self.write_pwm(self.pwm_target):
33043292
self.log.info("Set PWM failed. Possible SDK is not started")
33053293
self.exit.wait(2)
3294+
if self.exit.is_set():
3295+
return
33063296

33073297
self._collect_hw_info()
33083298
self.amb_tmp = CONST.TEMP_INIT_VAL_DEF
@@ -3935,13 +3925,12 @@ def sig_handler(self, sig, *_):
39353925
Signal handler for termination signals
39363926
"""
39373927
if sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]:
3938-
self.exit_flag = True
3939-
self.log.syslog_log(self.log.NOTICE, "Thermal control stopped by signal {}".format(sig))
3940-
self.log.stop()
3941-
if self.sys_config.get("platform_support", 1):
3942-
self.stop(reason="SIG {}".format(sig))
3943-
3944-
os._exit(0)
3928+
global _sig_condition_name
3929+
try:
3930+
_sig_condition_name = signal.Signals(sig).name
3931+
except (ValueError, AttributeError):
3932+
_sig_condition_name = str(sig)
3933+
self.exit.set()
39453934

39463935
# ----------------------------------------------------------------------
39473936
def load_user_configuration(self, user_config_file_name):
@@ -4373,7 +4362,7 @@ def run(self):
43734362
gmemory_snapshot = None
43744363

43754364
# main loop
4376-
while not self.exit.is_set() or not self.exit_flag:
4365+
while not self.exit.is_set():
43774366
try:
43784367
log_level = int(self.read_file(CONST.LOG_LEVEL_FILENAME))
43794368
if log_level != self.cmd_arg["verbosity"]:
@@ -4692,12 +4681,20 @@ class RawTextArgumentDefaultsHelpFormatter(
46924681
thermal_management = None
46934682
try:
46944683
thermal_management = ThermalManagement(args, logger)
4695-
thermal_management.init()
4696-
thermal_management.start(reason="init")
4697-
thermal_management.run()
4684+
if not thermal_management.exit.is_set():
4685+
thermal_management.init()
4686+
thermal_management.start(reason="init")
4687+
thermal_management.run()
4688+
4689+
logger.notice("Thermal control stopped by signal {}".format(_sig_condition_name), repeat=1)
4690+
if (thermal_management is not None and
4691+
hasattr(thermal_management, 'sys_config') and
4692+
thermal_management.sys_config.get("platform_support", 1)):
4693+
thermal_management.stop(reason="SIG {}".format(_sig_condition_name))
4694+
46984695
except Exception as e:
46994696
logger.info(traceback.format_exc())
4700-
if thermal_management:
4697+
if thermal_management is not None:
47014698
thermal_management.stop(reason="crash ({})".format(str(e)))
47024699
sys.exit(1)
47034700

0 commit comments

Comments
 (0)