Skip to content

Commit cdc474f

Browse files
committed
Stabilize the DPU kernel panic and memory exhaustion tests
We should check the DPUs are offline before checking they are online after the kernel panic or memory exhaustion. Otherwise the check for DPU online could pass even before the DPUs are rebooted and the later crirical services check will fail. Signed-off-by: Cong Hou <congh@nvidia.com>
1 parent ed96ef8 commit cdc474f

File tree

2 files changed

+26
-1
lines changed

2 files changed

+26
-1
lines changed

tests/smartswitch/common/device_utils_dpu.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,25 @@ def check_dpu_module_status(duthost, power_status, dpu_name):
225225
return False
226226

227227

228+
def check_dpus_module_status(duthost, dpu_list, power_status, wait_timeout=30):
229+
"""
230+
Check module status of given DPU list
231+
Args:
232+
duthost : Host handle
233+
dpu_list: List of DPUs to be checked
234+
num_dpu_modules: number of dpu modules
235+
power_status: status to be checked (on/off)
236+
wait_timeout: timeout for the check
237+
"""
238+
logging.info("Check module status of DPUs")
239+
wait_interval = 10 if wait_timeout > 20 else wait_timeout // 3
240+
for dpu_name in dpu_list:
241+
pytest_assert(wait_until(wait_timeout, wait_interval, 0,
242+
check_dpu_module_status, duthost,
243+
power_status, dpu_name),
244+
f"DPU {dpu_name} is not {power_status}")
245+
246+
228247
def check_dpu_reboot_cause(duthost, dpu_name, reason):
229248
"""
230249
Check reboot cause of all DPU modules

tests/smartswitch/platform_tests/test_reload_dpu.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from tests.common.reboot import reboot, REBOOT_TYPE_COLD, SONIC_SSH_PORT, SONIC_SSH_REGEX
1212
from tests.smartswitch.common.device_utils_dpu import check_dpu_link_and_status,\
1313
pre_test_check, post_test_switch_check, post_test_dpus_check,\
14-
dpus_shutdown_and_check, dpus_startup_and_check,\
14+
dpus_shutdown_and_check, dpus_startup_and_check, check_dpus_module_status,\
1515
num_dpu_modules, check_dpus_are_not_pingable, check_dpus_reboot_cause # noqa: F401
1616
from tests.common.platform.device_utils import platform_api_conn, start_platform_api_service # noqa: F401,F403
1717
from tests.smartswitch.common.reboot import perform_reboot
@@ -216,6 +216,9 @@ def test_dpu_status_post_dpu_kernel_panic(duthosts, dpuhosts,
216216

217217
logging.info("Starting UP the DPUs")
218218
dpus_startup_and_check(duthost, dpu_on_list, num_dpu_modules)
219+
else:
220+
logging.info("Check DPUs are offline")
221+
check_dpus_module_status(duthost, dpu_on_list, "off")
219222

220223
logging.info("Executing post test dpu check")
221224
post_test_dpus_check(duthost, dpuhosts,
@@ -267,6 +270,9 @@ def test_dpu_check_post_dpu_mem_exhaustion(duthosts, dpuhosts,
267270

268271
logging.info("Starting UP the DPUs")
269272
dpus_startup_and_check(duthost, dpu_on_list, num_dpu_modules)
273+
else:
274+
logging.info("Check DPUs are offline")
275+
check_dpus_module_status(duthost, dpu_on_list, "off")
270276

271277
logging.info("Executing post test dpu check")
272278
post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list,

0 commit comments

Comments
 (0)