Skip to content

Commit 3c691ff

Browse files
ggoklaniGaurav Goklani
andauthored
test: Added an RDMA validation testcase that checks the expected RDMA enablement and Azure RDMA persistent naming setup. (#77)
To automatically verify that the role correctly enables RDMA in waagent, installs RDMA userland tools, and (on Azure/systemd) configures and maintains Azure persistent RDMA naming services. How to run: Execute {{ __hpc_azure_tests_dir }}/test-rdma.sh after the role completes. Expected result: Exit 0 with “Test Passed …” lines; non-zero with “Failed: …” explaining the missing/failed prerequisite. Moved "Create Azure HPC resource directories" task at beginning to avoid path not found issue for other tasks. Signed-off-by: Gaurav Goklani <ggoklani@redhat.com> Co-authored-by: Gaurav Goklani <ggoklani@ggoklani-thinkpadt14gen4.punetw6.csb>
1 parent 74cac79 commit 3c691ff

File tree

2 files changed

+121
-14
lines changed

2 files changed

+121
-14
lines changed

files/rdma/test-rdma.sh

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
#!/usr/bin/env bash
2+
# SPDX-License-Identifier: MIT
3+
#
4+
# RDMA Validation Script
5+
# Usage: test-rdma.sh
6+
#
7+
8+
# This is test code, and some operations are expected to fail. Hence we can't
9+
# use set -e to automatically exit the script if something fails.
10+
set -u
11+
12+
fail()
13+
{
14+
echo Failed: "$1"
15+
exit 1
16+
}
17+
18+
require_file() {
19+
local path="$1"
20+
[[ -e "$path" ]] || fail "missing file: $path"
21+
}
22+
23+
require_executable() {
24+
local path="$1"
25+
[[ -x "$path" ]] || fail "not executable: $path"
26+
}
27+
28+
require_cmd() {
29+
local cmd="$1"
30+
command -v "$cmd" >/dev/null 2>&1 || fail "missing command in PATH: $cmd"
31+
}
32+
33+
sys_vendor() {
34+
if [[ -r /sys/class/dmi/id/sys_vendor ]]; then
35+
cat /sys/class/dmi/id/sys_vendor
36+
else
37+
echo ""
38+
fi
39+
}
40+
41+
is_systemd() {
42+
[[ "$(ps -p 1 -o comm= 2>/dev/null || true)" == "systemd" ]]
43+
}
44+
45+
main() {
46+
echo
47+
echo "Testing waagent RDMA flag"
48+
require_file /etc/waagent.conf
49+
grep -Fxq "OS.EnableRDMA=y" /etc/waagent.conf || fail "expected 'OS.EnableRDMA=y' in /etc/waagent.conf"
50+
echo Test Passed: "waagent RDMA flag is set"
51+
52+
echo
53+
echo "Testing RDMA userland tools"
54+
require_cmd ibv_devinfo
55+
echo Test Passed: "RDMA tools are present (ibv_devinfo)"
56+
57+
# Azure persistent RDMA naming artifacts/services (Azure only)
58+
if [ "$(sys_vendor)" != "Microsoft Corporation" ]; then
59+
echo
60+
echo "Testing Azure persistent RDMA naming (skip: not Azure)"
61+
echo Test Passed: "not running on Azure; Azure persistent RDMA naming checks skipped"
62+
return 0
63+
fi
64+
65+
if ! is_systemd; then
66+
echo
67+
echo "Testing Azure persistent RDMA naming (skip: not systemd)"
68+
echo Test Passed: "not running systemd; systemd unit checks skipped"
69+
return 0
70+
fi
71+
72+
echo
73+
echo "Testing Azure persistent RDMA naming artifacts"
74+
require_executable /usr/sbin/azure_persistent_rdma_naming.sh
75+
require_executable /usr/sbin/azure_persistent_rdma_naming_monitor.sh
76+
require_file /etc/systemd/system/azure_persistent_rdma_naming.service
77+
require_file /etc/systemd/system/azure_persistent_rdma_naming_monitor.service
78+
require_file /etc/udev/rules.d/99-azure-persistent-rdma-naming.rules
79+
echo Test Passed: "Azure persistent RDMA naming artifacts exist"
80+
81+
echo
82+
echo "Testing Azure persistent RDMA naming services"
83+
require_cmd systemctl
84+
systemctl is-enabled azure_persistent_rdma_naming.service >/dev/null 2>&1 || fail "azure_persistent_rdma_naming.service not enabled"
85+
systemctl is-enabled azure_persistent_rdma_naming_monitor.service >/dev/null 2>&1 || fail "azure_persistent_rdma_naming_monitor.service not enabled"
86+
87+
# azure_persistent_rdma_naming.service is Type=oneshot, so it may not remain
88+
# "active" after it runs. Treat "failed" as an error; other states are OK.
89+
if [ "$(systemctl is-failed azure_persistent_rdma_naming.service 2>/dev/null || true)" = "failed" ]; then
90+
fail "azure_persistent_rdma_naming.service is in failed state"
91+
fi
92+
93+
# Monitor service should be continuously running.
94+
systemctl is-active azure_persistent_rdma_naming_monitor.service >/dev/null 2>&1 || fail "azure_persistent_rdma_naming_monitor.service not active"
95+
echo Test Passed: "Azure persistent RDMA naming services look healthy"
96+
}
97+
98+
main "$@"
99+

tasks/main.yml

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,20 @@
313313
ternary('ansible.posix.rhel_rpm_ostree', omit) }}"
314314
register: __hpc_azure_packages_install
315315
until: __hpc_azure_packages_install is success
316+
317+
- name: Create Azure HPC resource directories
318+
file:
319+
path: "{{ item }}"
320+
state: directory
321+
owner: root
322+
group: root
323+
mode: '0755'
324+
loop:
325+
- "{{ __hpc_azure_resource_dir }}"
326+
- "{{ __hpc_azure_resource_dir }}/bin"
327+
- "{{ __hpc_azure_tools_dir }}"
328+
- "{{ __hpc_azure_tests_dir }}"
329+
- "{{ __hpc_azure_runtime_dir }}"
316330

317331
- name: Install NVidia driver
318332
# Note that currently the role supports only Microsoft Azure
@@ -539,6 +553,14 @@
539553
state: started
540554
daemon_reload: "{{ __hpc_azure_persistent_rdma_naming_monitor_unit.changed | d(false) }}"
541555

556+
- name: Install RDMA validation script
557+
copy:
558+
src: rdma/test-rdma.sh
559+
dest: "{{ __hpc_azure_tests_dir }}/test-rdma.sh"
560+
owner: root
561+
group: root
562+
mode: "0755"
563+
542564
- name: Install common OpenMPI packages
543565
when: hpc_install_system_openmpi or hpc_build_openmpi_w_nvidia_gpu_support
544566
package:
@@ -959,20 +981,6 @@
959981
mode: '0644'
960982
notify: Reload udev
961983

962-
- name: Create Azure HPC resource directories
963-
file:
964-
path: "{{ item }}"
965-
state: directory
966-
owner: root
967-
group: root
968-
mode: '0755'
969-
loop:
970-
- "{{ __hpc_azure_resource_dir }}"
971-
- "{{ __hpc_azure_resource_dir }}/bin"
972-
- "{{ __hpc_azure_tools_dir }}"
973-
- "{{ __hpc_azure_tests_dir }}"
974-
- "{{ __hpc_azure_runtime_dir }}"
975-
976984
- name: Install SKU Customisation scripts and services
977985
when: hpc_sku_customisation
978986
block:

0 commit comments

Comments
 (0)