Skip to content

Commit 55e2dd1

Browse files
committed
rhwa nhc: add NHC & SNR sudden-loss system test
Add a test suite that runs through the following scenario. A healthy OpenShift cluster with a MNO topology experiences the sudden shutdown of a worker node that is running a stateful application (a simple Pod writing to a PV). The application is evicted and rescheduled to another healthy node in order to provide business continuity. Assisted-by: Claude
1 parent 8b70abc commit 55e2dd1

12 files changed

Lines changed: 1273 additions & 3 deletions

File tree

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
11
---
22
# RHWA default configurations.
3-
...
3+
4+
nhc_target_worker: "" # e.g. "openshift-worker-0.ocp.example.org"
5+
nhc_failover_workers: [] # e.g. ["openshift-worker-1.ocp.example.org"]
6+
nhc_storage_class: "" # e.g. "ocs-external-storagecluster-ceph-rbd"
7+
nhc_app_image: "" # e.g. "registry.ocp.example.org:5000/test/ubi-minimal:latest"
8+
nhc_target_worker_bmc:
9+
address: "" # e.g. "10.1.0.2"
10+
username: "" # e.g. "example-user"
11+
password: "" # e.g. "example-pass"

tests/rhwa/internal/rhwaconfig/rhwaconfig.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
package rhwaconfig
22

33
import (
4+
"encoding/json"
45
"log"
56
"os"
67
"path/filepath"
78
"runtime"
9+
"strings"
810

911
"github.com/kelseyhightower/envconfig"
1012
"github.com/rh-ecosystem-edge/eco-gotests/tests/internal/config"
@@ -16,9 +18,33 @@ const (
1618
PathToDefaultRhwaParamsFile = "./default.yaml"
1719
)
1820

21+
// BMCDetails holds BMC connection details for a single node.
22+
type BMCDetails struct {
23+
Address string `yaml:"address" json:"address"`
24+
Username string `yaml:"username" json:"username"`
25+
Password string `yaml:"password" json:"password"`
26+
}
27+
28+
// Decode implements the envconfig.Decoder interface to parse a JSON string
29+
// from an environment variable into BMCDetails.
30+
func (b *BMCDetails) Decode(value string) error {
31+
if strings.TrimSpace(value) == "" {
32+
return nil
33+
}
34+
35+
return json.Unmarshal([]byte(value), b)
36+
}
37+
1938
// RHWAConfig type keeps rhwa configuration.
2039
type RHWAConfig struct {
2140
*config.GeneralConfig
41+
42+
// NHC/SNR sudden-loss test configuration.
43+
TargetWorker string `yaml:"nhc_target_worker" envconfig:"ECO_RHWA_NHC_TARGET_WORKER"`
44+
FailoverWorkers []string `yaml:"nhc_failover_workers" envconfig:"ECO_RHWA_NHC_FAILOVER_WORKERS"`
45+
StorageClass string `yaml:"nhc_storage_class" envconfig:"ECO_RHWA_NHC_STORAGE_CLASS"`
46+
AppImage string `yaml:"nhc_app_image" envconfig:"ECO_RHWA_NHC_APP_IMAGE"`
47+
TargetWorkerBMC BMCDetails `yaml:"nhc_target_worker_bmc" envconfig:"ECO_RHWA_NHC_TARGET_WORKER_BMC"`
2248
}
2349

2450
// NewRHWAConfig returns instance of RHWA config type.
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
package rhwaparams
2+
3+
import (
4+
"k8s.io/apimachinery/pkg/runtime/schema"
5+
)
6+
7+
var (
8+
// SnrGVR is the GroupVersionResource for SelfNodeRemediation resources.
9+
SnrGVR = schema.GroupVersionResource{
10+
Group: "self-node-remediation.medik8s.io",
11+
Version: "v1alpha1",
12+
Resource: "selfnoderemediations",
13+
}
14+
)

tests/rhwa/nhc-operator/README.md

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
# RHWA Team - Node Health Check Operator
2+
3+
## Overview
4+
5+
NHC operator tests validate that the Node Health Check (NHC) and Self Node Remediation (SNR) operators
6+
work together to detect unhealthy nodes and remediate them by fencing and evicting stateful workloads
7+
to healthy nodes.
8+
9+
The first test scenario is **Sudden loss of a node**: a healthy MNO cluster experiences the unexpected
10+
shutdown of a worker node running a stateful application. The NHC operator detects the node failure,
11+
creates a `SelfNodeRemediation` resource, and the SNR operator applies an `out-of-service` taint to
12+
fence the node. Kubernetes then force-evicts the stateful pod and reschedules it on a healthy node,
13+
reattaching its persistent storage.
14+
15+
### Prerequisites for running these tests:
16+
17+
The test suite is designed to run on an OCP cluster version 4.19+ with the following components
18+
and configuration.
19+
20+
It has been run successfully on these OCP versions:
21+
- 4.19
22+
23+
It has been tested on bare-metal nodes. For virtualised infrastructure, a virtual BMC must be used,
24+
such as:
25+
26+
- sushy-emulator (from the sushy project) — exposes a Redfish API that maps to libvirt VM power
27+
operations
28+
- VirtualBMC (vbmc) — maps IPMI commands to libvirt, though the test uses Redfish not IPMI
29+
30+
With the sushy-emulator running on the hypervisor, the ECO_RHWA_NHC_TARGET_WORKER_BMC environment
31+
variable must point at the sushy endpoint,
32+
e.g. `{"address":"hypervisor:8000","username":"admin","password":"password"}`). The VMs must have a
33+
watchdog device configured (e.g. i6300esb in libvirt), or set `isSoftwareRebootEnabled: true` as a
34+
fallback.
35+
36+
#### Cluster topology
37+
38+
* A Multi-Node OpenShift (MNO) cluster with **bare-metal** or **virtualised** worker nodes
39+
* At least **2 worker nodes** that will be used by the test (a target node and one or more
40+
failover nodes). The test labels the target node with `node-role.kubernetes.io/appworker`
41+
first to guarantee initial pod placement, then labels the failover nodes after the app is
42+
deployed. All labels are removed at the end
43+
* The target worker node must have **BMC/Redfish** (or iLO/IPMI) access for power control.
44+
The test powers it off via BMC to simulate sudden power loss and powers it back on at the end
45+
46+
The test observes the full remediation lifecycle:
47+
48+
1. Node `Ready` condition transitions to `Unknown` (~40s after power-off)
49+
2. NHC detects the unhealthy condition and creates a `SelfNodeRemediation` CR (~60s after condition change)
50+
3. SNR fences the node with an `out-of-service` taint (~180s after `safeTimeToAssumeNodeRebootedSeconds`)
51+
4. The stateful pod is evicted and rescheduled on a healthy node
52+
5. The PVC is reattached and the pod becomes Ready on the new node
53+
6. The node is powered back on via BMC and returns to `Ready` state
54+
55+
#### Operators
56+
57+
* **Node Health Check operator** (namespace: `openshift-workload-availability`)
58+
* **Self Node Remediation operator** (installed as default remediation provider by NHC)
59+
60+
#### Operator configuration
61+
62+
* A `SelfNodeRemediationTemplate` CR with `remediationStrategy: OutOfServiceTaint`
63+
* A `NodeHealthCheck` CR (named `nhc-worker-self`) configured with:
64+
* A `selector` matching the worker nodes monitored by NHC (e.g. `node-role.kubernetes.io/worker`).
65+
The selector must match the target and failover nodes
66+
* `minHealthy` set to a value that is **still satisfied** when one node goes down.
67+
For example, with 4 workers under NHC, use `75%` — losing 1 node leaves 3/4 = 75% healthy,
68+
which meets the threshold. If `minHealthy` is too high (e.g. `90%` with 4 nodes requires
69+
all 4 healthy), NHC will not remediate
70+
* `unhealthyConditions` with `duration: 60s` for `Ready` in `False` and `Unknown` status
71+
* A `remediationTemplate` pointing to the `SelfNodeRemediationTemplate` above
72+
* A `SelfNodeRemediationConfig` CR with `safeTimeToAssumeNodeRebootedSeconds: 180`
73+
74+
The [Telco Reference CRs](https://github.com/openshift-kni/telco-reference/)
75+
can provide an up-to-date configuration and values for the settings above.
76+
77+
#### Storage
78+
79+
* A **StorageClass** capable of dynamically provisioning `ReadWriteOnce` PersistentVolumes
80+
(e.g. NFS-based). The test creates a 1Gi PVC for the stateful application. The storage
81+
must support volume reattachment to a different node after the original node is fenced
82+
* The test verifies `VolumeAttachment` resources for CSI-backed storage. For non-CSI storage
83+
(e.g. NFS), this check is skipped — the PVC being Bound and the pod Running on the new node
84+
is sufficient verification
85+
86+
#### Container image
87+
88+
* A container image accessible from the cluster (e.g. `ubi-minimal`). In disconnected
89+
environments, mirror it to the local registry. The test uses this image to run a simple
90+
heartbeat loop as the stateful application
91+
92+
### Test suites:
93+
94+
| Name | Description |
95+
|------|-------------|
96+
| [sudden-node-loss](tests/sudden-node-loss.go) | Powers off a worker node via BMC and verifies NHC/SNR remediation and pod rescheduling |
97+
98+
### Internal pkgs
99+
100+
| Name | Description |
101+
|------|-------------|
102+
| [nhcparams](internal/nhcparams/const.go) | Constants, labels, timeouts, and reporter configuration for NHC tests |
103+
104+
### Inputs
105+
106+
Environment variables for test configuration:
107+
108+
- `ECO_RHWA_NHC_TARGET_WORKER`: FQDN of the worker node to power off (must match the BMC address)
109+
- `ECO_RHWA_NHC_FAILOVER_WORKERS`: comma-separated list of worker FQDNs eligible for pod rescheduling
110+
- `ECO_RHWA_NHC_STORAGE_CLASS`: StorageClass name for the test PVC (e.g. `standard`)
111+
- `ECO_RHWA_NHC_APP_IMAGE`: container image for the stateful test application
112+
- `ECO_RHWA_NHC_TARGET_WORKER_BMC`: JSON object with BMC connection details, e.g. `{"address":"10.1.29.13","username":"user","password":"pass"}`
113+
114+
Please refer to the project README for a list of global inputs - [How to run](../../../README.md#how-to-run)
115+
116+
### Running NHC Test Suites
117+
118+
```bash
119+
# export KUBECONFIG=</path/to/kubeconfig>
120+
# export ECO_RHWA_NHC_TARGET_WORKER=openshift-worker-0.example.com
121+
# export ECO_RHWA_NHC_FAILOVER_WORKERS=openshift-worker-1.example.com
122+
# export ECO_RHWA_NHC_STORAGE_CLASS=standard
123+
# export ECO_RHWA_NHC_APP_IMAGE=registry.example.com:5000/test/ubi-minimal:latest
124+
# export ECO_RHWA_NHC_TARGET_WORKER_BMC='{"address":"10.1.29.13","username":"admin","password":"secret"}'
125+
# make run-tests
126+
```
127+
128+
**Note on timeouts:** The `go test` command must use `-timeout` greater than the ginkgo timeout
129+
(e.g. `-timeout=30m` with `-ginkgo.timeout=20m`). If `go test` uses its default of 10 minutes,
130+
the Go test harness will kill the process before ginkgo can complete the test and run cleanup
131+
(AfterAll), which includes powering the node back on.
132+
133+
**Expected duration:** A full sudden-node-loss run typically takes **11–15 minutes** end-to-end,
134+
broken down as follows (observed on a 4-worker bare-metal cluster with `unhealthyConditions.duration=60s`
135+
and `safeTimeToAssumeNodeRebootedSeconds=180`):
136+
137+
| Phase | Typical duration | Notes |
138+
|-------|-----------------|-------|
139+
| Step 3: Deploy app & verify placement | ~10s | PVC binding + pod scheduling |
140+
| Step 4: Power off node & detect failure | ~50s | ~40s for kubelet heartbeat timeout |
141+
| Step 5: NHC marks unhealthy & creates SNR | ~60s | Matches `unhealthyConditions.duration` |
142+
| Step 6: SNR fences node (out-of-service taint) | 3–5 min | 180s fence timer + SNR waits for all pods on the dead node to finish terminating; system pods like `dns-default` can extend this |
143+
| Step 7: Verify rescheduling | < 1s | Pod is rescheduled as soon as taint is applied |
144+
| AfterAll: Power on node & wait for Ready | ~5 min | Bare metal boot + kubelet registration |
145+
146+
Step 6 is the most variable: after the 180s `safeTimeToAssumeNodeRebootedSeconds` timer expires,
147+
the SNR operator waits for all terminating pods on the fenced node to complete deletion before
148+
marking fencing as complete. System pods (e.g. `dns-default`, `ingress-canary`) on an unreachable
149+
node can take several additional minutes to terminate, pushing Step 6 to 5–8 minutes in the worst
150+
case. Combined with the AfterAll node recovery, the total can reach ~17–20 minutes, which is why
151+
the ginkgo timeout is set to 20 minutes and the Go test timeout to 30 minutes.
152+

tests/rhwa/nhc-operator/internal/nhcparams/const.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,31 @@ package nhcparams
33
const (
44
// Label represents nhc operator label that can be used for test cases selection.
55
Label = "nhc"
6+
7+
// LabelSuddenLoss is the label for the sudden-loss test scenario.
8+
LabelSuddenLoss = "sudden-loss"
9+
10+
// NHCResourceName is the name of the NodeHealthCheck CR.
11+
NHCResourceName = "nhc-worker-self"
12+
13+
// AppNamespace is the namespace for the stateful test application.
14+
AppNamespace = "stateful-app-test"
15+
16+
// AppName is the name of the stateful test deployment.
17+
AppName = "stateful-app"
18+
19+
// AppLabelKey is the label key for the stateful test application.
20+
AppLabelKey = "app"
21+
22+
// AppLabelValue is the label value for the stateful test application.
23+
AppLabelValue = "stateful-app"
24+
25+
// AppWorkerLabel is the node label used to select worker nodes for the test app.
26+
AppWorkerLabel = "node-role.kubernetes.io/appworker"
27+
28+
// PVCName is the name of the PersistentVolumeClaim for the test app.
29+
PVCName = "app-data"
30+
31+
// PVCSize is the size of the PVC for the test app.
32+
PVCSize = "1Gi"
633
)
Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,66 @@
11
package nhcparams
22

3-
import "github.com/rh-ecosystem-edge/eco-gotests/tests/rhwa/internal/rhwaparams"
3+
import (
4+
"time"
5+
6+
"github.com/openshift-kni/k8sreporter"
7+
"github.com/rh-ecosystem-edge/eco-gotests/tests/rhwa/internal/rhwaparams"
8+
corev1 "k8s.io/api/core/v1"
9+
"k8s.io/apimachinery/pkg/runtime/schema"
10+
)
411

512
var (
613
// Labels represents the range of labels that can be used for test cases selection.
714
Labels = []string{rhwaparams.Label, Label}
15+
16+
// OperatorDeploymentName represents NHC deployment name.
17+
OperatorDeploymentName = "node-healthcheck-controller-manager"
18+
19+
// OperatorControllerPodLabel is how the controller pod is labeled.
20+
OperatorControllerPodLabel = "node-healthcheck-operator"
21+
22+
// ReporterNamespacesToDump tells to the reporter from where to collect logs.
23+
ReporterNamespacesToDump = map[string]string{
24+
rhwaparams.RhwaOperatorNs: rhwaparams.RhwaOperatorNs,
25+
AppNamespace: AppNamespace,
26+
}
27+
28+
// ReporterCRDsToDump tells to the reporter what CRs to dump.
29+
ReporterCRDsToDump = []k8sreporter.CRData{
30+
{Cr: &corev1.PodList{}},
31+
}
32+
33+
// NhcGVR is the GroupVersionResource for NodeHealthCheck resources.
34+
NhcGVR = schema.GroupVersionResource{
35+
Group: "remediation.medik8s.io",
36+
Version: "v1alpha1",
37+
Resource: "nodehealthchecks",
38+
}
39+
40+
// NodeReadyTimeout is how long to wait for a node Ready condition change.
41+
NodeReadyTimeout = 2 * time.Minute
42+
43+
// NHCObserveTimeout is how long to wait for NHC to mark a node unhealthy.
44+
NHCObserveTimeout = 3 * time.Minute
45+
46+
// SNRFenceTimeout is how long to wait for SNR to fence the node.
47+
SNRFenceTimeout = 5 * time.Minute
48+
49+
// RescheduleTimeout is how long to wait for the pod to reschedule.
50+
RescheduleTimeout = 5 * time.Minute
51+
52+
// DeploymentTimeout is how long to wait for a deployment to become ready.
53+
DeploymentTimeout = 5 * time.Minute
54+
55+
// DeletionTimeout is how long to wait for a deletion to apply.
56+
DeletionTimeout = 5 * time.Minute
57+
58+
// NodeRecoveryTimeout is how long to wait for a node to become Ready after power-on.
59+
NodeRecoveryTimeout = 25 * time.Minute
60+
61+
// PollingInterval is the default polling interval for Eventually blocks.
62+
PollingInterval = 10 * time.Second
63+
64+
// BMCTimeout is the Redfish operation timeout.
65+
BMCTimeout = 6 * time.Minute
866
)
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
package nhc
2+
3+
import (
4+
"runtime"
5+
"testing"
6+
7+
. "github.com/onsi/ginkgo/v2"
8+
. "github.com/onsi/gomega"
9+
"github.com/rh-ecosystem-edge/eco-goinfra/pkg/reportxml"
10+
"github.com/rh-ecosystem-edge/eco-gotests/tests/internal/reporter"
11+
. "github.com/rh-ecosystem-edge/eco-gotests/tests/rhwa/internal/rhwainittools"
12+
"github.com/rh-ecosystem-edge/eco-gotests/tests/rhwa/nhc-operator/internal/nhcparams"
13+
_ "github.com/rh-ecosystem-edge/eco-gotests/tests/rhwa/nhc-operator/tests"
14+
)
15+
16+
var _, currentFile, _, _ = runtime.Caller(0)
17+
18+
func TestNHC(t *testing.T) {
19+
_, reporterConfig := GinkgoConfiguration()
20+
reporterConfig.JUnitReport = RHWAConfig.GetJunitReportPath(currentFile)
21+
22+
RegisterFailHandler(Fail)
23+
RunSpecs(t, "NHC", Label(nhcparams.Labels...), reporterConfig)
24+
}
25+
26+
var _ = JustAfterEach(func() {
27+
reporter.ReportIfFailed(
28+
CurrentSpecReport(), currentFile, nhcparams.ReporterNamespacesToDump, nhcparams.ReporterCRDsToDump)
29+
})
30+
31+
var _ = ReportAfterSuite("", func(report Report) {
32+
reportxml.Create(
33+
report, RHWAConfig.GetReportPath(), RHWAConfig.TCPrefix)
34+
})

0 commit comments

Comments
 (0)