rh-ecosystem-edge
diff --git a/‎tests/rhwa/internal/rhwaconfig/default.yaml‎
Lines changed: 9 additions & 1 deletion b/‎tests/rhwa/internal/rhwaconfig/default.yaml‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎tests/rhwa/internal/rhwaconfig/rhwaconfig.go‎
Lines changed: 26 additions & 0 deletions b/‎tests/rhwa/internal/rhwaconfig/rhwaconfig.go‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎tests/rhwa/internal/rhwaparams/rhwavars.go‎
Lines changed: 14 additions & 0 deletions b/‎tests/rhwa/internal/rhwaparams/rhwavars.go‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎tests/rhwa/nhc-operator/README.md‎
Lines changed: 152 additions & 0 deletions b/‎tests/rhwa/nhc-operator/README.md‎
Lines changed: 152 additions & 0 deletions
diff --git a/‎tests/rhwa/nhc-operator/internal/nhcparams/const.go‎
Lines changed: 27 additions & 0 deletions b/‎tests/rhwa/nhc-operator/internal/nhcparams/const.go‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎tests/rhwa/nhc-operator/internal/nhcparams/nhcvars.go‎
Lines changed: 59 additions & 1 deletion b/‎tests/rhwa/nhc-operator/internal/nhcparams/nhcvars.go‎
Lines changed: 59 additions & 1 deletion
diff --git a/‎tests/rhwa/nhc-operator/nhc_suite_test.go‎
Lines changed: 34 additions & 0 deletions b/‎tests/rhwa/nhc-operator/nhc_suite_test.go‎
Lines changed: 34 additions & 0 deletions
@@ -1,3 +1,11 @@
 ---
 # RHWA default configurations.
-...
+
+nhc_target_worker: ""       # e.g. "openshift-worker-0.ocp.example.org"
+nhc_failover_workers: []    # e.g. ["openshift-worker-1.ocp.example.org"]
+nhc_storage_class: ""       # e.g. "ocs-external-storagecluster-ceph-rbd"
+nhc_app_image: ""           # e.g. "registry.ocp.example.org:5000/test/ubi-minimal:latest"
+nhc_target_worker_bmc:
+  address: ""               # e.g. "10.1.0.2"
+  username: ""              # e.g. "example-user"
+  password: ""              # e.g. "example-pass"
@@ -1,10 +1,12 @@
 package rhwaconfig
 
 import (
+	"encoding/json"
 	"log"
 	"os"
 	"path/filepath"
 	"runtime"
+	"strings"
 
 	"github.com/kelseyhightower/envconfig"
 	"github.com/rh-ecosystem-edge/eco-gotests/tests/internal/config"
@@ -16,9 +18,33 @@ const (
 	PathToDefaultRhwaParamsFile = "./default.yaml"
 )
 
+// BMCDetails holds BMC connection details for a single node.
+type BMCDetails struct {
+	Address  string `yaml:"address" json:"address"`
+	Username string `yaml:"username" json:"username"`
+	Password string `yaml:"password" json:"password"`
+}
+
+// Decode implements the envconfig.Decoder interface to parse a JSON string
+// from an environment variable into BMCDetails.
+func (b *BMCDetails) Decode(value string) error {
+	if strings.TrimSpace(value) == "" {
+		return nil
+	}
+
+	return json.Unmarshal([]byte(value), b)
+}
+
 // RHWAConfig type keeps rhwa configuration.
 type RHWAConfig struct {
 	*config.GeneralConfig
+
+	// NHC/SNR sudden-loss test configuration.
+	TargetWorker    string     `yaml:"nhc_target_worker" envconfig:"ECO_RHWA_NHC_TARGET_WORKER"`
+	FailoverWorkers []string   `yaml:"nhc_failover_workers" envconfig:"ECO_RHWA_NHC_FAILOVER_WORKERS"`
+	StorageClass    string     `yaml:"nhc_storage_class" envconfig:"ECO_RHWA_NHC_STORAGE_CLASS"`
+	AppImage        string     `yaml:"nhc_app_image" envconfig:"ECO_RHWA_NHC_APP_IMAGE"`
+	TargetWorkerBMC BMCDetails `yaml:"nhc_target_worker_bmc" envconfig:"ECO_RHWA_NHC_TARGET_WORKER_BMC"`
 }
 
 // NewRHWAConfig returns instance of RHWA config type.
 
@@ -0,0 +1,14 @@
+package rhwaparams
+
+import (
+	"k8s.io/apimachinery/pkg/runtime/schema"
+)
+
+var (
+	// SnrGVR is the GroupVersionResource for SelfNodeRemediation resources.
+	SnrGVR = schema.GroupVersionResource{
+		Group:    "self-node-remediation.medik8s.io",
+		Version:  "v1alpha1",
+		Resource: "selfnoderemediations",
+	}
+)
@@ -0,0 +1,152 @@
+# RHWA Team - Node Health Check Operator
+
+## Overview
+
+NHC operator tests validate that the Node Health Check (NHC) and Self Node Remediation (SNR) operators
+work together to detect unhealthy nodes and remediate them by fencing and evicting stateful workloads
+to healthy nodes.
+
+The first test scenario is **Sudden loss of a node**: a healthy MNO cluster experiences the unexpected
+shutdown of a worker node running a stateful application. The NHC operator detects the node failure,
+creates a `SelfNodeRemediation` resource, and the SNR operator applies an `out-of-service` taint to
+fence the node. Kubernetes then force-evicts the stateful pod and reschedules it on a healthy node,
+reattaching its persistent storage.
+
+### Prerequisites for running these tests:
+
+The test suite is designed to run on an OCP cluster version 4.19+ with the following components
+and configuration.
+
+It has been run successfully on these OCP versions:
+- 4.19
+
+It has been tested on bare-metal nodes. For virtualised infrastructure, a virtual BMC must be used, 
+such as:
+
+  - sushy-emulator (from the sushy project) — exposes a Redfish API that maps to libvirt VM power
+  operations
+  - VirtualBMC (vbmc) — maps IPMI commands to libvirt, though the test uses Redfish not IPMI
+
+With the sushy-emulator running on the hypervisor, the ECO_RHWA_NHC_TARGET_WORKER_BMC environment
+variable must point at the sushy endpoint, 
+e.g. `{"address":"hypervisor:8000","username":"admin","password":"password"}`). The VMs must have a
+watchdog device configured (e.g. i6300esb in libvirt), or set `isSoftwareRebootEnabled: true` as a
+fallback.
+
+#### Cluster topology
+
+* A Multi-Node OpenShift (MNO) cluster with **bare-metal** or **virtualised** worker nodes
+* At least **2 worker nodes** that will be used by the test (a target node and one or more
+  failover nodes). The test labels the target node with `node-role.kubernetes.io/appworker`
+  first to guarantee initial pod placement, then labels the failover nodes after the app is
+  deployed. All labels are removed at the end
+* The target worker node must have **BMC/Redfish** (or iLO/IPMI) access for power control.
+  The test powers it off via BMC to simulate sudden power loss and powers it back on at the end
+
+The test observes the full remediation lifecycle:
+
+1. Node `Ready` condition transitions to `Unknown` (~40s after power-off)
+2. NHC detects the unhealthy condition and creates a `SelfNodeRemediation` CR (~60s after condition change)
+3. SNR fences the node with an `out-of-service` taint (~180s after `safeTimeToAssumeNodeRebootedSeconds`)
+4. The stateful pod is evicted and rescheduled on a healthy node
+5. The PVC is reattached and the pod becomes Ready on the new node
+6. The node is powered back on via BMC and returns to `Ready` state
+
+#### Operators
+
+* **Node Health Check operator** (namespace: `openshift-workload-availability`)
+* **Self Node Remediation operator** (installed as default remediation provider by NHC)
+
+#### Operator configuration
+
+* A `SelfNodeRemediationTemplate` CR with `remediationStrategy: OutOfServiceTaint`
+* A `NodeHealthCheck` CR (named `nhc-worker-self`) configured with:
+  * A `selector` matching the worker nodes monitored by NHC (e.g. `node-role.kubernetes.io/worker`).
+    The selector must match the target and failover nodes
+  * `minHealthy` set to a value that is **still satisfied** when one node goes down.
+    For example, with 4 workers under NHC, use `75%` — losing 1 node leaves 3/4 = 75% healthy,
+    which meets the threshold. If `minHealthy` is too high (e.g. `90%` with 4 nodes requires
+    all 4 healthy), NHC will not remediate
+  * `unhealthyConditions` with `duration: 60s` for `Ready` in `False` and `Unknown` status
+  * A `remediationTemplate` pointing to the `SelfNodeRemediationTemplate` above
+* A `SelfNodeRemediationConfig` CR with `safeTimeToAssumeNodeRebootedSeconds: 180`
+
+The [Telco Reference CRs](https://github.com/openshift-kni/telco-reference/)
+can provide an up-to-date configuration and values for the settings above.
+
+#### Storage
+
+* A **StorageClass** capable of dynamically provisioning `ReadWriteOnce` PersistentVolumes
+  (e.g. NFS-based). The test creates a 1Gi PVC for the stateful application. The storage
+  must support volume reattachment to a different node after the original node is fenced
+* The test verifies `VolumeAttachment` resources for CSI-backed storage. For non-CSI storage
+  (e.g. NFS), this check is skipped — the PVC being Bound and the pod Running on the new node
+  is sufficient verification
+
+#### Container image
+
+* A container image accessible from the cluster (e.g. `ubi-minimal`). In disconnected
+  environments, mirror it to the local registry. The test uses this image to run a simple
+  heartbeat loop as the stateful application
+
+### Test suites:
+
+| Name | Description |
+|------|-------------|
+| [sudden-node-loss](tests/sudden-node-loss.go) | Powers off a worker node via BMC and verifies NHC/SNR remediation and pod rescheduling |
+
+### Internal pkgs
+
+| Name | Description |
+|------|-------------|
+| [nhcparams](internal/nhcparams/const.go) | Constants, labels, timeouts, and reporter configuration for NHC tests |
+
+### Inputs
+
+Environment variables for test configuration:
+
+- `ECO_RHWA_NHC_TARGET_WORKER`: FQDN of the worker node to power off (must match the BMC address)
+- `ECO_RHWA_NHC_FAILOVER_WORKERS`: comma-separated list of worker FQDNs eligible for pod rescheduling
+- `ECO_RHWA_NHC_STORAGE_CLASS`: StorageClass name for the test PVC (e.g. `standard`)
+- `ECO_RHWA_NHC_APP_IMAGE`: container image for the stateful test application
+- `ECO_RHWA_NHC_TARGET_WORKER_BMC`: JSON object with BMC connection details, e.g. `{"address":"10.1.29.13","username":"user","password":"pass"}`
+
+Please refer to the project README for a list of global inputs - [How to run](../../../README.md#how-to-run)
+
+### Running NHC Test Suites
+
+```bash
+# export KUBECONFIG=</path/to/kubeconfig>
+# export ECO_RHWA_NHC_TARGET_WORKER=openshift-worker-0.example.com
+# export ECO_RHWA_NHC_FAILOVER_WORKERS=openshift-worker-1.example.com
+# export ECO_RHWA_NHC_STORAGE_CLASS=standard
+# export ECO_RHWA_NHC_APP_IMAGE=registry.example.com:5000/test/ubi-minimal:latest
+# export ECO_RHWA_NHC_TARGET_WORKER_BMC='{"address":"10.1.29.13","username":"admin","password":"secret"}'
+# make run-tests
+```
+
+**Note on timeouts:** The `go test` command must use `-timeout` greater than the ginkgo timeout
+(e.g. `-timeout=30m` with `-ginkgo.timeout=20m`). If `go test` uses its default of 10 minutes,
+the Go test harness will kill the process before ginkgo can complete the test and run cleanup
+(AfterAll), which includes powering the node back on.
+
+**Expected duration:** A full sudden-node-loss run typically takes **11–15 minutes** end-to-end,
+broken down as follows (observed on a 4-worker bare-metal cluster with `unhealthyConditions.duration=60s`
+and `safeTimeToAssumeNodeRebootedSeconds=180`):
+
+| Phase | Typical duration | Notes |
+|-------|-----------------|-------|
+| Step 3: Deploy app & verify placement | ~10s | PVC binding + pod scheduling |
+| Step 4: Power off node & detect failure | ~50s | ~40s for kubelet heartbeat timeout |
+| Step 5: NHC marks unhealthy & creates SNR | ~60s | Matches `unhealthyConditions.duration` |
+| Step 6: SNR fences node (out-of-service taint) | 3–5 min | 180s fence timer + SNR waits for all pods on the dead node to finish terminating; system pods like `dns-default` can extend this |
+| Step 7: Verify rescheduling | < 1s | Pod is rescheduled as soon as taint is applied |
+| AfterAll: Power on node & wait for Ready | ~5 min | Bare metal boot + kubelet registration |
+
+Step 6 is the most variable: after the 180s `safeTimeToAssumeNodeRebootedSeconds` timer expires,
+the SNR operator waits for all terminating pods on the fenced node to complete deletion before
+marking fencing as complete. System pods (e.g. `dns-default`, `ingress-canary`) on an unreachable
+node can take several additional minutes to terminate, pushing Step 6 to 5–8 minutes in the worst
+case. Combined with the AfterAll node recovery, the total can reach ~17–20 minutes, which is why
+the ginkgo timeout is set to 20 minutes and the Go test timeout to 30 minutes.
+
@@ -3,4 +3,31 @@ package nhcparams
 const (
 	// Label represents nhc operator label that can be used for test cases selection.
 	Label = "nhc"
+
+	// LabelSuddenLoss is the label for the sudden-loss test scenario.
+	LabelSuddenLoss = "sudden-loss"
+
+	// NHCResourceName is the name of the NodeHealthCheck CR.
+	NHCResourceName = "nhc-worker-self"
+
+	// AppNamespace is the namespace for the stateful test application.
+	AppNamespace = "stateful-app-test"
+
+	// AppName is the name of the stateful test deployment.
+	AppName = "stateful-app"
+
+	// AppLabelKey is the label key for the stateful test application.
+	AppLabelKey = "app"
+
+	// AppLabelValue is the label value for the stateful test application.
+	AppLabelValue = "stateful-app"
+
+	// AppWorkerLabel is the node label used to select worker nodes for the test app.
+	AppWorkerLabel = "node-role.kubernetes.io/appworker"
+
+	// PVCName is the name of the PersistentVolumeClaim for the test app.
+	PVCName = "app-data"
+
+	// PVCSize is the size of the PVC for the test app.
+	PVCSize = "1Gi"
 )
@@ -1,8 +1,66 @@
 package nhcparams
 
-import "github.com/rh-ecosystem-edge/eco-gotests/tests/rhwa/internal/rhwaparams"
+import (
+	"time"
+
+	"github.com/openshift-kni/k8sreporter"
+	"github.com/rh-ecosystem-edge/eco-gotests/tests/rhwa/internal/rhwaparams"
+	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/runtime/schema"
+)
 
 var (
 	// Labels represents the range of labels that can be used for test cases selection.
 	Labels = []string{rhwaparams.Label, Label}
+
+	// OperatorDeploymentName represents NHC deployment name.
+	OperatorDeploymentName = "node-healthcheck-controller-manager"
+
+	// OperatorControllerPodLabel is how the controller pod is labeled.
+	OperatorControllerPodLabel = "node-healthcheck-operator"
+
+	// ReporterNamespacesToDump tells to the reporter from where to collect logs.
+	ReporterNamespacesToDump = map[string]string{
+		rhwaparams.RhwaOperatorNs: rhwaparams.RhwaOperatorNs,
+		AppNamespace:              AppNamespace,
+	}
+
+	// ReporterCRDsToDump tells to the reporter what CRs to dump.
+	ReporterCRDsToDump = []k8sreporter.CRData{
+		{Cr: &corev1.PodList{}},
+	}
+
+	// NhcGVR is the GroupVersionResource for NodeHealthCheck resources.
+	NhcGVR = schema.GroupVersionResource{
+		Group:    "remediation.medik8s.io",
+		Version:  "v1alpha1",
+		Resource: "nodehealthchecks",
+	}
+
+	// NodeReadyTimeout is how long to wait for a node Ready condition change.
+	NodeReadyTimeout = 2 * time.Minute
+
+	// NHCObserveTimeout is how long to wait for NHC to mark a node unhealthy.
+	NHCObserveTimeout = 3 * time.Minute
+
+	// SNRFenceTimeout is how long to wait for SNR to fence the node.
+	SNRFenceTimeout = 5 * time.Minute
+
+	// RescheduleTimeout is how long to wait for the pod to reschedule.
+	RescheduleTimeout = 5 * time.Minute
+
+	// DeploymentTimeout is how long to wait for a deployment to become ready.
+	DeploymentTimeout = 5 * time.Minute
+
+	// DeletionTimeout is how long to wait for a deletion to apply.
+	DeletionTimeout = 5 * time.Minute
+
+	// NodeRecoveryTimeout is how long to wait for a node to become Ready after power-on.
+	NodeRecoveryTimeout = 25 * time.Minute
+
+	// PollingInterval is the default polling interval for Eventually blocks.
+	PollingInterval = 10 * time.Second
+
+	// BMCTimeout is the Redfish operation timeout.
+	BMCTimeout = 6 * time.Minute
 )
@@ -0,0 +1,34 @@
+package nhc
+
+import (
+	"runtime"
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"github.com/rh-ecosystem-edge/eco-goinfra/pkg/reportxml"
+	"github.com/rh-ecosystem-edge/eco-gotests/tests/internal/reporter"
+	. "github.com/rh-ecosystem-edge/eco-gotests/tests/rhwa/internal/rhwainittools"
+	"github.com/rh-ecosystem-edge/eco-gotests/tests/rhwa/nhc-operator/internal/nhcparams"
+	_ "github.com/rh-ecosystem-edge/eco-gotests/tests/rhwa/nhc-operator/tests"
+)
+
+var _, currentFile, _, _ = runtime.Caller(0)
+
+func TestNHC(t *testing.T) {
+	_, reporterConfig := GinkgoConfiguration()
+	reporterConfig.JUnitReport = RHWAConfig.GetJunitReportPath(currentFile)
+
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "NHC", Label(nhcparams.Labels...), reporterConfig)
+}
+
+var _ = JustAfterEach(func() {
+	reporter.ReportIfFailed(
+		CurrentSpecReport(), currentFile, nhcparams.ReporterNamespacesToDump, nhcparams.ReporterCRDsToDump)
+})
+
+var _ = ReportAfterSuite("", func(report Report) {
+	reportxml.Create(
+		report, RHWAConfig.GetReportPath(), RHWAConfig.TCPrefix)
+})