Skip to content

Commit 8e5c0b2

Browse files
committed
rhwa nhc: add NHC & SNR sudden-loss system test
Add a test suite that runs through the following scenario. A healthy OpenShift cluster with a MNO topology experiences the sudden shutdown of a worker node that is running a stateful application (a simple Pod writing to a PV). The application is evicted and rescheduled to another healthy node in order to provide business continuity. Assisted-by: Claude
1 parent 8b70abc commit 8e5c0b2

10 files changed

Lines changed: 1000 additions & 3 deletions

File tree

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
11
---
22
# RHWA default configurations.
3-
...
3+
4+
nhc_target_worker: "" # e.g. "openshift-worker-0.ocp.example.org"
5+
nhc_failover_workers: [] # e.g. ["openshift-worker-1.ocp.example.org"]
6+
nhc_storage_class: "" # e.g. "ocs-external-storagecluster-ceph-rbd"
7+
nhc_app_image: "" # e.g. "registry.ocp.example.org:5000/test/ubi-minimal:latest"
8+
nhc_target_worker_bmc:
9+
address: "" # e.g. "10.1.0.2"
10+
username: "" # e.g. "example-user"
11+
password: "" # e.g. "example-pass"

tests/rhwa/internal/rhwaconfig/rhwaconfig.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
package rhwaconfig
22

33
import (
4+
"encoding/json"
45
"log"
56
"os"
67
"path/filepath"
78
"runtime"
9+
"strings"
810

911
"github.com/kelseyhightower/envconfig"
1012
"github.com/rh-ecosystem-edge/eco-gotests/tests/internal/config"
@@ -16,9 +18,33 @@ const (
1618
PathToDefaultRhwaParamsFile = "./default.yaml"
1719
)
1820

21+
// BMCDetails holds BMC connection details for a single node.
22+
type BMCDetails struct {
23+
Address string `yaml:"address" json:"address"`
24+
Username string `yaml:"username" json:"username"`
25+
Password string `yaml:"password" json:"password"`
26+
}
27+
28+
// Decode implements the envconfig.Decoder interface to parse a JSON string
29+
// from an environment variable into BMCDetails.
30+
func (b *BMCDetails) Decode(value string) error {
31+
if strings.TrimSpace(value) == "" {
32+
return nil
33+
}
34+
35+
return json.Unmarshal([]byte(value), b)
36+
}
37+
1938
// RHWAConfig type keeps rhwa configuration.
2039
type RHWAConfig struct {
2140
*config.GeneralConfig
41+
42+
// NHC/SNR sudden-loss test configuration.
43+
TargetWorker string `yaml:"nhc_target_worker" envconfig:"ECO_RHWA_NHC_TARGET_WORKER"`
44+
FailoverWorkers []string `yaml:"nhc_failover_workers" envconfig:"ECO_RHWA_NHC_FAILOVER_WORKERS"`
45+
StorageClass string `yaml:"nhc_storage_class" envconfig:"ECO_RHWA_NHC_STORAGE_CLASS"`
46+
AppImage string `yaml:"nhc_app_image" envconfig:"ECO_RHWA_NHC_APP_IMAGE"`
47+
TargetWorkerBMC BMCDetails `yaml:"nhc_target_worker_bmc" envconfig:"ECO_RHWA_NHC_TARGET_WORKER_BMC"`
2248
}
2349

2450
// NewRHWAConfig returns instance of RHWA config type.

tests/rhwa/nhc-operator/internal/nhcparams/const.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,37 @@ package nhcparams
33
const (
44
// Label represents nhc operator label that can be used for test cases selection.
55
Label = "nhc"
6+
7+
// LabelSuddenLoss is the label for the sudden-loss test scenario.
8+
LabelSuddenLoss = "sudden-loss"
9+
10+
// NHCResourceName is the name of the NodeHealthCheck CR.
11+
NHCResourceName = "nhc-worker-self"
12+
13+
// SNRDeploymentName is the name of the SNR operator deployment.
14+
SNRDeploymentName = "self-node-remediation-controller-manager"
15+
16+
// AppNamespace is the namespace for the stateful test application.
17+
AppNamespace = "stateful-app-test"
18+
19+
// AppName is the name of the stateful test deployment.
20+
AppName = "stateful-app"
21+
22+
// AppLabelKey is the label key for the stateful test application.
23+
AppLabelKey = "app"
24+
25+
// AppLabelValue is the label value for the stateful test application.
26+
AppLabelValue = "stateful-app"
27+
28+
// AppWorkerLabel is the node label used to select worker nodes for the test app.
29+
AppWorkerLabel = "node-role.kubernetes.io/appworker"
30+
31+
// PVCName is the name of the PersistentVolumeClaim for the test app.
32+
PVCName = "app-data"
33+
34+
// PVCSize is the size of the PVC for the test app.
35+
PVCSize = "1Gi"
36+
37+
// OutOfServiceTaintKey is the taint key applied by SNR to fence a node.
38+
OutOfServiceTaintKey = "node.kubernetes.io/out-of-service"
639
)
Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,52 @@
11
package nhcparams
22

3-
import "github.com/rh-ecosystem-edge/eco-gotests/tests/rhwa/internal/rhwaparams"
3+
import (
4+
"time"
5+
6+
"github.com/openshift-kni/k8sreporter"
7+
"github.com/rh-ecosystem-edge/eco-gotests/tests/rhwa/internal/rhwaparams"
8+
corev1 "k8s.io/api/core/v1"
9+
)
410

511
var (
612
// Labels represents the range of labels that can be used for test cases selection.
713
Labels = []string{rhwaparams.Label, Label}
14+
15+
// OperatorDeploymentName represents NHC deployment name.
16+
OperatorDeploymentName = "node-healthcheck-controller-manager"
17+
18+
// OperatorControllerPodLabel is how the controller pod is labeled.
19+
OperatorControllerPodLabel = "node-healthcheck-operator"
20+
21+
// ReporterNamespacesToDump tells to the reporter from where to collect logs.
22+
ReporterNamespacesToDump = map[string]string{
23+
rhwaparams.RhwaOperatorNs: rhwaparams.RhwaOperatorNs,
24+
AppNamespace: AppNamespace,
25+
}
26+
27+
// ReporterCRDsToDump tells to the reporter what CRs to dump.
28+
ReporterCRDsToDump = []k8sreporter.CRData{
29+
{Cr: &corev1.PodList{}},
30+
}
31+
32+
// NodeReadyTimeout is how long to wait for a node Ready condition change.
33+
NodeReadyTimeout = 2 * time.Minute
34+
35+
// NHCObserveTimeout is how long to wait for NHC to mark a node unhealthy.
36+
NHCObserveTimeout = 3 * time.Minute
37+
38+
// SNRFenceTimeout is how long to wait for SNR to fence the node.
39+
SNRFenceTimeout = 5 * time.Minute
40+
41+
// RescheduleTimeout is how long to wait for the pod to reschedule.
42+
RescheduleTimeout = 5 * time.Minute
43+
44+
// NodeRecoveryTimeout is how long to wait for a node to become Ready after power-on.
45+
NodeRecoveryTimeout = 25 * time.Minute
46+
47+
// PollingInterval is the default polling interval for Eventually blocks.
48+
PollingInterval = 10 * time.Second
49+
50+
// BMCTimeout is the Redfish operation timeout.
51+
BMCTimeout = 6 * time.Minute
852
)
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
package nhc
2+
3+
import (
4+
"runtime"
5+
"testing"
6+
7+
. "github.com/onsi/ginkgo/v2"
8+
. "github.com/onsi/gomega"
9+
"github.com/rh-ecosystem-edge/eco-goinfra/pkg/reportxml"
10+
"github.com/rh-ecosystem-edge/eco-gotests/tests/internal/reporter"
11+
. "github.com/rh-ecosystem-edge/eco-gotests/tests/rhwa/internal/rhwainittools"
12+
"github.com/rh-ecosystem-edge/eco-gotests/tests/rhwa/nhc-operator/internal/nhcparams"
13+
_ "github.com/rh-ecosystem-edge/eco-gotests/tests/rhwa/nhc-operator/tests"
14+
)
15+
16+
var _, currentFile, _, _ = runtime.Caller(0)
17+
18+
func TestNHC(t *testing.T) {
19+
_, reporterConfig := GinkgoConfiguration()
20+
reporterConfig.JUnitReport = RHWAConfig.GetJunitReportPath(currentFile)
21+
22+
RegisterFailHandler(Fail)
23+
RunSpecs(t, "NHC", Label(nhcparams.Labels...), reporterConfig)
24+
}
25+
26+
var _ = JustAfterEach(func() {
27+
reporter.ReportIfFailed(
28+
CurrentSpecReport(), currentFile, nhcparams.ReporterNamespacesToDump, nhcparams.ReporterCRDsToDump)
29+
})
30+
31+
var _ = ReportAfterSuite("", func(report Report) {
32+
reportxml.Create(
33+
report, RHWAConfig.GetReportPath(), RHWAConfig.TCPrefix)
34+
})
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
package tests
2+
3+
import (
4+
"fmt"
5+
6+
. "github.com/onsi/ginkgo/v2"
7+
. "github.com/onsi/gomega"
8+
9+
"github.com/rh-ecosystem-edge/eco-goinfra/pkg/deployment"
10+
"github.com/rh-ecosystem-edge/eco-goinfra/pkg/pod"
11+
12+
. "github.com/rh-ecosystem-edge/eco-gotests/tests/rhwa/internal/rhwainittools"
13+
"github.com/rh-ecosystem-edge/eco-gotests/tests/rhwa/internal/rhwaparams"
14+
"github.com/rh-ecosystem-edge/eco-gotests/tests/rhwa/nhc-operator/internal/nhcparams"
15+
16+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
17+
)
18+
19+
var _ = Describe(
20+
"NHC Post Deployment tests",
21+
Ordered,
22+
ContinueOnFailure,
23+
Label(nhcparams.Label), func() {
24+
BeforeAll(func() {
25+
By("Get NHC deployment object")
26+
nhcDeployment, err := deployment.Pull(
27+
APIClient, nhcparams.OperatorDeploymentName, rhwaparams.RhwaOperatorNs)
28+
Expect(err).ToNot(HaveOccurred(), "Failed to get NHC deployment")
29+
30+
By("Verify NHC deployment is Ready")
31+
Expect(nhcDeployment.IsReady(rhwaparams.DefaultTimeout)).To(BeTrue(), "NHC deployment is not Ready")
32+
})
33+
34+
It("Verify Node Health Check Operator pod is running", func() {
35+
listOptions := metav1.ListOptions{
36+
LabelSelector: fmt.Sprintf("app.kubernetes.io/name=%s", nhcparams.OperatorControllerPodLabel),
37+
}
38+
_, err := pod.WaitForAllPodsInNamespaceRunning(
39+
APIClient,
40+
rhwaparams.RhwaOperatorNs,
41+
rhwaparams.DefaultTimeout,
42+
listOptions,
43+
)
44+
Expect(err).ToNot(HaveOccurred(), "Pod is not ready")
45+
})
46+
})

0 commit comments

Comments
 (0)