Skip to content

Commit fbe3998

Browse files
committed
Check node-agent DaemonSet status instead of listing pods
Replace HasRunningPods (which listed all node-agent pods and iterated to find a running one) with IsReady, which does a single Get on the node-agent DaemonSet and checks status.numberReady > 0. This is a cheaper check and more semantically appropriate since we only need to know the DaemonSet is healthy, not inspect individual pods. Signed-off-by: Joseph <jvaikath@redhat.com>
1 parent 8d559de commit fbe3998

4 files changed

Lines changed: 87 additions & 80 deletions

File tree

pkg/controller/backup_controller.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -496,14 +496,14 @@ func (b *backupReconciler) prepareBackupRequest(ctx context.Context, backup *vel
496496
}
497497
}
498498

499-
// validate that node-agent pods are running when snapshot data movement with the
500-
// built-in data mover is requested. Without this, the DataUpload CR will be created
501-
// but never processed (the DataUpload controller runs inside node-agent), causing the
502-
// backup to hang until itemOperationTimeout expires.
499+
// validate that the node-agent daemonset is ready when snapshot data movement with
500+
// the built-in data mover is requested. Without this, the DataUpload CR will be
501+
// created but never processed (the DataUpload controller runs inside node-agent),
502+
// causing the backup to hang until itemOperationTimeout expires.
503503
if boolptr.IsSetToTrue(request.Spec.SnapshotMoveData) && datamover.IsBuiltInUploader(request.Spec.DataMover) {
504-
if err := nodeagent.HasRunningPods(ctx, request.Namespace, b.kbClient); err != nil {
504+
if err := nodeagent.IsReady(ctx, request.Namespace, b.kbClient); err != nil {
505505
request.Status.ValidationErrors = append(request.Status.ValidationErrors,
506-
"no running node-agent pods found; the built-in data mover requires node-agent to be deployed")
506+
fmt.Sprintf("node-agent is not ready: %v; the built-in data mover requires node-agent to be deployed and running", err))
507507
}
508508
}
509509

pkg/controller/backup_controller_test.go

Lines changed: 50 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ import (
3333
"github.com/stretchr/testify/assert"
3434
"github.com/stretchr/testify/mock"
3535
"github.com/stretchr/testify/require"
36-
corev1api "k8s.io/api/core/v1"
36+
appsv1api "k8s.io/api/apps/v1"
3737
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3838
"k8s.io/apimachinery/pkg/runtime"
3939
"k8s.io/apimachinery/pkg/types"
@@ -497,27 +497,43 @@ func TestPrepareBackupRequest_NodeAgentValidation(t *testing.T) {
497497
now, err := time.Parse(time.RFC1123Z, time.RFC1123Z)
498498
require.NoError(t, err)
499499

500-
nodeAgentPod := builder.ForPod(velerov1api.DefaultNamespace, "node-agent-abc").
501-
Labels(map[string]string{"role": "node-agent"}).
502-
NodeName("worker-1").
503-
Phase(corev1api.PodRunning).
504-
Result()
500+
nodeAgentDS := &appsv1api.DaemonSet{
501+
ObjectMeta: metav1.ObjectMeta{
502+
Namespace: velerov1api.DefaultNamespace,
503+
Name: "node-agent",
504+
},
505+
Status: appsv1api.DaemonSetStatus{NumberReady: 3},
506+
}
507+
508+
nodeAgentDSNotReady := &appsv1api.DaemonSet{
509+
ObjectMeta: metav1.ObjectMeta{
510+
Namespace: velerov1api.DefaultNamespace,
511+
Name: "node-agent",
512+
},
513+
Status: appsv1api.DaemonSetStatus{NumberReady: 0},
514+
}
505515

506516
tests := []struct {
507-
name string
508-
backup *velerov1api.Backup
509-
objs []runtime.Object
510-
expectedValidationError string
517+
name string
518+
backup *velerov1api.Backup
519+
objs []runtime.Object
520+
expectNodeAgent bool
511521
}{
512522
{
513-
name: "snapshotMoveData with no node-agent pods",
514-
backup: defaultBackup().SnapshotMoveData(true).Result(),
515-
expectedValidationError: "no running node-agent pods found; the built-in data mover requires node-agent to be deployed",
523+
name: "snapshotMoveData with no node-agent daemonset",
524+
backup: defaultBackup().SnapshotMoveData(true).Result(),
525+
expectNodeAgent: true,
516526
},
517527
{
518-
name: "snapshotMoveData with running node-agent pod",
528+
name: "snapshotMoveData with node-agent daemonset not ready",
529+
backup: defaultBackup().SnapshotMoveData(true).Result(),
530+
objs: []runtime.Object{nodeAgentDSNotReady},
531+
expectNodeAgent: true,
532+
},
533+
{
534+
name: "snapshotMoveData with ready node-agent daemonset",
519535
backup: defaultBackup().SnapshotMoveData(true).Result(),
520-
objs: []runtime.Object{nodeAgentPod},
536+
objs: []runtime.Object{nodeAgentDS},
521537
},
522538
{
523539
name: "snapshotMoveData with custom data mover skips check",
@@ -550,8 +566,15 @@ func TestPrepareBackupRequest_NodeAgentValidation(t *testing.T) {
550566
res := c.prepareBackupRequest(ctx, test.backup, logger)
551567
defer res.WorkerPool.Stop()
552568

553-
if test.expectedValidationError != "" {
554-
assert.Contains(t, res.Status.ValidationErrors, test.expectedValidationError)
569+
if test.expectNodeAgent {
570+
hasNodeAgentError := false
571+
for _, e := range res.Status.ValidationErrors {
572+
if strings.Contains(e, "node-agent") {
573+
hasNodeAgentError = true
574+
break
575+
}
576+
}
577+
assert.True(t, hasNodeAgentError, "expected a node-agent validation error")
555578
} else {
556579
for _, e := range res.Status.ValidationErrors {
557580
assert.NotContains(t, e, "node-agent")
@@ -740,14 +763,16 @@ func TestProcessBackupCompletions(t *testing.T) {
740763
now = now.Local()
741764
timestamp := metav1.NewTime(now)
742765

743-
// Node-agent pod is needed for all test cases so that the node-agent
766+
// Node-agent daemonset is needed for all test cases so that the node-agent
744767
// validation in prepareBackupRequest does not reject backups that use
745768
// snapshot data movement.
746-
nodeAgentPod := builder.ForPod(velerov1api.DefaultNamespace, "node-agent-abc").
747-
Labels(map[string]string{"role": "node-agent"}).
748-
NodeName("worker-1").
749-
Phase(corev1api.PodRunning).
750-
Result()
769+
nodeAgentDS := &appsv1api.DaemonSet{
770+
ObjectMeta: metav1.ObjectMeta{
771+
Namespace: velerov1api.DefaultNamespace,
772+
Name: "node-agent",
773+
},
774+
Status: appsv1api.DaemonSetStatus{NumberReady: 3},
775+
}
751776

752777
tests := []struct {
753778
name string
@@ -1593,15 +1618,15 @@ func TestProcessBackupCompletions(t *testing.T) {
15931618
builder.ForVolumeSnapshotContent("testVSC").ObjectMeta(builder.WithLabels(velerov1api.BackupNameLabel, "backup-1")).VolumeSnapshotClassName("testClass").Status(&snapshotv1api.VolumeSnapshotContentStatus{
15941619
SnapshotHandle: &snapshotHandle,
15951620
}).Result(),
1596-
nodeAgentPod,
1621+
nodeAgentDS,
15971622
)
15981623
} else {
15991624
fakeClient = velerotest.NewFakeControllerRuntimeClient(t,
16001625
builder.ForVolumeSnapshotClass("testClass").Driver("testDriver").Result(),
16011626
builder.ForVolumeSnapshotContent("testVSC").ObjectMeta(builder.WithLabels(velerov1api.BackupNameLabel, "backup-1")).VolumeSnapshotClassName("testClass").Status(&snapshotv1api.VolumeSnapshotContentStatus{
16021627
SnapshotHandle: &snapshotHandle,
16031628
}).Result(),
1604-
nodeAgentPod,
1629+
nodeAgentDS,
16051630
)
16061631
}
16071632

pkg/nodeagent/node_agent.go

Lines changed: 9 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"fmt"
2323

2424
"github.com/pkg/errors"
25+
appsv1api "k8s.io/api/apps/v1"
2526
corev1api "k8s.io/api/core/v1"
2627
apierrors "k8s.io/apimachinery/pkg/api/errors"
2728
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -80,29 +81,18 @@ func KbClientIsRunningInNode(ctx context.Context, namespace string, nodeName str
8081
return isRunningInNode(ctx, namespace, nodeName, nil, kubeClient)
8182
}
8283

83-
// HasRunningPods checks if any node agent pod is running in the namespace through controller client. If not, return the error found.
84-
func HasRunningPods(ctx context.Context, namespace string, crClient ctrlclient.Client) error {
85-
pods := new(corev1api.PodList)
86-
parsedSelector, err := labels.Parse(fmt.Sprintf("role=%s", nodeAgentRole))
87-
if err != nil {
88-
return errors.Wrap(err, "fail to parse selector")
84+
// IsReady checks whether the node-agent daemonset has at least one ready pod by inspecting the DaemonSet status.
85+
func IsReady(ctx context.Context, namespace string, crClient ctrlclient.Client) error {
86+
ds := new(appsv1api.DaemonSet)
87+
if err := crClient.Get(ctx, ctrlclient.ObjectKey{Namespace: namespace, Name: daemonSet}, ds); err != nil {
88+
return errors.Wrap(err, "failed to get node-agent daemonset")
8989
}
9090

91-
err = crClient.List(ctx, pods, &ctrlclient.ListOptions{
92-
LabelSelector: parsedSelector,
93-
Namespace: namespace,
94-
})
95-
if err != nil {
96-
return errors.Wrap(err, "failed to list node-agent pods")
97-
}
98-
99-
for i := range pods.Items {
100-
if kube.IsPodRunning(&pods.Items[i]) == nil {
101-
return nil
102-
}
91+
if ds.Status.NumberReady == 0 {
92+
return errors.New("node-agent daemonset has no ready pods")
10393
}
10494

105-
return errors.New("no running node-agent pods found")
95+
return nil
10696
}
10797

10898
// IsRunningInNode checks if the node agent pod is running properly in a specified node through controller client. If not, return the error found

pkg/nodeagent/node_agent_test.go

Lines changed: 22 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -213,17 +213,18 @@ func TestIsRunningInNode(t *testing.T) {
213213
}
214214
}
215215

216-
func TestHasRunningPods(t *testing.T) {
216+
func TestIsReady(t *testing.T) {
217217
scheme := runtime.NewScheme()
218-
corev1api.AddToScheme(scheme)
218+
appsv1api.AddToScheme(scheme)
219219

220-
nonNodeAgentPod := builder.ForPod("fake-ns", "fake-pod").Result()
221-
nodeAgentPodNotRunning := builder.ForPod("fake-ns", "fake-pod-pending").Labels(map[string]string{"role": "node-agent"}).Result()
222-
nodeAgentPodRunning := builder.ForPod("fake-ns", "fake-pod-running").
223-
Labels(map[string]string{"role": "node-agent"}).
224-
Phase(corev1api.PodRunning).
225-
NodeName("fake-node").
226-
Result()
220+
dsNotReady := &appsv1api.DaemonSet{
221+
ObjectMeta: metav1.ObjectMeta{Namespace: "fake-ns", Name: "node-agent"},
222+
Status: appsv1api.DaemonSetStatus{NumberReady: 0},
223+
}
224+
dsReady := &appsv1api.DaemonSet{
225+
ObjectMeta: metav1.ObjectMeta{Namespace: "fake-ns", Name: "node-agent"},
226+
Status: appsv1api.DaemonSetStatus{NumberReady: 3},
227+
}
227228

228229
tests := []struct {
229230
name string
@@ -232,44 +233,35 @@ func TestHasRunningPods(t *testing.T) {
232233
expectErr string
233234
}{
234235
{
235-
name: "no pods at all",
236-
namespace: "fake-ns",
237-
expectErr: "no running node-agent pods found",
238-
},
239-
{
240-
name: "only non-node-agent pods",
236+
name: "daemonset not found",
241237
namespace: "fake-ns",
242-
kubeClientObj: []runtime.Object{
243-
nonNodeAgentPod,
244-
},
245-
expectErr: "no running node-agent pods found",
238+
expectErr: "failed to get node-agent daemonset: daemonsets.apps \"node-agent\" not found",
246239
},
247240
{
248-
name: "node-agent pods exist but none running",
241+
name: "daemonset exists but no ready pods",
249242
namespace: "fake-ns",
250243
kubeClientObj: []runtime.Object{
251-
nodeAgentPodNotRunning,
244+
dsNotReady,
252245
},
253-
expectErr: "no running node-agent pods found",
246+
expectErr: "node-agent daemonset has no ready pods",
254247
},
255248
{
256-
name: "at least one running node-agent pod",
249+
name: "daemonset has ready pods",
257250
namespace: "fake-ns",
258251
kubeClientObj: []runtime.Object{
259-
nodeAgentPodNotRunning,
260-
nodeAgentPodRunning,
252+
dsReady,
261253
},
262254
},
263255
}
264256

265257
for _, test := range tests {
266258
t.Run(test.name, func(t *testing.T) {
267-
fakeClientBuilder := clientFake.NewClientBuilder()
268-
fakeClientBuilder = fakeClientBuilder.WithScheme(scheme)
269-
270-
fakeClient := fakeClientBuilder.WithRuntimeObjects(test.kubeClientObj...).Build()
259+
fakeClient := clientFake.NewClientBuilder().
260+
WithScheme(scheme).
261+
WithRuntimeObjects(test.kubeClientObj...).
262+
Build()
271263

272-
err := HasRunningPods(t.Context(), test.namespace, fakeClient)
264+
err := IsReady(t.Context(), test.namespace, fakeClient)
273265
if test.expectErr == "" {
274266
assert.NoError(t, err)
275267
} else {

0 commit comments

Comments
 (0)