Skip to content

Commit ab90453

Browse files
committed
Add node-agent pod check to DataMover backup path
When SnapshotMoveData is enabled but no node-agent pods are running, DataUpload CRs sit unprocessed until timeout. This adds a pre-flight check in the PVC backup item action that verifies running node-agent pods exist before creating DataUpload CRs, mirroring the check FSB already performs. Cleans up the VolumeSnapshot on failure to prevent orphaned resources. Signed-off-by: Joseph <jvaikath@redhat.com>
1 parent 37abfb4 commit ab90453

4 files changed

Lines changed: 122 additions & 7 deletions

File tree

pkg/backup/actions/csi/pvc_action.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ import (
4949
veleroclient "github.com/vmware-tanzu/velero/pkg/client"
5050
"github.com/vmware-tanzu/velero/pkg/kuberesource"
5151
"github.com/vmware-tanzu/velero/pkg/label"
52+
"github.com/vmware-tanzu/velero/pkg/nodeagent"
5253
plugincommon "github.com/vmware-tanzu/velero/pkg/plugin/framework/common"
5354
"github.com/vmware-tanzu/velero/pkg/plugin/utils/volumehelper"
5455
"github.com/vmware-tanzu/velero/pkg/plugin/velero"
@@ -384,6 +385,12 @@ func (p *pvcBackupItemAction) Execute(
384385

385386
dataUploadLog.Info("Starting data upload of backup")
386387

388+
if err := nodeagent.HasRunningPods(context.Background(), backup.Namespace, p.crClient); err != nil {
389+
dataUploadLog.WithError(err).Error("failed to check for running node-agent pods")
390+
csi.CleanupVolumeSnapshot(vs, p.crClient, p.log)
391+
return nil, nil, "", nil, errors.Wrap(err, "snapshot data movement requires a running node-agent daemonset; ensure node-agent is deployed and running")
392+
}
393+
387394
dataUpload, err := createDataUpload(
388395
context.Background(),
389396
backup,

pkg/backup/actions/csi/pvc_action_test.go

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ func TestExecute(t *testing.T) {
9494
expectedDataUpload *velerov2alpha1.DataUpload
9595
expectedPVC *corev1api.PersistentVolumeClaim
9696
resourcePolicy *corev1api.ConfigMap
97+
extraObjects []runtime.Object
9798
failVSCreate bool
9899
skipVSReadyUpdate bool // New flag to control VS readiness
99100
}{
@@ -122,12 +123,24 @@ func TestExecute(t *testing.T) {
122123
expectErr: true, // Expect an error, but the exact message can vary
123124
},
124125
{
125-
name: "Test SnapshotMoveData",
126+
name: "Fail SnapshotMoveData when no node-agent pods",
126127
backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).CSISnapshotTimeout(1 * time.Minute).Result(),
127128
pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(),
128129
pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(),
129130
sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(),
130131
vsClass: builder.ForVolumeSnapshotClass("testVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(),
132+
expectedErr: errors.New("snapshot data movement requires a running node-agent daemonset; ensure node-agent is deployed and running: no running node-agent pods found"),
133+
},
134+
{
135+
name: "Test SnapshotMoveData",
136+
backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).CSISnapshotTimeout(1 * time.Minute).Result(),
137+
pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(),
138+
pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(),
139+
sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(),
140+
vsClass: builder.ForVolumeSnapshotClass("testVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(),
141+
extraObjects: []runtime.Object{
142+
builder.ForPod("velero", "node-agent-pod").Labels(map[string]string{"role": "node-agent"}).Phase(corev1api.PodRunning).NodeName("test-node").Result(),
143+
},
131144
operationID: ".",
132145
expectedDataUpload: &velerov2alpha1.DataUpload{
133146
TypeMeta: metav1.TypeMeta{
@@ -167,12 +180,15 @@ func TestExecute(t *testing.T) {
167180
},
168181
},
169182
{
170-
name: "Verify PVC is modified as expected",
171-
backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).CSISnapshotTimeout(1 * time.Minute).Result(),
172-
pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(),
173-
pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(),
174-
sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(),
175-
vsClass: builder.ForVolumeSnapshotClass("tescVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(),
183+
name: "Verify PVC is modified as expected",
184+
backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).CSISnapshotTimeout(1 * time.Minute).Result(),
185+
pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(),
186+
pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(),
187+
sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(),
188+
vsClass: builder.ForVolumeSnapshotClass("tescVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(),
189+
extraObjects: []runtime.Object{
190+
builder.ForPod("velero", "node-agent-pod").Labels(map[string]string{"role": "node-agent"}).Phase(corev1api.PodRunning).NodeName("test-node").Result(),
191+
},
176192
operationID: ".",
177193
expectedPVC: builder.ForPersistentVolumeClaim("velero", "testPVC").
178194
ObjectMeta(builder.WithAnnotations(velerov1api.MustIncludeAdditionalItemAnnotation, "true", velerov1api.DataUploadNameAnnotation, "velero/"),
@@ -210,6 +226,7 @@ func TestExecute(t *testing.T) {
210226
if tc.resourcePolicy != nil {
211227
objects = append(objects, tc.resourcePolicy)
212228
}
229+
objects = append(objects, tc.extraObjects...)
213230

214231
var crClient crclient.Client
215232
if tc.failVSCreate {

pkg/nodeagent/node_agent.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,31 @@ func KbClientIsRunningInNode(ctx context.Context, namespace string, nodeName str
8080
return isRunningInNode(ctx, namespace, nodeName, nil, kubeClient)
8181
}
8282

83+
// HasRunningPods checks if any node agent pod is running in the namespace through controller client. If not, return the error found.
84+
func HasRunningPods(ctx context.Context, namespace string, crClient ctrlclient.Client) error {
85+
pods := new(corev1api.PodList)
86+
parsedSelector, err := labels.Parse(fmt.Sprintf("role=%s", nodeAgentRole))
87+
if err != nil {
88+
return errors.Wrap(err, "fail to parse selector")
89+
}
90+
91+
err = crClient.List(ctx, pods, &ctrlclient.ListOptions{
92+
LabelSelector: parsedSelector,
93+
Namespace: namespace,
94+
})
95+
if err != nil {
96+
return errors.Wrap(err, "failed to list node-agent pods")
97+
}
98+
99+
for i := range pods.Items {
100+
if kube.IsPodRunning(&pods.Items[i]) == nil {
101+
return nil
102+
}
103+
}
104+
105+
return errors.New("no running node-agent pods found")
106+
}
107+
83108
// IsRunningInNode checks if the node agent pod is running properly in a specified node through controller client. If not, return the error found
84109
func IsRunningInNode(ctx context.Context, namespace string, nodeName string, crClient ctrlclient.Client) error {
85110
return isRunningInNode(ctx, namespace, nodeName, crClient, nil)

pkg/nodeagent/node_agent_test.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,72 @@ func TestIsRunningInNode(t *testing.T) {
213213
}
214214
}
215215

216+
func TestHasRunningPods(t *testing.T) {
217+
scheme := runtime.NewScheme()
218+
corev1api.AddToScheme(scheme)
219+
220+
nonNodeAgentPod := builder.ForPod("fake-ns", "fake-pod").Result()
221+
nodeAgentPodNotRunning := builder.ForPod("fake-ns", "fake-pod-pending").Labels(map[string]string{"role": "node-agent"}).Result()
222+
nodeAgentPodRunning := builder.ForPod("fake-ns", "fake-pod-running").
223+
Labels(map[string]string{"role": "node-agent"}).
224+
Phase(corev1api.PodRunning).
225+
NodeName("fake-node").
226+
Result()
227+
228+
tests := []struct {
229+
name string
230+
kubeClientObj []runtime.Object
231+
namespace string
232+
expectErr string
233+
}{
234+
{
235+
name: "no pods at all",
236+
namespace: "fake-ns",
237+
expectErr: "no running node-agent pods found",
238+
},
239+
{
240+
name: "only non-node-agent pods",
241+
namespace: "fake-ns",
242+
kubeClientObj: []runtime.Object{
243+
nonNodeAgentPod,
244+
},
245+
expectErr: "no running node-agent pods found",
246+
},
247+
{
248+
name: "node-agent pods exist but none running",
249+
namespace: "fake-ns",
250+
kubeClientObj: []runtime.Object{
251+
nodeAgentPodNotRunning,
252+
},
253+
expectErr: "no running node-agent pods found",
254+
},
255+
{
256+
name: "at least one running node-agent pod",
257+
namespace: "fake-ns",
258+
kubeClientObj: []runtime.Object{
259+
nodeAgentPodNotRunning,
260+
nodeAgentPodRunning,
261+
},
262+
},
263+
}
264+
265+
for _, test := range tests {
266+
t.Run(test.name, func(t *testing.T) {
267+
fakeClientBuilder := clientFake.NewClientBuilder()
268+
fakeClientBuilder = fakeClientBuilder.WithScheme(scheme)
269+
270+
fakeClient := fakeClientBuilder.WithRuntimeObjects(test.kubeClientObj...).Build()
271+
272+
err := HasRunningPods(t.Context(), test.namespace, fakeClient)
273+
if test.expectErr == "" {
274+
assert.NoError(t, err)
275+
} else {
276+
assert.EqualError(t, err, test.expectErr)
277+
}
278+
})
279+
}
280+
}
281+
216282
func TestGetPodSpec(t *testing.T) {
217283
podSpec := corev1api.PodSpec{
218284
NodeName: "fake-node",

0 commit comments

Comments
 (0)