Skip to content

Commit a9031eb

Browse files
authored
Merge pull request #8626 from Lyndon-Li/repo-maintainance-for-windows-2
Repo maintenance for windows
2 parents 5b1738a + 0a4b05c commit a9031eb

7 files changed

Lines changed: 340 additions & 150 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix issue #8419, support repo maintenance job to run on Windows nodes

pkg/cmd/cli/repomantenance/maintenance.go

Lines changed: 28 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"github.com/vmware-tanzu/velero/pkg/util/logging"
2727

2828
repokey "github.com/vmware-tanzu/velero/pkg/repository/keys"
29+
"github.com/vmware-tanzu/velero/pkg/repository/maintenance"
2930
repomanager "github.com/vmware-tanzu/velero/pkg/repository/manager"
3031
)
3132

@@ -78,17 +79,7 @@ func (o *Options) Run(f velerocli.Factory) {
7879
}()
7980

8081
if pruneError != nil {
81-
logger.WithError(pruneError).Error("An error occurred when running repo prune")
82-
terminationLogFile, err := os.Create("/dev/termination-log")
83-
if err != nil {
84-
logger.WithError(err).Error("Failed to create termination log file")
85-
return
86-
}
87-
defer terminationLogFile.Close()
88-
89-
if _, errWrite := terminationLogFile.WriteString(fmt.Sprintf("An error occurred: %v", err)); errWrite != nil {
90-
logger.WithError(errWrite).Error("Failed to write error to termination log file")
91-
}
82+
os.Stdout.WriteString(fmt.Sprintf("%s%v", maintenance.TerminationLogIndicator, pruneError))
9283
}
9384
}
9485

@@ -163,22 +154,38 @@ func (o *Options) runRepoPrune(f velerocli.Factory, namespace string, logger log
163154
return err
164155
}
165156

166-
manager, err := initRepoManager(namespace, cli, kubeClient, logger)
167-
if err != nil {
168-
return err
157+
var repo *velerov1api.BackupRepository
158+
retry := 10
159+
for {
160+
repo, err = repository.GetBackupRepository(context.Background(), cli, namespace,
161+
repository.BackupRepositoryKey{
162+
VolumeNamespace: o.RepoName,
163+
BackupLocation: o.BackupStorageLocation,
164+
RepositoryType: o.RepoType,
165+
}, true)
166+
if err == nil {
167+
break
168+
}
169+
170+
retry--
171+
if retry == 0 {
172+
break
173+
}
174+
175+
logger.WithError(err).Warn("Failed to retrieve backup repo, need retry")
176+
177+
time.Sleep(time.Second)
169178
}
170179

171-
// backupRepository
172-
repo, err := repository.GetBackupRepository(context.Background(), cli, namespace,
173-
repository.BackupRepositoryKey{
174-
VolumeNamespace: o.RepoName,
175-
BackupLocation: o.BackupStorageLocation,
176-
RepositoryType: o.RepoType,
177-
}, true)
178180
if err != nil {
179181
return errors.Wrap(err, "failed to get backup repository")
180182
}
181183

184+
manager, err := initRepoManager(namespace, cli, kubeClient, logger)
185+
if err != nil {
186+
return err
187+
}
188+
182189
err = manager.PruneRepo(repo)
183190
if err != nil {
184191
return errors.Wrap(err, "failed to prune repo")

pkg/controller/backup_repository_controller_test.go

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -131,10 +131,15 @@ func waitMaintenanceJobCompleteFail(client.Client, context.Context, string, stri
131131
}
132132

133133
func waitMaintenanceJobCompleteFunc(now time.Time, result velerov1api.BackupRepositoryMaintenanceResult, message string) func(client.Client, context.Context, string, string, logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error) {
134+
completionTimeStamp := &metav1.Time{Time: now.Add(time.Hour)}
135+
if result == velerov1api.BackupRepositoryMaintenanceFailed {
136+
completionTimeStamp = nil
137+
}
138+
134139
return func(client.Client, context.Context, string, string, logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error) {
135140
return velerov1api.BackupRepositoryMaintenanceStatus{
136141
StartTimestamp: &metav1.Time{Time: now},
137-
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
142+
CompleteTimestamp: completionTimeStamp,
138143
Result: result,
139144
Message: message,
140145
}, nil
@@ -316,10 +321,9 @@ func TestRunMaintenanceIfDue(t *testing.T) {
316321
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
317322
},
318323
{
319-
StartTimestamp: &metav1.Time{Time: now},
320-
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
321-
Result: velerov1api.BackupRepositoryMaintenanceFailed,
322-
Message: "fake-maintenance-message",
324+
StartTimestamp: &metav1.Time{Time: now},
325+
Result: velerov1api.BackupRepositoryMaintenanceFailed,
326+
Message: "fake-maintenance-message",
323327
},
324328
},
325329
},
@@ -893,7 +897,7 @@ func TestUpdateRepoMaintenanceHistory(t *testing.T) {
893897
{
894898
name: "full history",
895899
backupRepo: backupRepoWithFullHistory,
896-
result: velerov1api.BackupRepositoryMaintenanceFailed,
900+
result: velerov1api.BackupRepositoryMaintenanceSucceeded,
897901
expectedHistory: []velerov1api.BackupRepositoryMaintenanceStatus{
898902
{
899903
StartTimestamp: &metav1.Time{Time: standardTime.Add(-time.Hour * 22)},
@@ -915,7 +919,7 @@ func TestUpdateRepoMaintenanceHistory(t *testing.T) {
915919
{
916920
name: "over full history",
917921
backupRepo: backupRepoWithOverFullHistory,
918-
result: velerov1api.BackupRepositoryMaintenanceFailed,
922+
result: velerov1api.BackupRepositoryMaintenanceSucceeded,
919923
expectedHistory: []velerov1api.BackupRepositoryMaintenanceStatus{
920924
{
921925
StartTimestamp: &metav1.Time{Time: standardTime.Add(-time.Hour * 20)},
@@ -1127,7 +1131,7 @@ func TestConsolidateHistory(t *testing.T) {
11271131
{
11281132
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
11291133
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
1130-
Result: velerov1api.BackupRepositoryMaintenanceFailed,
1134+
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
11311135
Message: "fake-maintenance-message-2",
11321136
},
11331137
},
@@ -1149,7 +1153,7 @@ func TestConsolidateHistory(t *testing.T) {
11491153
{
11501154
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
11511155
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
1152-
Result: velerov1api.BackupRepositoryMaintenanceFailed,
1156+
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
11531157
Message: "fake-maintenance-message-2",
11541158
},
11551159
{
@@ -1172,7 +1176,7 @@ func TestConsolidateHistory(t *testing.T) {
11721176
{
11731177
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
11741178
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
1175-
Result: velerov1api.BackupRepositoryMaintenanceFailed,
1179+
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
11761180
Message: "fake-maintenance-message-2",
11771181
},
11781182
},
@@ -1194,7 +1198,7 @@ func TestConsolidateHistory(t *testing.T) {
11941198
{
11951199
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
11961200
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
1197-
Result: velerov1api.BackupRepositoryMaintenanceFailed,
1201+
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
11981202
Message: "fake-maintenance-message-2",
11991203
},
12001204
{
@@ -1223,7 +1227,7 @@ func TestConsolidateHistory(t *testing.T) {
12231227
{
12241228
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
12251229
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
1226-
Result: velerov1api.BackupRepositoryMaintenanceFailed,
1230+
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
12271231
Message: "fake-maintenance-message-2",
12281232
},
12291233
{
@@ -1237,7 +1241,7 @@ func TestConsolidateHistory(t *testing.T) {
12371241
{
12381242
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
12391243
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
1240-
Result: velerov1api.BackupRepositoryMaintenanceFailed,
1244+
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
12411245
Message: "fake-maintenance-message-2",
12421246
},
12431247
{
@@ -1257,7 +1261,7 @@ func TestConsolidateHistory(t *testing.T) {
12571261
{
12581262
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
12591263
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
1260-
Result: velerov1api.BackupRepositoryMaintenanceFailed,
1264+
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
12611265
Message: "fake-maintenance-message-2",
12621266
},
12631267
{
@@ -1339,13 +1343,13 @@ func TestGetLastMaintenanceTimeFromHistory(t *testing.T) {
13391343
history: []velerov1api.BackupRepositoryMaintenanceStatus{
13401344
{
13411345
StartTimestamp: &metav1.Time{Time: now},
1342-
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
1346+
Result: velerov1api.BackupRepositoryMaintenanceFailed,
13431347
Message: "fake-maintenance-message",
13441348
},
13451349
{
13461350
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
13471351
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
1348-
Result: velerov1api.BackupRepositoryMaintenanceFailed,
1352+
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
13491353
Message: "fake-maintenance-message-2",
13501354
},
13511355
{

pkg/repository/maintenance/maintenance.go

Lines changed: 50 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"fmt"
2323
"math"
2424
"sort"
25+
"strings"
2526
"time"
2627

2728
"github.com/pkg/errors"
@@ -35,6 +36,7 @@ import (
3536
"sigs.k8s.io/controller-runtime/pkg/client"
3637

3738
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
39+
"github.com/vmware-tanzu/velero/pkg/util"
3840
"github.com/vmware-tanzu/velero/pkg/util/kube"
3941

4042
appsv1 "k8s.io/api/apps/v1"
@@ -47,6 +49,7 @@ import (
4749
const (
4850
RepositoryNameLabel = "velero.io/repo-name"
4951
GlobalKeyForRepoMaintenanceJobCM = "global"
52+
TerminationLogIndicator = "Repo maintenance error: "
5053
)
5154

5255
type JobConfigs struct {
@@ -147,24 +150,37 @@ func getResultFromJob(cli client.Client, job *batchv1.Job) (string, error) {
147150
}
148151

149152
if len(podList.Items) == 0 {
150-
return "", fmt.Errorf("no pod found for job %s", job.Name)
153+
return "", errors.Errorf("no pod found for job %s", job.Name)
151154
}
152155

153156
// we only have one maintenance pod for the job
154157
pod := podList.Items[0]
155158

156159
statuses := pod.Status.ContainerStatuses
157160
if len(statuses) == 0 {
158-
return "", fmt.Errorf("no container statuses found for job %s", job.Name)
161+
return "", errors.Errorf("no container statuses found for job %s", job.Name)
159162
}
160163

161164
// we only have one maintenance container
162165
terminated := statuses[0].State.Terminated
163166
if terminated == nil {
164-
return "", fmt.Errorf("container for job %s is not terminated", job.Name)
167+
return "", errors.Errorf("container for job %s is not terminated", job.Name)
165168
}
166169

167-
return terminated.Message, nil
170+
if terminated.Message == "" {
171+
return "", nil
172+
}
173+
174+
idx := strings.Index(terminated.Message, TerminationLogIndicator)
175+
if idx == -1 {
176+
return "", errors.New("error to locate repo maintenance error indicator from termination message")
177+
}
178+
179+
if idx+len(TerminationLogIndicator) >= len(terminated.Message) {
180+
return "", errors.New("nothing after repo maintenance error indicator in termination message")
181+
}
182+
183+
return terminated.Message[idx+len(TerminationLogIndicator):], nil
168184
}
169185

170186
// getJobConfig is called to get the Maintenance Job Config for the
@@ -331,7 +347,7 @@ func WaitAllJobsComplete(ctx context.Context, cli client.Client, repo *velerov1a
331347
if job.Status.Failed > 0 {
332348
if msg, err := getResultFromJob(cli, job); err != nil {
333349
log.WithError(err).Warnf("Failed to get result of maintenance job %s", job.Name)
334-
message = "Repo maintenance failed but result is not retrieveable"
350+
message = fmt.Sprintf("Repo maintenance failed but result is not retrieveable, err: %v", err)
335351
} else {
336352
message = msg
337353
}
@@ -434,6 +450,16 @@ func buildJob(cli client.Client, ctx context.Context, repo *velerov1api.BackupRe
434450
return nil, errors.Wrap(err, "failed to parse resource requirements for maintenance job")
435451
}
436452

453+
podLabels := map[string]string{
454+
RepositoryNameLabel: repo.Name,
455+
}
456+
457+
for _, k := range util.ThirdPartyLabels {
458+
if v := veleroutil.GetVeleroServerLabelValue(deployment, k); v != "" {
459+
podLabels[k] = v
460+
}
461+
}
462+
437463
// Set arguments
438464
args := []string{"repo-maintenance"}
439465
args = append(args, fmt.Sprintf("--repo-name=%s", repo.Spec.VolumeNamespace))
@@ -455,10 +481,8 @@ func buildJob(cli client.Client, ctx context.Context, repo *velerov1api.BackupRe
455481
BackoffLimit: new(int32), // Never retry
456482
Template: v1.PodTemplateSpec{
457483
ObjectMeta: metav1.ObjectMeta{
458-
Name: "velero-repo-maintenance-pod",
459-
Labels: map[string]string{
460-
RepositoryNameLabel: repo.Name,
461-
},
484+
Name: "velero-repo-maintenance-pod",
485+
Labels: podLabels,
462486
},
463487
Spec: v1.PodSpec{
464488
Containers: []v1.Container{
@@ -468,17 +492,26 @@ func buildJob(cli client.Client, ctx context.Context, repo *velerov1api.BackupRe
468492
Command: []string{
469493
"/velero",
470494
},
471-
Args: args,
472-
ImagePullPolicy: v1.PullIfNotPresent,
473-
Env: envVars,
474-
EnvFrom: envFromSources,
475-
VolumeMounts: volumeMounts,
476-
Resources: resources,
495+
Args: args,
496+
ImagePullPolicy: v1.PullIfNotPresent,
497+
Env: envVars,
498+
EnvFrom: envFromSources,
499+
VolumeMounts: volumeMounts,
500+
Resources: resources,
501+
TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
477502
},
478503
},
479504
RestartPolicy: v1.RestartPolicyNever,
480505
Volumes: volumes,
481506
ServiceAccountName: serviceAccount,
507+
Tolerations: []v1.Toleration{
508+
{
509+
Key: "os",
510+
Operator: "Equal",
511+
Effect: "NoSchedule",
512+
Value: "windows",
513+
},
514+
},
482515
},
483516
},
484517
},
@@ -489,22 +522,6 @@ func buildJob(cli client.Client, ctx context.Context, repo *velerov1api.BackupRe
489522
job.Spec.Template.Spec.Affinity = affinity
490523
}
491524

492-
if tolerations := veleroutil.GetTolerationsFromVeleroServer(deployment); tolerations != nil {
493-
job.Spec.Template.Spec.Tolerations = tolerations
494-
}
495-
496-
if nodeSelector := veleroutil.GetNodeSelectorFromVeleroServer(deployment); nodeSelector != nil {
497-
job.Spec.Template.Spec.NodeSelector = nodeSelector
498-
}
499-
500-
if labels := veleroutil.GetVeleroServerLables(deployment); len(labels) > 0 {
501-
job.Spec.Template.Labels = labels
502-
}
503-
504-
if annotations := veleroutil.GetVeleroServerAnnotations(deployment); len(annotations) > 0 {
505-
job.Spec.Template.Annotations = annotations
506-
}
507-
508525
return job, nil
509526
}
510527

@@ -516,8 +533,8 @@ func composeStatusFromJob(job *batchv1.Job, message string) velerov1api.BackupRe
516533

517534
return velerov1api.BackupRepositoryMaintenanceStatus{
518535
Result: result,
519-
StartTimestamp: &metav1.Time{Time: job.CreationTimestamp.Time},
520-
CompleteTimestamp: &metav1.Time{Time: job.Status.CompletionTime.Time},
536+
StartTimestamp: &job.CreationTimestamp,
537+
CompleteTimestamp: job.Status.CompletionTime,
521538
Message: message,
522539
}
523540
}

0 commit comments

Comments
 (0)