restore: log restore retries to job messages table

kev-cao · kev-cao · commit 56888b570f43 · 2025-08-04T13:17:13.000-04:00
Currently, when a restore job is stuck in a retry loop, no indication is made to the user that errors were encountered and the job is retrying (aside from within the logs). This commit teaches the restore retrier to now log errors to the job messages table, throttled to once every 5 minutes to avoid a hot loop. Epic: CRDB-50823 Fixes: cockroachdb#149787, cockroachdb#148033 Release note (general change): Restore jobs now log errors on retry to the job messages table.
diff --git a/pkg/backup/restore_job.go b/pkg/backup/restore_job.go
@@ -63,6 +63,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
 	"github.com/cockroachdb/cockroach/pkg/sql/sqlclustersettings"
 	"github.com/cockroachdb/cockroach/pkg/sql/stats"
+	"github.com/cockroachdb/cockroach/pkg/util"
 	bulkutil "github.com/cockroachdb/cockroach/pkg/util/bulk"
 	"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
 	"github.com/cockroachdb/cockroach/pkg/util/hlc"
@@ -91,20 +92,31 @@ var (
 		settings.WithVisibility(settings.Reserved),
 		settings.PositiveDuration,
 	)
+
+	restoreRetryLogRate = settings.RegisterDurationSetting(
+		settings.ApplicationLevel,
+		"restore.retry_log_rate",
+		"maximum rate at which retryable restore errors are logged to the job messages table",
+		5*time.Minute,
+		settings.WithVisibility(settings.Reserved),
+		settings.PositiveDuration,
+	)
 )
 
-// restoreStatsInsertBatchSize is an arbitrarily chosen value of the number of
-// tables we process in a single txn when restoring their table statistics.
-const restoreStatsInsertBatchSize = 10
+const (
+	// restoreStatsInsertBatchSize is an arbitrarily chosen value of the number of
+	// tables we process in a single txn when restoring their table statistics.
+	restoreStatsInsertBatchSize = 10
 
-// maxRestoreRetryFastFail is the maximum number of times we will retry without
-// seeing any progress before fast-failing the restore job.
-const maxRestoreRetryFastFail = 5
+	// maxRestoreRetryFastFail is the maximum number of times we will retry before
+	// exceeding the restoreRetryProgressThreshold.
+	maxRestoreRetryFastFail = 5
 
-// restoreRetryProgressThreshold is the fraction of the job that must
-// be _exceeded_ before we no longer fast fail the restore job after hitting the
-// maxRestoreRetryFastFail threshold.
-const restoreRetryProgressThreshold = 0
+	// restoreRetryProgressThreshold is the fraction of the job that must
+	// be _exceeded_ before we no longer fast fail the restore job after hitting the
+	// maxRestoreRetryFastFail threshold.
+	restoreRetryProgressThreshold = 0
+)
 
 var restoreStatsInsertionConcurrency = settings.RegisterIntSetting(
 	settings.ApplicationLevel,
@@ -198,10 +210,12 @@ func restoreWithRetry(
 	// We want to retry a restore if there are transient failures (i.e. worker nodes
 	// dying), so if we receive a retryable error, re-plan and retry the restore.
 	retryOpts, progThreshold := getRetryOptionsAndProgressThreshold(execCtx)
+	logRate := restoreRetryLogRate.Get(&execCtx.ExecCfg().Settings.SV)
+	logThrottler := util.Every(logRate)
 	var (
-		res                                    roachpb.RowCount
-		err                                    error
-		currPersistedSpans, prevPersistedSpans jobspb.RestoreFrontierEntries
+		res                roachpb.RowCount
+		err                error
+		prevPersistedSpans jobspb.RestoreFrontierEntries
 	)
 	for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); {
 		res, err = restore(
@@ -237,16 +251,19 @@ func restoreWithRetry(
 
 		log.Warningf(ctx, "encountered retryable error: %+v", err)
 
-		// Check if retry counter should be reset if progress was made.
-		currPersistedSpans = resumer.job.
-			Progress().Details.(*jobspb.Progress_Restore).Restore.Checkpoint
-		if !currPersistedSpans.Equal(prevPersistedSpans) {
-			// If the previous persisted spans are different than the current, it
-			// implies that further progress has been persisted.
-			r.Reset()
-			log.Infof(ctx, "restored frontier has advanced since last retry, resetting retry counter")
+		if logThrottler.ShouldProcess(timeutil.Now()) {
+			// We throttle the logging of errors to the jobs messages table to avoid
+			// flooding the table during the hot loop of a retry.
+			if err := execCtx.ExecCfg().InternalDB.Txn(ctx, func(ctx context.Context, txn isql.Txn) error {
+				return resumer.job.Messages().Record(
+					ctx, txn, "error", fmt.Sprintf("restore encountered error: %v", err),
+				)
+			}); err != nil {
+				log.Warningf(ctx, "failed to record job error message: %v", err)
+			}
 		}
-		prevPersistedSpans = currPersistedSpans
+
+		prevPersistedSpans = maybeResetRetry(ctx, resumer, &r, prevPersistedSpans)
 
 		// Fail fast if no progress has been made after a certain number of retries.
 		if r.CurrentAttempt() >= maxRestoreRetryFastFail &&
@@ -268,6 +285,7 @@ func restoreWithRetry(
 	if err != nil {
 		return res, jobs.MarkPauseRequestError(errors.Wrap(err, "exhausted retries"))
 	}
+
 	return res, nil
 }
 
@@ -281,8 +299,9 @@ func getRetryOptionsAndProgressThreshold(execCtx sql.JobExecContext) (retry.Opti
 	// event that some progress has been made.
 	maxDuration := restoreRetryMaxDuration.Get(&execCtx.ExecCfg().Settings.SV)
 	retryOpts := retry.Options{
-		MaxBackoff:  5 * time.Minute,
-		MaxDuration: maxDuration,
+		InitialBackoff: 50 * time.Millisecond,
+		MaxBackoff:     5 * time.Minute,
+		MaxDuration:    maxDuration,
 	}
 	var progThreshold float32 = restoreRetryProgressThreshold
 	if knobs := execCtx.ExecCfg().BackupRestoreTestingKnobs; knobs != nil {
@@ -297,6 +316,26 @@ func getRetryOptionsAndProgressThreshold(execCtx sql.JobExecContext) (retry.Opti
 	return retryOpts, progThreshold
 }
 
+// maybeResetRetry checks on the progress of the restore job and resets the
+// retry loop if progress has been made. It returns the latest progress.
+func maybeResetRetry(
+	ctx context.Context,
+	resumer *restoreResumer,
+	rt *retry.Retry,
+	prevProgress jobspb.RestoreFrontierEntries,
+) jobspb.RestoreFrontierEntries {
+	// Check if retry counter should be reset if progress was made.
+	var currProgress jobspb.RestoreFrontierEntries = resumer.job.
+		Progress().Details.(*jobspb.Progress_Restore).Restore.Checkpoint
+	if !currProgress.Equal(prevProgress) {
+		// If the previous persisted spans are different than the current, it
+		// implies that further progress has been persisted.
+		rt.Reset()
+		log.Infof(ctx, "restored frontier has advanced since last retry, resetting retry counter")
+	}
+	return currProgress
+}
+
 type storeByLocalityKV map[string]cloudpb.ExternalStorage
 
 func makeBackupLocalityMap(
@@ -384,10 +423,13 @@ func restore(
 	restoreCheckpoint := job.Progress().Details.(*jobspb.Progress_Restore).Restore.Checkpoint
 	requiredSpans := dataToRestore.getSpans()
 	progressTracker, err := makeProgressTracker(
+		job,
+		execCtx.ExecCfg(),
 		requiredSpans,
 		restoreCheckpoint,
 		restoreCheckpointMaxBytes.Get(&execCtx.ExecCfg().Settings.SV),
-		endTime)
+		endTime,
+	)
 	if err != nil {
 		return emptyRowCount, err
 	}
diff --git a/pkg/backup/restore_progress.go b/pkg/backup/restore_progress.go
@@ -9,9 +9,11 @@ import (
 	"context"
 
 	"github.com/cockroachdb/cockroach/pkg/backup/backuppb"
+	"github.com/cockroachdb/cockroach/pkg/jobs"
 	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
 	"github.com/cockroachdb/cockroach/pkg/settings"
+	"github.com/cockroachdb/cockroach/pkg/sql"
 	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
 	"github.com/cockroachdb/cockroach/pkg/util/hlc"
 	"github.com/cockroachdb/cockroach/pkg/util/log"
@@ -36,6 +38,9 @@ var restoreCheckpointMaxBytes = settings.RegisterByteSizeSetting(
 var completedSpanTime = hlc.MaxTimestamp
 
 type progressTracker struct {
+	job     *jobs.Job
+	execCfg *sql.ExecutorConfig
+
 	// nextRequiredSpanKey maps a required span endkey to the subsequent requiredSpan's startKey.
 	nextRequiredSpanKey map[string]roachpb.Key
 
@@ -56,12 +61,13 @@ type progressTracker struct {
 }
 
 func makeProgressTracker(
+	job *jobs.Job,
+	execCfg *sql.ExecutorConfig,
 	requiredSpans roachpb.Spans,
 	persistedSpans []jobspb.RestoreProgress_FrontierEntry,
 	maxBytes int64,
 	endTime hlc.Timestamp,
 ) (*progressTracker, error) {
-
 	var (
 		checkpointFrontier  spanUtils.Frontier
 		err                 error
@@ -76,13 +82,17 @@ func makeProgressTracker(
 		nextRequiredSpanKey[requiredSpans[i].EndKey.String()] = requiredSpans[i+1].Key
 	}
 
-	pt := &progressTracker{}
+	pt := &progressTracker{
+		job:                 job,
+		execCfg:             execCfg,
+		nextRequiredSpanKey: nextRequiredSpanKey,
+		maxBytes:            maxBytes,
+		endTime:             endTime,
+	}
 	pt.mu.checkpointFrontier = checkpointFrontier
-	pt.nextRequiredSpanKey = nextRequiredSpanKey
-	pt.maxBytes = maxBytes
-	pt.endTime = endTime
 	return pt, nil
 }
+
 func (pt *progressTracker) close() {
 	pt.mu.Lock()
 	defer pt.mu.Unlock()
diff --git a/pkg/backup/restore_progress_test.go b/pkg/backup/restore_progress_test.go
@@ -82,7 +82,9 @@ func TestProgressTracker(t *testing.T) {
 		},
 	} {
 		restoreTime := hlc.Timestamp{}
-		pt, err := makeProgressTracker(requiredSpans, persistedSpans, 0, restoreTime)
+		pt, err := makeProgressTracker(
+			nil /* job */, &execCfg, requiredSpans, persistedSpans, 0, restoreTime,
+		)
 		require.NoError(t, err, "step %d", i)
 
 		done, err := pt.ingestUpdate(ctx, mockUpdate(step.update, step.completeUpTo))
diff --git a/pkg/backup/restore_test.go b/pkg/backup/restore_test.go
@@ -13,6 +13,7 @@ import (
 
 	"github.com/cockroachdb/cockroach/pkg/backup/backuptestutils"
 	"github.com/cockroachdb/cockroach/pkg/base"
+	"github.com/cockroachdb/cockroach/pkg/cloud/nodelocal"
 	"github.com/cockroachdb/cockroach/pkg/jobs"
 	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
 	"github.com/cockroachdb/cockroach/pkg/sql"
@@ -206,3 +207,91 @@ func TestRestoreRetryFastFails(t *testing.T) {
 		require.Equal(t, expFastFailAttempts, attempts)
 	})
 }
+
+func TestRestoreJobMessages(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+	defer log.Scope(t).Close(t)
+
+	mu := struct {
+		syncutil.Mutex
+		retryCount int
+	}{}
+	// allowSuccess is a channel that will be closed when we want to allow the
+	// restore job to succeed.
+	allowSuccess := make(chan struct{})
+
+	testKnobs := &sql.BackupRestoreTestingKnobs{
+		RestoreDistSQLRetryPolicy: &retry.Options{
+			InitialBackoff: time.Microsecond,
+			Multiplier:     1,
+			MaxBackoff:     time.Microsecond,
+			// We will be allowing the restore job to succeed after a few job messages
+			// are logged, so we just need MaxDuration to be long enough that it won't
+			// be hit.
+			MaxDuration: 5 * time.Minute,
+		},
+		RunBeforeRestoreFlow: func() error {
+			mu.Lock()
+			defer mu.Unlock()
+
+			if mu.retryCount < maxRestoreRetryFastFail {
+				// Have not consumed all retries before a fast fail.
+				mu.retryCount++
+				return syscall.ECONNRESET
+			}
+
+			return nil
+		},
+		RunAfterRestoreFlow: func() error {
+			mu.Lock()
+			defer mu.Unlock()
+
+			select {
+			case <-allowSuccess:
+				return nil
+			default:
+				mu.retryCount++
+				return syscall.ECONNRESET
+			}
+		},
+	}
+	var params base.TestClusterArgs
+	params.ServerArgs.Knobs.BackupRestore = testKnobs
+
+	const numAccounts = 10
+	_, sqlDB, tmpDir, cleanupFn := backuptestutils.StartBackupRestoreTestCluster(
+		t, singleNode, backuptestutils.WithParams(params), backuptestutils.WithBank(numAccounts),
+	)
+	defer cleanupFn()
+	defer nodelocal.ReplaceNodeLocalForTesting(tmpDir)()
+
+	sqlDB.Exec(t, "SET CLUSTER SETTING restore.retry_log_rate = '100ms'")
+	sqlDB.Exec(t, "BACKUP DATABASE data INTO 'nodelocal://1/backup'")
+
+	var restoreJobID jobspb.JobID
+	sqlDB.QueryRow(
+		t, `RESTORE DATABASE data FROM LATEST IN 'nodelocal://1/backup'
+					WITH detached, new_db_name = 'restored_data_'`,
+	).Scan(&restoreJobID)
+
+	// Allow the restore job to fail a few times to log some error messages.
+	time.AfterFunc(2*time.Second, func() {
+		close(allowSuccess)
+	})
+	jobutils.WaitForJobToHaveStatus(t, sqlDB, restoreJobID, jobs.StateSucceeded)
+
+	var numErrMessages int
+	sqlDB.QueryRow(
+		t, `SELECT count(*) FROM system.job_message WHERE job_id = $1 AND kind = $2`,
+		restoreJobID, "error",
+	).Scan(&numErrMessages)
+	require.Greater(t, numErrMessages, 1)
+	// Depending on if the test is run under stress or not, we may log more
+	// messages, so we just check that we log fewer than 50 messages. If there
+	// were no throttling, there would be significantly more messages logged
+	// (100+ without stress), so this is sufficient.
+	require.Less(t, numErrMessages, 50)
+
+	finalStatusMsg := jobutils.GetJobStatusMessage(t, sqlDB, restoreJobID)
+	require.Empty(t, finalStatusMsg, "status message should be cleared on job completion")
+}
diff --git a/pkg/testutils/jobutils/jobs_verification.go b/pkg/testutils/jobutils/jobs_verification.go
@@ -100,6 +100,18 @@ func WaitForJobToHaveStatus(
 	}
 }
 
+// GetJobStatusMessage retrieves the current status message for a job.
+func GetJobStatusMessage(
+	t testing.TB, db *sqlutils.SQLRunner, jobID jobspb.JobID,
+) jobs.StatusMessage {
+	t.Helper()
+	statuses := db.QueryStr(t, "SELECT status FROM system.job_status WHERE job_id = $1", jobID)
+	if len(statuses) == 0 {
+		return ""
+	}
+	return jobs.StatusMessage(statuses[0][0])
+}
+
 // BulkOpResponseFilter creates a blocking response filter for the responses
 // related to bulk IO/backup/restore/import: Export, Import and AddSSTable. See
 // discussion on RunJob for where this might be useful.