@@ -63,6 +63,7 @@ import (
6363 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
6464 "github.com/cockroachdb/cockroach/pkg/sql/sqlclustersettings"
6565 "github.com/cockroachdb/cockroach/pkg/sql/stats"
66+ "github.com/cockroachdb/cockroach/pkg/util"
6667 bulkutil "github.com/cockroachdb/cockroach/pkg/util/bulk"
6768 "github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
6869 "github.com/cockroachdb/cockroach/pkg/util/hlc"
@@ -91,20 +92,31 @@ var (
9192 settings .WithVisibility (settings .Reserved ),
9293 settings .PositiveDuration ,
9394 )
95+
96+ restoreRetryLogRate = settings .RegisterDurationSetting (
97+ settings .ApplicationLevel ,
98+ "restore.retry_log_rate" ,
99+ "maximum rate at which retryable restore errors are logged to the job messages table" ,
100+ 5 * time .Minute ,
101+ settings .WithVisibility (settings .Reserved ),
102+ settings .PositiveDuration ,
103+ )
94104)
95105
96- // restoreStatsInsertBatchSize is an arbitrarily chosen value of the number of
97- // tables we process in a single txn when restoring their table statistics.
98- const restoreStatsInsertBatchSize = 10
106+ const (
107+ // restoreStatsInsertBatchSize is an arbitrarily chosen value of the number of
108+ // tables we process in a single txn when restoring their table statistics.
109+ restoreStatsInsertBatchSize = 10
99110
100- // maxRestoreRetryFastFail is the maximum number of times we will retry without
101- // seeing any progress before fast-failing the restore job .
102- const maxRestoreRetryFastFail = 5
111+ // maxRestoreRetryFastFail is the maximum number of times we will retry before
112+ // exceeding the restoreRetryProgressThreshold .
113+ maxRestoreRetryFastFail = 5
103114
104- // restoreRetryProgressThreshold is the fraction of the job that must
105- // be _exceeded_ before we no longer fast fail the restore job after hitting the
106- // maxRestoreRetryFastFail threshold.
107- const restoreRetryProgressThreshold = 0
115+ // restoreRetryProgressThreshold is the fraction of the job that must
116+ // be _exceeded_ before we no longer fast fail the restore job after hitting the
117+ // maxRestoreRetryFastFail threshold.
118+ restoreRetryProgressThreshold = 0
119+ )
108120
109121var restoreStatsInsertionConcurrency = settings .RegisterIntSetting (
110122 settings .ApplicationLevel ,
@@ -198,10 +210,12 @@ func restoreWithRetry(
198210 // We want to retry a restore if there are transient failures (i.e. worker nodes
199211 // dying), so if we receive a retryable error, re-plan and retry the restore.
200212 retryOpts , progThreshold := getRetryOptionsAndProgressThreshold (execCtx )
213+ logRate := restoreRetryLogRate .Get (& execCtx .ExecCfg ().Settings .SV )
214+ logThrottler := util .Every (logRate )
201215 var (
202- res roachpb.RowCount
203- err error
204- currPersistedSpans , prevPersistedSpans jobspb.RestoreFrontierEntries
216+ res roachpb.RowCount
217+ err error
218+ prevPersistedSpans jobspb.RestoreFrontierEntries
205219 )
206220 for r := retry .StartWithCtx (ctx , retryOpts ); r .Next (); {
207221 res , err = restore (
@@ -237,16 +251,19 @@ func restoreWithRetry(
237251
238252 log .Warningf (ctx , "encountered retryable error: %+v" , err )
239253
240- // Check if retry counter should be reset if progress was made.
241- currPersistedSpans = resumer .job .
242- Progress ().Details .(* jobspb.Progress_Restore ).Restore .Checkpoint
243- if ! currPersistedSpans .Equal (prevPersistedSpans ) {
244- // If the previous persisted spans are different than the current, it
245- // implies that further progress has been persisted.
246- r .Reset ()
247- log .Infof (ctx , "restored frontier has advanced since last retry, resetting retry counter" )
254+ if logThrottler .ShouldProcess (timeutil .Now ()) {
255+ // We throttle the logging of errors to the jobs messages table to avoid
256+ // flooding the table during the hot loop of a retry.
257+ if err := execCtx .ExecCfg ().InternalDB .Txn (ctx , func (ctx context.Context , txn isql.Txn ) error {
258+ return resumer .job .Messages ().Record (
259+ ctx , txn , "error" , fmt .Sprintf ("restore encountered error: %v" , err ),
260+ )
261+ }); err != nil {
262+ log .Warningf (ctx , "failed to record job error message: %v" , err )
263+ }
248264 }
249- prevPersistedSpans = currPersistedSpans
265+
266+ prevPersistedSpans = maybeResetRetry (ctx , resumer , & r , prevPersistedSpans )
250267
251268 // Fail fast if no progress has been made after a certain number of retries.
252269 if r .CurrentAttempt () >= maxRestoreRetryFastFail &&
@@ -268,6 +285,7 @@ func restoreWithRetry(
268285 if err != nil {
269286 return res , jobs .MarkPauseRequestError (errors .Wrap (err , "exhausted retries" ))
270287 }
288+
271289 return res , nil
272290}
273291
@@ -281,8 +299,9 @@ func getRetryOptionsAndProgressThreshold(execCtx sql.JobExecContext) (retry.Opti
281299 // event that some progress has been made.
282300 maxDuration := restoreRetryMaxDuration .Get (& execCtx .ExecCfg ().Settings .SV )
283301 retryOpts := retry.Options {
284- MaxBackoff : 5 * time .Minute ,
285- MaxDuration : maxDuration ,
302+ InitialBackoff : 50 * time .Millisecond ,
303+ MaxBackoff : 5 * time .Minute ,
304+ MaxDuration : maxDuration ,
286305 }
287306 var progThreshold float32 = restoreRetryProgressThreshold
288307 if knobs := execCtx .ExecCfg ().BackupRestoreTestingKnobs ; knobs != nil {
@@ -297,6 +316,26 @@ func getRetryOptionsAndProgressThreshold(execCtx sql.JobExecContext) (retry.Opti
297316 return retryOpts , progThreshold
298317}
299318
319+ // maybeResetRetry checks on the progress of the restore job and resets the
320+ // retry loop if progress has been made. It returns the latest progress.
321+ func maybeResetRetry (
322+ ctx context.Context ,
323+ resumer * restoreResumer ,
324+ rt * retry.Retry ,
325+ prevProgress jobspb.RestoreFrontierEntries ,
326+ ) jobspb.RestoreFrontierEntries {
327+ // Check if retry counter should be reset if progress was made.
328+ var currProgress jobspb.RestoreFrontierEntries = resumer .job .
329+ Progress ().Details .(* jobspb.Progress_Restore ).Restore .Checkpoint
330+ if ! currProgress .Equal (prevProgress ) {
331+ // If the previous persisted spans are different than the current, it
332+ // implies that further progress has been persisted.
333+ rt .Reset ()
334+ log .Infof (ctx , "restored frontier has advanced since last retry, resetting retry counter" )
335+ }
336+ return currProgress
337+ }
338+
300339type storeByLocalityKV map [string ]cloudpb.ExternalStorage
301340
302341func makeBackupLocalityMap (
@@ -384,10 +423,13 @@ func restore(
384423 restoreCheckpoint := job .Progress ().Details .(* jobspb.Progress_Restore ).Restore .Checkpoint
385424 requiredSpans := dataToRestore .getSpans ()
386425 progressTracker , err := makeProgressTracker (
426+ job ,
427+ execCtx .ExecCfg (),
387428 requiredSpans ,
388429 restoreCheckpoint ,
389430 restoreCheckpointMaxBytes .Get (& execCtx .ExecCfg ().Settings .SV ),
390- endTime )
431+ endTime ,
432+ )
391433 if err != nil {
392434 return emptyRowCount , err
393435 }
0 commit comments