@@ -116,7 +116,7 @@ public CloudFetchDownloader(
116116 /// <param name="resultFetcher">The result fetcher that manages URLs.</param>
117117 /// <param name="maxParallelDownloads">Maximum parallel downloads.</param>
118118 /// <param name="isLz4Compressed">Whether results are LZ4 compressed.</param>
119- /// <param name="maxRetries">Maximum retry attempts (0 = no limit, use timeout only).</param>
119+ /// <param name="maxRetries">Maximum retry attempts. -1 = no limit ( use timeout only), 0 = no retries (single attempt), positive = max retry attempts .</param>
120120 /// <param name="retryTimeoutSeconds">Time budget for retries in seconds (optional, default 300).</param>
121121 /// <param name="retryDelayMs">Initial delay between retries in ms (optional, default 500).</param>
122122 internal CloudFetchDownloader (
@@ -492,30 +492,16 @@ await _activityTracer.TraceActivityAsync(async activity =>
492492 ] ) ;
493493
494494 // Retry logic with time-budget approach and exponential backoff with jitter.
495- // Similar to RetryHttpHandler: keeps retrying until the time budget is exhausted,
495+ // Keeps retrying until the time budget (real elapsed time) is exhausted,
496496 // rather than a fixed retry count. This gives transient issues (firewall, proxy 502,
497497 // connection drops) enough time to resolve.
498498 int currentBackoffMs = _retryDelayMs ;
499- int totalRetryWaitMs = 0 ;
499+ long retryTimeoutMs = Math . Min ( ( long ) _retryTimeoutSeconds , int . MaxValue / 1000L ) * 1000L ;
500500 int attemptCount = 0 ;
501501 Exception ? lastException = null ;
502502
503503 while ( ! cancellationToken . IsCancellationRequested )
504504 {
505- // Check if we've exceeded the max retry count (if set)
506- if ( _maxRetries >= 0 && attemptCount >= _maxRetries )
507- {
508- activity ? . AddEvent ( "cloudfetch.download_max_retries_exceeded" , [
509- new ( "offset" , downloadResult . StartRowOffset ) ,
510- new ( "sanitized_url" , SanitizeUrl ( url ) ) ,
511- new ( "total_attempts" , attemptCount ) ,
512- new ( "max_retries" , _maxRetries ) ,
513- new ( "last_error" , lastException ? . GetType ( ) . Name ?? "none" ) ,
514- new ( "last_error_message" , lastException ? . Message ?? "none" )
515- ] ) ;
516- break ;
517- }
518-
519505 attemptCount ++ ;
520506 try
521507 {
@@ -594,36 +580,43 @@ await _activityTracer.TraceActivityAsync(async activity =>
594580 {
595581 lastException = ex ;
596582
597- // Exponential backoff with jitter (80-120% of base)
598- int waitMs = ( int ) Math . Max ( 100 , currentBackoffMs * ( 0.8 + new Random ( ) . NextDouble ( ) * 0.4 ) ) ;
583+ // Check if we've exceeded the max retry count (if set)
584+ // -1 = unlimited, 0 = no retries (single attempt), >0 = max retry attempts
585+ if ( _maxRetries >= 0 && attemptCount > _maxRetries )
586+ {
587+ activity ? . AddEvent ( "cloudfetch.download_max_retries_exceeded" , [
588+ new ( "offset" , downloadResult . StartRowOffset ) ,
589+ new ( "sanitized_url" , SanitizeUrl ( url ) ) ,
590+ new ( "total_attempts" , attemptCount ) ,
591+ new ( "max_retries" , _maxRetries )
592+ ] ) ;
593+ break ;
594+ }
599595
600- // Check if we would exceed the retry time budget
601- int retryTimeoutMs = _retryTimeoutSeconds * 1000 ;
602- if ( retryTimeoutMs > 0 && totalRetryWaitMs + waitMs > retryTimeoutMs )
596+ // Check if we've exceeded the time budget (real elapsed time)
597+ if ( retryTimeoutMs > 0 && stopwatch . ElapsedMilliseconds >= retryTimeoutMs )
603598 {
604599 activity ? . AddEvent ( "cloudfetch.download_retry_timeout_exceeded" , [
605600 new ( "offset" , downloadResult . StartRowOffset ) ,
606601 new ( "sanitized_url" , SanitizeUrl ( url ) ) ,
607602 new ( "total_attempts" , attemptCount ) ,
608- new ( "total_retry_wait_ms " , totalRetryWaitMs ) ,
603+ new ( "elapsed_ms " , stopwatch . ElapsedMilliseconds ) ,
609604 new ( "retry_timeout_seconds" , _retryTimeoutSeconds ) ,
610- new ( "last_error" , ex . GetType ( ) . Name ) ,
611- new ( "last_error_message" , ex . Message )
605+ new ( "last_error" , ex . GetType ( ) . Name )
612606 ] ) ;
613- break ; // Exceeded time budget
607+ break ;
614608 }
615609
616- totalRetryWaitMs += waitMs ;
610+ // Exponential backoff with jitter (80-120% of base)
611+ int waitMs = ( int ) Math . Max ( 100 , currentBackoffMs * ( 0.8 + new Random ( ) . NextDouble ( ) * 0.4 ) ) ;
617612
618613 activity ? . AddEvent ( "cloudfetch.download_retry" , [
619- new ( "error.context" , "cloudfetch.download_retry" ) ,
620614 new ( "offset" , downloadResult . StartRowOffset ) ,
621615 new ( "sanitized_url" , SanitizeUrl ( url ) ) ,
622616 new ( "attempt" , attemptCount ) ,
617+ new ( "elapsed_ms" , stopwatch . ElapsedMilliseconds ) ,
623618 new ( "retry_timeout_seconds" , _retryTimeoutSeconds ) ,
624- new ( "total_retry_wait_ms" , totalRetryWaitMs ) ,
625619 new ( "error_type" , ex . GetType ( ) . Name ) ,
626- new ( "error_message" , ex . Message ) ,
627620 new ( "backoff_ms" , waitMs )
628621 ] ) ;
629622
@@ -639,7 +632,6 @@ await _activityTracer.TraceActivityAsync(async activity =>
639632 new ( "offset" , downloadResult . StartRowOffset ) ,
640633 new ( "sanitized_url" , sanitizedUrl ) ,
641634 new ( "total_attempts" , attemptCount ) ,
642- new ( "total_retry_wait_ms" , totalRetryWaitMs ) ,
643635 new ( "elapsed_time_ms" , stopwatch . ElapsedMilliseconds )
644636 ] ) ;
645637
@@ -649,7 +641,7 @@ await _activityTracer.TraceActivityAsync(async activity =>
649641 ? $ "max_retries: { _maxRetries } , timeout: { _retryTimeoutSeconds } s"
650642 : $ "timeout: { _retryTimeoutSeconds } s";
651643 throw new InvalidOperationException (
652- $ "Failed to download file from { sanitizedUrl } after { attemptCount } attempts over { totalRetryWaitMs / 1000 } s ({ retryLimits } ).",
644+ $ "Failed to download file from { sanitizedUrl } after { attemptCount } attempts over { stopwatch . ElapsedMilliseconds / 1000 } s ({ retryLimits } ).",
653645 lastException ) ;
654646 }
655647
0 commit comments