diff --git a/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs b/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs
new file mode 100644
index 0000000000..24e5c801cf
--- /dev/null
+++ b/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs
@@ -0,0 +1,246 @@
+//------------------------------------------------------------
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//------------------------------------------------------------
+
+namespace Microsoft.Azure.Cosmos
+{
+ using System;
+ using System.Runtime.ExceptionServices;
+ using System.Threading;
+ using System.Threading.Tasks;
+ using Microsoft.Azure.Cosmos.Core.Trace;
+ using Microsoft.Azure.Documents;
+
+ ///
+ /// Retry helper for internal metadata (control-plane) reads such as
+ /// .ReadCollectionAsync.
+ ///
+ /// The shared BackoffRetryUtility evaluates the caller's
+ /// before consulting ShouldRetryAsync. If the
+ /// caller's timeout trips during the control-plane HTTP timeout escalation
+ /// (0.5s → 5s → 30s ≈ 36s) for an unhealthy region, the cross-region failover
+ /// that ClientRetryPolicy would otherwise execute is preempted and the
+ /// operation surfaces an .
+ ///
+ /// This helper runs a bounded retry loop that always consults the retry policy
+ /// on exception and, when the caller's token has already been cancelled, grants
+ /// a small bounded grace window so the cross-region failover attempt can run.
+ /// The grace window is intentionally short — the goal is best-effort availability,
+ /// not unbounded timeout extension. If the retry attempt does not complete within
+ /// the grace window, the original exception is rethrown.
+ ///
+ /// Note on the grace bound: the grace controls
+ /// when the grace attempt may START (via ThrowIfCancellationRequested and
+ /// propagation into the operation lambda). If the underlying operation does not
+ /// observe the grace token (e.g. the store model ignores cancellation), the in-flight
+ /// call may exceed the grace window. Callers relying on a strict upper bound should
+ /// ensure the operation honors its .
+ ///
+ internal static class MetadataRetryHelper
+ {
+ ///
+ /// Maximum additional time granted to a metadata read after the caller's
+ /// cancellation token has tripped so that cross-region retry can execute.
+ ///
+ internal static readonly TimeSpan DefaultCrossRegionRetryGrace = TimeSpan.FromSeconds(10);
+
+ ///
+ /// Defensive upper bound on the number of attempts the helper makes within a single
+ ///
+ /// invocation. and peers already bound their own retry
+ /// counts, but a misconfigured policy that always returns ShouldRetry=true would
+ /// otherwise spin this loop indefinitely.
+ ///
+ private const int MaxAttemptsHardCap = 20;
+
+ internal static Task ExecuteAsync(
+ Func> operation,
+ IDocumentClientRetryPolicy retryPolicy,
+ CancellationToken cancellationToken)
+ {
+ return ExecuteAsync(operation, retryPolicy, DefaultCrossRegionRetryGrace, cancellationToken);
+ }
+
+ internal static async Task ExecuteAsync(
+ Func> operation,
+ IDocumentClientRetryPolicy retryPolicy,
+ TimeSpan crossRegionRetryGrace,
+ CancellationToken cancellationToken)
+ {
+ if (operation == null)
+ {
+ throw new ArgumentNullException(nameof(operation));
+ }
+
+ if (retryPolicy == null)
+ {
+ throw new ArgumentNullException(nameof(retryPolicy));
+ }
+
+ if (crossRegionRetryGrace < TimeSpan.Zero)
+ {
+ throw new ArgumentOutOfRangeException(
+ nameof(crossRegionRetryGrace),
+ "Cross-region retry grace must not be negative.");
+ }
+
+ // Contract: the operation lambda MUST honor the CancellationToken passed to it
+ // and MUST NOT close over any outer CancellationToken. On a grace retry attempt
+ // this will be a fresh, bounded-lifetime token decoupled from the caller's
+ // (already cancelled) token. Closing over the outer token re-introduces the
+ // defect this helper is designed to fix.
+ bool graceAttempted = false;
+ int attemptCount = 0;
+ while (true)
+ {
+ if (++attemptCount > MaxAttemptsHardCap)
+ {
+ DefaultTrace.TraceError(
+ "MetadataRetryHelper: exceeded hard attempt cap ({0}). Surfacing last exception.",
+ MaxAttemptsHardCap);
+ throw new InvalidOperationException(
+ $"MetadataRetryHelper exceeded the defensive attempt cap of {MaxAttemptsHardCap}. " +
+ "This indicates a misconfigured retry policy that returns ShouldRetry=true indefinitely.");
+ }
+
+ ExceptionDispatchInfo capturedException;
+ try
+ {
+ return await operation(cancellationToken).ConfigureAwait(false);
+ }
+ catch (Exception ex)
+ {
+ capturedException = ExceptionDispatchInfo.Capture(ex);
+ }
+
+ // Always consult the retry policy BEFORE honoring caller cancellation.
+ // This is the correctness fix: BackoffRetryUtility honors cancellation
+ // first, which silently swallows cross-region failover decisions for
+ // metadata reads when the caller's timeout trips during the HTTP
+ // timeout policy escalation.
+ ShouldRetryResult shouldRetry;
+ try
+ {
+ shouldRetry = await retryPolicy
+ .ShouldRetryAsync(capturedException.SourceException, CancellationToken.None)
+ .ConfigureAwait(false);
+ }
+ catch (Exception policyException)
+ {
+ DefaultTrace.TraceWarning(
+ "MetadataRetryHelper: retry policy threw while evaluating exception {0}: {1}",
+ capturedException.SourceException.GetType().Name,
+ policyException.Message);
+ capturedException.Throw();
+ throw; // unreachable
+ }
+
+ if (shouldRetry == null || !shouldRetry.ShouldRetry)
+ {
+ // Honor ShouldRetryResult.ExceptionToThrow if the policy has specified a
+ // wrapper/translated exception (matches BackoffRetryUtility.ThrowIfDoneTrying).
+ // When the policy hands back the same reference as the original exception,
+ // rethrow via the captured ExceptionDispatchInfo so the original stack trace
+ // is preserved (mirrors ShouldRetryResult.ThrowIfDoneTrying).
+ if (shouldRetry?.ExceptionToThrow != null)
+ {
+ if (shouldRetry.ExceptionToThrow == capturedException.SourceException)
+ {
+ capturedException.Throw();
+ }
+
+ throw shouldRetry.ExceptionToThrow;
+ }
+
+ capturedException.Throw();
+ }
+
+ // If the caller has not cancelled, honor any backoff and continue the loop.
+ if (!cancellationToken.IsCancellationRequested)
+ {
+ if (shouldRetry.BackoffTime > TimeSpan.Zero)
+ {
+ try
+ {
+ await Task.Delay(shouldRetry.BackoffTime, cancellationToken).ConfigureAwait(false);
+ }
+ catch (OperationCanceledException)
+ {
+ // Caller cancelled during backoff. Fall through — we will
+ // attempt one bounded cross-region retry below.
+ }
+ }
+
+ if (!cancellationToken.IsCancellationRequested)
+ {
+ continue;
+ }
+ }
+
+ // Caller token is cancelled. Grant a single bounded grace window for
+ // a cross-region retry so availability-critical failover is not
+ // silently preempted. The grace token is passed to the operation so
+ // the underlying HTTP call is not itself preempted by the caller's
+ // already-cancelled token. Subsequent cancellations collapse to the
+ // original exception so we do not extend the caller's timeout unbounded.
+ if (graceAttempted || crossRegionRetryGrace <= TimeSpan.Zero)
+ {
+ DefaultTrace.TraceInformation(
+ "MetadataRetryHelper: caller token cancelled; cross-region retry grace already used or disabled. Surfacing original exception.");
+ capturedException.Throw();
+ }
+
+ graceAttempted = true;
+ DefaultTrace.TraceVerbose(
+ "MetadataRetryHelper: caller token cancelled; granting {0}ms grace for one cross-region metadata retry.",
+ (int)crossRegionRetryGrace.TotalMilliseconds);
+
+ using (CancellationTokenSource graceCts = new CancellationTokenSource(crossRegionRetryGrace))
+ {
+ try
+ {
+ return await ExecuteSingleAttemptWithGraceAsync(
+ operation,
+ shouldRetry.BackoffTime,
+ graceCts.Token).ConfigureAwait(false);
+ }
+ catch (OperationCanceledException) when (graceCts.IsCancellationRequested)
+ {
+ // Grace window expired before the cross-region attempt completed.
+ // Surface the original failure rather than the grace-timeout.
+ DefaultTrace.TraceWarning(
+ "MetadataRetryHelper: grace window ({0}ms) expired during cross-region retry. Surfacing original exception.",
+ (int)crossRegionRetryGrace.TotalMilliseconds);
+ capturedException.Throw();
+ throw; // unreachable
+ }
+ catch (Exception graceException)
+ {
+ // Cross-region attempt itself failed. Surface the ORIGINAL exception so
+ // callers see the pre-failover failure mode, not the grace-region failure.
+ DefaultTrace.TraceWarning(
+ "MetadataRetryHelper: grace-region retry failed with {0}: {1}. Surfacing original exception.",
+ graceException.GetType().Name,
+ graceException.Message);
+ capturedException.Throw();
+ throw; // unreachable
+ }
+ }
+ }
+ }
+
+ private static async Task ExecuteSingleAttemptWithGraceAsync(
+ Func> operation,
+ TimeSpan backoff,
+ CancellationToken graceToken)
+ {
+ if (backoff > TimeSpan.Zero)
+ {
+ await Task.Delay(backoff, graceToken).ConfigureAwait(false);
+ }
+
+ graceToken.ThrowIfCancellationRequested();
+ return await operation(graceToken).ConfigureAwait(false);
+ }
+ }
+}
diff --git a/Microsoft.Azure.Cosmos/src/Routing/ClientCollectionCache.cs b/Microsoft.Azure.Cosmos/src/Routing/ClientCollectionCache.cs
index cc8b5a1bb6..b05167053b 100644
--- a/Microsoft.Azure.Cosmos/src/Routing/ClientCollectionCache.cs
+++ b/Microsoft.Azure.Cosmos/src/Routing/ClientCollectionCache.cs
@@ -53,15 +53,16 @@ protected override Task GetByRidAsync(string apiVersion,
cancellationToken.ThrowIfCancellationRequested();
IDocumentClientRetryPolicy retryPolicyInstance = new ClearingSessionContainerClientRetryPolicy(
this.sessionContainer, this.retryPolicy.GetRequestPolicy());
- return TaskHelper.InlineIfPossible(
- () => this.ReadCollectionAsync(
- PathsHelper.GeneratePath(ResourceType.Collection, collectionRid, false),
+ return TaskHelper.RunInlineIfNeededAsync(
+ () => MetadataRetryHelper.ExecuteAsync(
+ (ct) => this.ReadCollectionAsync(
+ PathsHelper.GeneratePath(ResourceType.Collection, collectionRid, false),
+ retryPolicyInstance,
+ trace,
+ clientSideRequestStatistics,
+ ct),
retryPolicyInstance,
- trace,
- clientSideRequestStatistics,
- cancellationToken),
- retryPolicyInstance,
- cancellationToken);
+ cancellationToken));
}
protected override Task GetByNameAsync(string apiVersion,
@@ -73,11 +74,12 @@ protected override Task GetByNameAsync(string apiVersion,
cancellationToken.ThrowIfCancellationRequested();
IDocumentClientRetryPolicy retryPolicyInstance = new ClearingSessionContainerClientRetryPolicy(
this.sessionContainer, this.retryPolicy.GetRequestPolicy());
- return TaskHelper.InlineIfPossible(
- () => this.ReadCollectionAsync(
- resourceAddress, retryPolicyInstance, trace, clientSideRequestStatistics, cancellationToken),
- retryPolicyInstance,
- cancellationToken);
+ return TaskHelper.RunInlineIfNeededAsync(
+ () => MetadataRetryHelper.ExecuteAsync(
+ (ct) => this.ReadCollectionAsync(
+ resourceAddress, retryPolicyInstance, trace, clientSideRequestStatistics, ct),
+ retryPolicyInstance,
+ cancellationToken));
}
internal override Task ResolveByNameAsync(
diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs
new file mode 100644
index 0000000000..a513a22af6
--- /dev/null
+++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs
@@ -0,0 +1,608 @@
+//------------------------------------------------------------
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//------------------------------------------------------------
+
+namespace Microsoft.Azure.Cosmos.Tests
+{
+ using System;
+ using System.Net;
+ using System.Threading;
+ using System.Threading.Tasks;
+ using Microsoft.Azure.Documents;
+ using Microsoft.VisualStudio.TestTools.UnitTesting;
+ using Moq;
+
+ ///
+ /// Unit tests for .
+ ///
+ /// These tests reproduce the control-plane metadata retry defect described in
+ /// PR #5787 — when a caller's cancellation token trips during the control-plane
+ /// HTTP timeout escalation against an unhealthy region, the cross-region retry
+ /// that would otherwise execute is preempted.
+ ///
+ /// Each test is written so that the pre-fix implementation (TaskHelper.InlineIfPossible
+ /// → BackoffRetryUtility) would FAIL the assertion, while the fixed implementation
+ /// (MetadataRetryHelper) passes.
+ ///
+ [TestClass]
+ public class MetadataRetryHelperTests
+ {
+ [TestMethod]
+ [Owner("ntripician")]
+ public async Task ExecuteAsync_SucceedsFirstAttempt_NoRetry()
+ {
+ Mock policy = new Mock();
+ int attempts = 0;
+
+ int result = await MetadataRetryHelper.ExecuteAsync(
+ (_) =>
+ {
+ attempts++;
+ return Task.FromResult(42);
+ },
+ policy.Object,
+ CancellationToken.None);
+
+ Assert.AreEqual(42, result);
+ Assert.AreEqual(1, attempts);
+ policy.Verify(
+ p => p.ShouldRetryAsync(It.IsAny(), It.IsAny()),
+ Times.Never);
+ }
+
+ ///
+ /// Exercises the classic retry loop: exception, policy says retry, second attempt
+ /// succeeds. No cancellation involved.
+ ///
+ [TestMethod]
+ [Owner("ntripician")]
+ public async Task ExecuteAsync_RetriesOnTransient_WhenPolicyAllows()
+ {
+ Mock policy = new Mock();
+ policy
+ .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny()))
+ .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero));
+
+ int attempts = 0;
+
+ int result = await MetadataRetryHelper.ExecuteAsync(
+ (_) =>
+ {
+ attempts++;
+ if (attempts == 1)
+ {
+ throw new DocumentClientException(
+ "transient",
+ HttpStatusCode.ServiceUnavailable,
+ SubStatusCodes.Unknown);
+ }
+
+ return Task.FromResult(7);
+ },
+ policy.Object,
+ CancellationToken.None);
+
+ Assert.AreEqual(7, result);
+ Assert.AreEqual(2, attempts);
+ }
+
+ ///
+ /// REPRO / FIX VERIFICATION: the caller's cancellation token is already cancelled
+ /// at the time the first attempt throws a 503. The retry policy says "retry"
+ /// (cross-region failover). The fix must still execute one cross-region attempt
+ /// against the detached grace token.
+ ///
+ /// Pre-fix behavior (BackoffRetryUtility): throws OperationCanceledException
+ /// without consulting the policy, or consults the policy but then immediately
+ /// honors the cancelled token before the retry attempt. Either way, attempts == 1.
+ ///
+ /// Post-fix behavior (MetadataRetryHelper): attempts == 2 and the second attempt
+ /// returns successfully.
+ ///
+ [TestMethod]
+ [Owner("ntripician")]
+ public async Task ExecuteAsync_CrossRegionRetryExecutes_EvenWhenCallerTokenCancelled()
+ {
+ Mock policy = new Mock();
+ policy
+ .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny()))
+ .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero));
+
+ using CancellationTokenSource cts = new CancellationTokenSource();
+ cts.Cancel();
+
+ int attempts = 0;
+ CancellationToken secondAttemptToken = default;
+
+ int result = await MetadataRetryHelper.ExecuteAsync(
+ (ct) =>
+ {
+ attempts++;
+ if (attempts == 1)
+ {
+ throw new DocumentClientException(
+ "503 from unhealthy region",
+ HttpStatusCode.ServiceUnavailable,
+ SubStatusCodes.Unknown);
+ }
+
+ // Capture the token observed by the grace attempt so the detached-token
+ // contract can be asserted below.
+ secondAttemptToken = ct;
+ return Task.FromResult(100);
+ },
+ policy.Object,
+ MetadataRetryHelper.DefaultCrossRegionRetryGrace,
+ cts.Token);
+
+ Assert.AreEqual(100, result, "Cross-region retry should have executed despite cancelled caller token.");
+ Assert.AreEqual(2, attempts, "Exactly one cross-region retry attempt should run on the grace token.");
+
+ // Pin the detached-grace-token contract: the grace attempt MUST NOT receive the
+ // caller's already-cancelled token. A future refactor that accidentally passes
+ // cancellationToken (instead of graceCts.Token) would silently reintroduce the
+ // defect this helper was built to fix — this assertion catches that regression.
+ Assert.IsFalse(
+ secondAttemptToken.IsCancellationRequested,
+ "Grace attempt must receive a fresh, non-cancelled token decoupled from the caller's cancelled token.");
+ Assert.AreNotEqual(
+ cts.Token,
+ secondAttemptToken,
+ "Grace attempt token must not be the caller's cancellation token.");
+ }
+
+ ///
+ /// Verifies the bounded grace: if the caller's token is cancelled AND the cross-region
+ /// retry attempt itself also fails with a retriable error, the helper does not loop
+ /// indefinitely. It grants at most one grace window and then surfaces the original
+ /// exception (not the grace-timeout exception).
+ ///
+ [TestMethod]
+ [Owner("ntripician")]
+ public async Task ExecuteAsync_GraceIsBounded_SurfacesOriginalExceptionOnSecondFailure()
+ {
+ Mock policy = new Mock();
+ policy
+ .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny()))
+ .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero));
+
+ using CancellationTokenSource cts = new CancellationTokenSource();
+ cts.Cancel();
+
+ int attempts = 0;
+
+ DocumentClientException thrown = await Assert.ThrowsExceptionAsync(
+ () => MetadataRetryHelper.ExecuteAsync(
+ (_) =>
+ {
+ attempts++;
+ throw new DocumentClientException(
+ $"503 attempt {attempts}",
+ HttpStatusCode.ServiceUnavailable,
+ SubStatusCodes.Unknown);
+ },
+ policy.Object,
+ MetadataRetryHelper.DefaultCrossRegionRetryGrace,
+ cts.Token));
+
+ Assert.AreEqual(2, attempts, "Exactly one grace-window retry should run; then original is surfaced.");
+ StringAssert.Contains(thrown.Message, "503 attempt 1");
+ }
+
+ ///
+ /// Caller token cancelled and the retry policy says "no retry" (e.g. 404, throttling exhausted).
+ /// The helper must surface the original exception, not OperationCanceledException.
+ ///
+ [TestMethod]
+ [Owner("ntripician")]
+ public async Task ExecuteAsync_PolicyDeniesRetry_SurfacesOriginalExceptionEvenWhenCancelled()
+ {
+ Mock policy = new Mock();
+ policy
+ .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny()))
+ .ReturnsAsync(ShouldRetryResult.NoRetry());
+
+ using CancellationTokenSource cts = new CancellationTokenSource();
+ cts.Cancel();
+
+ int attempts = 0;
+
+ DocumentClientException thrown = await Assert.ThrowsExceptionAsync(
+ () => MetadataRetryHelper.ExecuteAsync(
+ (_) =>
+ {
+ attempts++;
+ throw new DocumentClientException(
+ "404",
+ HttpStatusCode.NotFound,
+ SubStatusCodes.Unknown);
+ },
+ policy.Object,
+ MetadataRetryHelper.DefaultCrossRegionRetryGrace,
+ cts.Token));
+
+ Assert.AreEqual(1, attempts);
+ StringAssert.Contains(thrown.Message, "404");
+ }
+
+ ///
+ /// If the grace window is zero and the caller token is cancelled, the helper must
+ /// not attempt a cross-region retry. Original exception is surfaced on the first failure.
+ ///
+ [TestMethod]
+ [Owner("ntripician")]
+ public async Task ExecuteAsync_ZeroGrace_DoesNotRetryAfterCancellation()
+ {
+ Mock policy = new Mock();
+ policy
+ .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny()))
+ .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero));
+
+ using CancellationTokenSource cts = new CancellationTokenSource();
+ cts.Cancel();
+
+ int attempts = 0;
+
+ DocumentClientException thrown = await Assert.ThrowsExceptionAsync(
+ () => MetadataRetryHelper.ExecuteAsync(
+ (_) =>
+ {
+ attempts++;
+ throw new DocumentClientException(
+ "503",
+ HttpStatusCode.ServiceUnavailable,
+ SubStatusCodes.Unknown);
+ },
+ policy.Object,
+ TimeSpan.Zero,
+ cts.Token));
+
+ Assert.AreEqual(1, attempts);
+ StringAssert.Contains(thrown.Message, "503");
+ }
+
+ ///
+ /// If the grace window itself expires while the cross-region attempt is still in flight,
+ /// the helper surfaces the ORIGINAL underlying exception (not the grace-timeout OCE).
+ /// This prevents leaking internal cancellation semantics to callers.
+ ///
+ [TestMethod]
+ [Owner("ntripician")]
+ public async Task ExecuteAsync_GraceExpires_SurfacesOriginalException()
+ {
+ Mock policy = new Mock();
+ policy
+ .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny()))
+ .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero));
+
+ using CancellationTokenSource cts = new CancellationTokenSource();
+ cts.Cancel();
+
+ int attempts = 0;
+
+ DocumentClientException thrown = await Assert.ThrowsExceptionAsync(
+ () => MetadataRetryHelper.ExecuteAsync(
+ async (ct) =>
+ {
+ attempts++;
+ if (attempts == 1)
+ {
+ throw new DocumentClientException(
+ "original 503",
+ HttpStatusCode.ServiceUnavailable,
+ SubStatusCodes.Unknown);
+ }
+
+ // Second attempt: observes the grace token and completes deterministically
+ // only when the grace CTS fires. Using TaskCompletionSource + ct.Register
+ // avoids the long Task.Delay worst-case on slow CI runners.
+ TaskCompletionSource tcs = new TaskCompletionSource(
+ TaskCreationOptions.RunContinuationsAsynchronously);
+ using (ct.Register(() => tcs.TrySetCanceled(ct)))
+ {
+ return await tcs.Task.ConfigureAwait(false);
+ }
+ },
+ policy.Object,
+ TimeSpan.FromMilliseconds(100),
+ cts.Token));
+
+ Assert.AreEqual(2, attempts);
+ StringAssert.Contains(thrown.Message, "original 503");
+ }
+
+ ///
+ /// Exercises the mid-backoff cancellation → grace transition. The caller's token
+ /// is live at the start of the first attempt (so the outer IsCancellationRequested
+ /// check on the backoff path is false), then trips during Task.Delay. The
+ /// OperationCanceledException from the delay is caught and the inner
+ /// IsCancellationRequested check routes execution into the bounded grace
+ /// window — which then succeeds.
+ ///
+ /// This is the most realistic production trigger for the grace path (caller's
+ /// ~36s timeout fires while the helper waits out a retry backoff after a 503 from
+ /// an unhealthy region). All other cancellation tests use a pre-cancelled token,
+ /// which short-circuits the outer check and never exercises the catch block or
+ /// the second cancellation check. A refactor that drops either guard would
+ /// regress one of these branches without any other test catching it.
+ ///
+ [TestMethod]
+ [Owner("ntripician")]
+ public async Task ExecuteAsync_CancellationDuringBackoff_TransitionsToGraceRetry()
+ {
+ Mock policy = new Mock();
+ policy
+ .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny()))
+ .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.FromSeconds(5)));
+
+ using CancellationTokenSource cts = new CancellationTokenSource();
+ int attempts = 0;
+ CancellationToken graceAttemptToken = default;
+
+ int result = await MetadataRetryHelper.ExecuteAsync(
+ (ct) =>
+ {
+ attempts++;
+ if (attempts == 1)
+ {
+ // Token is live now (outer IsCancellationRequested check on the
+ // backoff path will be false). Schedule a cancellation that fires
+ // during Task.Delay(5s, cancellationToken) — this is the production
+ // case where the caller's timeout trips mid-backoff.
+ cts.CancelAfter(TimeSpan.FromMilliseconds(50));
+ throw new DocumentClientException(
+ "503 from unhealthy region",
+ HttpStatusCode.ServiceUnavailable,
+ SubStatusCodes.Unknown);
+ }
+
+ graceAttemptToken = ct;
+ return Task.FromResult(99);
+ },
+ policy.Object,
+ MetadataRetryHelper.DefaultCrossRegionRetryGrace,
+ cts.Token);
+
+ Assert.AreEqual(99, result, "Mid-backoff cancellation must transition to grace retry, not surface OCE.");
+ Assert.AreEqual(2, attempts, "Exactly one grace-window retry should run after mid-backoff cancellation.");
+
+ // The grace attempt must receive a fresh token decoupled from the caller's
+ // already-cancelled token. Same contract pinned by
+ // ExecuteAsync_CrossRegionRetryExecutes_EvenWhenCallerTokenCancelled, but
+ // re-asserted here because this path reaches the grace branch via the
+ // OperationCanceledException catch rather than the pre-cancelled outer check.
+ Assert.IsFalse(
+ graceAttemptToken.IsCancellationRequested,
+ "Grace attempt must receive a fresh, non-cancelled token after mid-backoff cancellation.");
+ Assert.AreNotEqual(
+ cts.Token,
+ graceAttemptToken,
+ "Grace attempt token must not be the caller's cancellation token.");
+ }
+
+ ///
+ /// COMPANION REPRO: exercises the legacy TaskHelper.InlineIfPossible +
+ /// BackoffRetryUtility path that ClientCollectionCache used prior to the
+ /// fix. It demonstrates that, under the same conditions where the fixed
+ /// succeeds with a cross-region retry
+ /// (),
+ /// the legacy path throws without ever
+ /// executing the cross-region attempt that the retry policy requested.
+ ///
+ /// This test is the canonical proof of Defect A and ensures that if anyone
+ /// reverts the fix by calling TaskHelper.InlineIfPossible for a metadata
+ /// read in ClientCollectionCache, the bug surfaces here.
+ ///
+ [TestMethod]
+ [Owner("ntripician")]
+ public async Task LegacyBackoffRetryUtility_CrossRegionRetryIsPreempted_ByCancelledCallerToken()
+ {
+ Mock policy = new Mock();
+ policy
+ .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny()))
+ .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero));
+
+ using CancellationTokenSource cts = new CancellationTokenSource();
+ cts.Cancel();
+
+ int attempts = 0;
+
+ await Assert.ThrowsExceptionAsync(
+ () => TaskHelper.InlineIfPossible(
+ () =>
+ {
+ attempts++;
+ if (attempts == 1)
+ {
+ throw new DocumentClientException(
+ "503 from unhealthy region",
+ HttpStatusCode.ServiceUnavailable,
+ SubStatusCodes.Unknown);
+ }
+
+ return Task.FromResult(100);
+ },
+ policy.Object,
+ cts.Token));
+
+ Assert.IsTrue(
+ attempts <= 1,
+ $"Legacy BackoffRetryUtility path preempts cross-region retry once caller token is cancelled — " +
+ $"expected at most one attempt (possibly zero if ThrowIfCancellationRequested fires first), got {attempts}. " +
+ $"If this starts asserting on attempts==2, someone improved the shared retry utility to consult the policy " +
+ $"before honoring cancellation — in that case the bespoke MetadataRetryHelper may be removable.");
+ }
+
+ ///
+ /// Regression guard: when the very first operation attempt throws
+ /// (caller token already cancelled and the
+ /// inner HTTP call honors it immediately), the helper must still consult the retry
+ /// policy before surfacing the OCE. If the policy returns NoRetry, the OCE is
+ /// surfaced. This test pins the "policy is consulted FIRST" invariant.
+ ///
+ [TestMethod]
+ [Owner("ntripician")]
+ public async Task ExecuteAsync_FirstAttemptOCE_PolicyIsConsultedBeforeSurfacing()
+ {
+ Mock policy = new Mock();
+ policy
+ .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny()))
+ .ReturnsAsync(ShouldRetryResult.NoRetry());
+
+ using CancellationTokenSource cts = new CancellationTokenSource();
+ cts.Cancel();
+
+ int attempts = 0;
+
+ await Assert.ThrowsExceptionAsync(
+ () => MetadataRetryHelper.ExecuteAsync(
+ (ct) =>
+ {
+ attempts++;
+ ct.ThrowIfCancellationRequested();
+ return Task.FromResult(42);
+ },
+ policy.Object,
+ cts.Token));
+
+ Assert.AreEqual(1, attempts);
+ policy.Verify(
+ p => p.ShouldRetryAsync(It.IsAny(), It.IsAny()),
+ Times.AtLeastOnce,
+ "Retry policy must be consulted even on first-attempt OCE. If not, the helper's core invariant is broken.");
+ }
+
+ ///
+ /// Regression guard for finding #5: when
+ /// is invoked with a policy-specified translated exception, the helper must surface
+ /// THAT exception (matching BackoffRetryUtility.ThrowIfDoneTrying), not the
+ /// original captured exception. If this invariant is broken, retry policies that rewrite
+ /// error types (e.g. NonRetriableInvalidPartitionExceptionRetryPolicy translating
+ /// a gone/invalid-partition into a NotFoundException) would silently diverge in
+ /// behavior when wired through .
+ ///
+ [TestMethod]
+ [Owner("ntripician")]
+ public async Task ExecuteAsync_PolicySpecifiesExceptionToThrow_SurfacesTranslatedException()
+ {
+ InvalidOperationException translated = new InvalidOperationException("policy-translated");
+ Mock policy = new Mock();
+ policy
+ .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny()))
+ .ReturnsAsync(ShouldRetryResult.NoRetry(translated));
+
+ int attempts = 0;
+
+ InvalidOperationException thrown = await Assert.ThrowsExceptionAsync(
+ () => MetadataRetryHelper.ExecuteAsync(
+ (_) =>
+ {
+ attempts++;
+ throw new DocumentClientException(
+ "original 503",
+ HttpStatusCode.ServiceUnavailable,
+ SubStatusCodes.Unknown);
+ },
+ policy.Object,
+ CancellationToken.None));
+
+ Assert.AreEqual(1, attempts);
+ Assert.AreSame(
+ translated,
+ thrown,
+ "Helper must surface ShouldRetryResult.ExceptionToThrow (policy-translated), not the original exception.");
+ }
+
+ ///
+ /// Negative grace windows are invalid and must be rejected at the API boundary rather
+ /// than silently collapsed to "disabled". This prevents subtle misuse.
+ ///
+ [TestMethod]
+ [Owner("ntripician")]
+ public async Task ExecuteAsync_NegativeGrace_Throws()
+ {
+ Mock policy = new Mock();
+
+ await Assert.ThrowsExceptionAsync(
+ () => MetadataRetryHelper.ExecuteAsync(
+ (ct) => Task.FromResult(1),
+ policy.Object,
+ TimeSpan.FromSeconds(-1),
+ CancellationToken.None));
+ }
+
+ ///
+ /// Regression guard for the defensive MaxAttemptsHardCap. A misconfigured retry
+ /// policy that always returns would otherwise
+ /// spin indefinitely. The helper must terminate with an
+ /// after the hard cap (20) is exceeded.
+ ///
+ [TestMethod]
+ [Owner("ntripician")]
+ public async Task ExecuteAsync_ExceedsMaxAttemptsHardCap_ThrowsInvalidOperationException()
+ {
+ Mock policy = new Mock();
+ policy
+ .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny()))
+ .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero));
+
+ int attempts = 0;
+
+ InvalidOperationException thrown = await Assert.ThrowsExceptionAsync(
+ () => MetadataRetryHelper.ExecuteAsync(
+ (_) =>
+ {
+ attempts++;
+ throw new DocumentClientException(
+ "always fail",
+ HttpStatusCode.ServiceUnavailable,
+ SubStatusCodes.Unknown);
+ },
+ policy.Object,
+ CancellationToken.None));
+
+ Assert.AreEqual(20, attempts, "Helper must stop at the defensive attempt cap of 20.");
+ StringAssert.Contains(thrown.Message, "defensive attempt cap");
+ }
+
+ ///
+ /// Regression guard: when the retry policy itself throws while evaluating
+ /// ShouldRetryAsync, the helper must surface the ORIGINAL operation
+ /// exception (with its stack trace preserved) rather than the policy's
+ /// internal failure. The policy error is logged but should not be visible
+ /// to callers — diagnosing a metadata read failure should never be
+ /// distorted by a buggy retry policy.
+ ///
+ [TestMethod]
+ [Owner("ntripician")]
+ public async Task ExecuteAsync_PolicyThrowsDuringShouldRetry_SurfacesOriginalException()
+ {
+ Mock policy = new Mock();
+ policy
+ .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny()))
+ .ThrowsAsync(new InvalidOperationException("policy internal error"));
+
+ int attempts = 0;
+
+ DocumentClientException thrown = await Assert.ThrowsExceptionAsync(
+ () => MetadataRetryHelper.ExecuteAsync(
+ (_) =>
+ {
+ attempts++;
+ throw new DocumentClientException(
+ "original 503",
+ HttpStatusCode.ServiceUnavailable,
+ SubStatusCodes.Unknown);
+ },
+ policy.Object,
+ CancellationToken.None));
+
+ Assert.AreEqual(1, attempts);
+ StringAssert.Contains(
+ thrown.Message,
+ "original 503",
+ "Helper must surface the ORIGINAL operation exception, not the policy's internal failure.");
+ }
+ }
+}
+