From d86c9f650f591ca37347b87b88139218ae78bd07 Mon Sep 17 00:00:00 2001 From: Nick Tripician Date: Wed, 22 Apr 2026 13:05:37 -0700 Subject: [PATCH 1/6] Metadata retry: consult retry policy before honoring caller cancellation ClientCollectionCache's cold-path ReadCollection metadata call goes through TaskHelper.InlineIfPossible -> BackoffRetryUtility.ExecuteAsync, which evaluates the caller's CancellationToken before invoking IDocumentClientRetryPolicy.ShouldRetryAsync. When the control-plane HTTP timeout policy burns 0.5s/5s/30s against an unhealthy region and the caller's ~36.5s timeout trips, the cross-region failover that ClientRetryPolicy would otherwise execute is silently preempted and the operation surfaces OperationCanceledException instead of retrying to the next preferred region. Introduces MetadataRetryHelper with a bespoke retry loop that: - Always consults the retry policy on exception, before honoring the caller token. - Grants a single bounded grace window (default 10s) when the caller token is already cancelled so availability-critical cross-region failover can execute. - Surfaces the original exception (not the grace-timeout OCE) if the grace attempt expires or fails. Wires the helper into ClientCollectionCache.GetByRidAsync / GetByNameAsync. PartitionKeyRangeCache is unaffected (its call does not pass a caller cancellation token into BackoffRetryUtility). Adds MetadataRetryHelperTests including a companion regression test (LegacyBackoffRetryUtility_CrossRegionRetryIsPreempted_ByCancelledCallerToken) that pins the preempted behavior of the shared utility so that if it is ever fixed upstream the bespoke helper becomes removable. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/MetadataRetryHelper.cs | 185 ++++++++++ .../src/Routing/ClientCollectionCache.cs | 12 +- .../MetadataRetryHelperTests.cs | 347 ++++++++++++++++++ 3 files changed, 538 insertions(+), 6 deletions(-) create mode 100644 Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs create mode 100644 Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs diff --git a/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs b/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs new file mode 100644 index 0000000000..0a31570244 --- /dev/null +++ b/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs @@ -0,0 +1,185 @@ +//------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------ + +namespace Microsoft.Azure.Cosmos +{ + using System; + using System.Runtime.ExceptionServices; + using System.Threading; + using System.Threading.Tasks; + using Microsoft.Azure.Cosmos.Core.Trace; + using Microsoft.Azure.Documents; + + /// + /// Retry helper for internal metadata (control-plane) reads such as + /// .ReadCollectionAsync. + /// + /// The shared BackoffRetryUtility evaluates the caller's + /// before consulting ShouldRetryAsync. If the + /// caller's timeout trips during the control-plane HTTP timeout escalation + /// (0.5s → 5s → 30s ≈ 36s) for an unhealthy region, the cross-region failover + /// that ClientRetryPolicy would otherwise execute is preempted and the + /// operation surfaces an . + /// + /// This helper runs a bounded retry loop that always consults the retry policy + /// on exception and, when the caller's token has already been cancelled, grants + /// a small bounded grace window so the cross-region failover attempt can run. + /// The grace window is intentionally short — the goal is best-effort availability, + /// not unbounded timeout extension. If the retry attempt does not complete within + /// the grace window, the original exception is rethrown. + /// + internal static class MetadataRetryHelper + { + /// + /// Maximum additional time granted to a metadata read after the caller's + /// cancellation token has tripped so that cross-region retry can execute. + /// + internal static readonly TimeSpan DefaultCrossRegionRetryGrace = TimeSpan.FromSeconds(10); + + public static Task ExecuteAsync( + Func> operation, + IDocumentClientRetryPolicy retryPolicy, + CancellationToken cancellationToken) + { + return ExecuteAsync(operation, retryPolicy, DefaultCrossRegionRetryGrace, cancellationToken); + } + + public static async Task ExecuteAsync( + Func> operation, + IDocumentClientRetryPolicy retryPolicy, + TimeSpan crossRegionRetryGrace, + CancellationToken cancellationToken) + { + if (operation == null) + { + throw new ArgumentNullException(nameof(operation)); + } + + if (retryPolicy == null) + { + throw new ArgumentNullException(nameof(retryPolicy)); + } + + bool graceTokenUsed = false; + while (true) + { + ExceptionDispatchInfo capturedException; + try + { + return await operation(cancellationToken).ConfigureAwait(false); + } + catch (Exception ex) + { + capturedException = ExceptionDispatchInfo.Capture(ex); + } + + // Always consult the retry policy BEFORE honoring caller cancellation. + // This is the correctness fix: BackoffRetryUtility honors cancellation + // first, which silently swallows cross-region failover decisions for + // metadata reads when the caller's timeout trips during the HTTP + // timeout policy escalation. + ShouldRetryResult shouldRetry; + try + { + shouldRetry = await retryPolicy + .ShouldRetryAsync(capturedException.SourceException, CancellationToken.None) + .ConfigureAwait(false); + } + catch (Exception policyException) + { + DefaultTrace.TraceWarning( + "MetadataRetryHelper: retry policy threw while evaluating exception {0}: {1}", + capturedException.SourceException.GetType().Name, + policyException.Message); + capturedException.Throw(); + throw; // unreachable + } + + if (shouldRetry == null || !shouldRetry.ShouldRetry) + { + capturedException.Throw(); + } + + // If the caller has not cancelled, honor any backoff and continue the loop. + if (!cancellationToken.IsCancellationRequested) + { + if (shouldRetry.BackoffTime > TimeSpan.Zero) + { + try + { + await Task.Delay(shouldRetry.BackoffTime, cancellationToken).ConfigureAwait(false); + } + catch (OperationCanceledException) + { + // Caller cancelled during backoff. Fall through — we will + // attempt one bounded cross-region retry below. + } + } + + if (!cancellationToken.IsCancellationRequested) + { + continue; + } + } + + // Caller token is cancelled. Grant a single bounded grace window for + // a cross-region retry so availability-critical failover is not + // silently preempted. The grace token is passed to the operation so + // the underlying HTTP call is not itself preempted by the caller's + // already-cancelled token. Subsequent cancellations collapse to the + // original exception so we do not extend the caller's timeout unbounded. + if (graceTokenUsed || crossRegionRetryGrace <= TimeSpan.Zero) + { + DefaultTrace.TraceInformation( + "MetadataRetryHelper: caller token cancelled; cross-region retry grace already used or disabled. Surfacing original exception."); + capturedException.Throw(); + } + + graceTokenUsed = true; + DefaultTrace.TraceInformation( + "MetadataRetryHelper: caller token cancelled; granting {0}ms grace for one cross-region metadata retry.", + (int)crossRegionRetryGrace.TotalMilliseconds); + + using (CancellationTokenSource graceCts = new CancellationTokenSource(crossRegionRetryGrace)) + { + try + { + return await ExecuteSingleAttemptWithGraceAsync( + operation, + shouldRetry.BackoffTime, + graceCts.Token).ConfigureAwait(false); + } + catch (OperationCanceledException) when (graceCts.IsCancellationRequested) + { + // Grace window expired before the cross-region attempt completed. + // Surface the original failure rather than the grace-timeout. + capturedException.Throw(); + throw; // unreachable + } + catch (Exception) + { + // Cross-region attempt itself failed. Surface the ORIGINAL exception so + // callers see the pre-failover failure mode, not the grace-region failure. + capturedException.Throw(); + throw; // unreachable + } + } + } + } + + private static async Task ExecuteSingleAttemptWithGraceAsync( + Func> operation, + TimeSpan backoff, + CancellationToken graceToken) + { + if (backoff > TimeSpan.Zero) + { + await Task.Delay(backoff, graceToken).ConfigureAwait(false); + } + + graceToken.ThrowIfCancellationRequested(); + return await operation(graceToken).ConfigureAwait(false); + } + } +} diff --git a/Microsoft.Azure.Cosmos/src/Routing/ClientCollectionCache.cs b/Microsoft.Azure.Cosmos/src/Routing/ClientCollectionCache.cs index cc8b5a1bb6..f674a675a3 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/ClientCollectionCache.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/ClientCollectionCache.cs @@ -53,13 +53,13 @@ protected override Task GetByRidAsync(string apiVersion, cancellationToken.ThrowIfCancellationRequested(); IDocumentClientRetryPolicy retryPolicyInstance = new ClearingSessionContainerClientRetryPolicy( this.sessionContainer, this.retryPolicy.GetRequestPolicy()); - return TaskHelper.InlineIfPossible( - () => this.ReadCollectionAsync( + return MetadataRetryHelper.ExecuteAsync( + (ct) => this.ReadCollectionAsync( PathsHelper.GeneratePath(ResourceType.Collection, collectionRid, false), retryPolicyInstance, trace, clientSideRequestStatistics, - cancellationToken), + ct), retryPolicyInstance, cancellationToken); } @@ -73,9 +73,9 @@ protected override Task GetByNameAsync(string apiVersion, cancellationToken.ThrowIfCancellationRequested(); IDocumentClientRetryPolicy retryPolicyInstance = new ClearingSessionContainerClientRetryPolicy( this.sessionContainer, this.retryPolicy.GetRequestPolicy()); - return TaskHelper.InlineIfPossible( - () => this.ReadCollectionAsync( - resourceAddress, retryPolicyInstance, trace, clientSideRequestStatistics, cancellationToken), + return MetadataRetryHelper.ExecuteAsync( + (ct) => this.ReadCollectionAsync( + resourceAddress, retryPolicyInstance, trace, clientSideRequestStatistics, ct), retryPolicyInstance, cancellationToken); } diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs new file mode 100644 index 0000000000..e04ca29c5c --- /dev/null +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs @@ -0,0 +1,347 @@ +//------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------ + +namespace Microsoft.Azure.Cosmos.Tests +{ + using System; + using System.Net; + using System.Threading; + using System.Threading.Tasks; + using Microsoft.Azure.Documents; + using Microsoft.VisualStudio.TestTools.UnitTesting; + using Moq; + + /// + /// Unit tests for . + /// + /// These tests reproduce the control-plane metadata retry defect described in + /// PR #5787 — when a caller's cancellation token trips during the control-plane + /// HTTP timeout escalation against an unhealthy region, the cross-region retry + /// that would otherwise execute is preempted. + /// + /// Each test is written so that the pre-fix implementation (TaskHelper.InlineIfPossible + /// → BackoffRetryUtility) would FAIL the assertion, while the fixed implementation + /// (MetadataRetryHelper) passes. + /// + [TestClass] + public class MetadataRetryHelperTests + { + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_SucceedsFirstAttempt_NoRetry() + { + Mock policy = new Mock(); + int attempts = 0; + + int result = await MetadataRetryHelper.ExecuteAsync( + (_) => + { + attempts++; + return Task.FromResult(42); + }, + policy.Object, + CancellationToken.None); + + Assert.AreEqual(42, result); + Assert.AreEqual(1, attempts); + policy.Verify( + p => p.ShouldRetryAsync(It.IsAny(), It.IsAny()), + Times.Never); + } + + /// + /// Exercises the classic retry loop: exception, policy says retry, second attempt + /// succeeds. No cancellation involved. + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_RetriesOnTransient_WhenPolicyAllows() + { + Mock policy = new Mock(); + policy + .SetupSequence(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero)) + .ReturnsAsync(ShouldRetryResult.NoRetry()); + + int attempts = 0; + + int result = await MetadataRetryHelper.ExecuteAsync( + (_) => + { + attempts++; + if (attempts == 1) + { + throw new DocumentClientException( + "transient", + HttpStatusCode.ServiceUnavailable, + SubStatusCodes.Unknown); + } + + return Task.FromResult(7); + }, + policy.Object, + CancellationToken.None); + + Assert.AreEqual(7, result); + Assert.AreEqual(2, attempts); + } + + /// + /// REPRO / FIX VERIFICATION: the caller's cancellation token is already cancelled + /// at the time the first attempt throws a 503. The retry policy says "retry" + /// (cross-region failover). The fix must still execute one cross-region attempt + /// against the detached grace token. + /// + /// Pre-fix behavior (BackoffRetryUtility): throws OperationCanceledException + /// without consulting the policy, or consults the policy but then immediately + /// honors the cancelled token before the retry attempt. Either way, attempts == 1. + /// + /// Post-fix behavior (MetadataRetryHelper): attempts == 2 and the second attempt + /// returns successfully. + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_CrossRegionRetryExecutes_EvenWhenCallerTokenCancelled() + { + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero)); + + using CancellationTokenSource cts = new CancellationTokenSource(); + cts.Cancel(); + + int attempts = 0; + + int result = await MetadataRetryHelper.ExecuteAsync( + (_) => + { + attempts++; + if (attempts == 1) + { + throw new DocumentClientException( + "503 from unhealthy region", + HttpStatusCode.ServiceUnavailable, + SubStatusCodes.Unknown); + } + + return Task.FromResult(100); + }, + policy.Object, + MetadataRetryHelper.DefaultCrossRegionRetryGrace, + cts.Token); + + Assert.AreEqual(100, result, "Cross-region retry should have executed despite cancelled caller token."); + Assert.AreEqual(2, attempts, "Exactly one cross-region retry attempt should run on the grace token."); + } + + /// + /// Verifies the bounded grace: if the caller's token is cancelled AND the cross-region + /// retry attempt itself also fails with a retriable error, the helper does not loop + /// indefinitely. It grants at most one grace window and then surfaces the original + /// exception (not the grace-timeout exception). + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_GraceIsBounded_SurfacesOriginalExceptionOnSecondFailure() + { + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero)); + + using CancellationTokenSource cts = new CancellationTokenSource(); + cts.Cancel(); + + int attempts = 0; + + DocumentClientException thrown = await Assert.ThrowsExceptionAsync( + () => MetadataRetryHelper.ExecuteAsync( + (_) => + { + attempts++; + throw new DocumentClientException( + $"503 attempt {attempts}", + HttpStatusCode.ServiceUnavailable, + SubStatusCodes.Unknown); + }, + policy.Object, + MetadataRetryHelper.DefaultCrossRegionRetryGrace, + cts.Token)); + + Assert.AreEqual(2, attempts, "Exactly one grace-window retry should run; then original is surfaced."); + StringAssert.Contains(thrown.Message, "503 attempt 1"); + } + + /// + /// Caller token cancelled and the retry policy says "no retry" (e.g. 404, throttling exhausted). + /// The helper must surface the original exception, not OperationCanceledException. + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_PolicyDeniesRetry_SurfacesOriginalExceptionEvenWhenCancelled() + { + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.NoRetry()); + + using CancellationTokenSource cts = new CancellationTokenSource(); + cts.Cancel(); + + int attempts = 0; + + DocumentClientException thrown = await Assert.ThrowsExceptionAsync( + () => MetadataRetryHelper.ExecuteAsync( + (_) => + { + attempts++; + throw new DocumentClientException( + "404", + HttpStatusCode.NotFound, + SubStatusCodes.Unknown); + }, + policy.Object, + MetadataRetryHelper.DefaultCrossRegionRetryGrace, + cts.Token)); + + Assert.AreEqual(1, attempts); + StringAssert.Contains(thrown.Message, "404"); + } + + /// + /// If the grace window is zero and the caller token is cancelled, the helper must + /// not attempt a cross-region retry. Original exception is surfaced on the first failure. + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_ZeroGrace_DoesNotRetryAfterCancellation() + { + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero)); + + using CancellationTokenSource cts = new CancellationTokenSource(); + cts.Cancel(); + + int attempts = 0; + + DocumentClientException thrown = await Assert.ThrowsExceptionAsync( + () => MetadataRetryHelper.ExecuteAsync( + (_) => + { + attempts++; + throw new DocumentClientException( + "503", + HttpStatusCode.ServiceUnavailable, + SubStatusCodes.Unknown); + }, + policy.Object, + TimeSpan.Zero, + cts.Token)); + + Assert.AreEqual(1, attempts); + StringAssert.Contains(thrown.Message, "503"); + } + + /// + /// If the grace window itself expires while the cross-region attempt is still in flight, + /// the helper surfaces the ORIGINAL underlying exception (not the grace-timeout OCE). + /// This prevents leaking internal cancellation semantics to callers. + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_GraceExpires_SurfacesOriginalException() + { + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero)); + + using CancellationTokenSource cts = new CancellationTokenSource(); + cts.Cancel(); + + int attempts = 0; + + DocumentClientException thrown = await Assert.ThrowsExceptionAsync( + () => MetadataRetryHelper.ExecuteAsync( + async (ct) => + { + attempts++; + if (attempts == 1) + { + throw new DocumentClientException( + "original 503", + HttpStatusCode.ServiceUnavailable, + SubStatusCodes.Unknown); + } + + // Second attempt: observes the grace token and will throw OCE when it expires. + await Task.Delay(TimeSpan.FromSeconds(30), ct); + return 0; + }, + policy.Object, + TimeSpan.FromMilliseconds(100), + cts.Token)); + + Assert.AreEqual(2, attempts); + StringAssert.Contains(thrown.Message, "original 503"); + } + + /// + /// COMPANION REPRO: exercises the legacy TaskHelper.InlineIfPossible + + /// BackoffRetryUtility path that ClientCollectionCache used prior to the + /// fix. It demonstrates that, under the same conditions where the fixed + /// succeeds with a cross-region retry + /// (), + /// the legacy path throws without ever + /// executing the cross-region attempt that the retry policy requested. + /// + /// This test is the canonical proof of Defect A and ensures that if anyone + /// reverts the fix by calling TaskHelper.InlineIfPossible for a metadata + /// read in ClientCollectionCache, the bug surfaces here. + /// + [TestMethod] + [Owner("ntripician")] + public async Task LegacyBackoffRetryUtility_CrossRegionRetryIsPreempted_ByCancelledCallerToken() + { + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero)); + + using CancellationTokenSource cts = new CancellationTokenSource(); + cts.Cancel(); + + int attempts = 0; + + await Assert.ThrowsExceptionAsync( + () => TaskHelper.InlineIfPossible( + () => + { + attempts++; + if (attempts == 1) + { + throw new DocumentClientException( + "503 from unhealthy region", + HttpStatusCode.ServiceUnavailable, + SubStatusCodes.Unknown); + } + + return Task.FromResult(100); + }, + policy.Object, + cts.Token)); + + Assert.IsTrue( + attempts <= 1, + $"Legacy BackoffRetryUtility path preempts cross-region retry once caller token is cancelled — " + + $"expected at most one attempt (possibly zero if ThrowIfCancellationRequested fires first), got {attempts}. " + + $"If this starts asserting on attempts==2, someone improved the shared retry utility to consult the policy " + + $"before honoring cancellation — in that case the bespoke MetadataRetryHelper may be removable."); + } + } +} + From 19d5a765a4b28bac5e89c1f78be9e6a731208be8 Mon Sep 17 00:00:00 2001 From: Nick Tripician Date: Wed, 22 Apr 2026 13:11:08 -0700 Subject: [PATCH 2/6] Address review round 1: operation-token contract, first-attempt-OCE test, negative grace validation - Document contract in MetadataRetryHelper that the operation lambda must honor the passed CancellationToken and not close over outer tokens. - Reject negative crossRegionRetryGrace with ArgumentOutOfRangeException. - Add regression test that pins 'policy consulted before surfacing OCE' even when the very first attempt throws OperationCanceledException. - Add test covering ArgumentOutOfRangeException for negative grace. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/MetadataRetryHelper.cs | 12 ++++ .../MetadataRetryHelperTests.cs | 57 +++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs b/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs index 0a31570244..a3b9d697aa 100644 --- a/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs +++ b/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs @@ -61,6 +61,18 @@ public static async Task ExecuteAsync( throw new ArgumentNullException(nameof(retryPolicy)); } + if (crossRegionRetryGrace < TimeSpan.Zero) + { + throw new ArgumentOutOfRangeException( + nameof(crossRegionRetryGrace), + "Cross-region retry grace must not be negative."); + } + + // Contract: the operation lambda MUST honor the CancellationToken passed to it + // and MUST NOT close over any outer CancellationToken. On a grace retry attempt + // this will be a fresh, bounded-lifetime token decoupled from the caller's + // (already cancelled) token. Closing over the outer token re-introduces the + // defect this helper is designed to fix. bool graceTokenUsed = false; while (true) { diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs index e04ca29c5c..c46c150431 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs @@ -342,6 +342,63 @@ await Assert.ThrowsExceptionAsync( $"If this starts asserting on attempts==2, someone improved the shared retry utility to consult the policy " + $"before honoring cancellation — in that case the bespoke MetadataRetryHelper may be removable."); } + + /// + /// Regression guard: when the very first operation attempt throws + /// (caller token already cancelled and the + /// inner HTTP call honors it immediately), the helper must still consult the retry + /// policy before surfacing the OCE. If the policy returns NoRetry, the OCE is + /// surfaced. This test pins the "policy is consulted FIRST" invariant. + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_FirstAttemptOCE_PolicyIsConsultedBeforeSurfacing() + { + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.NoRetry()); + + using CancellationTokenSource cts = new CancellationTokenSource(); + cts.Cancel(); + + int attempts = 0; + + await Assert.ThrowsExceptionAsync( + () => MetadataRetryHelper.ExecuteAsync( + (ct) => + { + attempts++; + ct.ThrowIfCancellationRequested(); + return Task.FromResult(42); + }, + policy.Object, + cts.Token)); + + Assert.AreEqual(1, attempts); + policy.Verify( + p => p.ShouldRetryAsync(It.IsAny(), It.IsAny()), + Times.AtLeastOnce, + "Retry policy must be consulted even on first-attempt OCE. If not, the helper's core invariant is broken."); + } + + /// + /// Negative grace windows are invalid and must be rejected at the API boundary rather + /// than silently collapsed to "disabled". This prevents subtle misuse. + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_NegativeGrace_Throws() + { + Mock policy = new Mock(); + + await Assert.ThrowsExceptionAsync( + () => MetadataRetryHelper.ExecuteAsync( + (ct) => Task.FromResult(1), + policy.Object, + TimeSpan.FromSeconds(-1), + CancellationToken.None)); + } } } From 6c51bc34258ec69d855740ba5f62e3790cb9430b Mon Sep 17 00:00:00 2001 From: Nalu Tripician <27316859+NaluTripician@users.noreply.github.com> Date: Thu, 23 Apr 2026 12:28:33 -0700 Subject: [PATCH 3/6] Address deep-review round 3: honor ExceptionToThrow, SyncContext guard, attempt cap, test hardening MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - MetadataRetryHelper: honor ShouldRetryResult.ExceptionToThrow when policy specifies a translated exception (matches BackoffRetryUtility contract). - MetadataRetryHelper: add MaxAttemptsHardCap (20) defensive bound so a misconfigured retry policy cannot spin the loop indefinitely. - MetadataRetryHelper: TraceWarning on grace-window expiration and grace-region failure to improve cross-region outage debuggability. - MetadataRetryHelper: rename graceTokenUsed -> graceAttempted; document that the grace CTS controls when the attempt STARTS, not total duration. - MetadataRetryHelper: tighten accessibility of ExecuteAsync overloads (public -> internal on internal class). - ClientCollectionCache.GetByRidAsync / GetByNameAsync: wrap MetadataRetryHelper.ExecuteAsync in TaskHelper.RunInlineIfNeededAsync to preserve the SynchronizationContext guard that TaskHelper.InlineIfPossible previously provided for .NET Framework sync-over-async call chains. - Tests: add detached-grace-token contract assertion in ExecuteAsync_CrossRegionRetryExecutes_EvenWhenCallerTokenCancelled; pins the invariant that the grace attempt receives a fresh, non-cancelled token decoupled from the caller's cancelled token. - Tests: replace Task.Delay(30s, ct) with TaskCompletionSource + ct.Register in ExecuteAsync_GraceExpires_SurfacesOriginalException — deterministic and removes 30s worst-case cliff on slow CI runners. - Tests: add ExecuteAsync_PolicySpecifiesExceptionToThrow regression test covering the new ExceptionToThrow code path. - Tests: remove dead SetupSequence NoRetry() stub in ExecuteAsync_RetriesOnTransient_WhenPolicyAllows; simplify to single Setup. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/MetadataRetryHelper.cs | 53 +++++++++++-- .../src/Routing/ClientCollectionCache.cs | 28 +++---- .../MetadataRetryHelperTests.cs | 76 +++++++++++++++++-- 3 files changed, 131 insertions(+), 26 deletions(-) diff --git a/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs b/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs index a3b9d697aa..1e2fb2c200 100644 --- a/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs +++ b/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs @@ -28,6 +28,13 @@ namespace Microsoft.Azure.Cosmos /// The grace window is intentionally short — the goal is best-effort availability, /// not unbounded timeout extension. If the retry attempt does not complete within /// the grace window, the original exception is rethrown. + /// + /// Note on the grace bound: the grace controls + /// when the grace attempt may START (via ThrowIfCancellationRequested and + /// propagation into the operation lambda). If the underlying operation does not + /// observe the grace token (e.g. the store model ignores cancellation), the in-flight + /// call may exceed the grace window. Callers relying on a strict upper bound should + /// ensure the operation honors its . /// internal static class MetadataRetryHelper { @@ -37,7 +44,16 @@ internal static class MetadataRetryHelper /// internal static readonly TimeSpan DefaultCrossRegionRetryGrace = TimeSpan.FromSeconds(10); - public static Task ExecuteAsync( + /// + /// Defensive upper bound on the number of attempts the helper makes within a single + /// + /// invocation. and peers already bound their own retry + /// counts, but a misconfigured policy that always returns ShouldRetry=true would + /// otherwise spin this loop indefinitely. + /// + private const int MaxAttemptsHardCap = 20; + + internal static Task ExecuteAsync( Func> operation, IDocumentClientRetryPolicy retryPolicy, CancellationToken cancellationToken) @@ -45,7 +61,7 @@ public static Task ExecuteAsync( return ExecuteAsync(operation, retryPolicy, DefaultCrossRegionRetryGrace, cancellationToken); } - public static async Task ExecuteAsync( + internal static async Task ExecuteAsync( Func> operation, IDocumentClientRetryPolicy retryPolicy, TimeSpan crossRegionRetryGrace, @@ -73,9 +89,20 @@ public static async Task ExecuteAsync( // this will be a fresh, bounded-lifetime token decoupled from the caller's // (already cancelled) token. Closing over the outer token re-introduces the // defect this helper is designed to fix. - bool graceTokenUsed = false; + bool graceAttempted = false; + int attemptCount = 0; while (true) { + if (++attemptCount > MaxAttemptsHardCap) + { + DefaultTrace.TraceError( + "MetadataRetryHelper: exceeded hard attempt cap ({0}). Surfacing last exception.", + MaxAttemptsHardCap); + throw new InvalidOperationException( + $"MetadataRetryHelper exceeded the defensive attempt cap of {MaxAttemptsHardCap}. " + + "This indicates a misconfigured retry policy that returns ShouldRetry=true indefinitely."); + } + ExceptionDispatchInfo capturedException; try { @@ -110,6 +137,13 @@ public static async Task ExecuteAsync( if (shouldRetry == null || !shouldRetry.ShouldRetry) { + // Honor ShouldRetryResult.ExceptionToThrow if the policy has specified a + // wrapper/translated exception (matches BackoffRetryUtility.ThrowIfDoneTrying). + if (shouldRetry?.ExceptionToThrow != null) + { + throw shouldRetry.ExceptionToThrow; + } + capturedException.Throw(); } @@ -141,14 +175,14 @@ public static async Task ExecuteAsync( // the underlying HTTP call is not itself preempted by the caller's // already-cancelled token. Subsequent cancellations collapse to the // original exception so we do not extend the caller's timeout unbounded. - if (graceTokenUsed || crossRegionRetryGrace <= TimeSpan.Zero) + if (graceAttempted || crossRegionRetryGrace <= TimeSpan.Zero) { DefaultTrace.TraceInformation( "MetadataRetryHelper: caller token cancelled; cross-region retry grace already used or disabled. Surfacing original exception."); capturedException.Throw(); } - graceTokenUsed = true; + graceAttempted = true; DefaultTrace.TraceInformation( "MetadataRetryHelper: caller token cancelled; granting {0}ms grace for one cross-region metadata retry.", (int)crossRegionRetryGrace.TotalMilliseconds); @@ -166,13 +200,20 @@ public static async Task ExecuteAsync( { // Grace window expired before the cross-region attempt completed. // Surface the original failure rather than the grace-timeout. + DefaultTrace.TraceWarning( + "MetadataRetryHelper: grace window ({0}ms) expired during cross-region retry. Surfacing original exception.", + (int)crossRegionRetryGrace.TotalMilliseconds); capturedException.Throw(); throw; // unreachable } - catch (Exception) + catch (Exception graceException) { // Cross-region attempt itself failed. Surface the ORIGINAL exception so // callers see the pre-failover failure mode, not the grace-region failure. + DefaultTrace.TraceWarning( + "MetadataRetryHelper: grace-region retry failed with {0}: {1}. Surfacing original exception.", + graceException.GetType().Name, + graceException.Message); capturedException.Throw(); throw; // unreachable } diff --git a/Microsoft.Azure.Cosmos/src/Routing/ClientCollectionCache.cs b/Microsoft.Azure.Cosmos/src/Routing/ClientCollectionCache.cs index f674a675a3..b05167053b 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/ClientCollectionCache.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/ClientCollectionCache.cs @@ -53,15 +53,16 @@ protected override Task GetByRidAsync(string apiVersion, cancellationToken.ThrowIfCancellationRequested(); IDocumentClientRetryPolicy retryPolicyInstance = new ClearingSessionContainerClientRetryPolicy( this.sessionContainer, this.retryPolicy.GetRequestPolicy()); - return MetadataRetryHelper.ExecuteAsync( - (ct) => this.ReadCollectionAsync( - PathsHelper.GeneratePath(ResourceType.Collection, collectionRid, false), + return TaskHelper.RunInlineIfNeededAsync( + () => MetadataRetryHelper.ExecuteAsync( + (ct) => this.ReadCollectionAsync( + PathsHelper.GeneratePath(ResourceType.Collection, collectionRid, false), + retryPolicyInstance, + trace, + clientSideRequestStatistics, + ct), retryPolicyInstance, - trace, - clientSideRequestStatistics, - ct), - retryPolicyInstance, - cancellationToken); + cancellationToken)); } protected override Task GetByNameAsync(string apiVersion, @@ -73,11 +74,12 @@ protected override Task GetByNameAsync(string apiVersion, cancellationToken.ThrowIfCancellationRequested(); IDocumentClientRetryPolicy retryPolicyInstance = new ClearingSessionContainerClientRetryPolicy( this.sessionContainer, this.retryPolicy.GetRequestPolicy()); - return MetadataRetryHelper.ExecuteAsync( - (ct) => this.ReadCollectionAsync( - resourceAddress, retryPolicyInstance, trace, clientSideRequestStatistics, ct), - retryPolicyInstance, - cancellationToken); + return TaskHelper.RunInlineIfNeededAsync( + () => MetadataRetryHelper.ExecuteAsync( + (ct) => this.ReadCollectionAsync( + resourceAddress, retryPolicyInstance, trace, clientSideRequestStatistics, ct), + retryPolicyInstance, + cancellationToken)); } internal override Task ResolveByNameAsync( diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs index c46c150431..adb8cda495 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs @@ -60,9 +60,8 @@ public async Task ExecuteAsync_RetriesOnTransient_WhenPolicyAllows() { Mock policy = new Mock(); policy - .SetupSequence(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) - .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero)) - .ReturnsAsync(ShouldRetryResult.NoRetry()); + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero)); int attempts = 0; @@ -113,9 +112,10 @@ public async Task ExecuteAsync_CrossRegionRetryExecutes_EvenWhenCallerTokenCance cts.Cancel(); int attempts = 0; + CancellationToken secondAttemptToken = default; int result = await MetadataRetryHelper.ExecuteAsync( - (_) => + (ct) => { attempts++; if (attempts == 1) @@ -126,6 +126,9 @@ public async Task ExecuteAsync_CrossRegionRetryExecutes_EvenWhenCallerTokenCance SubStatusCodes.Unknown); } + // Capture the token observed by the grace attempt so the detached-token + // contract can be asserted below. + secondAttemptToken = ct; return Task.FromResult(100); }, policy.Object, @@ -134,6 +137,18 @@ public async Task ExecuteAsync_CrossRegionRetryExecutes_EvenWhenCallerTokenCance Assert.AreEqual(100, result, "Cross-region retry should have executed despite cancelled caller token."); Assert.AreEqual(2, attempts, "Exactly one cross-region retry attempt should run on the grace token."); + + // Pin the detached-grace-token contract: the grace attempt MUST NOT receive the + // caller's already-cancelled token. A future refactor that accidentally passes + // cancellationToken (instead of graceCts.Token) would silently reintroduce the + // defect this helper was built to fix — this assertion catches that regression. + Assert.IsFalse( + secondAttemptToken.IsCancellationRequested, + "Grace attempt must receive a fresh, non-cancelled token decoupled from the caller's cancelled token."); + Assert.AreNotEqual( + cts.Token, + secondAttemptToken, + "Grace attempt token must not be the caller's cancellation token."); } /// @@ -278,9 +293,15 @@ public async Task ExecuteAsync_GraceExpires_SurfacesOriginalException() SubStatusCodes.Unknown); } - // Second attempt: observes the grace token and will throw OCE when it expires. - await Task.Delay(TimeSpan.FromSeconds(30), ct); - return 0; + // Second attempt: observes the grace token and completes deterministically + // only when the grace CTS fires. Using TaskCompletionSource + ct.Register + // avoids the long Task.Delay worst-case on slow CI runners. + TaskCompletionSource tcs = new TaskCompletionSource( + TaskCreationOptions.RunContinuationsAsynchronously); + using (ct.Register(() => tcs.TrySetCanceled(ct))) + { + return await tcs.Task.ConfigureAwait(false); + } }, policy.Object, TimeSpan.FromMilliseconds(100), @@ -382,6 +403,47 @@ await Assert.ThrowsExceptionAsync( "Retry policy must be consulted even on first-attempt OCE. If not, the helper's core invariant is broken."); } + /// + /// Regression guard for finding #5: when + /// is invoked with a policy-specified translated exception, the helper must surface + /// THAT exception (matching BackoffRetryUtility.ThrowIfDoneTrying), not the + /// original captured exception. If this invariant is broken, retry policies that rewrite + /// error types (e.g. NonRetriableInvalidPartitionExceptionRetryPolicy translating + /// a gone/invalid-partition into a NotFoundException) would silently diverge in + /// behavior when wired through . + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_PolicySpecifiesExceptionToThrow_SurfacesTranslatedException() + { + InvalidOperationException translated = new InvalidOperationException("policy-translated"); + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.NoRetry(translated)); + + int attempts = 0; + + InvalidOperationException thrown = await Assert.ThrowsExceptionAsync( + () => MetadataRetryHelper.ExecuteAsync( + (_) => + { + attempts++; + throw new DocumentClientException( + "original 503", + HttpStatusCode.ServiceUnavailable, + SubStatusCodes.Unknown); + }, + policy.Object, + CancellationToken.None)); + + Assert.AreEqual(1, attempts); + Assert.AreSame( + translated, + thrown, + "Helper must surface ShouldRetryResult.ExceptionToThrow (policy-translated), not the original exception."); + } + /// /// Negative grace windows are invalid and must be rejected at the API boundary rather /// than silently collapsed to "disabled". This prevents subtle misuse. From 5bdae9f4fe485c94cd1e31b7e3ef3fef78af4b32 Mon Sep 17 00:00:00 2001 From: Nalu Tripician <27316859+NaluTripician@users.noreply.github.com> Date: Mon, 27 Apr 2026 16:44:48 -0700 Subject: [PATCH 4/6] Metadata Retry: Addresses review feedback - Preserve original stack trace when ShouldRetryResult.ExceptionToThrow references the same instance as the captured exception (mirrors ShouldRetryResult.ThrowIfDoneTrying in BackoffRetryUtility). - Add test for MaxAttemptsHardCap defensive guard. - Add test for ShouldRetryAsync-throws path surfacing the original operation exception. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/MetadataRetryHelper.cs | 8 ++ .../MetadataRetryHelperTests.cs | 73 +++++++++++++++++++ 2 files changed, 81 insertions(+) diff --git a/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs b/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs index 1e2fb2c200..d50bb02d4f 100644 --- a/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs +++ b/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs @@ -139,8 +139,16 @@ internal static async Task ExecuteAsync( { // Honor ShouldRetryResult.ExceptionToThrow if the policy has specified a // wrapper/translated exception (matches BackoffRetryUtility.ThrowIfDoneTrying). + // When the policy hands back the same reference as the original exception, + // rethrow via the captured ExceptionDispatchInfo so the original stack trace + // is preserved (mirrors ShouldRetryResult.ThrowIfDoneTrying). if (shouldRetry?.ExceptionToThrow != null) { + if (shouldRetry.ExceptionToThrow == capturedException.SourceException) + { + capturedException.Throw(); + } + throw shouldRetry.ExceptionToThrow; } diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs index adb8cda495..304494ca97 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs @@ -461,6 +461,79 @@ await Assert.ThrowsExceptionAsync( TimeSpan.FromSeconds(-1), CancellationToken.None)); } + + /// + /// Regression guard for the defensive MaxAttemptsHardCap. A misconfigured retry + /// policy that always returns would otherwise + /// spin indefinitely. The helper must terminate with an + /// after the hard cap (20) is exceeded. + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_ExceedsMaxAttemptsHardCap_ThrowsInvalidOperationException() + { + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero)); + + int attempts = 0; + + InvalidOperationException thrown = await Assert.ThrowsExceptionAsync( + () => MetadataRetryHelper.ExecuteAsync( + (_) => + { + attempts++; + throw new DocumentClientException( + "always fail", + HttpStatusCode.ServiceUnavailable, + SubStatusCodes.Unknown); + }, + policy.Object, + CancellationToken.None)); + + Assert.AreEqual(20, attempts, "Helper must stop at the defensive attempt cap of 20."); + StringAssert.Contains(thrown.Message, "defensive attempt cap"); + } + + /// + /// Regression guard: when the retry policy itself throws while evaluating + /// ShouldRetryAsync, the helper must surface the ORIGINAL operation + /// exception (with its stack trace preserved) rather than the policy's + /// internal failure. The policy error is logged but should not be visible + /// to callers — diagnosing a metadata read failure should never be + /// distorted by a buggy retry policy. + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_PolicyThrowsDuringShouldRetry_SurfacesOriginalException() + { + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ThrowsAsync(new InvalidOperationException("policy internal error")); + + int attempts = 0; + + DocumentClientException thrown = await Assert.ThrowsExceptionAsync( + () => MetadataRetryHelper.ExecuteAsync( + (_) => + { + attempts++; + throw new DocumentClientException( + "original 503", + HttpStatusCode.ServiceUnavailable, + SubStatusCodes.Unknown); + }, + policy.Object, + CancellationToken.None)); + + Assert.AreEqual(1, attempts); + StringAssert.Contains( + thrown.Message, + "original 503", + "Helper must surface the ORIGINAL operation exception, not the policy's internal failure."); + } } } From 306e025a1330d9109141f92025fab6234891df06 Mon Sep 17 00:00:00 2001 From: Nalu Tripician <27316859+NaluTripician@users.noreply.github.com> Date: Thu, 30 Apr 2026 12:12:39 -0700 Subject: [PATCH 5/6] MetadataRetryHelper: Adds mid-backoff cancellation grace transition test Covers the production trigger for the grace path: caller's token is live at attempt start and trips during Task.Delay(backoff, cancellationToken), exercising the OperationCanceledException catch and the second IsCancellationRequested check (lines 159-178). Existing cancellation tests use a pre-cancelled token and never reach this branch. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../MetadataRetryHelperTests.cs | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs index 304494ca97..a513a22af6 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs @@ -311,6 +311,75 @@ public async Task ExecuteAsync_GraceExpires_SurfacesOriginalException() StringAssert.Contains(thrown.Message, "original 503"); } + /// + /// Exercises the mid-backoff cancellation → grace transition. The caller's token + /// is live at the start of the first attempt (so the outer IsCancellationRequested + /// check on the backoff path is false), then trips during Task.Delay. The + /// OperationCanceledException from the delay is caught and the inner + /// IsCancellationRequested check routes execution into the bounded grace + /// window — which then succeeds. + /// + /// This is the most realistic production trigger for the grace path (caller's + /// ~36s timeout fires while the helper waits out a retry backoff after a 503 from + /// an unhealthy region). All other cancellation tests use a pre-cancelled token, + /// which short-circuits the outer check and never exercises the catch block or + /// the second cancellation check. A refactor that drops either guard would + /// regress one of these branches without any other test catching it. + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_CancellationDuringBackoff_TransitionsToGraceRetry() + { + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.FromSeconds(5))); + + using CancellationTokenSource cts = new CancellationTokenSource(); + int attempts = 0; + CancellationToken graceAttemptToken = default; + + int result = await MetadataRetryHelper.ExecuteAsync( + (ct) => + { + attempts++; + if (attempts == 1) + { + // Token is live now (outer IsCancellationRequested check on the + // backoff path will be false). Schedule a cancellation that fires + // during Task.Delay(5s, cancellationToken) — this is the production + // case where the caller's timeout trips mid-backoff. + cts.CancelAfter(TimeSpan.FromMilliseconds(50)); + throw new DocumentClientException( + "503 from unhealthy region", + HttpStatusCode.ServiceUnavailable, + SubStatusCodes.Unknown); + } + + graceAttemptToken = ct; + return Task.FromResult(99); + }, + policy.Object, + MetadataRetryHelper.DefaultCrossRegionRetryGrace, + cts.Token); + + Assert.AreEqual(99, result, "Mid-backoff cancellation must transition to grace retry, not surface OCE."); + Assert.AreEqual(2, attempts, "Exactly one grace-window retry should run after mid-backoff cancellation."); + + // The grace attempt must receive a fresh token decoupled from the caller's + // already-cancelled token. Same contract pinned by + // ExecuteAsync_CrossRegionRetryExecutes_EvenWhenCallerTokenCancelled, but + // re-asserted here because this path reaches the grace branch via the + // OperationCanceledException catch rather than the pre-cancelled outer check. + Assert.IsFalse( + graceAttemptToken.IsCancellationRequested, + "Grace attempt must receive a fresh, non-cancelled token after mid-backoff cancellation."); + Assert.AreNotEqual( + cts.Token, + graceAttemptToken, + "Grace attempt token must not be the caller's cancellation token."); + } + /// /// COMPANION REPRO: exercises the legacy TaskHelper.InlineIfPossible + /// BackoffRetryUtility path that ClientCollectionCache used prior to the From 29844bdf18a79c2cf5a719f871ada1b5b511569c Mon Sep 17 00:00:00 2001 From: Nalu Tripician <27316859+NaluTripician@users.noreply.github.com> Date: Fri, 1 May 2026 09:40:46 -0700 Subject: [PATCH 6/6] MetadataRetryHelper: Downgrades grace-grant log to TraceVerbose Addresses review feedback from @kundadebdatta: the entry-event 'granting Nms grace' message is now TraceVerbose since it is a mid-flow event. Terminal paths (grace already used / surfacing original exception, grace expired, grace attempt failed) remain at TraceInformation/TraceWarning so they remain in default logs during incident triage. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs b/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs index d50bb02d4f..24e5c801cf 100644 --- a/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs +++ b/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs @@ -191,7 +191,7 @@ internal static async Task ExecuteAsync( } graceAttempted = true; - DefaultTrace.TraceInformation( + DefaultTrace.TraceVerbose( "MetadataRetryHelper: caller token cancelled; granting {0}ms grace for one cross-region metadata retry.", (int)crossRegionRetryGrace.TotalMilliseconds);