diff --git a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs index 3e81c62d0d..4ca272bca5 100644 --- a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs +++ b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs @@ -23,6 +23,7 @@ internal sealed class ClientRetryPolicy : IDocumentClientRetryPolicy private const int RetryIntervalInMS = 1000; // Once we detect failover wait for 1 second before retrying request. private const int MaxRetryCount = 120; private const int MaxServiceUnavailableRetryCount = 1; + private const int MaxSessionTokenRetryCount = 2; private readonly IDocumentClientRetryPolicy throttlingRetry; private readonly GlobalEndpointManager globalEndpointManager; @@ -331,14 +332,6 @@ private async Task ShouldRetryInternalAsync( if (statusCode == HttpStatusCode.NotFound && subStatusCode == SubStatusCodes.ReadSessionNotAvailable) { -#if !INTERNAL - // Only set the hub region processing header for single master accounts - // Set header only after the first retry attempt fails with 404/1002 - if (!this.canUseMultipleWriteLocations && this.sessionTokenRetryCount >= 1) - { - this.addHubRegionProcessingOnlyHeader = true; - } -#endif return this.ShouldRetryOnSessionNotAvailable(this.documentServiceRequest); } @@ -456,12 +449,29 @@ private ShouldRetryResult ShouldRetryOnSessionNotAvailable(DocumentServiceReques } else { +#if !INTERNAL + // Only set the hub region processing header for single master accounts. + // Set header after the second consecutive 404/1002 (count >= 2 means both + // the initial request and the first retry to the write region have failed). + if (this.sessionTokenRetryCount >= MaxSessionTokenRetryCount) + { + this.addHubRegionProcessingOnlyHeader = true; + } + + if (this.sessionTokenRetryCount > MaxSessionTokenRetryCount) + { + // Hub region header was set at count == MaxSessionTokenRetryCount and the + // request was retried with it. If the hub still returns 404/1002, stop. + return ShouldRetryResult.NoRetry(); + } +#else if (this.sessionTokenRetryCount > 1) { - // When cannot use multiple write locations, then don't retry the request if - // we have already tried this request on the write location + // When cannot use multiple write locations, then don't retry the request if + // we have already tried this request on the write location. return ShouldRetryResult.NoRetry(); } +#endif else { this.retryContext = new RetryContext diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemTests.cs index 5b01f886c3..363fa2accb 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemTests.cs @@ -4319,12 +4319,14 @@ private static async Task GivenItemAsyncWhenMissingMemberHandlingIsErrorThenExpe [TestMethod] [Owner("aavasthy")] - [Description("Forces two consecutive 404/1002 responses from the gateway and verifies ClientRetryPolicy sets the hub region header flag after the first retry fails.")] + [Description("Forces three consecutive 404/1002 responses from the gateway and verifies ClientRetryPolicy " + + "sets the hub region header flag after the second 404/1002 and sends it on the third attempt.")] public async Task ReadItemAsync_ShouldAddHubHeader_OnRetryAfter_404_1002() { int requestCount = 0; int return404Count = 0; - const int maxReturn404 = 2; // Return 404/1002 twice + const int maxReturn404 = 3; // Return 404/1002 three times: initial + retry to write region + retry with hub header + bool hubHeaderPresentOnThirdRequest = false; // Created HTTP handler to intercept requests HttpClientHandlerHelper httpHandler = new HttpClientHandlerHelper @@ -4338,15 +4340,23 @@ public async Task ReadItemAsync_ShouldAddHubHeader_OnRetryAfter_404_1002() { requestCount++; - // Header should NOT be present on first retry (2nd request) - if (requestCount == 2 && - request.Headers.TryGetValues(HubRegionHeader, out IEnumerable firstRetryValues) && - firstRetryValues.Any()) + // Header should NOT be present on first two requests + if (requestCount <= 2 && + request.Headers.TryGetValues(HubRegionHeader, out IEnumerable earlyRetryValues) && + earlyRetryValues.Any()) { - Assert.Fail("Header should NOT be present on first retry attempt."); + Assert.Fail($"Header should NOT be present on request {requestCount}."); } - // Return fake 404/1002 for first two requests + // Header MUST be present on third request (after two consecutive 404/1002 failures) + if (requestCount == 3) + { + hubHeaderPresentOnThirdRequest = + request.Headers.TryGetValues(HubRegionHeader, out IEnumerable hubValues) + && hubValues.Any(v => v == bool.TrueString); + } + + // Return fake 404/1002 for the configured number of requests if (return404Count < maxReturn404) { return404Count++; @@ -4399,8 +4409,7 @@ public async Task ReadItemAsync_ShouldAddHubHeader_OnRetryAfter_404_1002() try { - // This should trigger 404/1002 twice - // In single-region emulator, after first retry fails with 404/1002, it won't retry again + // This should trigger 404/1002 three times then NoRetry ItemResponse response = await customContainer.ReadItemAsync( testItem.id, new Cosmos.PartitionKey(testItem.pk)); @@ -4409,17 +4418,18 @@ public async Task ReadItemAsync_ShouldAddHubHeader_OnRetryAfter_404_1002() } catch (CosmosException ex) { - // Expected: After first retry fails with 404/1002, single master won't retry again + // Expected: After third 404/1002 (with hub header), single master stops retrying Assert.AreEqual(HttpStatusCode.NotFound, ex.StatusCode); Assert.AreEqual((int)SubStatusCodes.ReadSessionNotAvailable, ex.SubStatusCode); } // Verify the expected behavior: - // 1. Initial request (requestCount = 1) fails with 404/1002 - // 2. First retry (requestCount = 2) fails with 404/1002 - // 3. No more retries because single master + no additional regions - Assert.AreEqual(2, requestCount, $"Expected exactly 2 requests (initial + 1 retry) for single-region emulator, but got {requestCount}"); - Assert.AreEqual(2, return404Count, "Both requests should have returned 404/1002"); + // 1. Initial request (requestCount = 1) fails with 404/1002 → no hub header + // 2. First retry to write region (requestCount = 2) fails with 404/1002 → no hub header, flag set + // 3. Second retry with hub header (requestCount = 3) fails with 404/1002 → hub header present → NoRetry + Assert.AreEqual(3, requestCount, $"Expected exactly 3 requests (initial + retry to write region + retry with hub header) for single-region emulator, but got {requestCount}"); + Assert.AreEqual(3, return404Count, "All three requests should have returned 404/1002"); + Assert.IsTrue(hubHeaderPresentOnThirdRequest, "Hub region header must be present on the third request (after two consecutive 404/1002 failures)."); } private async Task AutoGenerateIdPatternTest(Cosmos.PartitionKey pk, T itemWithoutId) diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs index b595711070..de4deb7ca1 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs @@ -466,43 +466,16 @@ public async Task ClientRetryPolicy_HubRegionHeader_AddedOn404_1002_BasedOnAccou if (isSingleMaster) { - // For single master, after one retry fails with 404/1002, it won't retry further - // But the header flag should be set for any potential future retries due to other errors - Assert.IsFalse(shouldRetry.ShouldRetry, "Single master should not retry again after first 404/1002 retry fails."); - - // The header flag should be set even though no more 404/1002 retries will happen - // This ensures if the request is retried for a different reason (e.g., 503), it will have the header - } - else - { - // Multi-master can retry across multiple regions - Assert.IsTrue(shouldRetry.ShouldRetry, "Multi-master should continue retrying on 404/1002."); - } - - // For single master: Verify header would be added if request is retried for other reasons (e.g., 503) - // For multi-master: Verify header is NOT added even on subsequent retries - if (isSingleMaster) - { - // Simulate a 503 error to trigger another retry - DocumentClientException serviceUnavailableException = new DocumentClientException( - message: "Simulated 503 ServiceUnavailable", - innerException: null, - statusCode: HttpStatusCode.ServiceUnavailable, - substatusCode: SubStatusCodes.Unknown, - requestUri: request.RequestContext.LocationEndpointToRoute, - responseHeaders: new DictionaryNameValueCollection()); - - shouldRetry = await retryPolicy.ShouldRetryAsync(serviceUnavailableException, CancellationToken.None); - - if (shouldRetry.ShouldRetry) - { - // Now verify the header is present on this retry triggered by 503 - retryPolicy.OnBeforeSendRequest(request); - headerValues = request.Headers.GetValues(HubRegionHeader); - Assert.IsNotNull(headerValues, "Header should be present on retry after 404/1002 flag was set."); - Assert.AreEqual(1, headerValues.Length, "Header should have exactly one value."); - Assert.AreEqual(bool.TrueString, headerValues[0], "Header value should be 'True'."); - } + // For single master, after the second 404/1002, the hub region header flag has been set + // and the retry policy should allow one more retry so the request can be sent with the header. + Assert.IsTrue(shouldRetry.ShouldRetry, "Single master should retry once more after second 404/1002 so the hub region header is sent."); + + // Verify the header is now present on the retry + retryPolicy.OnBeforeSendRequest(request); + headerValues = request.Headers.GetValues(HubRegionHeader); + Assert.IsNotNull(headerValues, "Hub region header should be present on retry after second 404/1002."); + Assert.AreEqual(1, headerValues.Length, "Header should have exactly one value."); + Assert.AreEqual(bool.TrueString, headerValues[0], "Header value should be 'True'."); } else { @@ -528,7 +501,177 @@ public async Task ClientRetryPolicy_HubRegionHeader_AddedOn404_1002_BasedOnAccou } } } - } + } + + /// + /// End-to-end test for the hub region discovery flow on a single-master account (Direct mode): + /// 1st request → 404/1002 (no hub header) → retry to write region + /// 2nd request → 404/1002 (no hub header) → hub header flag set, retry + /// 3rd request → assert hub header present → 403/3 from non-hub → retry + /// 4th request → assert hub header present → 200 success + /// + [TestMethod] + public async Task ClientRetryPolicy_HubRegionDiscovery_EndToEnd_DirectMode() + { + // Arrange: single-master, endpoint discovery enabled + const bool enableEndpointDiscovery = true; + + using GlobalEndpointManager endpointManager = this.Initialize( + useMultipleWriteLocations: false, + enableEndpointDiscovery: enableEndpointDiscovery, + isPreferredLocationsListEmpty: false, + enforceSingleMasterSingleWriteLocation: true); + + ClientRetryPolicy retryPolicy = new ClientRetryPolicy( + endpointManager, + this.partitionKeyRangeLocationCache, + new RetryOptions(), + enableEndpointDiscovery, + isThinClientEnabled: false); + + DocumentServiceRequest request = this.CreateRequest(isReadRequest: true, isMasterResourceType: false); + + // ---- Step 1: First request attempt ---- + retryPolicy.OnBeforeSendRequest(request); + Assert.IsNull( + request.Headers.GetValues(HubRegionHeader), + "Hub region header should NOT be present on the initial request."); + + // Simulate 1st 404/1002 + ShouldRetryResult shouldRetry = await retryPolicy.ShouldRetryAsync( + new DocumentClientException( + message: "1st 404/1002", + innerException: null, + statusCode: HttpStatusCode.NotFound, + substatusCode: SubStatusCodes.ReadSessionNotAvailable, + requestUri: request.RequestContext.LocationEndpointToRoute, + responseHeaders: new DictionaryNameValueCollection()), + CancellationToken.None); + + Assert.IsTrue(shouldRetry.ShouldRetry, "Should retry after first 404/1002."); + + // ---- Step 2: Retry routed to write region ---- + retryPolicy.OnBeforeSendRequest(request); + Assert.IsNull( + request.Headers.GetValues(HubRegionHeader), + "Hub region header should NOT be present on the first retry (routed to write region)."); + + // Simulate 2nd 404/1002 + shouldRetry = await retryPolicy.ShouldRetryAsync( + new DocumentClientException( + message: "2nd 404/1002", + innerException: null, + statusCode: HttpStatusCode.NotFound, + substatusCode: SubStatusCodes.ReadSessionNotAvailable, + requestUri: request.RequestContext.LocationEndpointToRoute, + responseHeaders: new DictionaryNameValueCollection()), + CancellationToken.None); + + Assert.IsTrue(shouldRetry.ShouldRetry, "Should retry after second 404/1002 (hub header flag now set)."); + + // ---- Step 3: Retry with hub region header → gets 403/3 ---- + retryPolicy.OnBeforeSendRequest(request); + string[] headerValues = request.Headers.GetValues(HubRegionHeader); + Assert.IsNotNull(headerValues, "Hub region header MUST be present on the retry after two consecutive 404/1002 errors."); + Assert.AreEqual(1, headerValues.Length, "Hub region header should have exactly one value."); + Assert.AreEqual(bool.TrueString, headerValues[0], "Hub region header value should be 'True'."); + + // Simulate 403/3 (WriteForbidden) — this happens when the request reaches a non-hub region + shouldRetry = await retryPolicy.ShouldRetryAsync( + new DocumentClientException( + message: "403/3 WriteForbidden from non-hub region", + innerException: null, + statusCode: HttpStatusCode.Forbidden, + substatusCode: SubStatusCodes.WriteForbidden, + requestUri: request.RequestContext.LocationEndpointToRoute, + responseHeaders: new DictionaryNameValueCollection()), + CancellationToken.None); + + Assert.IsTrue(shouldRetry.ShouldRetry, "Should retry after 403/3 to continue hub region discovery."); + + // ---- Step 4: Retry still carries hub header → 200 success ---- + retryPolicy.OnBeforeSendRequest(request); + headerValues = request.Headers.GetValues(HubRegionHeader); + Assert.IsNotNull(headerValues, "Hub region header MUST persist through 403/3 retries."); + Assert.AreEqual(bool.TrueString, headerValues[0], "Hub region header value should remain 'True'."); + } + + /// + /// Verifies that once the hub region header is set (after two consecutive 404/1002), + /// it persists through subsequent retries triggered by other retriable errors + /// (503 ServiceUnavailable, 408 RequestTimeout) and that the normal preferred-region + /// cycling continues with the header attached. + /// + [TestMethod] + public async Task ClientRetryPolicy_HubRegionHeader_PersistsThroughRetriableErrors() + { + const bool enableEndpointDiscovery = true; + + using GlobalEndpointManager endpointManager = this.Initialize( + useMultipleWriteLocations: false, + enableEndpointDiscovery: enableEndpointDiscovery, + isPreferredLocationsListEmpty: false, + enforceSingleMasterSingleWriteLocation: true); + + ClientRetryPolicy retryPolicy = new ClientRetryPolicy( + endpointManager, + this.partitionKeyRangeLocationCache, + new RetryOptions(), + enableEndpointDiscovery, + isThinClientEnabled: false); + + DocumentServiceRequest request = this.CreateRequest(isReadRequest: true, isMasterResourceType: false); + + // ---- 1st 404/1002 ---- + retryPolicy.OnBeforeSendRequest(request); + ShouldRetryResult shouldRetry = await retryPolicy.ShouldRetryAsync( + new DocumentClientException( + message: "1st 404/1002", + innerException: null, + statusCode: HttpStatusCode.NotFound, + substatusCode: SubStatusCodes.ReadSessionNotAvailable, + requestUri: request.RequestContext.LocationEndpointToRoute, + responseHeaders: new DictionaryNameValueCollection()), + CancellationToken.None); + Assert.IsTrue(shouldRetry.ShouldRetry); + + // ---- 2nd 404/1002 → hub header flag gets set ---- + retryPolicy.OnBeforeSendRequest(request); + shouldRetry = await retryPolicy.ShouldRetryAsync( + new DocumentClientException( + message: "2nd 404/1002", + innerException: null, + statusCode: HttpStatusCode.NotFound, + substatusCode: SubStatusCodes.ReadSessionNotAvailable, + requestUri: request.RequestContext.LocationEndpointToRoute, + responseHeaders: new DictionaryNameValueCollection()), + CancellationToken.None); + Assert.IsTrue(shouldRetry.ShouldRetry, "Should retry so the hub header can be sent."); + + // ---- 3rd request: hub header should be present ---- + retryPolicy.OnBeforeSendRequest(request); + string[] headerValues = request.Headers.GetValues(HubRegionHeader); + Assert.IsNotNull(headerValues, "Hub region header must be present after two 404/1002 failures."); + Assert.AreEqual(bool.TrueString, headerValues[0]); + + // ---- Now simulate a retriable 503 ServiceUnavailable error ---- + shouldRetry = await retryPolicy.ShouldRetryAsync( + new DocumentClientException( + message: "503 ServiceUnavailable after hub header set", + innerException: null, + statusCode: HttpStatusCode.ServiceUnavailable, + substatusCode: SubStatusCodes.Unknown, + requestUri: request.RequestContext.LocationEndpointToRoute, + responseHeaders: new DictionaryNameValueCollection()), + CancellationToken.None); + Assert.IsTrue(shouldRetry.ShouldRetry, "Should retry on 503 ServiceUnavailable."); + + // ---- 4th request: hub header must STILL be present after 503 retry ---- + retryPolicy.OnBeforeSendRequest(request); + headerValues = request.Headers.GetValues(HubRegionHeader); + Assert.IsNotNull(headerValues, "Hub region header must persist through 503 retry."); + Assert.AreEqual(bool.TrueString, headerValues[0], "Hub region header value should remain 'True'."); + } private async Task ValidateConnectTimeoutTriggersClientRetryPolicyAsync( bool isReadRequest, diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/LocationCacheTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/LocationCacheTests.cs index b8f85d4bef..087f76f0d6 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/LocationCacheTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/LocationCacheTests.cs @@ -245,6 +245,13 @@ await BackoffRetryUtility.ExecuteAsync( Uri expectedEndpoint = new Uri(this.databaseAccount.WriteLocationsInternal[0].Endpoint); Assert.AreEqual(expectedEndpoint, request.RequestContext.LocationEndpointToRoute); } + else if (retryCount == 2) + { + // Third request is the retry with the hub region header set. + // It still routes to the write endpoint (index=0, preferred=false). + Uri expectedEndpoint = new Uri(this.databaseAccount.WriteLocationsInternal[0].Endpoint); + Assert.AreEqual(expectedEndpoint, request.RequestContext.LocationEndpointToRoute); + } else { Assert.Fail(); @@ -268,7 +275,7 @@ await BackoffRetryUtility.ExecuteAsync( catch (NotFoundException) { DefaultTrace.TraceInformation("Received expected notFoundException"); - Assert.AreEqual(2, retryCount); + Assert.AreEqual(3, retryCount); } } }