Skip to content

Commit 562d2d5

Browse files
[Internal] PPAF: Fixes Hub region header never sent due to premature NoRetry on single-master 404/1002. (#5792)
# Pull Request Template ## Description # PR Description ## Summary Fixes a bug in `ClientRetryPolicy.ShouldRetryOnSessionNotAvailable` where the hub region processing header (`x-ms-cosmos-hub-region-processing-only`) was set but **never actually sent** to the backend on single-master accounts. The retry threshold `sessionTokenRetryCount > 1` caused `NoRetry` immediately after the header flag was set, preventing the retry that would carry the header. ## Fix Changed `sessionTokenRetryCount > 1` → `sessionTokenRetryCount > 2`. ### Corrected Flow | Attempt | `count` before method | Guard sets header? | `count++` | Check `count > 2` | Result | Header on request? | |---------|----------------------|-------------------|-----------|-------------------|--------|--------------------| | 1st request | — | — | — | — | Sent to preferred read region | ❌ No | | 1st 404/1002 | 0 | No (`0 >= 1` = false) | → 1 | `1 > 2` = false | ✅ Retry to write region (index=0) | ❌ No | | 2nd 404/1002 | 1 | **Yes** (`1 >= 1` = true) | → 2 | `2 > 2` = false | ✅ Retry to write region (index=0) | ✅ **Yes** | | 3rd 404/1002 | 2 | Yes (already true) | → 3 | `3 > 2` = true | ⛔ NoRetry — hub confirmed not available | ✅ Yes | ### End-to-End Hub Region Discovery (Happy Path with 403/3) | Step | Request | Response | Hub Header? | What happens | |------|---------|----------|-------------|-------------| | 1 | Read to preferred region | 404/1002 | ❌ | Session not available → retry to write region | | 2 | Read to write region | 404/1002 | ❌ | Write region also returns session not available → header flag set → retry | | 3 | Read to write region | 403/3 | ✅ | Non-hub region rejects hub-only request → PPAF marks endpoint unavailable → retry to next region | | 4 | Read to next region | 200 OK | ✅ | Hub region found, request succeeds, hub cached for partition | ## Type of change Please delete options that are not relevant. - [X] Bug fix (non-breaking change which fixes an issue) ## Closing issues To automatically close an issue: closes #IssueNumber --------- Co-authored-by: Debdatta Kunda <87335885+kundadebdatta@users.noreply.github.com>
1 parent 377c5b5 commit 562d2d5

4 files changed

Lines changed: 235 additions & 65 deletions

File tree

Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ internal sealed class ClientRetryPolicy : IDocumentClientRetryPolicy
2323
private const int RetryIntervalInMS = 1000; // Once we detect failover wait for 1 second before retrying request.
2424
private const int MaxRetryCount = 120;
2525
private const int MaxServiceUnavailableRetryCount = 1;
26+
private const int MaxSessionTokenRetryCount = 2;
2627

2728
private readonly IDocumentClientRetryPolicy throttlingRetry;
2829
private readonly GlobalEndpointManager globalEndpointManager;
@@ -331,14 +332,6 @@ private async Task<ShouldRetryResult> ShouldRetryInternalAsync(
331332

332333
if (statusCode == HttpStatusCode.NotFound && subStatusCode == SubStatusCodes.ReadSessionNotAvailable)
333334
{
334-
#if !INTERNAL
335-
// Only set the hub region processing header for single master accounts
336-
// Set header only after the first retry attempt fails with 404/1002
337-
if (!this.canUseMultipleWriteLocations && this.sessionTokenRetryCount >= 1)
338-
{
339-
this.addHubRegionProcessingOnlyHeader = true;
340-
}
341-
#endif
342335
return this.ShouldRetryOnSessionNotAvailable(this.documentServiceRequest);
343336
}
344337

@@ -456,12 +449,29 @@ private ShouldRetryResult ShouldRetryOnSessionNotAvailable(DocumentServiceReques
456449
}
457450
else
458451
{
452+
#if !INTERNAL
453+
// Only set the hub region processing header for single master accounts.
454+
// Set header after the second consecutive 404/1002 (count >= 2 means both
455+
// the initial request and the first retry to the write region have failed).
456+
if (this.sessionTokenRetryCount >= MaxSessionTokenRetryCount)
457+
{
458+
this.addHubRegionProcessingOnlyHeader = true;
459+
}
460+
461+
if (this.sessionTokenRetryCount > MaxSessionTokenRetryCount)
462+
{
463+
// Hub region header was set at count == MaxSessionTokenRetryCount and the
464+
// request was retried with it. If the hub still returns 404/1002, stop.
465+
return ShouldRetryResult.NoRetry();
466+
}
467+
#else
459468
if (this.sessionTokenRetryCount > 1)
460469
{
461-
// When cannot use multiple write locations, then don't retry the request if
462-
// we have already tried this request on the write location
470+
// When cannot use multiple write locations, then don't retry the request if
471+
// we have already tried this request on the write location.
463472
return ShouldRetryResult.NoRetry();
464473
}
474+
#endif
465475
else
466476
{
467477
this.retryContext = new RetryContext

Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemTests.cs

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4319,12 +4319,14 @@ private static async Task GivenItemAsyncWhenMissingMemberHandlingIsErrorThenExpe
43194319

43204320
[TestMethod]
43214321
[Owner("aavasthy")]
4322-
[Description("Forces two consecutive 404/1002 responses from the gateway and verifies ClientRetryPolicy sets the hub region header flag after the first retry fails.")]
4322+
[Description("Forces three consecutive 404/1002 responses from the gateway and verifies ClientRetryPolicy " +
4323+
"sets the hub region header flag after the second 404/1002 and sends it on the third attempt.")]
43234324
public async Task ReadItemAsync_ShouldAddHubHeader_OnRetryAfter_404_1002()
43244325
{
43254326
int requestCount = 0;
43264327
int return404Count = 0;
4327-
const int maxReturn404 = 2; // Return 404/1002 twice
4328+
const int maxReturn404 = 3; // Return 404/1002 three times: initial + retry to write region + retry with hub header
4329+
bool hubHeaderPresentOnThirdRequest = false;
43284330

43294331
// Created HTTP handler to intercept requests
43304332
HttpClientHandlerHelper httpHandler = new HttpClientHandlerHelper
@@ -4338,15 +4340,23 @@ public async Task ReadItemAsync_ShouldAddHubHeader_OnRetryAfter_404_1002()
43384340
{
43394341
requestCount++;
43404342

4341-
// Header should NOT be present on first retry (2nd request)
4342-
if (requestCount == 2 &&
4343-
request.Headers.TryGetValues(HubRegionHeader, out IEnumerable<string> firstRetryValues) &&
4344-
firstRetryValues.Any())
4343+
// Header should NOT be present on first two requests
4344+
if (requestCount <= 2 &&
4345+
request.Headers.TryGetValues(HubRegionHeader, out IEnumerable<string> earlyRetryValues) &&
4346+
earlyRetryValues.Any())
43454347
{
4346-
Assert.Fail("Header should NOT be present on first retry attempt.");
4348+
Assert.Fail($"Header should NOT be present on request {requestCount}.");
43474349
}
43484350

4349-
// Return fake 404/1002 for first two requests
4351+
// Header MUST be present on third request (after two consecutive 404/1002 failures)
4352+
if (requestCount == 3)
4353+
{
4354+
hubHeaderPresentOnThirdRequest =
4355+
request.Headers.TryGetValues(HubRegionHeader, out IEnumerable<string> hubValues)
4356+
&& hubValues.Any(v => v == bool.TrueString);
4357+
}
4358+
4359+
// Return fake 404/1002 for the configured number of requests
43504360
if (return404Count < maxReturn404)
43514361
{
43524362
return404Count++;
@@ -4399,8 +4409,7 @@ public async Task ReadItemAsync_ShouldAddHubHeader_OnRetryAfter_404_1002()
43994409

44004410
try
44014411
{
4402-
// This should trigger 404/1002 twice
4403-
// In single-region emulator, after first retry fails with 404/1002, it won't retry again
4412+
// This should trigger 404/1002 three times then NoRetry
44044413
ItemResponse<ToDoActivity> response = await customContainer.ReadItemAsync<ToDoActivity>(
44054414
testItem.id,
44064415
new Cosmos.PartitionKey(testItem.pk));
@@ -4409,17 +4418,18 @@ public async Task ReadItemAsync_ShouldAddHubHeader_OnRetryAfter_404_1002()
44094418
}
44104419
catch (CosmosException ex)
44114420
{
4412-
// Expected: After first retry fails with 404/1002, single master won't retry again
4421+
// Expected: After third 404/1002 (with hub header), single master stops retrying
44134422
Assert.AreEqual(HttpStatusCode.NotFound, ex.StatusCode);
44144423
Assert.AreEqual((int)SubStatusCodes.ReadSessionNotAvailable, ex.SubStatusCode);
44154424
}
44164425

44174426
// Verify the expected behavior:
4418-
// 1. Initial request (requestCount = 1) fails with 404/1002
4419-
// 2. First retry (requestCount = 2) fails with 404/1002
4420-
// 3. No more retries because single master + no additional regions
4421-
Assert.AreEqual(2, requestCount, $"Expected exactly 2 requests (initial + 1 retry) for single-region emulator, but got {requestCount}");
4422-
Assert.AreEqual(2, return404Count, "Both requests should have returned 404/1002");
4427+
// 1. Initial request (requestCount = 1) fails with 404/1002 → no hub header
4428+
// 2. First retry to write region (requestCount = 2) fails with 404/1002 → no hub header, flag set
4429+
// 3. Second retry with hub header (requestCount = 3) fails with 404/1002 → hub header present → NoRetry
4430+
Assert.AreEqual(3, requestCount, $"Expected exactly 3 requests (initial + retry to write region + retry with hub header) for single-region emulator, but got {requestCount}");
4431+
Assert.AreEqual(3, return404Count, "All three requests should have returned 404/1002");
4432+
Assert.IsTrue(hubHeaderPresentOnThirdRequest, "Hub region header must be present on the third request (after two consecutive 404/1002 failures).");
44234433
}
44244434

44254435
private async Task<T> AutoGenerateIdPatternTest<T>(Cosmos.PartitionKey pk, T itemWithoutId)

0 commit comments

Comments
 (0)