Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
7e9b945
Add header for 404/1002 retry requests.
aavasthy Oct 14, 2025
7786e45
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Oct 14, 2025
d85ed0a
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Oct 15, 2025
163be31
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Nov 4, 2025
e523341
Update direct package and retry header code
aavasthy Nov 4, 2025
822b8e6
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Nov 4, 2025
1ad3fef
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Nov 5, 2025
1f63898
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Nov 6, 2025
2d90cb1
Resolve merge conflicts.
aavasthy Nov 18, 2025
b1fa2b2
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Dec 13, 2025
35e0eeb
Code clean up
aavasthy Dec 16, 2025
d51b105
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Dec 18, 2025
f622ebd
Add not to be used internally check for hubregion header.
aavasthy Dec 18, 2025
a15e97a
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Dec 18, 2025
961288f
Add not to be used internally check for hubregion header.
aavasthy Dec 19, 2025
b8facfd
Merge with master
aavasthy Dec 19, 2025
1fbd6c0
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Dec 19, 2025
7668f2a
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Dec 30, 2025
692484c
Made property volatile
aavasthy Jan 6, 2026
9dd4ec0
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Jan 6, 2026
66d481b
Update retry logic and add header for all subsequent request.
aavasthy Jan 9, 2026
5b541e1
Merge with master
aavasthy Jan 9, 2026
7936ada
Correct formatting
aavasthy Jan 9, 2026
6bffa82
Update the check to work only for single master.
aavasthy Jan 22, 2026
728e61e
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Jan 23, 2026
b7b3a37
Update retry header logic
aavasthy Jan 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 26 additions & 10 deletions Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,10 @@ internal sealed class ClientRetryPolicy : IDocumentClientRetryPolicy
private bool isMultiMasterWriteRequest;
private Uri locationEndpoint;
private RetryContext retryContext;
private DocumentServiceRequest documentServiceRequest;
private DocumentServiceRequest documentServiceRequest;
#if !INTERNAL
private volatile bool addHubRegionProcessingOnlyHeader;
#endif

public ClientRetryPolicy(
GlobalEndpointManager globalEndpointManager,
Expand Down Expand Up @@ -222,8 +225,14 @@ public void OnBeforeSendRequest(DocumentServiceRequest request)
// set location-based routing directive based on request retry context
request.RequestContext.RouteToLocation(this.retryContext.RetryLocationIndex, this.retryContext.RetryRequestOnPreferredLocations);
}
}

}
#if !INTERNAL
// If previous attempt failed with 404/1002, add the hub-region-processing-only header to all subsequent retry attempts
if (this.addHubRegionProcessingOnlyHeader)
{
request.Headers[HttpConstants.HttpHeaders.ShouldProcessOnlyInHubRegion] = bool.TrueString;
}
#endif
// Resolve the endpoint for the request and pin the resolution to the resolved endpoint
// This enables marking the endpoint unavailability on endpoint failover/unreachability
this.locationEndpoint = this.isThinClientEnabled
Expand Down Expand Up @@ -318,15 +327,22 @@ private async Task<ShouldRetryResult> ShouldRetryInternalAsync(
markBothReadAndWriteAsUnavailable: false,
forceRefresh: false,
retryOnPreferredLocations: true);
}

if (statusCode == HttpStatusCode.NotFound
&& subStatusCode == SubStatusCodes.ReadSessionNotAvailable)
}

if (statusCode == HttpStatusCode.NotFound && subStatusCode == SubStatusCodes.ReadSessionNotAvailable)
{
#if !INTERNAL
// Only set the hub region processing header for single master accounts
// Set header only after the first retry attempt fails with 404/1002
if (!this.canUseMultipleWriteLocations && this.sessionTokenRetryCount >= 1)
{
this.addHubRegionProcessingOnlyHeader = true;
}
#endif
return this.ShouldRetryOnSessionNotAvailable(this.documentServiceRequest);
}
// Received 503 due to client connect timeout or Gateway
}
// Received 503 due to client connect timeout or Gateway
if (statusCode == HttpStatusCode.ServiceUnavailable)
{
return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ namespace Microsoft.Azure.Cosmos.SDK.EmulatorTests
using System.Threading.Tasks;
using Microsoft.Azure.Cosmos;
using Microsoft.Azure.Cosmos.Diagnostics;
using Microsoft.Azure.Cosmos.Handlers;
using Microsoft.Azure.Cosmos.Json;
using Microsoft.Azure.Cosmos.Query.Core.ExecutionContext;
using Microsoft.Azure.Cosmos.Query.Core.QueryClient;
Expand All @@ -39,7 +40,8 @@ public class CosmosItemTests : BaseCosmosClientHelper
{
private Container Container = null;
private ContainerProperties containerSettings = null;


private const string HubRegionHeader = "x-ms-cosmos-hub-region-processing-only";
private static readonly string nonPartitionItemId = "fixed-Container-Item";
private static readonly string undefinedPartitionItemId = "undefined-partition-Item";

Expand Down Expand Up @@ -4315,7 +4317,112 @@ private static async Task GivenItemAsyncWhenMissingMemberHandlingIsErrorThenExpe

JsonConvert.DefaultSettings = () => default;
}
}
}

[TestMethod]
[Owner("aavasthy")]
[Description("Forces two consecutive 404/1002 responses from the gateway and verifies ClientRetryPolicy sets the hub region header flag after the first retry fails.")]
public async Task ReadItemAsync_ShouldAddHubHeader_OnRetryAfter_404_1002()
{
int requestCount = 0;
int return404Count = 0;
const int maxReturn404 = 2; // Return 404/1002 twice

// Created HTTP handler to intercept requests
HttpClientHandlerHelper httpHandler = new HttpClientHandlerHelper
{
RequestCallBack = (request, cancellationToken) =>
{
// Track all document read requests
if (request.Method == HttpMethod.Get &&
request.RequestUri != null &&
request.RequestUri.AbsolutePath.Contains("/docs/"))
{
requestCount++;

// Header should NOT be present on first retry (2nd request)
if (requestCount == 2 &&
request.Headers.TryGetValues(HubRegionHeader, out IEnumerable<string> firstRetryValues) &&
firstRetryValues.Any())
{
Assert.Fail("Header should NOT be present on first retry attempt.");
}

// Return fake 404/1002 for first two requests
if (return404Count < maxReturn404)
{
return404Count++;

var errorResponse = new
{
code = "NotFound",
message = "Message: {\"Errors\":[\"Resource Not Found. Learn more: https://aka.ms/cosmosdb-tsg-not-found\"]}\r\nActivityId: " + Guid.NewGuid() + ", Request URI: " + request.RequestUri,
additionalErrorInfo = ""
};

HttpResponseMessage notFoundResponse = new HttpResponseMessage(HttpStatusCode.NotFound)
{
Content = new StringContent(
JsonConvert.SerializeObject(errorResponse),
Encoding.UTF8,
"application/json"
)
};

// Add the substatus header for ReadSessionNotAvailable
notFoundResponse.Headers.Add("x-ms-substatus", "1002");
notFoundResponse.Headers.Add("x-ms-activity-id", Guid.NewGuid().ToString());
notFoundResponse.Headers.Add("x-ms-request-charge", "1.0");

return Task.FromResult(notFoundResponse);
}
}

return Task.FromResult<HttpResponseMessage>(null);
}
};

CosmosClientOptions clientOptions = new CosmosClientOptions
{
ConnectionMode = ConnectionMode.Gateway,
ConsistencyLevel = Cosmos.ConsistencyLevel.Session,
HttpClientFactory = () => new HttpClient(httpHandler),
MaxRetryAttemptsOnRateLimitedRequests = 9,
MaxRetryWaitTimeOnRateLimitedRequests = TimeSpan.FromSeconds(30)
};

using CosmosClient customClient = TestCommon.CreateCosmosClient(clientOptions);

Container customContainer = customClient.GetContainer(this.database.Id, this.Container.Id);

// Create a test item first
ToDoActivity testItem = ToDoActivity.CreateRandomToDoActivity();
await this.Container.CreateItemAsync(testItem, new Cosmos.PartitionKey(testItem.pk));

try
{
// This should trigger 404/1002 twice
// In single-region emulator, after first retry fails with 404/1002, it won't retry again
ItemResponse<ToDoActivity> response = await customContainer.ReadItemAsync<ToDoActivity>(
testItem.id,
new Cosmos.PartitionKey(testItem.pk));

Assert.Fail("Expected CosmosException due to consecutive 404/1002 failures.");
}
catch (CosmosException ex)
{
// Expected: After first retry fails with 404/1002, single master won't retry again
Assert.AreEqual(HttpStatusCode.NotFound, ex.StatusCode);
Assert.AreEqual((int)SubStatusCodes.ReadSessionNotAvailable, ex.SubStatusCode);
}

// Verify the expected behavior:
// 1. Initial request (requestCount = 1) fails with 404/1002
// 2. First retry (requestCount = 2) fails with 404/1002
// 3. No more retries because single master + no additional regions
Assert.AreEqual(2, requestCount, $"Expected exactly 2 requests (initial + 1 retry) for single-region emulator, but got {requestCount}");
Assert.AreEqual(2, return404Count, "Both requests should have returned 404/1002");
}

private async Task<T> AutoGenerateIdPatternTest<T>(Cosmos.PartitionKey pk, T itemWithoutId)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ public sealed class ClientRetryPolicyTests
{
private static Uri Location1Endpoint = new Uri("https://location1.documents.azure.com");
private static Uri Location2Endpoint = new Uri("https://location2.documents.azure.com");


private const string HubRegionHeader = "x-ms-cosmos-hub-region-processing-only";
private ReadOnlyCollection<string> preferredLocations;
private AccountProperties databaseAccount;
private GlobalPartitionEndpointManager partitionKeyRangeLocationCache;
Expand Down Expand Up @@ -400,6 +401,134 @@ public async Task ClientRetryPolicy_NoRetry_MultiMaster_Write_NoPreferredLocatio
{
await this.ValidateConnectTimeoutTriggersClientRetryPolicyAsync(isReadRequest: false, useMultipleWriteLocations: true, usesPreferredLocations: false, true);
}

/// <summary>
/// Test to validate that hub region header is added on 404/1002 for single master accounts only,
/// starting from the second retry (after first retry also fails). For multi-master accounts,
/// the header should NOT be added.
/// </summary>
[TestMethod]
[DataRow(true, true, DisplayName = "Read request on single master - Hub region header added after first retry fails")]
[DataRow(false, true, DisplayName = "Write request on single master - Hub region header added after first retry fails")]
[DataRow(true, false, DisplayName = "Read request on multi-master - Hub region header NOT added")]
[DataRow(false, false, DisplayName = "Write request on multi-master - Hub region header NOT added")]
public async Task ClientRetryPolicy_HubRegionHeader_AddedOn404_1002_BasedOnAccountType(bool isReadRequest, bool isSingleMaster)
{
// Arrange
const bool enableEndpointDiscovery = true;

using GlobalEndpointManager endpointManager = this.Initialize(
useMultipleWriteLocations: !isSingleMaster,
enableEndpointDiscovery: enableEndpointDiscovery,
isPreferredLocationsListEmpty: false,
enforceSingleMasterSingleWriteLocation: isSingleMaster);

ClientRetryPolicy retryPolicy = new ClientRetryPolicy(
endpointManager,
this.partitionKeyRangeLocationCache,
new RetryOptions(),
enableEndpointDiscovery,
isThinClientEnabled: false);

DocumentServiceRequest request = this.CreateRequest(isReadRequest: isReadRequest, isMasterResourceType: false);

// First attempt - header should not exist
retryPolicy.OnBeforeSendRequest(request);
Assert.IsNull(request.Headers.GetValues(HubRegionHeader), "Header should not exist on initial request before any 404/1002 error.");

// Simulate first 404/1002 error
DocumentClientException sessionNotAvailableException = new DocumentClientException(
message: "Simulated 404/1002 ReadSessionNotAvailable",
innerException: null,
statusCode: HttpStatusCode.NotFound,
substatusCode: SubStatusCodes.ReadSessionNotAvailable,
requestUri: request.RequestContext.LocationEndpointToRoute,
responseHeaders: new DictionaryNameValueCollection());

ShouldRetryResult shouldRetry = await retryPolicy.ShouldRetryAsync(sessionNotAvailableException, CancellationToken.None);
Assert.IsTrue(shouldRetry.ShouldRetry, "Should retry on 404/1002.");

// First retry attempt - header should NOT be present yet
retryPolicy.OnBeforeSendRequest(request);
string[] headerValues = request.Headers.GetValues(HubRegionHeader);
Assert.IsNull(headerValues, "Header should NOT be present on first retry attempt (before it fails).");

// Simulate first retry also failing with 404/1002
DocumentClientException sessionNotAvailableException2 = new DocumentClientException(
message: "Simulated 404/1002 ReadSessionNotAvailable on first retry",
innerException: null,
statusCode: HttpStatusCode.NotFound,
substatusCode: SubStatusCodes.ReadSessionNotAvailable,
requestUri: request.RequestContext.LocationEndpointToRoute,
responseHeaders: new DictionaryNameValueCollection());

shouldRetry = await retryPolicy.ShouldRetryAsync(sessionNotAvailableException2, CancellationToken.None);

if (isSingleMaster)
{
// For single master, after one retry fails with 404/1002, it won't retry further
// But the header flag should be set for any potential future retries due to other errors
Assert.IsFalse(shouldRetry.ShouldRetry, "Single master should not retry again after first 404/1002 retry fails.");

// The header flag should be set even though no more 404/1002 retries will happen
// This ensures if the request is retried for a different reason (e.g., 503), it will have the header
}
else
{
// Multi-master can retry across multiple regions
Assert.IsTrue(shouldRetry.ShouldRetry, "Multi-master should continue retrying on 404/1002.");
}

// For single master: Verify header would be added if request is retried for other reasons (e.g., 503)
// For multi-master: Verify header is NOT added even on subsequent retries
if (isSingleMaster)
{
// Simulate a 503 error to trigger another retry
DocumentClientException serviceUnavailableException = new DocumentClientException(
message: "Simulated 503 ServiceUnavailable",
innerException: null,
statusCode: HttpStatusCode.ServiceUnavailable,
substatusCode: SubStatusCodes.Unknown,
requestUri: request.RequestContext.LocationEndpointToRoute,
responseHeaders: new DictionaryNameValueCollection());

shouldRetry = await retryPolicy.ShouldRetryAsync(serviceUnavailableException, CancellationToken.None);

if (shouldRetry.ShouldRetry)
{
// Now verify the header is present on this retry triggered by 503
retryPolicy.OnBeforeSendRequest(request);
headerValues = request.Headers.GetValues(HubRegionHeader);
Assert.IsNotNull(headerValues, "Header should be present on retry after 404/1002 flag was set.");
Assert.AreEqual(1, headerValues.Length, "Header should have exactly one value.");
Assert.AreEqual(bool.TrueString, headerValues[0], "Header value should be 'True'.");
}
}
else
{
// For multi-master: Verify header is NOT added even on subsequent retries
for (int retryAttempt = 2; retryAttempt <= 3; retryAttempt++)
{
if (shouldRetry.ShouldRetry)
{
retryPolicy.OnBeforeSendRequest(request);
headerValues = request.Headers.GetValues(HubRegionHeader);
Assert.IsNull(headerValues, $"Header should NOT be present on retry attempt {retryAttempt} for multi-master account.");

// Simulate another 404/1002 or 503 to continue retry loop
DocumentClientException nextException = new DocumentClientException(
message: $"Simulated error on retry {retryAttempt}",
innerException: null,
statusCode: retryAttempt % 2 == 0 ? HttpStatusCode.ServiceUnavailable : HttpStatusCode.NotFound,
substatusCode: retryAttempt % 2 == 0 ? SubStatusCodes.Unknown : SubStatusCodes.ReadSessionNotAvailable,
requestUri: request.RequestContext.LocationEndpointToRoute,
responseHeaders: new DictionaryNameValueCollection());

shouldRetry = await retryPolicy.ShouldRetryAsync(nextException, CancellationToken.None);
}
}
}
}

private async Task ValidateConnectTimeoutTriggersClientRetryPolicyAsync(
bool isReadRequest,
Expand Down
Loading