Skip to content

Commit 6eb976b

Browse files
[Internal] Thin Client Integration: Adds support for Per Partition Automatic Failover and Per Partition Circuit Breaker. (#5258)
# Pull Request Template ## Description This PR introduces the partition level failover (Automatic Failover for Writes, aka PPAF + Circuit Breaker, aka PPCB) for `Thinclient` mode. The changes are mainly done in the `ThinclientStoreModel` which leverages the `GlobalPartitionEndpointManager` to apply the partition level override for the next account or preferred regions. A thorough DR drill was conducted for PPAF on the thin proxy accounts, for both Strong and Session consistency levels. Below are the results captured after the DR drill: - **Account Name:** dkunda-tc-strong-account-0701 - **Consistency Level:** Strong. - **Environment:** Test14 - **Drill Start Time:** 2025-07-05T18:49:17.414Z - **Drill End Time:** 2025-07-05T23:00:17.414Z - **Accounts Hosted In:** Compute Gateway/ Thin Client Federation ## Write Failover Analysis: <img width="979" height="240" alt="image" src="https://github.com/user-attachments/assets/e24611a5-2447-4335-b62f-453681d8ce21" /> _[PPAF: The above image shows the write workload is successfully failing over from West US to East Asia and Failing Back on the reverse path.]_ ## Read Failover Analysis <img width="979" height="240" alt="image" src="https://github.com/user-attachments/assets/1c4a6e29-40ae-4c98-bbfb-6b4a7906b814" /> _[PPCB: The above image shows the read workload is successfully failing over from West US to East Asia and Failing Back on the reverse path.]_ ## Query Failover Analysis <img width="979" height="240" alt="image" src="https://github.com/user-attachments/assets/0d04bfc3-2f81-4df7-a989-8ab1b287f062" /> _[PPCB: The above image shows the query workload is successfully failing over from West US to East Asia and Failing Back on the reverse path.]_ ## Type of change Please delete options that are not relevant. - [X] New feature (non-breaking change which adds functionality) ## Closing issues To automatically close an issue: closes #5247 --------- Co-authored-by: Debdatta Kunda <dkunda@microsoft.com> Co-authored-by: Debdatta Kunda <87335885+kundadebdatta@users.noreply.github.com>
1 parent f869cce commit 6eb976b

8 files changed

Lines changed: 488 additions & 24 deletions

File tree

Microsoft.Azure.Cosmos/src/DocumentClient.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1115,7 +1115,8 @@ private async Task<bool> GetInitializationTaskAsync(IStoreClientFactory storeCli
11151115
(Cosmos.ConsistencyLevel)this.accountServiceConfiguration.DefaultConsistencyLevel,
11161116
this.eventSource,
11171117
this.serializerSettings,
1118-
this.httpClient);
1118+
this.httpClient,
1119+
isPartitionLevelFailoverEnabled: this.ConnectionPolicy.EnablePartitionLevelFailover || this.ConnectionPolicy.EnablePartitionLevelCircuitBreaker);
11191120

11201121
thinClientStoreModel.SetCaches(this.partitionKeyRangeCache, this.collectionCache);
11211122

Microsoft.Azure.Cosmos/src/GatewayStoreModel.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,7 @@ internal static async Task<Tuple<bool, string>> TryResolveSessionTokenAsync(
368368
return new Tuple<bool, string>(false, null);
369369
}
370370

371-
private static async Task<Tuple<bool, PartitionKeyRange>> TryResolvePartitionKeyRangeAsync(
371+
protected static async Task<Tuple<bool, PartitionKeyRange>> TryResolvePartitionKeyRangeAsync(
372372
DocumentServiceRequest request,
373373
ISessionContainer sessionContainer,
374374
PartitionKeyRangeCache partitionKeyRangeCache,

Microsoft.Azure.Cosmos/src/Routing/GlobalPartitionEndpointManagerCore.cs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,9 @@ public override bool TryMarkEndpointUnavailableForPartitionKeyRange(
169169
{
170170
// For multi master write accounts, since all the regions are treated as write regions, the next locations to fail over
171171
// will be the preferred read regions that are configured in the application preferred regions in the CosmosClientOptions.
172-
ReadOnlyCollection<Uri> nextLocations = this.globalEndpointManager.ReadEndpoints;
172+
ReadOnlyCollection<Uri> nextLocations = ConfigurationManager.IsThinClientEnabled(defaultValue: false) && ThinClientStoreModel.IsOperationSupportedByThinClient(request)
173+
? this.globalEndpointManager.ThinClientReadEndpoints
174+
: this.globalEndpointManager.ReadEndpoints;
173175

174176
return this.TryAddOrUpdatePartitionFailoverInfoAndMoveToNextLocation(
175177
partitionKeyRange,
@@ -181,7 +183,9 @@ public override bool TryMarkEndpointUnavailableForPartitionKeyRange(
181183
else if (this.IsRequestEligibleForPerPartitionAutomaticFailover(request))
182184
{
183185
// For any single master write accounts, the next locations to fail over will be the read regions configured at the account level.
184-
ReadOnlyCollection<Uri> nextLocations = this.globalEndpointManager.AccountReadEndpoints;
186+
ReadOnlyCollection<Uri> nextLocations = ConfigurationManager.IsThinClientEnabled(defaultValue: false) && ThinClientStoreModel.IsOperationSupportedByThinClient(request)
187+
? this.globalEndpointManager.ThinClientReadEndpoints
188+
: this.globalEndpointManager.AccountReadEndpoints;
185189

186190
return this.TryAddOrUpdatePartitionFailoverInfoAndMoveToNextLocation(
187191
partitionKeyRange,

Microsoft.Azure.Cosmos/src/ThinClientStoreClient.cs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ namespace Microsoft.Azure.Cosmos
1212
using System.Threading.Tasks;
1313
using Microsoft.Azure.Cosmos.Core.Trace;
1414
using Microsoft.Azure.Cosmos.Routing;
15-
using Microsoft.Azure.Cosmos.Tracing;
1615
using Microsoft.Azure.Documents;
1716
using Newtonsoft.Json;
1817
using static Microsoft.Azure.Cosmos.ThinClientTransportSerializer;
@@ -23,17 +22,21 @@ namespace Microsoft.Azure.Cosmos
2322
/// </summary>
2423
internal class ThinClientStoreClient : GatewayStoreClient
2524
{
25+
private readonly bool isPartitionLevelFailoverEnabled;
2626
private readonly ObjectPool<BufferProviderWrapper> bufferProviderWrapperPool;
2727

2828
public ThinClientStoreClient(
2929
CosmosHttpClient httpClient,
3030
ICommunicationEventSource eventSource,
31-
JsonSerializerSettings serializerSettings = null)
31+
JsonSerializerSettings serializerSettings = null,
32+
bool isPartitionLevelFailoverEnabled = false)
3233
: base(httpClient,
3334
eventSource,
34-
serializerSettings)
35+
serializerSettings,
36+
isPartitionLevelFailoverEnabled)
3537
{
36-
this.bufferProviderWrapperPool = new ObjectPool<BufferProviderWrapper>(() => new BufferProviderWrapper());
38+
this.bufferProviderWrapperPool = new ObjectPool<BufferProviderWrapper>(() => new BufferProviderWrapper());
39+
this.isPartitionLevelFailoverEnabled = isPartitionLevelFailoverEnabled;
3740
}
3841

3942
public override async Task<DocumentServiceResponse> InvokeAsync(

Microsoft.Azure.Cosmos/src/ThinClientStoreModel.cs

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ namespace Microsoft.Azure.Cosmos
1919
/// </summary>
2020
internal class ThinClientStoreModel : GatewayStoreModel
2121
{
22+
private readonly GlobalPartitionEndpointManager globalPartitionEndpointManager;
23+
private readonly bool isPartitionLevelFailoverEnabled;
2224
private ThinClientStoreClient thinClientStoreClient;
2325

2426
public ThinClientStoreModel(
@@ -28,19 +30,27 @@ public ThinClientStoreModel(
2830
ConsistencyLevel defaultConsistencyLevel,
2931
DocumentClientEventSource eventSource,
3032
JsonSerializerSettings serializerSettings,
31-
CosmosHttpClient httpClient)
33+
CosmosHttpClient httpClient,
34+
bool isPartitionLevelFailoverEnabled = false)
3235
: base(endpointManager,
3336
sessionContainer,
3437
defaultConsistencyLevel,
3538
eventSource,
3639
serializerSettings,
3740
httpClient,
38-
globalPartitionEndpointManager)
41+
globalPartitionEndpointManager,
42+
isPartitionLevelFailoverEnabled)
3943
{
4044
this.thinClientStoreClient = new ThinClientStoreClient(
4145
httpClient,
4246
eventSource,
43-
serializerSettings);
47+
serializerSettings,
48+
isPartitionLevelFailoverEnabled);
49+
50+
this.isPartitionLevelFailoverEnabled = isPartitionLevelFailoverEnabled;
51+
this.globalPartitionEndpointManager = globalPartitionEndpointManager;
52+
this.globalPartitionEndpointManager.SetBackgroundConnectionPeriodicRefreshTask(
53+
base.MarkEndpointsToHealthyAsync);
4454
}
4555

4656
public override async Task<DocumentServiceResponse> ProcessMessageAsync(
@@ -63,14 +73,30 @@ await GatewayStoreModel.ApplySessionTokenAsync(
6373
DocumentServiceResponse response;
6474
try
6575
{
66-
Uri physicalAddress = ThinClientStoreClient.IsFeedRequest(request.OperationType) ? base.GetFeedUri(request) : base.GetEntityUri(request);
6776
if (request.ResourceType.Equals(ResourceType.Document) && base.endpointManager.TryGetLocationForGatewayDiagnostics(
6877
request.RequestContext.LocationEndpointToRoute,
6978
out string regionName))
7079
{
7180
request.RequestContext.RegionName = regionName;
7281
}
7382

83+
// This is applicable for both per partition automatic failover and per partition circuit breaker.
84+
if (this.isPartitionLevelFailoverEnabled
85+
&& !ReplicatedResourceClient.IsMasterResource(request.ResourceType)
86+
&& request.ResourceType.IsPartitioned())
87+
{
88+
(bool isSuccess, PartitionKeyRange partitionKeyRange) = await GatewayStoreModel.TryResolvePartitionKeyRangeAsync(
89+
request: request,
90+
sessionContainer: this.sessionContainer,
91+
partitionKeyRangeCache: this.partitionKeyRangeCache,
92+
clientCollectionCache: this.clientCollectionCache,
93+
refreshCache: false);
94+
95+
request.RequestContext.ResolvedPartitionKeyRange = partitionKeyRange;
96+
this.globalPartitionEndpointManager.TryAddPartitionLevelLocationOverride(request);
97+
}
98+
99+
Uri physicalAddress = ThinClientStoreClient.IsFeedRequest(request.OperationType) ? base.GetFeedUri(request) : base.GetEntityUri(request);
74100
AccountProperties properties = await this.GetDatabaseAccountPropertiesAsync();
75101
response = await this.thinClientStoreClient.InvokeAsync(
76102
request,

0 commit comments

Comments
 (0)