Skip to content

Commit ee83a58

Browse files
kundadebdattaNaluTripicianFabianMeiswinkelPilchiedibahlfi
authored
[Hotfix] CrossRegionHedging: Fixes NullReference Exception Bug (#4956)
# Pull Request Template ## Description This PR cherry-picks the following commits: - #4706 - #4869 ## Type of change Please delete options that are not relevant. - [x] Bug fix (non-breaking change which fixes an issue) - [x] New feature (non-breaking change which adds functionality) ## Closing issues To automatically close an issue: closes #IssueNumber --------- Co-authored-by: Nalu Tripician <27316859+NaluTripician@users.noreply.github.com> Co-authored-by: Fabian Meiswinkel <fabian@meiswinkel.com> Co-authored-by: Kevin Pilch <kevinpi@microsoft.com> Co-authored-by: dibahlfi <106994927+dibahlfi@users.noreply.github.com> Co-authored-by: Arooshi Avasthy <113193425+aavasthy@users.noreply.github.com> Co-authored-by: Kiran Kumar Kolli <kirankk@microsoft.com>
1 parent e9fa019 commit ee83a58

10 files changed

Lines changed: 657 additions & 170 deletions

File tree

Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ public void OnBeforeSendRequest(DocumentServiceRequest request)
193193
this.canUseMultipleWriteLocations = this.globalEndpointManager.CanUseMultipleWriteLocations(request);
194194
this.documentServiceRequest = request;
195195
this.isMultiMasterWriteRequest = !this.isReadRequest
196-
&& (this.globalEndpointManager?.CanSupportMultipleWriteLocations(request) ?? false);
196+
&& (this.globalEndpointManager?.CanSupportMultipleWriteLocations(request.ResourceType, request.OperationType) ?? false);
197197

198198
// clear previous location-based routing directive
199199
request.RequestContext.ClearRouteToLocation();

Microsoft.Azure.Cosmos/src/DocumentClient.cs

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,6 @@ internal partial class DocumentClient : IDisposable, IAuthorizationTokenProvider
115115

116116
private readonly bool IsLocalQuorumConsistency = false;
117117
private readonly bool isReplicaAddressValidationEnabled;
118-
private readonly AvailabilityStrategy availabilityStrategy;
119118

120119
//Fault Injection
121120
private readonly IChaosInterceptorFactory chaosInterceptorFactory;
@@ -441,7 +440,6 @@ internal DocumentClient(Uri serviceEndpoint,
441440
/// <param name="cosmosClientId"></param>
442441
/// <param name="remoteCertificateValidationCallback">This delegate responsible for validating the third party certificate. </param>
443442
/// <param name="cosmosClientTelemetryOptions">This is distributed tracing flag</param>
444-
/// <param name="availabilityStrategy">This is the availability strategy for the client</param>"
445443
/// <param name="chaosInterceptorFactory">This is the chaos interceptor used for fault injection</param>
446444
/// <remarks>
447445
/// The service endpoint can be obtained from the Azure Management Portal.
@@ -471,7 +469,6 @@ internal DocumentClient(Uri serviceEndpoint,
471469
string cosmosClientId = null,
472470
RemoteCertificateValidationCallback remoteCertificateValidationCallback = null,
473471
CosmosClientTelemetryOptions cosmosClientTelemetryOptions = null,
474-
AvailabilityStrategy availabilityStrategy = null,
475472
IChaosInterceptorFactory chaosInterceptorFactory = null)
476473
{
477474
if (sendingRequestEventArgs != null)
@@ -495,7 +492,6 @@ internal DocumentClient(Uri serviceEndpoint,
495492
this.transportClientHandlerFactory = transportClientHandlerFactory;
496493
this.IsLocalQuorumConsistency = isLocalQuorumConsistency;
497494
this.initTaskCache = new AsyncCacheNonBlocking<string, bool>(cancellationToken: this.cancellationTokenSource.Token);
498-
this.availabilityStrategy = availabilityStrategy;
499495
this.chaosInterceptorFactory = chaosInterceptorFactory;
500496
this.chaosInterceptor = chaosInterceptorFactory?.CreateInterceptor(this);
501497

Microsoft.Azure.Cosmos/src/Resource/ClientContextCore.cs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,6 @@ internal static CosmosClientContext Create(
8484
cosmosClientId: cosmosClient.Id,
8585
remoteCertificateValidationCallback: ClientContextCore.SslCustomValidationCallBack(clientOptions.GetServerCertificateCustomValidationCallback()),
8686
cosmosClientTelemetryOptions: clientOptions.CosmosClientTelemetryOptions,
87-
availabilityStrategy: clientOptions.AvailabilityStrategy,
8887
chaosInterceptorFactory: clientOptions.ChaosInterceptorFactory);
8988

9089
return ClientContextCore.Create(

Microsoft.Azure.Cosmos/src/Routing/AvailabilityStrategy/AvailabilityStrategy.cs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,17 @@ internal static AvailabilityStrategy DisabledStrategy()
3939
/// </summary>
4040
/// <param name="threshold"> how long before SDK begins hedging</param>
4141
/// <param name="thresholdStep">Period of time between first hedge and next hedging attempts</param>
42+
/// <param name="enableMultiWriteRegionHedge">Whether hedging for write requests on accounts with multi-region writes are enabled
43+
/// Note that this does come with the caveat that there will be more 409 / 412 errors thrown by the SDK.
44+
/// This is expected and applications that adopt this feature should be prepared to handle these exceptions.
45+
/// Application might not be able to be deterministic on Create vs Replace in the case of Upsert Operations</param>
4246
/// <returns>something</returns>
43-
public static AvailabilityStrategy CrossRegionHedgingStrategy(TimeSpan threshold,
44-
TimeSpan? thresholdStep)
47+
public static AvailabilityStrategy CrossRegionHedgingStrategy(
48+
TimeSpan threshold,
49+
TimeSpan? thresholdStep,
50+
bool enableMultiWriteRegionHedge = false)
4551
{
46-
return new CrossRegionHedgingAvailabilityStrategy(threshold, thresholdStep);
52+
return new CrossRegionHedgingAvailabilityStrategy(threshold, thresholdStep, enableMultiWriteRegionHedge);
4753
}
4854
}
4955
}

Microsoft.Azure.Cosmos/src/Routing/AvailabilityStrategy/CrossRegionHedgingAvailabilityStrategy.cs

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,24 @@ internal class CrossRegionHedgingAvailabilityStrategy : AvailabilityStrategyInte
3636
/// </summary>
3737
public TimeSpan ThresholdStep { get; private set; }
3838

39+
/// <summary>
40+
/// Whether hedging for write requests on accounts with multi-region writes is enabled.
41+
/// Note that this does come with the caveat that there will be more 409 / 412 errors thrown by the SDK.
42+
/// This is expected and applications that adopt this feature should be prepared to handle these exceptions.
43+
/// Application might not be able to be deterministic on Create vs Replace in the case of Upsert Operations
44+
/// </summary>
45+
public bool EnableMultiWriteRegionHedge { get; private set; }
46+
3947
/// <summary>
4048
/// Constructor for hedging availability strategy
4149
/// </summary>
4250
/// <param name="threshold"></param>
4351
/// <param name="thresholdStep"></param>
52+
/// <param name="enableMultiWriteRegionHedge"></param>
4453
public CrossRegionHedgingAvailabilityStrategy(
4554
TimeSpan threshold,
46-
TimeSpan? thresholdStep)
55+
TimeSpan? thresholdStep,
56+
bool enableMultiWriteRegionHedge = false)
4757
{
4858
if (threshold <= TimeSpan.Zero)
4959
{
@@ -57,6 +67,7 @@ public CrossRegionHedgingAvailabilityStrategy(
5767

5868
this.Threshold = threshold;
5969
this.ThresholdStep = thresholdStep ?? TimeSpan.FromMilliseconds(-1);
70+
this.EnableMultiWriteRegionHedge = enableMultiWriteRegionHedge;
6071
}
6172

6273
/// <inheritdoc/>
@@ -70,18 +81,24 @@ internal override bool Enabled()
7081
/// This availability strategy can only be used if the request is a read-only request on a document request.
7182
/// </summary>
7283
/// <param name="request"></param>
84+
/// <param name="client"></param>
7385
/// <returns>whether the request should be a hedging request.</returns>
74-
internal bool ShouldHedge(RequestMessage request)
86+
internal bool ShouldHedge(RequestMessage request, CosmosClient client)
7587
{
7688
//Only use availability strategy for document point operations
7789
if (request.ResourceType != ResourceType.Document)
7890
{
7991
return false;
8092
}
8193

82-
//check to see if it is a not a read-only request
94+
//check to see if it is a not a read-only request/ if multimaster writes are enabled
8395
if (!OperationTypeExtensions.IsReadOperation(request.OperationType))
8496
{
97+
if (this.EnableMultiWriteRegionHedge
98+
&& client.DocumentClient.GlobalEndpointManager.CanSupportMultipleWriteLocations(request.ResourceType, request.OperationType))
99+
{
100+
return true;
101+
}
85102
return false;
86103
}
87104

@@ -102,7 +119,7 @@ internal override async Task<ResponseMessage> ExecuteAvailabilityStrategyAsync(
102119
RequestMessage request,
103120
CancellationToken cancellationToken)
104121
{
105-
if (!this.ShouldHedge(request)
122+
if (!this.ShouldHedge(request, client)
106123
|| client.DocumentClient.GlobalEndpointManager.ReadEndpoints.Count == 1)
107124
{
108125
return await sender(request, cancellationToken);
@@ -113,7 +130,7 @@ internal override async Task<ResponseMessage> ExecuteAvailabilityStrategyAsync(
113130
using (CancellationTokenSource cancellationTokenSource = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken))
114131
{
115132
using (CloneableStream clonedBody = (CloneableStream)(request.Content == null
116-
? null//new CloneableStream(new MemoryStream())
133+
? null
117134
: await StreamExtension.AsClonableStreamAsync(request.Content)))
118135
{
119136
IReadOnlyCollection<string> hedgeRegions = client.DocumentClient.GlobalEndpointManager
@@ -143,7 +160,8 @@ internal override async Task<ResponseMessage> ExecuteAvailabilityStrategyAsync(
143160
request,
144161
hedgeRegions.ElementAt(requestNumber),
145162
cancellationToken,
146-
cancellationTokenSource);
163+
cancellationTokenSource,
164+
trace);
147165

148166
requestTasks.Add(primaryRequest);
149167
}
@@ -262,7 +280,8 @@ private async Task<HedgingResponse> CloneAndSendAsync(
262280
clonedRequest,
263281
region,
264282
cancellationToken,
265-
cancellationTokenSource);
283+
cancellationTokenSource,
284+
trace);
266285
}
267286
}
268287

@@ -271,7 +290,8 @@ private async Task<HedgingResponse> RequestSenderAndResultCheckAsync(
271290
RequestMessage request,
272291
string hedgedRegion,
273292
CancellationToken cancellationToken,
274-
CancellationTokenSource cancellationTokenSource)
293+
CancellationTokenSource cancellationTokenSource,
294+
ITrace trace)
275295
{
276296
try
277297
{
@@ -288,9 +308,9 @@ private async Task<HedgingResponse> RequestSenderAndResultCheckAsync(
288308

289309
return new HedgingResponse(false, response, hedgedRegion);
290310
}
291-
catch (OperationCanceledException) when (cancellationTokenSource.IsCancellationRequested)
311+
catch (OperationCanceledException oce ) when (cancellationTokenSource.IsCancellationRequested)
292312
{
293-
return new HedgingResponse(false, null, hedgedRegion);
313+
throw new CosmosOperationCanceledException(oce, trace);
294314
}
295315
catch (Exception ex)
296316
{

Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -546,14 +546,17 @@ public virtual async Task RefreshLocationAsync(bool forceRefresh = false)
546546
/// Determines whether the current configuration and state of the service allow for supporting multiple write locations.
547547
/// This method returns True is the AvailableWriteLocations in LocationCache is more than 1. Otherwise, it returns False.
548548
/// </summary>
549-
/// <param name="request">The document service request for which the write location support is being evaluated.</param>
550-
/// <returns>A boolean flag indicating if the available write locations are more than one.</returns>
551-
public bool CanSupportMultipleWriteLocations(DocumentServiceRequest request)
549+
/// <param name="resourceType"> resource type of the request</param>
550+
/// <param name="operationType"> operation type of the request</param>
551+
/// <returns>A boolean flag indicating if the available write locations are more than one.</returns>
552+
public bool CanSupportMultipleWriteLocations(
553+
ResourceType resourceType,
554+
OperationType operationType)
552555
{
553556
return this.locationCache.CanUseMultipleWriteLocations()
554557
&& this.locationCache.GetAvailableAccountLevelWriteLocations()?.Count > 1
555-
&& (request.ResourceType == ResourceType.Document ||
556-
(request.ResourceType == ResourceType.StoredProcedure && request.OperationType == OperationType.Execute));
558+
&& (resourceType == ResourceType.Document ||
559+
(resourceType == ResourceType.StoredProcedure && operationType == OperationType.Execute));
557560
}
558561

559562
#pragma warning disable VSTHRD100 // Avoid async void methods

Microsoft.Azure.Cosmos/src/Routing/IGlobalEndpointManager.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,6 @@ internal interface IGlobalEndpointManager : IDisposable
3737

3838
ReadOnlyDictionary<string, Uri> GetAvailableReadEndpointsByLocation();
3939

40-
bool CanSupportMultipleWriteLocations(DocumentServiceRequest request);
40+
bool CanSupportMultipleWriteLocations(ResourceType resourceType, OperationType operationType);
4141
}
4242
}

0 commit comments

Comments
 (0)