Skip to content

Commit 28af01d

Browse files
authored
Merge branch 'master' into users/aavasthy/AADAudience
2 parents f85b3fd + 80924ef commit 28af01d

4 files changed

Lines changed: 88 additions & 78 deletions

File tree

Microsoft.Azure.Cosmos/src/Handler/AbstractRetryHandler.cs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ public override async Task<ResponseMessage> SendAsync(
5353

5454
throw;
5555
}
56+
catch (OperationCanceledException ex)
57+
{
58+
throw new CosmosOperationCanceledException(ex, request.Trace);
59+
}
5660
finally
5761
{
5862
request.OnBeforeSendRequestActions -= retryPolicyInstance.OnBeforeSendRequest;

Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/ClientConfigurationDiagnosticTest.cs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -154,10 +154,9 @@ public async Task VerifyDiagnosticsOrderTest()
154154
}
155155
catch (CosmosOperationCanceledException oce)
156156
{
157-
IReadOnlyList<ITrace> children = ((CosmosTraceDiagnostics)oce.Diagnostics).Value.Children;
158-
ITrace exceptionChild = children[^1];
159-
Assert.AreEqual("CosmosOperationCanceledException", exceptionChild.Name);
160-
Assert.IsNotNull(exceptionChild.Data["Operation Cancelled Exception"]);
157+
//check that the exception child exists in the trace diagnostics
158+
Assert.IsTrue(oce.Diagnostics.ToString().Contains("CosmosOperationCanceledException"));
159+
Assert.IsTrue(oce.Diagnostics.ToString().Contains("Operation Cancelled Exception"));
161160
}
162161
}
163162
}

Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/ClientTests.cs

Lines changed: 71 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1027,7 +1027,7 @@ public async Task CreateItemDuringTimeoutTest()
10271027
.Build(),
10281028
result:
10291029
FaultInjectionResultBuilder.GetResultBuilder(FaultInjectionServerErrorType.SendDelay)
1030-
.WithDelay(TimeSpan.FromSeconds(100))
1030+
.WithDelay(TimeSpan.FromSeconds(20))
10311031
.Build())
10321032
.Build();
10331033

@@ -1039,7 +1039,7 @@ public async Task CreateItemDuringTimeoutTest()
10391039
{
10401040
ConsistencyLevel = Cosmos.ConsistencyLevel.Session,
10411041
FaultInjector = faultInjector,
1042-
RequestTimeout = TimeSpan.FromSeconds(2)
1042+
RequestTimeout = TimeSpan.FromSeconds(1)
10431043

10441044
};
10451045

@@ -1049,11 +1049,13 @@ public async Task CreateItemDuringTimeoutTest()
10491049
CosmosClient cosmosClient = TestCommon.CreateCosmosClient(clientOptions: cosmosClientOptions);
10501050

10511051
db = await cosmosClient.CreateDatabaseIfNotExistsAsync("TimeoutFaultTest");
1052-
Container container = await db.CreateContainerIfNotExistsAsync("TimeoutFaultContainer", "/pk");
1053-
1054-
// Act.
1055-
// Simulate a aggressive timeout scenario by performing 3 writes which will all timeout due to fault injection rule.
1056-
for (int i = 0; i < 3; i++)
1052+
Container container = await db.CreateContainerIfNotExistsAsync("TimeoutFaultContainer", "/pk");
1053+
1054+
bool isTimeoutExceptionThrown = false;
1055+
1056+
// Act.
1057+
// Simulate a aggressive timeout scenario by performing 3 writes which will all timeout due to fault injection rule.
1058+
for (int i = 0; i < 2; i++)
10571059
{
10581060
try
10591061
{
@@ -1062,73 +1064,69 @@ public async Task CreateItemDuringTimeoutTest()
10621064
}
10631065
catch (CosmosException exx)
10641066
{
1065-
Assert.AreEqual(HttpStatusCode.RequestTimeout, exx.StatusCode);
1067+
Assert.AreEqual(HttpStatusCode.RequestTimeout, exx.StatusCode);
1068+
isTimeoutExceptionThrown = true;
10661069
}
1067-
}
1068-
1069-
//Assert that the old channel that is now made unhealthy by the timeouts and a new healthy channel is available for next requests.
1070-
1071-
1072-
// Get all the channels that are under TransportClient -> ChannelDictionary -> Channels.
1073-
IStoreClientFactory factory = (IStoreClientFactory)cosmosClient.DocumentClient.GetType()
1074-
.GetField("storeClientFactory", BindingFlags.NonPublic | BindingFlags.Instance)
1075-
.GetValue(cosmosClient.DocumentClient);
1076-
StoreClientFactory storeClientFactory = (StoreClientFactory)factory;
1077-
1078-
TransportClient client = (TransportClient)storeClientFactory.GetType()
1079-
.GetField("transportClient", BindingFlags.NonPublic | BindingFlags.Instance)
1080-
.GetValue(storeClientFactory);
1081-
Documents.Rntbd.TransportClient transportClient = (Documents.Rntbd.TransportClient)client;
1082-
1083-
Documents.Rntbd.ChannelDictionary channelDict = (Documents.Rntbd.ChannelDictionary)transportClient.GetType()
1084-
.GetField("channelDictionary", BindingFlags.NonPublic | BindingFlags.Instance)
1085-
.GetValue(transportClient);
1086-
ConcurrentDictionary<Documents.Rntbd.ServerKey, Documents.Rntbd.IChannel> allChannels = (ConcurrentDictionary<Documents.Rntbd.ServerKey, Documents.Rntbd.IChannel>)channelDict.GetType()
1087-
.GetField("channels", BindingFlags.NonPublic | BindingFlags.Instance)
1088-
.GetValue(channelDict);
1089-
1090-
//Assert that the old channel that is now made unhealthy by the timeouts.
1091-
//Get the channel by channelDict -> LoadBalancingChannel -> LoadBalancingPartition -> LbChannelState -> IChannel.
1092-
Documents.Rntbd.LoadBalancingChannel loadBalancingUnhealthyChannel = (Documents.Rntbd.LoadBalancingChannel)allChannels[allChannels.Keys.ElementAt(1)];
1093-
Documents.Rntbd.LoadBalancingPartition loadBalancingPartitionUnHealthy = (Documents.Rntbd.LoadBalancingPartition)loadBalancingUnhealthyChannel.GetType()
1094-
.GetField("singlePartition", BindingFlags.NonPublic | BindingFlags.Instance)
1095-
.GetValue(loadBalancingUnhealthyChannel);
1096-
1097-
Assert.IsNotNull(loadBalancingPartitionUnHealthy);
1098-
1099-
List<Documents.Rntbd.LbChannelState> openChannelsUnhealthy = (List<Documents.Rntbd.LbChannelState>)loadBalancingPartitionUnHealthy.GetType()
1100-
.GetField("openChannels", BindingFlags.NonPublic | BindingFlags.Instance)
1101-
.GetValue(loadBalancingPartitionUnHealthy);
1102-
Assert.AreEqual(1, openChannelsUnhealthy.Count);
1103-
1104-
foreach (Documents.Rntbd.LbChannelState channelState in openChannelsUnhealthy)
1105-
{
1106-
Documents.Rntbd.IChannel channel = (Documents.Rntbd.IChannel)openChannelsUnhealthy[0].GetType()
1107-
.GetField("channel", BindingFlags.NonPublic | BindingFlags.Instance)
1108-
.GetValue(channelState);
1109-
Assert.IsFalse(channel.Healthy);
1110-
}
1111-
1112-
//Assert that the new channel which is healthy. Picking the first channel from the allChannels dictionary as the new channel.
1113-
Documents.Rntbd.LoadBalancingChannel loadBalancingChannel = (Documents.Rntbd.LoadBalancingChannel)allChannels[allChannels.Keys.First()];
1114-
Documents.Rntbd.LoadBalancingPartition loadBalancingPartition = (Documents.Rntbd.LoadBalancingPartition)loadBalancingChannel.GetType()
1115-
.GetField("singlePartition", BindingFlags.NonPublic | BindingFlags.Instance)
1116-
.GetValue(loadBalancingChannel);
1117-
1118-
Assert.IsNotNull(loadBalancingPartition);
1119-
1120-
List<Documents.Rntbd.LbChannelState> openChannels = (List<Documents.Rntbd.LbChannelState>)loadBalancingPartition.GetType()
1121-
.GetField("openChannels", BindingFlags.NonPublic | BindingFlags.Instance)
1122-
.GetValue(loadBalancingPartition);
1123-
Assert.AreEqual(1, openChannels.Count);
1124-
1125-
foreach (Documents.Rntbd.LbChannelState channelState in openChannels)
1126-
{
1127-
Documents.Rntbd.IChannel channel = (Documents.Rntbd.IChannel)openChannels[0].GetType()
1128-
.GetField("channel", BindingFlags.NonPublic | BindingFlags.Instance)
1129-
.GetValue(channelState);
1130-
Assert.IsTrue(channel.Healthy);
1131-
}
1070+
}
1071+
1072+
Assert.IsTrue(isTimeoutExceptionThrown, "Timeout exception should be thrown for all the 3 writes due to fault injection rule.");
1073+
1074+
//Assert that the old channel that is now made unhealthy by the timeouts and a new healthy channel is available for next requests.
1075+
1076+
// Get all the channels that are under TransportClient -> ChannelDictionary -> Channels.
1077+
IStoreClientFactory storeClientFactAbstract = (IStoreClientFactory)cosmosClient.DocumentClient.GetType()
1078+
.GetField("storeClientFactory", BindingFlags.NonPublic | BindingFlags.Instance)
1079+
.GetValue(cosmosClient.DocumentClient);
1080+
StoreClientFactory storeClientFactory = (StoreClientFactory)storeClientFactAbstract;
1081+
1082+
TransportClient transportClient = (TransportClient)storeClientFactory.GetType()
1083+
.GetField("transportClient", BindingFlags.NonPublic | BindingFlags.Instance)
1084+
.GetValue(storeClientFactory);
1085+
1086+
Documents.Rntbd.TransportClient rntbdTransportClient = (Documents.Rntbd.TransportClient)transportClient;
1087+
1088+
Documents.Rntbd.ChannelDictionary channelDict = (Documents.Rntbd.ChannelDictionary)rntbdTransportClient.GetType()
1089+
.GetField("channelDictionary", BindingFlags.NonPublic | BindingFlags.Instance)
1090+
.GetValue(rntbdTransportClient);
1091+
ConcurrentDictionary<Documents.Rntbd.ServerKey, Documents.Rntbd.IChannel> allChannels = (ConcurrentDictionary<Documents.Rntbd.ServerKey, Documents.Rntbd.IChannel>)channelDict.GetType()
1092+
.GetField("channels", BindingFlags.NonPublic | BindingFlags.Instance)
1093+
.GetValue(channelDict);
1094+
1095+
Assert.IsTrue(allChannels.Count > 1, "There should be at least 2 channels, one healthy and one unhealthy channel.");
1096+
1097+
bool unHealthyChannelFound = false;
1098+
bool healthyChannelFound = false;
1099+
foreach (Documents.Rntbd.ServerKey ch in allChannels.Keys)
1100+
{
1101+
Documents.Rntbd.LoadBalancingChannel lbChannel = (Documents.Rntbd.LoadBalancingChannel)allChannels[ch];
1102+
1103+
Documents.Rntbd.LoadBalancingPartition lbPartition = (Documents.Rntbd.LoadBalancingPartition)lbChannel.GetType()
1104+
.GetField("singlePartition", BindingFlags.NonPublic | BindingFlags.Instance)
1105+
.GetValue(lbChannel);
1106+
1107+
Assert.IsNotNull(lbPartition);
1108+
List<Documents.Rntbd.LbChannelState> openChs = (List<Documents.Rntbd.LbChannelState>)lbPartition.GetType()
1109+
.GetField("openChannels", BindingFlags.NonPublic | BindingFlags.Instance)
1110+
.GetValue(lbPartition);
1111+
1112+
foreach (Documents.Rntbd.LbChannelState channelState in openChs)
1113+
{
1114+
Documents.Rntbd.IChannel channel = (Documents.Rntbd.IChannel)channelState.GetType()
1115+
.GetField("channel", BindingFlags.NonPublic | BindingFlags.Instance)
1116+
.GetValue(channelState);
1117+
if (!channelState.DeepHealthy)
1118+
{
1119+
unHealthyChannelFound = true;
1120+
} else
1121+
{
1122+
healthyChannelFound = true;
1123+
}
1124+
}
1125+
}
1126+
1127+
1128+
Assert.IsTrue(unHealthyChannelFound, "An unhealthy Channel/Connection should have been found due to the repeated timeouts plus aggressive connection timeout policy");
1129+
Assert.IsTrue(healthyChannelFound, "A healthy Channel/Connection should have been found due to the timeouts causing the the old channel to be closed and new one created for future use");
11321130
}
11331131
finally
11341132
{

changelog.md

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2424
- [5180](https://github.com/Azure/azure-cosmos-dotnet-v3/pull/5180) Query: Adds public PopulateQueryAdvice capability
2525
- [5215](https://github.com/Azure/azure-cosmos-dotnet-v3/pull/5215) Client Encryption: Adds support for latest Cosmos package and bumps up Encryption package for nuget release
2626
- [5157](https://github.com/Azure/azure-cosmos-dotnet-v3/pull/5157) Query: Adds support for LINQ extension method for VectorDistance
27+
> This also includes a Direct Package version update to `3.39.1` in PR [#5241](https://github.com/Azure/azure-cosmos-dotnet-v3/pull/5241) which includes the following:
28+
- Rntbd Health Check Improvements Part 3: Enables Aggressive Timeout Detection By Default.
29+
- Introduce East US 3 region in the SDK.
2730

2831
#### Fixed
2932

@@ -99,7 +102,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
99102

100103
#### Fixed
101104

102-
- [5108](https://github.com/Azure/azure-cosmos-dotnet-v3/pull/5108) Metadata requests: Fixes bug where certain metadata requests are not retried with a client cold start with only query requests
105+
- [5108](https://github.com/Azure/azure-cosmos-dotnet-v3/pull/5108) Metadata requests: Fixes bug where certain metadata requests are not retried with a client cold start with only query requests.
106+
107+
#### Added
108+
109+
- [5108](https://github.com/Azure/azure-cosmos-dotnet-v3/pull/5108) ClientRetryPolicy: Adds Cross Regional Retry on `Gone` (410) with `LeaseNotFound` (1022) sub status code.
110+
- [5108](https://github.com/Azure/azure-cosmos-dotnet-v3/pull/5108) ClientRetryPolicy: Adds Cross Regional Retry for read requests on `InternalServerError` (500) status code.
111+
- [5108](https://github.com/Azure/azure-cosmos-dotnet-v3/pull/5108) ClientRetryPolicy: Adds Retry on the Preferred Regions on endpoint failures.
103112

104113
### <a name="3.49.0-preview.0"/> [3.49.0-preview.0](https://www.nuget.org/packages/Microsoft.Azure.Cosmos/3.49.0-preview.0) - 2025-3-21
105114

0 commit comments

Comments
 (0)