Skip to content

Commit 0be9384

Browse files
HttpTimeoutPolicy Improvements Phase 2: Refactors Code to Separate out Retry Policy Timeouts for point-reads and non-point-reads on PPAF (#5482)
# Pull Request Template ## Description This change fixes the aggressive retry timeouts for requests when PPAF enabled. This change contains distinct retry policy timeouts for respective operations and allows us to manage the timeouts separately. ## Type of change Please delete options that are not relevant. - [x] Bug fix (non-breaking change which fixes an issue) ## Closing issues To automatically close an issue: closes #5484 --------- Co-authored-by: Debdatta Kunda <dkunda@microsoft.com> Co-authored-by: Debdatta Kunda <87335885+kundadebdatta@users.noreply.github.com>
1 parent 2a84d9a commit 0be9384

4 files changed

Lines changed: 217 additions & 29 deletions

File tree

Microsoft.Azure.Cosmos/src/HttpClient/HttpTimeoutPolicy.cs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,16 @@ public static HttpTimeoutPolicy GetTimeoutPolicy(
5555
// Data Plane Reads.
5656
else if (documentServiceRequest.IsReadOnlyRequest)
5757
{
58-
return isPartitionLevelFailoverEnabled
59-
? HttpTimeoutPolicyForPartitionFailover.InstanceShouldThrow503OnTimeout
60-
: HttpTimeoutPolicyDefault.InstanceShouldThrow503OnTimeout;
58+
if (isPartitionLevelFailoverEnabled)
59+
{
60+
return documentServiceRequest.OperationType == OperationType.Read
61+
? HttpTimeoutPolicyForPartitionFailover.InstanceShouldThrow503OnTimeoutForPointReads
62+
: HttpTimeoutPolicyForPartitionFailover.InstanceShouldThrow503OnTimeoutForNonPointReads;
63+
}
64+
else
65+
{
66+
return HttpTimeoutPolicyDefault.InstanceShouldThrow503OnTimeout;
67+
}
6168
}
6269
}
6370

Microsoft.Azure.Cosmos/src/HttpClient/HttpTimeoutPolicyForPartitionFailover.cs

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,37 +9,47 @@ namespace Microsoft.Azure.Cosmos
99

1010
internal sealed class HttpTimeoutPolicyForPartitionFailover : HttpTimeoutPolicy
1111
{
12-
public static readonly HttpTimeoutPolicy Instance = new HttpTimeoutPolicyForPartitionFailover(false);
13-
public static readonly HttpTimeoutPolicy InstanceShouldThrow503OnTimeout = new HttpTimeoutPolicyForPartitionFailover(true);
14-
public bool shouldThrow503OnTimeout;
12+
public static readonly HttpTimeoutPolicy InstanceShouldThrow503OnTimeoutForNonPointReads = new HttpTimeoutPolicyForPartitionFailover(isPointRead: false);
13+
public static readonly HttpTimeoutPolicy InstanceShouldThrow503OnTimeoutForPointReads = new HttpTimeoutPolicyForPartitionFailover(isPointRead: true);
14+
private readonly bool isPointRead;
1515
private static readonly string Name = nameof(HttpTimeoutPolicyDefault);
1616

17-
private HttpTimeoutPolicyForPartitionFailover(bool shouldThrow503OnTimeout)
17+
private HttpTimeoutPolicyForPartitionFailover(bool isPointRead)
1818
{
19-
this.shouldThrow503OnTimeout = shouldThrow503OnTimeout;
19+
this.isPointRead = isPointRead;
2020
}
2121

22-
private readonly IReadOnlyList<(TimeSpan requestTimeout, TimeSpan delayForNextRequest)> TimeoutsAndDelays = new List<(TimeSpan requestTimeout, TimeSpan delayForNextRequest)>()
22+
// Timeouts and delays are based on the following rationale:
23+
// For point reads: 3 attempts with timeouts of 1s, 6s, and 6s respectively.
24+
// For non-point reads: 3 attempts with timeouts of 6s, 6s, and 10s respectively.
25+
private readonly IReadOnlyList<(TimeSpan requestTimeout, TimeSpan delayForNextRequest)> TimeoutsAndDelaysForPointReads = new List<(TimeSpan requestTimeout, TimeSpan delayForNextRequest)>()
2326
{
24-
(TimeSpan.FromSeconds(.5), TimeSpan.Zero),
25-
(TimeSpan.FromSeconds(.5), TimeSpan.Zero),
2627
(TimeSpan.FromSeconds(1), TimeSpan.Zero),
28+
(TimeSpan.FromSeconds(6), TimeSpan.Zero),
29+
(TimeSpan.FromSeconds(6), TimeSpan.Zero),
30+
};
31+
32+
private readonly IReadOnlyList<(TimeSpan requestTimeout, TimeSpan delayForNextRequest)> TimeoutsAndDelaysForNonPointReads = new List<(TimeSpan requestTimeout, TimeSpan delayForNextRequest)>()
33+
{
34+
(TimeSpan.FromSeconds(6), TimeSpan.Zero),
35+
(TimeSpan.FromSeconds(6), TimeSpan.Zero),
36+
(TimeSpan.FromSeconds(10), TimeSpan.Zero),
2737
};
2838

2939
public override string TimeoutPolicyName => HttpTimeoutPolicyForPartitionFailover.Name;
3040

31-
public override int TotalRetryCount => this.TimeoutsAndDelays.Count;
41+
public override int TotalRetryCount => this.isPointRead ? this.TimeoutsAndDelaysForPointReads.Count : this.TimeoutsAndDelaysForNonPointReads.Count;
3242

3343
public override IEnumerator<(TimeSpan requestTimeout, TimeSpan delayForNextRequest)> GetTimeoutEnumerator()
3444
{
35-
return this.TimeoutsAndDelays.GetEnumerator();
45+
return this.isPointRead ? this.TimeoutsAndDelaysForPointReads.GetEnumerator() : this.TimeoutsAndDelaysForNonPointReads.GetEnumerator();
3646
}
3747

3848
public override bool ShouldRetryBasedOnResponse(HttpMethod requestHttpMethod, HttpResponseMessage responseMessage)
3949
{
4050
return false;
4151
}
4252

43-
public override bool ShouldThrow503OnTimeout => this.shouldThrow503OnTimeout;
53+
public override bool ShouldThrow503OnTimeout => true;
4454
}
4555
}

Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemIntegrationTests.cs

Lines changed: 139 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
{
33
using System;
44
using System.Collections.Generic;
5+
using System.Diagnostics;
56
using System.IO;
67
using System.Linq;
78
using System.Net;
@@ -12,7 +13,7 @@
1213
using System.Threading;
1314
using System.Threading.Tasks;
1415
using Microsoft.Azure.Cosmos.Diagnostics;
15-
using Microsoft.Azure.Cosmos.FaultInjection;
16+
using Microsoft.Azure.Cosmos.FaultInjection;
1617
using Microsoft.VisualStudio.TestTools.UnitTesting;
1718
using Newtonsoft.Json.Linq;
1819
using static Microsoft.Azure.Cosmos.Routing.GlobalPartitionEndpointManagerCore;
@@ -80,7 +81,8 @@ public void TestCleanup()
8081
finally
8182
{
8283
//Do not delete the resources (except MM Write test object), georeplication is slow and we want to reuse the resources
83-
this.client?.Dispose();
84+
this.client?.Dispose();
85+
Environment.SetEnvironmentVariable(ConfigurationManager.StalePartitionUnavailabilityRefreshIntervalInSeconds, null);
8486
}
8587
}
8688

@@ -467,7 +469,7 @@ await this.container.DeleteItemAsync<CosmosIntegrationTestObject>(
467469
}
468470

469471
[TestMethod]
470-
[TestCategory("MultiRegion")]
472+
[TestCategory("MultiRegion")]
471473
[DataRow(ConnectionMode.Direct, "15", "10", DisplayName = "Direct Mode - Scenario when the total iteration count is 15 and circuit breaker consecutive failure threshold is set to 10.")]
472474
[DataRow(ConnectionMode.Direct, "25", "20", DisplayName = "Direct Mode - Scenario when the total iteration count is 25 and circuit breaker consecutive failure threshold is set to 20.")]
473475
[DataRow(ConnectionMode.Direct, "35", "30", DisplayName = "Direct Mode - Scenario when the total iteration count is 35 and circuit breaker consecutive failure threshold is set to 30.")]
@@ -602,7 +604,7 @@ public async Task ReadItemAsync_WithCircuitBreakerEnabledAndSingleMasterAccountA
602604
}
603605

604606
[TestMethod]
605-
[TestCategory("MultiRegion")]
607+
[TestCategory("MultiRegion")]
606608
[DataRow(ConnectionMode.Direct, DisplayName ="Direct Mode")]
607609
[DataRow(ConnectionMode.Gateway, DisplayName = "Gateway Mode")]
608610
[Owner("nalutripician")]
@@ -721,15 +723,14 @@ public async Task ReadItemAsync_WithCircuitBreakerEnabledAndTimeoutCounterOverwr
721723
}
722724
finally
723725
{
724-
Environment.SetEnvironmentVariable(ConfigurationManager.PartitionLevelCircuitBreakerEnabled, null);
725-
Environment.SetEnvironmentVariable(ConfigurationManager.CircuitBreakerConsecutiveFailureCountForReads, null);
726-
726+
Environment.SetEnvironmentVariable(ConfigurationManager.PartitionLevelCircuitBreakerEnabled, null);
727+
Environment.SetEnvironmentVariable(ConfigurationManager.CircuitBreakerTimeoutCounterResetWindowInMinutes, null);
727728
await this.TryDeleteItems(itemsList);
728729
}
729730
}
730731

731732
[TestMethod]
732-
[TestCategory("MultiRegion")]
733+
[TestCategory("MultiRegion")]
733734
[Owner("dkunda")]
734735
[Timeout(70000)]
735736
public async Task ReadItemAsync_WithCircuitBreakerEnabledAndSingleMasterAccountAndServiceUnavailableReceivedFromTwoRegions_ShouldApplyPartitionLevelOverrideToThridRegion()
@@ -893,7 +894,7 @@ public async Task ReadItemAsync_WithCircuitBreakerEnabledAndSingleMasterAccountA
893894
}
894895

895896
[TestMethod]
896-
[TestCategory("MultiRegion")]
897+
[TestCategory("MultiRegion")]
897898
[Owner("dkunda")]
898899
[Timeout(70000)]
899900
public async Task ReadItemAsync_WithNoPreferredRegionsAndCircuitBreakerEnabledAndSingleMasterAccountAndServiceUnavailableReceived_ShouldApplyPartitionLevelOverride()
@@ -1011,7 +1012,7 @@ public async Task ReadItemAsync_WithNoPreferredRegionsAndCircuitBreakerEnabledAn
10111012

10121013
[TestMethod]
10131014
[Owner("dkunda")]
1014-
[TestCategory("MultiRegion")]
1015+
[TestCategory("MultiRegion")]
10151016
[Timeout(70000)]
10161017
public async Task ReadItemAsync_WithCircuitBreakerDisabledAndSingleMasterAccountAndServiceUnavailableReceived_ShouldNotApplyPartitionLevelOverride()
10171018
{
@@ -1095,14 +1096,12 @@ public async Task ReadItemAsync_WithCircuitBreakerDisabledAndSingleMasterAccount
10951096
finally
10961097
{
10971098
Environment.SetEnvironmentVariable(ConfigurationManager.PartitionLevelCircuitBreakerEnabled, null);
1098-
Environment.SetEnvironmentVariable(ConfigurationManager.CircuitBreakerConsecutiveFailureCountForReads, null);
1099-
11001099
await this.TryDeleteItems(itemsList);
11011100
}
11021101
}
11031102

11041103
[TestMethod]
1105-
[Owner("dkunda")]
1104+
[Owner("dkunda")]
11061105
[TestCategory("MultiRegion")]
11071106
[Timeout(70000)]
11081107
public async Task CreateItemAsync_WithCircuitBreakerEnabledAndSingleMasterAccountAndServiceUnavailableReceived_ShouldNotApplyPartitionLevelOverride()
@@ -1182,7 +1181,7 @@ public async Task CreateItemAsync_WithCircuitBreakerEnabledAndSingleMasterAccoun
11821181

11831182
[TestMethod]
11841183
[Owner("dkunda")]
1185-
[TestCategory("MultiMaster")]
1184+
[TestCategory("MultiMaster")]
11861185
[DataRow(ConnectionMode.Direct, "15", "10", DisplayName = "Direct Mode - Scenario whtn the total iteration count is 15 and circuit breaker consecutive failure threshold is set to 10.")]
11871186
[DataRow(ConnectionMode.Direct, "25", "20", DisplayName = "Direct Mode - Scenario whtn the total iteration count is 25 and circuit breaker consecutive failure threshold is set to 20.")]
11881187
[DataRow(ConnectionMode.Direct, "35", "30", DisplayName = "Direct Mode - Scenario whtn the total iteration count is 35 and circuit breaker consecutive failure threshold is set to 30.")]
@@ -2259,7 +2258,132 @@ public async Task ClinetOverrides0msRequestTimeoutValueForPPAF()
22592258
Assert.IsNotNull(strat);
22602259
Assert.AreNotEqual(0, strat.Threshold);
22612260
}
2262-
2261+
2262+
[TestMethod]
2263+
[TestCategory("MultiRegion")]
2264+
[Owner("pkolluri")]
2265+
[Timeout(70000)]
2266+
public async Task QueryItemAsync_WithCircuitBreakerEnabledMultiRegionAndServiceResponseDelay_ShouldFailOverToNextRegionAsync()
2267+
{
2268+
// Arrange.
2269+
Environment.SetEnvironmentVariable(ConfigurationManager.PartitionLevelCircuitBreakerEnabled, "True");
2270+
Environment.SetEnvironmentVariable(ConfigurationManager.CircuitBreakerConsecutiveFailureCountForReads, "1");
2271+
2272+
// Enabling fault injection rule to simulate a 503 service unavailable scenario.
2273+
string serviceResponseDelayRuleId = "response-delay-rule-" + Guid.NewGuid().ToString();
2274+
FaultInjectionRule serviceResponseDelayRuleFromRegion1 = new FaultInjectionRuleBuilder(
2275+
id: serviceResponseDelayRuleId,
2276+
condition:
2277+
new FaultInjectionConditionBuilder()
2278+
.WithOperationType(FaultInjectionOperationType.QueryItem)
2279+
.WithConnectionType(FaultInjectionConnectionType.Gateway)
2280+
.WithRegion(region1)
2281+
.Build(),
2282+
result:
2283+
FaultInjectionResultBuilder.GetResultBuilder(FaultInjectionServerErrorType.ResponseDelay)
2284+
.WithDelay(TimeSpan.FromSeconds(70))
2285+
.Build())
2286+
.Build();
2287+
2288+
serviceResponseDelayRuleFromRegion1.Disable();
2289+
2290+
List<FaultInjectionRule> rules = new List<FaultInjectionRule> { serviceResponseDelayRuleFromRegion1};
2291+
FaultInjector faultInjector = new FaultInjector(rules);
2292+
2293+
List<string> preferredRegions = new List<string> { region1, region2, region3 };
2294+
CosmosClientOptions cosmosClientOptions = new CosmosClientOptions()
2295+
{
2296+
ConsistencyLevel = ConsistencyLevel.Session,
2297+
FaultInjector = faultInjector,
2298+
ApplicationPreferredRegions = preferredRegions,
2299+
ConnectionMode = ConnectionMode.Gateway,
2300+
};
2301+
2302+
List<CosmosIntegrationTestObject> itemsList = new()
2303+
{
2304+
new() { Id = "smTestId2", Pk = "smpk1" },
2305+
};
2306+
2307+
try
2308+
{
2309+
using CosmosClient cosmosClient = new(connectionString: this.connectionString, clientOptions: cosmosClientOptions);
2310+
Database database = cosmosClient.GetDatabase(MultiRegionSetupHelpers.dbName);
2311+
Container container = database.GetContainer(MultiRegionSetupHelpers.containerName);
2312+
2313+
// Act and Assert.
2314+
await this.TryCreateItems(itemsList);
2315+
2316+
//Must Ensure the data is replicated to all regions
2317+
await Task.Delay(3000);
2318+
2319+
bool isRegion1Available = true;
2320+
bool isRegion2Available = true;
2321+
2322+
int thresholdCounter = 0;
2323+
int totalIterations = 7;
2324+
int ppcbDefaultThreshold = 1;
2325+
int firstRegionServiceUnavailableAttempt = 1;
2326+
2327+
for (int attemptCount = 1; attemptCount <= totalIterations; attemptCount++)
2328+
{
2329+
try
2330+
{
2331+
string sqlQueryText = $"SELECT * FROM c WHERE c.id = '{itemsList[0].Id}'";
2332+
using FeedIterator<CosmosIntegrationTestObject> feedIterator = container.GetItemQueryIterator<CosmosIntegrationTestObject>(sqlQueryText, requestOptions: new QueryRequestOptions());
2333+
2334+
while (feedIterator.HasMoreResults)
2335+
{
2336+
FeedResponse<CosmosIntegrationTestObject> response = await feedIterator.ReadNextAsync();
2337+
Assert.AreEqual(System.Net.HttpStatusCode.OK, response.StatusCode);
2338+
IReadOnlyList<(string regionName, Uri uri)> contactedRegionMapping = response.Diagnostics.GetContactedRegions();
2339+
HashSet<string> contactedRegions = new(contactedRegionMapping.Select(r => r.regionName));
2340+
2341+
if (isRegion1Available && isRegion2Available)
2342+
{
2343+
Assert.IsTrue(contactedRegions.Count == 1, "Assert that, when no failure happened, the query request is being served from region 1.");
2344+
Assert.IsTrue(contactedRegions.Contains(region1));
2345+
2346+
// Simulating service unavailable on region 1.
2347+
if (attemptCount == firstRegionServiceUnavailableAttempt)
2348+
{
2349+
isRegion1Available = false;
2350+
serviceResponseDelayRuleFromRegion1.Enable();
2351+
}
2352+
}
2353+
else if (isRegion2Available)
2354+
{
2355+
if (thresholdCounter <= ppcbDefaultThreshold)
2356+
{
2357+
Assert.IsTrue(contactedRegions.Count == 2, "Asserting that when the query request succeeds before the consecutive failure count reaches the threshold, the partition didn't fail over to the next region, and the request was retried.");
2358+
Assert.IsTrue(contactedRegions.Contains(region1) && contactedRegions.Contains(region2), "Asserting that both region 1 and region 2 were contacted.");
2359+
thresholdCounter++;
2360+
}
2361+
else
2362+
{
2363+
Assert.IsTrue(contactedRegions.Count == 1, "Asserting that when the consecutive failure count reaches the threshold, the partition was failed over to the next region, and the subsequent query request/s were successful on the next region");
2364+
}
2365+
}
2366+
}
2367+
}
2368+
catch (CosmosException ce)
2369+
{
2370+
Assert.Fail("Query operation should succeed with successful failover to next region." + ce.Diagnostics.ToString());
2371+
}
2372+
catch (Exception ex)
2373+
{
2374+
Assert.Fail($"Unhandled Exception was thrown during Query operation call. Message: {ex.Message}");
2375+
}
2376+
}
2377+
}
2378+
finally
2379+
{
2380+
Environment.SetEnvironmentVariable(ConfigurationManager.PartitionLevelCircuitBreakerEnabled, null);
2381+
Environment.SetEnvironmentVariable(ConfigurationManager.CircuitBreakerConsecutiveFailureCountForReads, null);
2382+
2383+
await this.TryDeleteItems(itemsList);
2384+
}
2385+
}
2386+
22632387
private async Task TryCreateItems(List<CosmosIntegrationTestObject> testItems)
22642388
{
22652389
foreach (CosmosIntegrationTestObject item in testItems)

Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosHttpClientCoreTests.cs

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,53 @@ await TestScenarioAsync(
592592
expectedNumberOfRetrys: 3);
593593
}
594594

595+
[TestMethod]
596+
public void HttpTimeoutPolicyForParitionFailoverForQueries()
597+
{
598+
HttpTimeoutPolicy httpTimeoutPolicyForQuery = HttpTimeoutPolicy.GetTimeoutPolicy(
599+
documentServiceRequest: CosmosHttpClientCoreTests.CreateDocumentServiceRequestByOperation(ResourceType.Document, OperationType.Query),
600+
isPartitionLevelFailoverEnabled: true,
601+
isThinClientEnabled: false);
602+
IEnumerator<(TimeSpan requestTimeout, TimeSpan delayForNextRequest)> availableRetries = httpTimeoutPolicyForQuery.GetTimeoutEnumerator();
603+
604+
int count = 0;
605+
while (availableRetries.MoveNext())
606+
{
607+
if (count <= 1)
608+
{
609+
Assert.AreEqual(new TimeSpan(0,0,6), availableRetries.Current.requestTimeout);
610+
}
611+
else if (count == 2)
612+
{
613+
Assert.AreEqual(new TimeSpan(0, 0, 10), availableRetries.Current.requestTimeout);
614+
}
615+
count++;
616+
}
617+
}
618+
619+
[TestMethod]
620+
public void HttpTimeoutPolicyForParitionFailoverForReads()
621+
{
622+
HttpTimeoutPolicy httpTimeoutPolicyForQuery = HttpTimeoutPolicy.GetTimeoutPolicy(
623+
documentServiceRequest: CosmosHttpClientCoreTests.CreateDocumentServiceRequestByOperation(ResourceType.Document, OperationType.Read),
624+
isPartitionLevelFailoverEnabled: true,
625+
isThinClientEnabled: false);
626+
IEnumerator<(TimeSpan requestTimeout, TimeSpan delayForNextRequest)> availableRetries = httpTimeoutPolicyForQuery.GetTimeoutEnumerator();
627+
628+
int count = 0;
629+
while (availableRetries.MoveNext())
630+
{
631+
if (count == 0)
632+
{
633+
Assert.AreEqual(new TimeSpan(0, 0, 1), availableRetries.Current.requestTimeout);
634+
}
635+
else if (count == 1 || count ==2 )
636+
{
637+
Assert.AreEqual(new TimeSpan(0, 0, 6), availableRetries.Current.requestTimeout);
638+
}
639+
count++;
640+
}
641+
}
595642

596643
private static DocumentServiceRequest CreateDocumentServiceRequestByOperation(
597644
ResourceType resourceType,

0 commit comments

Comments
 (0)