diff --git a/Microsoft.Azure.Cosmos/src/CosmosClientOptions.cs b/Microsoft.Azure.Cosmos/src/CosmosClientOptions.cs
index 7508bb1620..b8916711bb 100644
--- a/Microsoft.Azure.Cosmos/src/CosmosClientOptions.cs
+++ b/Microsoft.Azure.Cosmos/src/CosmosClientOptions.cs
@@ -55,6 +55,12 @@ public class CosmosClientOptions
private const string ConnectionStringDisableServerCertificateValidation = "DisableServerCertificateValidation";
private const ApiType DefaultApiType = ApiType.None;
+
+ ///
+ /// Default thresholds for PPAF request hedging.
+ ///
+ private const int DefaultHedgingThresholdInMilliseconds = 1000;
+ private const int DefaultHedgingThresholdStepInMilliseconds = 500;
///
/// Default request timeout
@@ -75,7 +81,7 @@ public class CosmosClientOptions
private Func httpClientFactory;
private string applicationName;
private IFaultInjector faultInjector;
- private bool isCustomSerializerProvided;
+ private bool isCustomSerializerProvided;
///
/// Creates a new CosmosClientOptions
@@ -90,7 +96,7 @@ public CosmosClientOptions()
this.ApiType = CosmosClientOptions.DefaultApiType;
this.CustomHandlers = new Collection();
this.CosmosClientTelemetryOptions = new CosmosClientTelemetryOptions();
- this.SessionRetryOptions = new SessionRetryOptions();
+ this.SessionRetryOptions = new SessionRetryOptions();
}
///
@@ -760,10 +766,14 @@ bool EnableRemoteRegionPreferredForSessionRetry
get => this.SessionRetryOptions.RemoteRegionPreferred;
set => this.SessionRetryOptions.RemoteRegionPreferred = value;
}
-
+
///
- /// Enable partition key level failover
- ///
+ /// Gets or sets a value indicating whether partition-level failover is enabled. When this feature is enabled,
+ /// the SDK by default applies a cross-region hedging strategy with a default threshold of 1 seconds.
+ /// If an availability strategy is provided explicitly, then it will be honored, and the default policy wouldn't be applied. Note that
+ /// the default availability strategy can be opted out by setting as the availability strategy in
+ /// cosmos client options.
+ ///
internal bool EnablePartitionLevelFailover { get; set; } = ConfigurationManager.IsPartitionLevelFailoverEnabled(defaultValue: false);
///
@@ -1013,8 +1023,8 @@ internal CosmosClientOptions Clone()
internal virtual ConnectionPolicy GetConnectionPolicy(int clientId)
{
this.ValidateDirectTCPSettings();
- this.ValidateLimitToEndpointSettings();
- this.ValidatePartitionLevelFailoverSettings();
+ this.ValidateLimitToEndpointSettings();
+ this.InitializePartitionLevelFailoverWithDefaultHedging();
ConnectionPolicy connectionPolicy = new ConnectionPolicy()
{
@@ -1191,16 +1201,6 @@ private void ValidateLimitToEndpointSettings()
}
}
- private void ValidatePartitionLevelFailoverSettings()
- {
- if (this.EnablePartitionLevelFailover
- && string.IsNullOrEmpty(this.ApplicationRegion)
- && (this.ApplicationPreferredRegions is null || this.ApplicationPreferredRegions.Count == 0))
- {
- throw new ArgumentException($"{nameof(this.ApplicationPreferredRegions)} or {nameof(this.ApplicationRegion)} is required when {nameof(this.EnablePartitionLevelFailover)} is enabled.");
- }
- }
-
private void ValidateDirectTCPSettings()
{
string settingName = string.Empty;
@@ -1262,6 +1262,21 @@ internal UserAgentContainer CreateUserAgentContainerWithFeatures(int clientId)
suffix: this.GetUserAgentSuffix());
}
+ internal void InitializePartitionLevelFailoverWithDefaultHedging()
+ {
+ if (this.EnablePartitionLevelFailover
+ && this.AvailabilityStrategy == null)
+ {
+ // The default threshold is the minimum value of 1 second and a fraction (currently it's half) of
+ // the request timeout value provided by the end customer.
+ double defaultThresholdInMillis = Math.Min(CosmosClientOptions.DefaultHedgingThresholdInMilliseconds, this.RequestTimeout.TotalMilliseconds / 2);
+
+ this.AvailabilityStrategy = AvailabilityStrategy.CrossRegionHedgingStrategy(
+ threshold: TimeSpan.FromMilliseconds(defaultThresholdInMillis),
+ thresholdStep: TimeSpan.FromMilliseconds(CosmosClientOptions.DefaultHedgingThresholdStepInMilliseconds));
+ }
+ }
+
internal string GetUserAgentSuffix()
{
int featureFlag = 0;
diff --git a/Microsoft.Azure.Cosmos/src/Routing/GlobalPartitionEndpointManagerCore.cs b/Microsoft.Azure.Cosmos/src/Routing/GlobalPartitionEndpointManagerCore.cs
index 1e8881dbd5..3818fe70ad 100644
--- a/Microsoft.Azure.Cosmos/src/Routing/GlobalPartitionEndpointManagerCore.cs
+++ b/Microsoft.Azure.Cosmos/src/Routing/GlobalPartitionEndpointManagerCore.cs
@@ -44,9 +44,9 @@ internal sealed class GlobalPartitionEndpointManagerCore : GlobalPartitionEndpoi
private readonly int partitionUnavailabilityDurationInSeconds = ConfigurationManager.GetAllowedPartitionUnavailabilityDurationInSeconds(5);
///
- /// A readonly integer containing the partition failback refresh interval in seconds. The default value is 60 seconds.
+ /// A readonly integer containing the partition failback refresh interval in seconds. The default value is 5 minutes.
///
- private readonly int backgroundConnectionInitTimeIntervalInSeconds = ConfigurationManager.GetStalePartitionUnavailabilityRefreshIntervalInSeconds(60);
+ private readonly int backgroundConnectionInitTimeIntervalInSeconds = ConfigurationManager.GetStalePartitionUnavailabilityRefreshIntervalInSeconds(300);
///
/// A readonly boolean flag used to determine if partition level failover is enabled.
diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemIntegrationTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemIntegrationTests.cs
index 8a79051ca8..45551f0bef 100644
--- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemIntegrationTests.cs
+++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemIntegrationTests.cs
@@ -674,6 +674,123 @@ public async Task ReadItemAsync_WithCircuitBreakerEnabledAndSingleMasterAccountA
}
}
+ [TestMethod]
+ [TestCategory("MultiRegion")]
+ [Owner("dkunda")]
+ [Timeout(70000)]
+ public async Task ReadItemAsync_WithNoPreferredRegionsAndCircuitBreakerEnabledAndSingleMasterAccountAndServiceUnavailableReceived_ShouldApplyPartitionLevelOverride()
+ {
+ // Arrange.
+ Environment.SetEnvironmentVariable(ConfigurationManager.PartitionLevelCircuitBreakerEnabled, "True");
+ Environment.SetEnvironmentVariable(ConfigurationManager.CircuitBreakerConsecutiveFailureCountForReads, "10");
+
+ // Enabling fault injection rule to simulate a 503 service unavailable scenario.
+ string serviceUnavailableRuleId = "503-rule-" + Guid.NewGuid().ToString();
+ FaultInjectionRule serviceUnavailableRule = new FaultInjectionRuleBuilder(
+ id: serviceUnavailableRuleId,
+ condition:
+ new FaultInjectionConditionBuilder()
+ .WithOperationType(FaultInjectionOperationType.ReadItem)
+ .WithRegion(region1)
+ .Build(),
+ result:
+ FaultInjectionResultBuilder.GetResultBuilder(FaultInjectionServerErrorType.ServiceUnavailable)
+ .WithDelay(TimeSpan.FromMilliseconds(10))
+ .Build())
+ .Build();
+
+ List rules = new List { serviceUnavailableRule };
+ FaultInjector faultInjector = new FaultInjector(rules);
+
+ CosmosClientOptions cosmosClientOptions = new CosmosClientOptions()
+ {
+ ConnectionMode = ConnectionMode.Direct,
+ ConsistencyLevel = ConsistencyLevel.Session,
+ FaultInjector = faultInjector,
+ RequestTimeout = TimeSpan.FromSeconds(5),
+ };
+
+ List itemsList = new()
+ {
+ new() { Id = "smTestId1", Pk = "smpk1" },
+ };
+
+ try
+ {
+ using CosmosClient cosmosClient = new(connectionString: this.connectionString, clientOptions: cosmosClientOptions);
+ Database database = cosmosClient.GetDatabase(MultiRegionSetupHelpers.dbName);
+ Container container = database.GetContainer(MultiRegionSetupHelpers.containerName);
+
+ // Act and Assert.
+ await this.TryCreateItems(itemsList);
+
+ //Must Ensure the data is replicated to all regions
+ await Task.Delay(3000);
+
+ int consecutiveFailureCount = 10;
+ int totalIterations = 15;
+
+ for (int attemptCount = 1; attemptCount <= totalIterations; attemptCount++)
+ {
+ try
+ {
+ ItemResponse readResponse = await container.ReadItemAsync(
+ id: itemsList[0].Id,
+ partitionKey: new PartitionKey(itemsList[0].Pk));
+
+ IReadOnlyList<(string regionName, Uri uri)> contactedRegionMapping = readResponse.Diagnostics.GetContactedRegions();
+ HashSet contactedRegions = new(contactedRegionMapping.Select(r => r.regionName));
+
+ Assert.AreEqual(
+ expected: HttpStatusCode.OK,
+ actual: readResponse.StatusCode);
+
+ Assert.IsNotNull(contactedRegions);
+
+ PartitionKeyRangeFailoverInfo failoverInfo = TestCommon.GetFailoverInfoForFirstPartitionUsingReflection(
+ globalPartitionEndpointManager: cosmosClient.ClientContext.DocumentClient.PartitionKeyRangeLocation,
+ isReadOnlyOrMultiMaster: true);
+
+ if (attemptCount > consecutiveFailureCount + 1)
+ {
+ Assert.IsTrue(contactedRegions.Count == 1, "Asserting that when the consecutive failure count reaches the threshold, the partition was failed over to the next region, and the subsequent read request/s were successful on the next region.");
+ Assert.IsTrue(contactedRegions.Contains(region2));
+ Assert.AreEqual(this.readRegionsMapping[region2], failoverInfo.Current);
+ }
+ else
+ {
+ Assert.IsTrue(contactedRegions.Count == 2, "Asserting that when the read request succeeds before the consecutive failure count reaches the threshold, the partition didn't over to the next region, and the request was retried on the next region.");
+ Assert.IsTrue(contactedRegions.Contains(region1) && contactedRegions.Contains(region2));
+
+ if (attemptCount > consecutiveFailureCount)
+ {
+ Assert.AreEqual(this.readRegionsMapping[region2], failoverInfo.Current);
+ }
+ else
+ {
+ Assert.AreEqual(this.readRegionsMapping[region1], failoverInfo.Current);
+ }
+ }
+ }
+ catch (CosmosException)
+ {
+ Assert.Fail("Read Item operation should succeed.");
+ }
+ catch (Exception ex)
+ {
+ Assert.Fail($"Unhandled Exception was thrown during ReadItemAsync call. Message: {ex.Message}");
+ }
+ }
+ }
+ finally
+ {
+ Environment.SetEnvironmentVariable(ConfigurationManager.PartitionLevelCircuitBreakerEnabled, null);
+ Environment.SetEnvironmentVariable(ConfigurationManager.CircuitBreakerConsecutiveFailureCountForReads, null);
+
+ await this.TryDeleteItems(itemsList);
+ }
+ }
+
[TestMethod]
[Owner("dkunda")]
[TestCategory("MultiRegion")]
@@ -1120,6 +1237,105 @@ public async Task CreateAndReadItemAsync_WithCircuitBreakerEnabledAndMultiMaster
}
}
+ [TestMethod]
+ [Owner("dkunda")]
+ [TestCategory("MultiRegion")]
+ [Timeout(70000)]
+ [DataRow(true, DisplayName = "Test scenario when PPAF is enabled at client level.")]
+ [DataRow(false, DisplayName = "Test scenario when PPAF is disabled at client level.")]
+ public async Task ReadItemAsync_WithPPAFEnabledAndSingleMasterAccountWithResponseDelay_ShouldHedgeRequestToMultipleRegions(
+ bool enablePartitionLevelFailover)
+ {
+ // Arrange.
+ if (enablePartitionLevelFailover)
+ {
+ Environment.SetEnvironmentVariable(ConfigurationManager.PartitionLevelFailoverEnabled, "True");
+ }
+
+ // Enabling fault injection rule to simulate a 503 service unavailable scenario.
+ string serviceUnavailableRuleId = "503-rule-" + Guid.NewGuid().ToString();
+ FaultInjectionRule serviceUnavailableRule = new FaultInjectionRuleBuilder(
+ id: serviceUnavailableRuleId,
+ condition:
+ new FaultInjectionConditionBuilder()
+ .WithOperationType(FaultInjectionOperationType.ReadItem)
+ .WithRegion(region1)
+ .Build(),
+ result:
+ FaultInjectionResultBuilder.GetResultBuilder(FaultInjectionServerErrorType.ResponseDelay)
+ .WithDelay(TimeSpan.FromMilliseconds(3000))
+ .Build())
+ .Build();
+
+ List rules = new List { serviceUnavailableRule };
+ FaultInjector faultInjector = new FaultInjector(rules);
+
+ List preferredRegions = new List { region1, region2, region3 };
+ CosmosClientOptions cosmosClientOptions = new CosmosClientOptions()
+ {
+ ConsistencyLevel = ConsistencyLevel.Session,
+ FaultInjector = faultInjector,
+ RequestTimeout = TimeSpan.FromSeconds(5),
+ ApplicationPreferredRegions = preferredRegions,
+ };
+
+ List itemsList = new()
+ {
+ new() { Id = "smTestId1", Pk = "smpk1" },
+ };
+
+ try
+ {
+ using CosmosClient cosmosClient = new(connectionString: this.connectionString, clientOptions: cosmosClientOptions);
+ Database database = cosmosClient.GetDatabase(MultiRegionSetupHelpers.dbName);
+ Container container = database.GetContainer(MultiRegionSetupHelpers.containerName);
+
+ // Act and Assert.
+ await this.TryCreateItems(itemsList);
+
+ //Must Ensure the data is replicated to all regions
+ await Task.Delay(3000);
+
+ ItemResponse readResponse = await container.ReadItemAsync(
+ id: itemsList[0].Id,
+ partitionKey: new PartitionKey(itemsList[0].Pk));
+
+ IReadOnlyList<(string regionName, Uri uri)> contactedRegionMapping = readResponse.Diagnostics.GetContactedRegions();
+ HashSet contactedRegions = new(contactedRegionMapping.Select(r => r.regionName));
+
+ Assert.AreEqual(
+ expected: HttpStatusCode.OK,
+ actual: readResponse.StatusCode);
+
+ CosmosTraceDiagnostics traceDiagnostic = readResponse.Diagnostics as CosmosTraceDiagnostics;
+ Assert.IsNotNull(traceDiagnostic);
+
+ traceDiagnostic.Value.Data.TryGetValue("Hedge Context", out object hedgeContext);
+
+ if (cosmosClientOptions.EnablePartitionLevelFailover)
+ {
+ Assert.IsNotNull(hedgeContext);
+ List hedgedRegions = ((IEnumerable)hedgeContext).ToList();
+
+ Assert.IsTrue(hedgedRegions.Count > 1, "Since the first region is not available, the request should atleast hedge to the next region.");
+ Assert.IsTrue(hedgedRegions.Contains(region1) && (hedgedRegions.Contains(region2) || hedgedRegions.Contains(region3)));
+ }
+ else
+ {
+ Assert.IsNull(hedgeContext);
+ }
+
+ Assert.IsNotNull(contactedRegions);
+ Assert.IsTrue(contactedRegions.Count == 1, "Asserting that when the read request succeeds on any region, given that there were no availability loss.");
+ }
+ finally
+ {
+ Environment.SetEnvironmentVariable(ConfigurationManager.PartitionLevelFailoverEnabled, null);
+
+ await this.TryDeleteItems(itemsList);
+ }
+ }
+
private async Task TryCreateItems(List testItems)
{
foreach (CosmosIntegrationTestObject item in testItems)
diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosClientOptionsUnitTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosClientOptionsUnitTests.cs
index 6c8fee598e..246f6914f8 100644
--- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosClientOptionsUnitTests.cs
+++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosClientOptionsUnitTests.cs
@@ -227,13 +227,14 @@ public void VerifyCosmosConfigurationPropertiesGetUpdated()
///
/// Test to validate that when the partition level failover is enabled with the preferred regions list is missing, then the client
- /// initialization should throw an argument exception and fail. This should hold true for both environment variable and CosmosClientOptions.
+ /// initialization should succeed. This should hold true for both environment variable and CosmosClientOptions.
///
[TestMethod]
[Owner("dkunda")]
[DataRow(true, DisplayName = "Validate that when environment variable is used to enable PPAF, the outcome of the test should be same.")]
[DataRow(false, DisplayName = "Validate that when CosmosClientOptions is used to enable PPAF, the outcome of the test should be same.")]
- public void CosmosClientOptions_WhenPartitionLevelFailoverEnabledAndPreferredRegionsNotSet_ShouldThrowArgumentException(bool useEnvironmentVariable)
+ public void CosmosClientOptions_WhenPartitionLevelFailoverEnabledAndPreferredRegionsNotSet_ShouldInitializeCosmosClientSuccessfully(
+ bool useEnvironmentVariable)
{
try
{
@@ -282,11 +283,10 @@ public void CosmosClientOptions_WhenPartitionLevelFailoverEnabledAndPreferredReg
.WithPartitionLevelFailoverEnabled();
}
- ArgumentException exception = Assert.ThrowsException(() => cosmosClientBuilder.Build());
+ CosmosClient cosmosClient = cosmosClientBuilder.Build();
- Assert.AreEqual(
- expected: "ApplicationPreferredRegions or ApplicationRegion is required when EnablePartitionLevelFailover is enabled.",
- actual: exception.Message);
+ Assert.IsNotNull(cosmosClient,
+ message: "ApplicationPreferredRegions or ApplicationRegion is no longer mandatory fields, hence the client initialization should succeed.");
}
finally
{
@@ -1246,7 +1246,6 @@ public void PPAFClientAppRegionAndAppPreferredRegionTest()
}
[TestMethod]
- [ExpectedException(typeof(ArgumentException))]
public void PPAFClientNoRegionsTest()
{
CosmosClientOptions cosmosClientOptions = new CosmosClientOptions
@@ -1254,7 +1253,9 @@ public void PPAFClientNoRegionsTest()
EnablePartitionLevelFailover = true
};
- _ = new CosmosClient(ConnectionString, cosmosClientOptions);
+ CosmosClient cosmosClient = new(ConnectionString, cosmosClientOptions);
+
+ Assert.IsNotNull(cosmosClient);
}
private class TestWebProxy : IWebProxy
diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/PartitionKeyRangeFailoverTests/GlobalPartitionEndpointManagerTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/PartitionKeyRangeFailoverTests/GlobalPartitionEndpointManagerTests.cs
index f1d483888a..114428297b 100644
--- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/PartitionKeyRangeFailoverTests/GlobalPartitionEndpointManagerTests.cs
+++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/PartitionKeyRangeFailoverTests/GlobalPartitionEndpointManagerTests.cs
@@ -200,11 +200,17 @@ public async Task CreateItemAsync_WithPreferredRegionsAndServiceUnavailableForFi
///
/// Test to validate that when the partition level failover is enabled with the preferred regions list is missing, then the client
- /// initialization should throw an argument exception and fail.
+ /// initialization should succeed without throwing any argument exception.
///
[TestMethod]
- public void CreateItemAsync_WithNoPreferredRegionsAndServiceUnavailable_ShouldThrowArgumentException()
+ [DataRow(true, DisplayName = "Validate that when an explict availability strategy is provided, the same will be honored by bypassing the default one, when PPAF is enabled.")]
+ [DataRow(false, DisplayName = "Validate that when no explict availability strategy is provided, a default availability strategy will be applied, when PPAF is enabled.")]
+ public void CreateItemAsync_WithNoPreferredRegionsAndServiceUnavailable_ShouldNotThrowArgumentException(
+ bool isExplictAvailabilityStrategyProvided)
{
+ TimeSpan explictAvailabilityStrategyThreshold = TimeSpan.FromMilliseconds(2000);
+ TimeSpan explictAvailabilityStrategyThresholdStep = TimeSpan.FromMilliseconds(500);
+
GlobalPartitionEndpointManagerTests.SetupAccountAndCacheOperations(
out string secondaryRegionNameForUri,
out string globalEndpoint,
@@ -247,14 +253,39 @@ public void CreateItemAsync_WithNoPreferredRegionsAndServiceUnavailable_ShouldTh
TransportClientHandlerFactory = (original) => mockTransport.Object,
};
- ArgumentException exception = Assert.ThrowsException(() => new CosmosClient(
- globalEndpoint,
- Convert.ToBase64String(Encoding.UTF8.GetBytes(Guid.NewGuid().ToString())),
- cosmosClientOptions));
+ if (isExplictAvailabilityStrategyProvided)
+ {
+ cosmosClientOptions.AvailabilityStrategy = AvailabilityStrategy.CrossRegionHedgingStrategy(
+ threshold: explictAvailabilityStrategyThreshold,
+ thresholdStep: explictAvailabilityStrategyThresholdStep);
+ }
+
+ CosmosClient cosmosClient = new CosmosClient(
+ globalEndpoint,
+ Convert.ToBase64String(Encoding.UTF8.GetBytes(Guid.NewGuid().ToString())),
+ cosmosClientOptions);
+
+ Assert.IsNotNull(cosmosClient,
+ message: "ApplicationPreferredRegions or ApplicationRegion is no longer mandatory fields, hence the client initialization should succeed.");
+
+ Assert.IsNotNull(cosmosClient.ClientOptions.AvailabilityStrategy);
+
+ CrossRegionHedgingAvailabilityStrategy crossRegionHedgingStrategy = (CrossRegionHedgingAvailabilityStrategy)cosmosClient.ClientOptions.AvailabilityStrategy;
+
+ Assert.IsNotNull(crossRegionHedgingStrategy);
- Assert.AreEqual(
- expected: "ApplicationPreferredRegions or ApplicationRegion is required when EnablePartitionLevelFailover is enabled.",
- actual: exception.Message);
+ if (isExplictAvailabilityStrategyProvided)
+ {
+ // Explict availability strategy values.
+ Assert.AreEqual(explictAvailabilityStrategyThreshold, crossRegionHedgingStrategy.Threshold);
+ Assert.AreEqual(explictAvailabilityStrategyThresholdStep, crossRegionHedgingStrategy.ThresholdStep);
+ }
+ else
+ {
+ // Default availability strategy values.
+ Assert.AreEqual(TimeSpan.FromMilliseconds(1000), crossRegionHedgingStrategy.Threshold);
+ Assert.AreEqual(TimeSpan.FromMilliseconds(500), crossRegionHedgingStrategy.ThresholdStep);
+ }
}
[TestMethod]