-
Notifications
You must be signed in to change notification settings - Fork 533
[Per Partition Automatic Failover] Enable PPAF Dynamically Using Targeted Event-Based Updates with Thread-Safe Operations #5326
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
a7d02e0
45ae5f2
6941697
395c4bd
47dcb90
526d0d8
5dc92c8
9f329f8
e610a29
49b3135
ec9fd26
2cd6627
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1431,6 +1431,17 @@ public void Dispose() | |
| this.initTaskCache = null; | ||
| } | ||
|
|
||
| if (this.accountServiceConfiguration != null) | ||
| { | ||
| this.accountServiceConfiguration.Dispose(); | ||
| this.accountServiceConfiguration = null; | ||
| } | ||
|
|
||
| if (this.PartitionKeyRangeLocation is IDisposable disposablePartitionManager) | ||
| { | ||
| disposablePartitionManager.Dispose(); | ||
| } | ||
|
|
||
| DefaultTrace.TraceInformation("DocumentClient with id {0} disposed.", this.traceId); | ||
| DefaultTrace.Flush(); | ||
|
|
||
|
|
@@ -6843,6 +6854,9 @@ private async Task InitializeGatewayConfigurationReaderAsync() | |
|
|
||
| this.accountServiceConfiguration = new CosmosAccountServiceConfiguration(accountReader.InitializeReaderAsync); | ||
|
|
||
| // Subscribe to account properties changes for dynamic PPAF updates | ||
| this.accountServiceConfiguration.OnEnablePartitionLevelFailoverChanged += this.HandleEnablePartitionLevelFailoverChanged; | ||
|
|
||
| await this.accountServiceConfiguration.InitializeAsync(); | ||
| AccountProperties accountProperties = this.accountServiceConfiguration.AccountProperties; | ||
| this.UseMultipleWriteLocations = this.ConnectionPolicy.UseMultipleWriteLocations && accountProperties.EnableMultipleWriteLocations; | ||
|
|
@@ -6875,21 +6889,129 @@ internal string GetUserAgentFeatures() | |
| return featureFlag == 0 ? string.Empty : $"F{featureFlag:X}"; | ||
| } | ||
|
|
||
| internal void InitializePartitionLevelFailoverWithDefaultHedging() | ||
| { | ||
| if (this.ConnectionPolicy.EnablePartitionLevelFailover | ||
| && this.ConnectionPolicy.AvailabilityStrategy == null) | ||
| { | ||
| // The default threshold is the minimum value of 1 second and a fraction (currently it's half) of | ||
| // the request timeout value provided by the end customer. | ||
| double defaultThresholdInMillis = Math.Min( | ||
| DocumentClient.DefaultHedgingThresholdInMilliseconds, | ||
| this.ConnectionPolicy.RequestTimeout.TotalMilliseconds / 2); | ||
|
|
||
| this.ConnectionPolicy.AvailabilityStrategy = AvailabilityStrategy.CrossRegionHedgingStrategy( | ||
| threshold: TimeSpan.FromMilliseconds(defaultThresholdInMillis), | ||
| thresholdStep: TimeSpan.FromMilliseconds(DocumentClient.DefaultHedgingThresholdStepInMilliseconds)); | ||
| } | ||
| internal void InitializePartitionLevelFailoverWithDefaultHedging() | ||
| { | ||
| if (this.ConnectionPolicy.EnablePartitionLevelFailover | ||
| && this.ConnectionPolicy.AvailabilityStrategy == null) | ||
| { | ||
| // The default threshold is the minimum value of 1 second and a fraction (currently it's half) of | ||
| // the request timeout value provided by the end customer. | ||
| double defaultThresholdInMillis = Math.Min( | ||
| DocumentClient.DefaultHedgingThresholdInMilliseconds, | ||
| this.ConnectionPolicy.RequestTimeout.TotalMilliseconds / 2); | ||
|
|
||
| this.ConnectionPolicy.AvailabilityStrategy = AvailabilityStrategy.CrossRegionHedgingStrategy( | ||
| threshold: TimeSpan.FromMilliseconds(defaultThresholdInMillis), | ||
| thresholdStep: TimeSpan.FromMilliseconds(DocumentClient.DefaultHedgingThresholdStepInMilliseconds)); | ||
| } | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Handles dynamic changes to the EnablePartitionLevelFailover flag from account properties refresh | ||
| /// </summary> | ||
| /// <param name="newEnablePartitionLevelFailover">The new value of the EnablePartitionLevelFailover flag</param> | ||
| private void HandleEnablePartitionLevelFailoverChanged(bool? newEnablePartitionLevelFailover) | ||
| { | ||
| try | ||
| { | ||
| // Only update if client-level override is not disabled | ||
| if (this.ConnectionPolicy.DisablePartitionLevelFailoverClientLevelOverride) | ||
| { | ||
| DefaultTrace.TraceInformation("DocumentClient: PPAF change ignored due to client-level override disabled"); | ||
| return; | ||
| } | ||
|
|
||
| bool previousValue = this.ConnectionPolicy.EnablePartitionLevelFailover; | ||
| bool newValue = newEnablePartitionLevelFailover ?? false; | ||
|
|
||
| if (previousValue == newValue) | ||
| { | ||
| // No actual change in effective value | ||
| return; | ||
| } | ||
|
|
||
| DefaultTrace.TraceInformation( | ||
| "DocumentClient: Updating EnablePartitionLevelFailover from {0} to {1}", | ||
| previousValue, | ||
| newValue); | ||
|
|
||
| // Update the connection policy | ||
| this.ConnectionPolicy.EnablePartitionLevelFailover = newValue; | ||
|
|
||
| // Update circuit breaker enablement | ||
| this.ConnectionPolicy.EnablePartitionLevelCircuitBreaker |= newValue; | ||
|
|
||
| // Update availability strategy for read hedging | ||
| this.UpdateAvailabilityStrategyForPPAF(newValue); | ||
|
|
||
| // Update the GlobalPartitionEndpointManager | ||
| this.UpdateGlobalPartitionEndpointManager(); | ||
|
|
||
| DefaultTrace.TraceInformation("DocumentClient: Successfully updated PPAF configuration dynamically"); | ||
| } | ||
| catch (Exception ex) | ||
| { | ||
| DefaultTrace.TraceError("DocumentClient: Error handling PPAF change: {0}", ex.Message); | ||
| } | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Updates the availability strategy based on PPAF enablement | ||
| /// </summary> | ||
| /// <param name="enablePPAF">Whether PPAF is enabled</param> | ||
| private void UpdateAvailabilityStrategyForPPAF(bool enablePPAF) | ||
| { | ||
| if (enablePPAF && this.ConnectionPolicy.AvailabilityStrategy == null) | ||
| { | ||
| // Enable default hedging when PPAF is enabled and no explicit strategy is set | ||
| double defaultThresholdInMillis = Math.Min( | ||
| DocumentClient.DefaultHedgingThresholdInMilliseconds, | ||
| this.ConnectionPolicy.RequestTimeout.TotalMilliseconds / 2); | ||
|
|
||
| this.ConnectionPolicy.AvailabilityStrategy = AvailabilityStrategy.CrossRegionHedgingStrategy( | ||
| threshold: TimeSpan.FromMilliseconds(defaultThresholdInMillis), | ||
| thresholdStep: TimeSpan.FromMilliseconds(DocumentClient.DefaultHedgingThresholdStepInMilliseconds)); | ||
|
|
||
| DefaultTrace.TraceInformation("DocumentClient: Enabled default hedging strategy for PPAF"); | ||
| } | ||
| // Note: We don't disable hedging when PPAF is disabled, as the user might have set it explicitly | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Updates the GlobalPartitionEndpointManager based on current PPAF and circuit breaker settings | ||
| /// </summary> | ||
| private void UpdateGlobalPartitionEndpointManager() | ||
| { | ||
| // Create new GlobalPartitionEndpointManager instance with updated settings | ||
| GlobalPartitionEndpointManager newPartitionKeyRangeLocation = | ||
| this.ConnectionPolicy.EnablePartitionLevelFailover | ||
| || this.ConnectionPolicy.EnablePartitionLevelCircuitBreaker | ||
| ? new GlobalPartitionEndpointManagerCore( | ||
| this.GlobalEndpointManager, | ||
| this.ConnectionPolicy.EnablePartitionLevelFailover, | ||
| this.ConnectionPolicy.EnablePartitionLevelCircuitBreaker, | ||
| this.isThinClientEnabled) | ||
| : GlobalPartitionEndpointManagerNoOp.Instance; | ||
|
|
||
| // Dispose the old instance if it's disposable | ||
| if (this.PartitionKeyRangeLocation is IDisposable disposableOldManager) | ||
| { | ||
| disposableOldManager.Dispose(); | ||
| } | ||
|
|
||
| // Update the partition key range location | ||
| this.PartitionKeyRangeLocation = newPartitionKeyRangeLocation; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could it cause any thread contention ? Can we use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed thread contention issue by using |
||
|
|
||
| // Update retry policy with new partition key range location | ||
| this.retryPolicy = new RetryPolicy( | ||
| globalEndpointManager: this.GlobalEndpointManager, | ||
| connectionPolicy: this.ConnectionPolicy, | ||
| partitionKeyRangeLocationCache: this.PartitionKeyRangeLocation, | ||
| isThinClientEnabled: this.isThinClientEnabled); | ||
|
|
||
| this.ResetSessionTokenRetryPolicy = this.retryPolicy; | ||
|
|
||
| DefaultTrace.TraceInformation("DocumentClient: Updated GlobalPartitionEndpointManager for dynamic PPAF change"); | ||
| } | ||
|
|
||
| internal void CaptureSessionToken(DocumentServiceRequest request, DocumentServiceResponse response) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,14 +6,44 @@ namespace Microsoft.Azure.Cosmos | |
| { | ||
| using System; | ||
| using System.Collections.Generic; | ||
| using System.Threading; | ||
| using System.Threading.Tasks; | ||
| using Microsoft.Azure.Cosmos.Core.Trace; | ||
| using Microsoft.Azure.Documents; | ||
|
|
||
| internal class CosmosAccountServiceConfiguration : IServiceConfigurationReader | ||
| internal class CosmosAccountServiceConfiguration : IServiceConfigurationReader, IDisposable | ||
| { | ||
| private Func<Task<AccountProperties>> accountPropertiesTaskFunc { get; } | ||
| private readonly Func<Task<AccountProperties>> accountPropertiesTaskFunc; | ||
| private readonly int refreshIntervalInSeconds; | ||
| private readonly object refreshLock = new object(); | ||
| private readonly CancellationTokenSource cancellationTokenSource = new CancellationTokenSource(); | ||
|
|
||
| private AccountProperties accountProperties; | ||
| private bool isBackgroundRefreshActive = false; | ||
| private Task backgroundRefreshTask; | ||
|
|
||
| internal AccountProperties AccountProperties | ||
| { | ||
| get | ||
| { | ||
| lock (this.refreshLock) | ||
| { | ||
| return this.accountProperties; | ||
| } | ||
| } | ||
| private set | ||
| { | ||
| lock (this.refreshLock) | ||
| { | ||
| this.accountProperties = value; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| internal AccountProperties AccountProperties { get; private set; } | ||
| /// <summary> | ||
| /// Event that is raised when account properties are refreshed and PPAF enablement status changes | ||
| /// </summary> | ||
| internal event Action<bool?> OnEnablePartitionLevelFailoverChanged; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we really need this event anymore ? Can we not leverage the event
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're absolutely right! I've removed the redundant Changes made:
This eliminates the intermediate event layer and provides a cleaner, more direct flow: The functionality remains the same but with reduced complexity and fewer event subscriptions. (commit: 395c4bd) |
||
|
|
||
| public CosmosAccountServiceConfiguration(Func<Task<AccountProperties>> accountPropertiesTaskFunc) | ||
| { | ||
|
|
@@ -23,6 +53,7 @@ public CosmosAccountServiceConfiguration(Func<Task<AccountProperties>> accountPr | |
| } | ||
|
|
||
| this.accountPropertiesTaskFunc = accountPropertiesTaskFunc; | ||
| this.refreshIntervalInSeconds = ConfigurationManager.GetAccountPropertiesRefreshIntervalInSeconds(300); // Default 5 minutes | ||
| } | ||
|
|
||
| public IDictionary<string, object> QueryEngineConfiguration => this.AccountProperties.QueryEngineConfiguration; | ||
|
|
@@ -58,6 +89,135 @@ public async Task InitializeAsync() | |
| if (this.AccountProperties == null) | ||
| { | ||
| this.AccountProperties = await this.accountPropertiesTaskFunc(); | ||
| this.InitializeBackgroundRefresh(); | ||
| } | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Initializes and starts the background account properties refresh task | ||
| /// </summary> | ||
| private void InitializeBackgroundRefresh() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is already a background task in Let's try to leverage that instead of creating a new one.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're absolutely right! I've refactored the implementation to leverage the existing background task in Changes made:
This approach is much cleaner as it:
The functionality remains the same - dynamic PPAF updates when account properties change - but now uses a single, centralized background refresh task. (commit: 6941697) |
||
| { | ||
| if (this.cancellationTokenSource.IsCancellationRequested) | ||
| { | ||
| return; | ||
| } | ||
|
|
||
| if (this.isBackgroundRefreshActive) | ||
| { | ||
| return; | ||
| } | ||
|
|
||
| lock (this.refreshLock) | ||
| { | ||
| if (this.isBackgroundRefreshActive) | ||
| { | ||
| return; | ||
| } | ||
|
|
||
| this.isBackgroundRefreshActive = true; | ||
| } | ||
|
|
||
| try | ||
| { | ||
| this.backgroundRefreshTask = this.InitiateAccountPropertiesRefreshLoopAsync(); | ||
| } | ||
| catch | ||
| { | ||
| this.isBackgroundRefreshActive = false; | ||
| throw; | ||
| } | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Runs a continuous loop with a delay to refresh the account properties periodically. | ||
| /// The loop will break when a cancellation is requested. | ||
| /// </summary> | ||
| #pragma warning disable VSTHRD100 // Avoid async void methods | ||
| private async Task InitiateAccountPropertiesRefreshLoopAsync() | ||
| #pragma warning restore VSTHRD100 // Avoid async void methods | ||
| { | ||
| while (!this.cancellationTokenSource.IsCancellationRequested) | ||
| { | ||
| try | ||
| { | ||
| await Task.Delay( | ||
| TimeSpan.FromSeconds(this.refreshIntervalInSeconds), | ||
| this.cancellationTokenSource.Token); | ||
|
|
||
| if (this.cancellationTokenSource.IsCancellationRequested) | ||
| { | ||
| break; | ||
| } | ||
|
|
||
| DefaultTrace.TraceInformation("CosmosAccountServiceConfiguration: Refreshing account properties."); | ||
| await this.RefreshAccountPropertiesAsync(); | ||
| } | ||
| catch (Exception ex) | ||
| { | ||
| if (this.cancellationTokenSource.IsCancellationRequested && (ex is OperationCanceledException || ex is ObjectDisposedException)) | ||
| { | ||
| break; | ||
| } | ||
|
|
||
| DefaultTrace.TraceCritical("CosmosAccountServiceConfiguration: Failed to refresh account properties. Exception: {0}", ex.Message); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Refreshes the account properties and notifies listeners if EnablePartitionLevelFailover changes | ||
| /// </summary> | ||
| private async Task RefreshAccountPropertiesAsync() | ||
| { | ||
| try | ||
| { | ||
| bool? previousEnablePartitionLevelFailover = this.AccountProperties?.EnablePartitionLevelFailover; | ||
| AccountProperties newProperties = await this.accountPropertiesTaskFunc(); | ||
|
|
||
| bool? newEnablePartitionLevelFailover = newProperties?.EnablePartitionLevelFailover; | ||
|
|
||
| // Check if PPAF enablement status has changed | ||
| if (previousEnablePartitionLevelFailover != newEnablePartitionLevelFailover) | ||
| { | ||
| DefaultTrace.TraceInformation( | ||
| "CosmosAccountServiceConfiguration: EnablePartitionLevelFailover changed from {0} to {1}", | ||
| previousEnablePartitionLevelFailover, | ||
| newEnablePartitionLevelFailover); | ||
|
|
||
| // Update the properties first | ||
| this.AccountProperties = newProperties; | ||
|
|
||
| // Notify listeners about the change | ||
| this.OnEnablePartitionLevelFailoverChanged?.Invoke(newEnablePartitionLevelFailover); | ||
| } | ||
| else | ||
| { | ||
| // Update properties even if PPAF status didn't change, as other properties might have changed | ||
| this.AccountProperties = newProperties; | ||
| } | ||
| } | ||
| catch (Exception ex) | ||
| { | ||
| DefaultTrace.TraceError("CosmosAccountServiceConfiguration: Error refreshing account properties: {0}", ex.Message); | ||
| } | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Disposes the background refresh task and cancellation token | ||
| /// </summary> | ||
| public void Dispose() | ||
| { | ||
| this.cancellationTokenSource?.Cancel(); | ||
| this.cancellationTokenSource?.Dispose(); | ||
|
|
||
| try | ||
| { | ||
| this.backgroundRefreshTask?.Wait(TimeSpan.FromSeconds(5)); | ||
| } | ||
| catch (Exception ex) | ||
| { | ||
| DefaultTrace.TraceError("CosmosAccountServiceConfiguration: Error disposing background refresh task: {0}", ex.Message); | ||
| } | ||
| } | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please update the user agent features here as well using
this.ConnectionPolicy.UserAgentContainer.AppendFeaturesThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added the user agent features update using
this.ConnectionPolicy.UserAgentContainer.AppendFeatures(this.GetUserAgentFeatures())when PPAF configuration changes dynamically. This ensures the user agent reflects the current PPAF state after dynamic updates. (commit: 49b3135)