Skip to content
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
181 changes: 164 additions & 17 deletions Microsoft.Azure.Cosmos/src/DocumentClient.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1416,6 +1416,8 @@ public void Dispose()

if (this.GlobalEndpointManager != null)
{
// Unsubscribe from account properties refresh events
this.GlobalEndpointManager.OnAccountPropertiesRefreshed -= this.HandleAccountPropertiesRefreshed;
this.GlobalEndpointManager.Dispose();
this.GlobalEndpointManager = null;
}
Expand All @@ -1431,6 +1433,16 @@ public void Dispose()
this.initTaskCache = null;
}

if (this.accountServiceConfiguration != null)
{
this.accountServiceConfiguration = null;
}

if (this.PartitionKeyRangeLocation is IDisposable disposablePartitionManager)
{
disposablePartitionManager.Dispose();
}

DefaultTrace.TraceInformation("DocumentClient with id {0} disposed.", this.traceId);
DefaultTrace.Flush();

Expand Down Expand Up @@ -6843,10 +6855,16 @@ private async Task InitializeGatewayConfigurationReaderAsync()

this.accountServiceConfiguration = new CosmosAccountServiceConfiguration(accountReader.InitializeReaderAsync);

await this.accountServiceConfiguration.InitializeAsync();
AccountProperties accountProperties = this.accountServiceConfiguration.AccountProperties;
// Subscribe to account properties changes for dynamic PPAF updates
this.accountServiceConfiguration.OnEnablePartitionLevelFailoverChanged += this.HandleEnablePartitionLevelFailoverChanged;

await this.accountServiceConfiguration.InitializeAsync();
AccountProperties accountProperties = this.accountServiceConfiguration.AccountProperties;
this.UseMultipleWriteLocations = this.ConnectionPolicy.UseMultipleWriteLocations && accountProperties.EnableMultipleWriteLocations;
this.GlobalEndpointManager.InitializeAccountPropertiesAndStartBackgroundRefresh(accountProperties);

// Subscribe to GlobalEndpointManager account properties refresh events to update CosmosAccountServiceConfiguration
this.GlobalEndpointManager.OnAccountPropertiesRefreshed += this.HandleAccountPropertiesRefreshed;
}

internal string GetUserAgentFeatures()
Expand Down Expand Up @@ -6875,21 +6893,150 @@ internal string GetUserAgentFeatures()
return featureFlag == 0 ? string.Empty : $"F{featureFlag:X}";
}

internal void InitializePartitionLevelFailoverWithDefaultHedging()
{
if (this.ConnectionPolicy.EnablePartitionLevelFailover
&& this.ConnectionPolicy.AvailabilityStrategy == null)
{
// The default threshold is the minimum value of 1 second and a fraction (currently it's half) of
// the request timeout value provided by the end customer.
double defaultThresholdInMillis = Math.Min(
DocumentClient.DefaultHedgingThresholdInMilliseconds,
this.ConnectionPolicy.RequestTimeout.TotalMilliseconds / 2);

this.ConnectionPolicy.AvailabilityStrategy = AvailabilityStrategy.CrossRegionHedgingStrategy(
threshold: TimeSpan.FromMilliseconds(defaultThresholdInMillis),
thresholdStep: TimeSpan.FromMilliseconds(DocumentClient.DefaultHedgingThresholdStepInMilliseconds));
}
internal void InitializePartitionLevelFailoverWithDefaultHedging()
{
if (this.ConnectionPolicy.EnablePartitionLevelFailover
&& this.ConnectionPolicy.AvailabilityStrategy == null)
{
// The default threshold is the minimum value of 1 second and a fraction (currently it's half) of
// the request timeout value provided by the end customer.
double defaultThresholdInMillis = Math.Min(
DocumentClient.DefaultHedgingThresholdInMilliseconds,
this.ConnectionPolicy.RequestTimeout.TotalMilliseconds / 2);

this.ConnectionPolicy.AvailabilityStrategy = AvailabilityStrategy.CrossRegionHedgingStrategy(
threshold: TimeSpan.FromMilliseconds(defaultThresholdInMillis),
thresholdStep: TimeSpan.FromMilliseconds(DocumentClient.DefaultHedgingThresholdStepInMilliseconds));
}
}

/// <summary>
/// Handles dynamic changes to the EnablePartitionLevelFailover flag from account properties refresh
/// </summary>
/// <param name="newEnablePartitionLevelFailover">The new value of the EnablePartitionLevelFailover flag</param>
private void HandleEnablePartitionLevelFailoverChanged(bool? newEnablePartitionLevelFailover)
{
try
{
// Only update if client-level override is not disabled
if (this.ConnectionPolicy.DisablePartitionLevelFailoverClientLevelOverride)
{
DefaultTrace.TraceInformation("DocumentClient: PPAF change ignored due to client-level override disabled");
return;
}

bool previousValue = this.ConnectionPolicy.EnablePartitionLevelFailover;
bool newValue = newEnablePartitionLevelFailover ?? false;

if (previousValue == newValue)
{
// No actual change in effective value
return;
}

DefaultTrace.TraceInformation(
"DocumentClient: Updating EnablePartitionLevelFailover from {0} to {1}",
previousValue,
newValue);

// Update the connection policy
this.ConnectionPolicy.EnablePartitionLevelFailover = newValue;

// Update circuit breaker enablement
this.ConnectionPolicy.EnablePartitionLevelCircuitBreaker |= newValue;

// Update availability strategy for read hedging
this.UpdateAvailabilityStrategyForPPAF(newValue);

// Update the GlobalPartitionEndpointManager
this.UpdateGlobalPartitionEndpointManager();

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please update the user agent features here as well using this.ConnectionPolicy.UserAgentContainer.AppendFeatures

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added the user agent features update using this.ConnectionPolicy.UserAgentContainer.AppendFeatures(this.GetUserAgentFeatures()) when PPAF configuration changes dynamically. This ensures the user agent reflects the current PPAF state after dynamic updates. (commit: 49b3135)

DefaultTrace.TraceInformation("DocumentClient: Successfully updated PPAF configuration dynamically");
}
catch (Exception ex)
{
DefaultTrace.TraceError("DocumentClient: Error handling PPAF change: {0}", ex.Message);
}
}

/// <summary>
/// Updates the availability strategy based on PPAF enablement
/// </summary>
/// <param name="enablePPAF">Whether PPAF is enabled</param>
private void UpdateAvailabilityStrategyForPPAF(bool enablePPAF)
{
if (enablePPAF && this.ConnectionPolicy.AvailabilityStrategy == null)
{
// Enable default hedging when PPAF is enabled and no explicit strategy is set
double defaultThresholdInMillis = Math.Min(
DocumentClient.DefaultHedgingThresholdInMilliseconds,
this.ConnectionPolicy.RequestTimeout.TotalMilliseconds / 2);

this.ConnectionPolicy.AvailabilityStrategy = AvailabilityStrategy.CrossRegionHedgingStrategy(
threshold: TimeSpan.FromMilliseconds(defaultThresholdInMillis),
thresholdStep: TimeSpan.FromMilliseconds(DocumentClient.DefaultHedgingThresholdStepInMilliseconds));

DefaultTrace.TraceInformation("DocumentClient: Enabled default hedging strategy for PPAF");
}
// Note: We don't disable hedging when PPAF is disabled, as the user might have set it explicitly
}

/// <summary>
/// Updates the GlobalPartitionEndpointManager based on current PPAF and circuit breaker settings
/// </summary>
private void UpdateGlobalPartitionEndpointManager()
{
// Create new GlobalPartitionEndpointManager instance with updated settings
GlobalPartitionEndpointManager newPartitionKeyRangeLocation =
this.ConnectionPolicy.EnablePartitionLevelFailover
|| this.ConnectionPolicy.EnablePartitionLevelCircuitBreaker
? new GlobalPartitionEndpointManagerCore(
this.GlobalEndpointManager,
this.ConnectionPolicy.EnablePartitionLevelFailover,
this.ConnectionPolicy.EnablePartitionLevelCircuitBreaker,
this.isThinClientEnabled)
: GlobalPartitionEndpointManagerNoOp.Instance;

// Dispose the old instance if it's disposable
if (this.PartitionKeyRangeLocation is IDisposable disposableOldManager)
{
disposableOldManager.Dispose();
}

// Update the partition key range location
this.PartitionKeyRangeLocation = newPartitionKeyRangeLocation;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could it cause any thread contention ? Can we use Interlocked.CompareExchange to alter the object ?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed thread contention issue by using Interlocked.Exchange for atomic updates to PartitionKeyRangeLocation. Changed from auto-property to field with property accessor and used Interlocked.Exchange(ref this.partitionKeyRangeLocation, newPartitionKeyRangeLocation) to atomically update the reference while avoiding race conditions during concurrent access. (commit: ec9fd26)


// Update retry policy with new partition key range location
this.retryPolicy = new RetryPolicy(
globalEndpointManager: this.GlobalEndpointManager,
connectionPolicy: this.ConnectionPolicy,
partitionKeyRangeLocationCache: this.PartitionKeyRangeLocation,
isThinClientEnabled: this.isThinClientEnabled);

this.ResetSessionTokenRetryPolicy = this.retryPolicy;

DefaultTrace.TraceInformation("DocumentClient: Updated GlobalPartitionEndpointManager for dynamic PPAF change");
}

/// <summary>
/// Handles account properties refresh events from GlobalEndpointManager
/// Updates the CosmosAccountServiceConfiguration with new account properties
/// </summary>
/// <param name="accountProperties">The refreshed account properties</param>
private void HandleAccountPropertiesRefreshed(AccountProperties accountProperties)
{
try
{
DefaultTrace.TraceInformation("DocumentClient: Received account properties refresh from GlobalEndpointManager");

// Update the CosmosAccountServiceConfiguration with the new account properties
// This will trigger the OnEnablePartitionLevelFailoverChanged event if PPAF value changed
this.accountServiceConfiguration?.UpdateAccountProperties(accountProperties);
}
catch (Exception ex)
{
DefaultTrace.TraceError("DocumentClient: Error handling account properties refresh: {0}", ex.Message);
}
}

internal void CaptureSessionToken(DocumentServiceRequest request, DocumentServiceResponse response)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ internal class CosmosAccountServiceConfiguration : IServiceConfigurationReader

internal AccountProperties AccountProperties { get; private set; }

/// <summary>
/// Event that is raised when account properties are refreshed and PPAF enablement status changes
/// </summary>
internal event Action<bool?> OnEnablePartitionLevelFailoverChanged;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we really need this event anymore ? Can we not leverage the event OnAccountPropertiesRefreshed from GlobalEndpointManager directly ?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're absolutely right! I've removed the redundant OnEnablePartitionLevelFailoverChanged event and simplified the architecture to leverage the existing OnAccountPropertiesRefreshed event from GlobalEndpointManager directly.

Changes made:

  • Removed OnEnablePartitionLevelFailoverChanged event from CosmosAccountServiceConfiguration
  • Enhanced DocumentClient.HandleAccountPropertiesRefreshed() to directly compare previous and new PPAF values and handle changes
  • Simplified CosmosAccountServiceConfiguration.UpdateAccountProperties() to only update properties without event firing
  • Updated unit tests to reflect the new direct architecture

This eliminates the intermediate event layer and provides a cleaner, more direct flow:
GlobalEndpointManager.OnAccountPropertiesRefreshedDocumentClient.HandleAccountPropertiesRefreshed() → direct PPAF change handling

The functionality remains the same but with reduced complexity and fewer event subscriptions. (commit: 395c4bd)


public CosmosAccountServiceConfiguration(Func<Task<AccountProperties>> accountPropertiesTaskFunc)
{
if (accountPropertiesTaskFunc == null)
Expand Down Expand Up @@ -60,5 +65,29 @@ public async Task InitializeAsync()
this.AccountProperties = await this.accountPropertiesTaskFunc();
}
}

/// <summary>
/// Updates the account properties and notifies listeners if EnablePartitionLevelFailover changes
/// This method is called by GlobalEndpointManager when account properties are refreshed
/// </summary>
internal void UpdateAccountProperties(AccountProperties newProperties)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not needed. Let's get this removed.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed the UpdateAccountProperties method and updated the event handler to focus only on PPAF handling without updating accountServiceConfiguration. Also simplified the tests accordingly. (commit: e610a29)

{
if (newProperties == null)
{
return;
}

bool? previousEnablePartitionLevelFailover = this.AccountProperties?.EnablePartitionLevelFailover;
bool? newEnablePartitionLevelFailover = newProperties.EnablePartitionLevelFailover;

// Update the properties first
this.AccountProperties = newProperties;

// Check if PPAF enablement status has changed and notify listeners
if (previousEnablePartitionLevelFailover != newEnablePartitionLevelFailover)
{
this.OnEnablePartitionLevelFailoverChanged?.Invoke(newEnablePartitionLevelFailover);
}
}
}
}
24 changes: 16 additions & 8 deletions Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ internal class GlobalEndpointManager : IGlobalEndpointManager
private bool isBackgroundAccountRefreshActive = false;
private DateTime LastBackgroundRefreshUtc = DateTime.MinValue;

/// <summary>
/// Event that is raised when account properties are refreshed and PPAF enablement status changes
/// </summary>
internal event Action<AccountProperties>? OnAccountPropertiesRefreshed;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's call this as OnEnablePartitionLevelFailoverConfigChanged and invoke the event from GlobalEndpointManager only when the PPAF flag is updated.

Update the DocumentClient implementation accordingly.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've renamed the event to OnEnablePartitionLevelFailoverConfigChanged and updated the implementation to only invoke the event when the PPAF flag actually changes. The GlobalEndpointManager now tracks the previous PPAF value and compares it during each refresh, only triggering the event when there's an actual change. Updated the DocumentClient implementation accordingly to use the new event name and simplified the handler since the comparison is now done in GlobalEndpointManager. (commit: 9f329f8)


public GlobalEndpointManager(
IDocumentClientInternal owner,
ConnectionPolicy connectionPolicy,
Expand Down Expand Up @@ -763,17 +768,20 @@ private async Task RefreshDatabaseAccountInternalAsync(bool forceRefresh)
{
this.LastBackgroundRefreshUtc = DateTime.UtcNow;
AccountProperties accountProperties = await this.GetDatabaseAccountAsync(true);
GlobalEndpointManager.ParseThinClientLocationsFromAdditionalProperties(accountProperties);

GlobalEndpointManager.ParseThinClientLocationsFromAdditionalProperties(accountProperties);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this really a change ? Please fix the code diff.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed the code diff by reverting the file to original state and reapplying only the necessary changes. Removed the unnecessary whitespace-only change that was appearing in the diff. (commit: 526d0d8)


this.locationCache.OnDatabaseAccountRead(accountProperties);

}
catch (Exception ex)
{
DefaultTrace.TraceWarning("Failed to refresh database account with exception: {0}. Activity Id: '{1}'",
ex.Message,
System.Diagnostics.Trace.CorrelationManager.ActivityId);
// Raise event to notify about account properties refresh
this.OnAccountPropertiesRefreshed?.Invoke(accountProperties);

}
catch (Exception ex)
{
DefaultTrace.TraceWarning("Failed to refresh database account with exception: {0}. Activity Id: '{1}'",
ex.Message,
System.Diagnostics.Trace.CorrelationManager.ActivityId);
}
finally
{
Expand Down
24 changes: 23 additions & 1 deletion Microsoft.Azure.Cosmos/src/Util/ConfigurationManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,13 @@ internal static class ConfigurationManager
/// <summary>
/// Environment variable name to enable thin client mode.
/// </summary>
internal static readonly string ThinClientModeEnabled = "AZURE_COSMOS_THIN_CLIENT_ENABLED";
internal static readonly string ThinClientModeEnabled = "AZURE_COSMOS_THIN_CLIENT_ENABLED";
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are not diffs. There are no changes. Please fix these diffs and any other places applicable within the scope of this PR.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed the spurious diffs in ConfigurationManager.cs and GlobalEndpointManager.cs by reverting to original state and reapplying only the necessary changes without whitespace modifications. (commit: 5dc92c8)


/// <summary>
/// A read-only string containing the environment variable name for capturing the account properties refresh interval
/// in seconds. The default value for this interval is 300 seconds (5 minutes).
/// </summary>
internal static readonly string AccountPropertiesRefreshIntervalInSeconds = "AZURE_COSMOS_ACCOUNT_PROPERTIES_REFRESH_INTERVAL_IN_SECONDS";
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove this as well.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed the empty line that was left after removing the AccountPropertiesRefreshIntervalInSeconds constant. (commit: 526d0d8)


/// <summary>
/// Environment variable to override AAD scope.
Expand Down Expand Up @@ -297,6 +303,22 @@ public static double GetCircuitBreakerTimeoutCounterResetWindowInMinutes(
defaultValue: defaultValue);
}

/// <summary>
/// Gets the account properties refresh interval in seconds.
/// The user can set the respective environment variable 'AZURE_COSMOS_ACCOUNT_PROPERTIES_REFRESH_INTERVAL_IN_SECONDS'
/// to override the value.
/// </summary>
/// <param name="defaultValue">An integer containing the default value for the refresh interval in seconds.</param>
/// <returns>An integer representing the refresh interval in seconds.</returns>
public static int GetAccountPropertiesRefreshIntervalInSeconds(
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no usage as we got rid of the redundant code for account refresh. Please remove this.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're correct! I've removed the unused GetAccountPropertiesRefreshIntervalInSeconds method and the associated AccountPropertiesRefreshIntervalInSeconds constant since they're no longer needed after refactoring to use the existing GlobalEndpointManager background task. (commit: 47dcb90)

int defaultValue)
{
return ConfigurationManager
.GetEnvironmentVariable(
variable: ConfigurationManager.AccountPropertiesRefreshIntervalInSeconds,
defaultValue: defaultValue);
}

/// <summary>
/// Gets the boolean value indicating whether optimistic direct execution is enabled based on the environment variable override.
/// </summary>
Expand Down
Loading
Loading