-
Notifications
You must be signed in to change notification settings - Fork 533
Metadata Retry: Adds Cross-Region Operation-level Retry for Metadata Request Failures #5780
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,13 +5,25 @@ | |
| namespace Microsoft.Azure.Cosmos.ChangeFeed.Bootstrapping | ||
| { | ||
| using System; | ||
| using System.Net.Http; | ||
| using System.Threading.Tasks; | ||
| using Microsoft.Azure.Cosmos.ChangeFeed.FeedManagement; | ||
| using Microsoft.Azure.Cosmos.ChangeFeed.LeaseManagement; | ||
| using Microsoft.Azure.Cosmos.Core.Trace; | ||
|
|
||
| internal sealed class BootstrapperCore : Bootstrapper | ||
| { | ||
| /// <summary> | ||
| /// Maximum number of times <see cref="InitializeAsync"/> will retry when | ||
| /// <see cref="PartitionSynchronizer.CreateMissingLeasesAsync"/> fails with a | ||
| /// regional error (e.g., <see cref="CosmosException"/> with 503 or | ||
| /// <see cref="HttpRequestException"/>). The retry is useful because | ||
| /// <see cref="MetadataRequestThrottleRetryPolicy"/> marks the failing | ||
| /// endpoint unavailable before propagating the error, so the next attempt | ||
| /// will be routed to a different region. | ||
| /// </summary> | ||
| internal const int MaxInitializationRetries = 3; | ||
|
|
||
| internal static readonly TimeSpan DefaultSleepTime = TimeSpan.FromSeconds(15); | ||
| internal static readonly TimeSpan DefaultLockTime = TimeSpan.FromSeconds(30); | ||
|
|
||
|
|
@@ -50,6 +62,8 @@ public BootstrapperCore(PartitionSynchronizer synchronizer, DocumentServiceLease | |
|
|
||
| public override async Task InitializeAsync() | ||
| { | ||
| int retryCount = 0; | ||
|
|
||
| while (true) | ||
| { | ||
| bool initialized = await this.leaseStore.IsInitializedAsync().ConfigureAwait(false); | ||
|
|
@@ -73,6 +87,39 @@ public override async Task InitializeAsync() | |
| await this.synchronizer.CreateMissingLeasesAsync().ConfigureAwait(false); | ||
| await this.leaseStore.MarkInitializedAsync().ConfigureAwait(false); | ||
| } | ||
| catch (CosmosException ex) when (retryCount < MaxInitializationRetries) | ||
| { | ||
| // MetadataRequestThrottleRetryPolicy has already marked the | ||
| // failing endpoint unavailable, so the next iteration will | ||
| // route to a different region. | ||
| retryCount++; | ||
| DefaultTrace.TraceWarning( | ||
| "BootstrapperCore: Regional failure during initialization " | ||
| + "(StatusCode: {0}, SubStatusCode: {1}). " | ||
| + "Attempt {2} of {3}. Retrying after {4}.", | ||
| ex.StatusCode, | ||
| ex.SubStatusCode, | ||
| retryCount, | ||
| MaxInitializationRetries, | ||
| this.sleepTime); | ||
|
|
||
| await Task.Delay(this.sleepTime).ConfigureAwait(false); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟢 Suggestion — The review rules require verifying cancellation token propagation on all async methods. Both The Consider adding |
||
| continue; | ||
| } | ||
| catch (HttpRequestException ex) when (retryCount < MaxInitializationRetries) | ||
| { | ||
| retryCount++; | ||
| DefaultTrace.TraceWarning( | ||
| "BootstrapperCore: HttpRequestException during initialization: {0}. " | ||
| + "Attempt {1} of {2}. Retrying after {3}.", | ||
| ex.Message, | ||
| retryCount, | ||
| MaxInitializationRetries, | ||
| this.sleepTime); | ||
|
|
||
| await Task.Delay(this.sleepTime).ConfigureAwait(false); | ||
| continue; | ||
| } | ||
| finally | ||
| { | ||
| if (isLockAcquired) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -105,6 +105,21 @@ private static async Task<ResponseMessage> ExecuteHttpRequestAsync( | |
| throw; | ||
| } | ||
| } | ||
| catch (CosmosException cosmosException) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 Recommendation — Missing test coverage for the most critical change in this PR This new However, there is no test in Suggestion: Add a test in
This is especially important because |
||
| { | ||
| // Metadata requests (e.g., pkranges) that fail with a regional | ||
| // error throw CosmosException from within the pipeline. Without | ||
| // this catch, the exception escapes the retry loop and is caught | ||
| // by the outer catch in SendAsync, which converts it to a | ||
| // ResponseMessage without consulting the retry policy. | ||
| // By catching it here, ClientRetryPolicy can evaluate the | ||
| // failure and retry the entire operation in another region. | ||
| result = await callShouldRetryException(cosmosException, cancellationToken); | ||
| if (!result.ShouldRetry) | ||
| { | ||
| throw; | ||
| } | ||
| } | ||
|
|
||
| TimeSpan backoffTime = result.BackoffTime; | ||
| if (backoffTime != TimeSpan.Zero) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,6 +6,7 @@ namespace Microsoft.Azure.Cosmos | |
| { | ||
| using System; | ||
| using System.Net; | ||
| using System.Net.Http; | ||
| using System.Threading; | ||
| using System.Threading.Tasks; | ||
| using Microsoft.Azure.Cosmos.Core.Trace; | ||
|
|
@@ -14,6 +15,9 @@ namespace Microsoft.Azure.Cosmos | |
|
|
||
| /// <summary> | ||
| /// Metadata Request Throttle Retry Policy is combination of endpoint change retry + throttling retry. | ||
| /// On regional failures the policy marks the endpoint unavailable and retries on the next | ||
| /// preferred region. Once all regions have been attempted, the exception propagates to the | ||
| /// operation-level retry policy (e.g. <see cref="ClientRetryPolicy"/>) for cross-region failover. | ||
| /// </summary> | ||
| internal sealed class MetadataRequestThrottleRetryPolicy : IDocumentClientRetryPolicy | ||
| { | ||
|
|
@@ -43,8 +47,8 @@ internal sealed class MetadataRequestThrottleRetryPolicy : IDocumentClientRetryP | |
| private readonly int maxUnavailableEndpointRetryCount; | ||
|
|
||
| /// <summary> | ||
| /// An instance of <see cref="Uri"/> containing the location endpoint where the partition key | ||
| /// range http request will be sent over. | ||
| /// An instance of <see cref="MetadataRetryContext"/> containing the location index | ||
| /// and preferred-location flag used to route the next retry attempt. | ||
| /// </summary> | ||
| private MetadataRetryContext retryContext; | ||
|
|
||
|
|
@@ -53,6 +57,13 @@ internal sealed class MetadataRequestThrottleRetryPolicy : IDocumentClientRetryP | |
| /// </summary> | ||
| private int unavailableEndpointRetryCount; | ||
|
|
||
| /// <summary> | ||
| /// The resolved location endpoint for the current attempt. Used to mark | ||
| /// the endpoint as unavailable in the <see cref="IGlobalEndpointManager"/> when | ||
| /// a regional failure is detected. | ||
| /// </summary> | ||
| private Uri locationEndpoint; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💬 Observation — Design: Marking endpoint unavailable affects ALL reads, not just metadata
This is the correct trade-off: if a region is failing for metadata reads (partition key range lookups), it's very likely failing for data reads too. The impact is soft (deprioritized, not blocked) and self-healing. This aligns with how Also confirmed: double-marking (from both this policy and |
||
|
|
||
| /// <summary> | ||
| /// The request being sent to the service. | ||
| /// </summary> | ||
|
|
@@ -124,17 +135,36 @@ public Task<ShouldRetryResult> ShouldRetryAsync( | |
| clientException.GetSubStatus(), | ||
| exception, cancellationToken); | ||
| } | ||
| else | ||
|
|
||
| if (exception is HttpRequestException) | ||
| { | ||
| DefaultTrace.TraceInformation("MetadataRequestThrottleRetryPolicy: Evaluating retry for Exception of type: {0}, Message: {1}, ResourceType {2}, CollectionName {3}, ResourceID {4}", | ||
| exception.GetType().Name, | ||
| exception.Message, | ||
| this.request.ResourceType, | ||
| this.request.CollectionName, | ||
| this.request.ResourceId); | ||
| DefaultTrace.TraceWarning("MetadataRequestThrottleRetryPolicy: HttpRequestException received. Marking endpoint {0} unavailable. ResourceType {1}, CollectionName {2}, ResourceID {3}.", | ||
| this.locationEndpoint, | ||
| this.request?.ResourceType, | ||
| this.request?.CollectionName, | ||
| this.request?.ResourceId); | ||
|
|
||
| return Task.FromResult(this.HandleRegionalFailure()); | ||
| } | ||
|
|
||
| if (exception is OperationCanceledException && !cancellationToken.IsCancellationRequested) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟢 Suggestion — Novel pattern: treating non-user This is a new behavioral pattern not present in any sibling retry policy. The rationale here (transport timeout = region issue) is reasonable for metadata requests, since a timeout on a metadata read is a strong signal of regional unavailability. But this creates a behavioral divergence between the metadata-level and operation-level policies interpreting the same exception type. Questions to confirm:
|
||
| { | ||
| DefaultTrace.TraceWarning("MetadataRequestThrottleRetryPolicy: Non-user OperationCanceledException received. Marking endpoint {0} unavailable. ResourceType {1}, CollectionName {2}, ResourceID {3}.", | ||
| this.locationEndpoint, | ||
| this.request?.ResourceType, | ||
| this.request?.CollectionName, | ||
| this.request?.ResourceId); | ||
|
|
||
| return Task.FromResult(this.HandleRegionalFailure()); | ||
| } | ||
|
|
||
| DefaultTrace.TraceInformation("MetadataRequestThrottleRetryPolicy: Evaluating retry for Exception of type: {0}, Message: {1}, ResourceType {2}, CollectionName {3}, ResourceID {4}", | ||
| exception.GetType().Name, | ||
| exception.Message, | ||
| this.request.ResourceType, | ||
| this.request.CollectionName, | ||
| this.request.ResourceId); | ||
|
|
||
| return this.throttlingRetryPolicy.ShouldRetryAsync(exception, cancellationToken); | ||
| } | ||
|
|
||
|
|
@@ -154,10 +184,17 @@ private Task<ShouldRetryResult> ShouldRetryInternalAsync( | |
| || (statusCode == HttpStatusCode.Gone && subStatus == SubStatusCodes.LeaseNotFound) | ||
| || (statusCode == HttpStatusCode.Forbidden && subStatus == SubStatusCodes.DatabaseAccountNotFound)) | ||
| { | ||
| if (this.IncrementRetryIndexOnUnavailableEndpointForMetadataRead()) | ||
| { | ||
| return Task.FromResult(ShouldRetryResult.RetryAfter(TimeSpan.Zero)); | ||
| } | ||
| DefaultTrace.TraceWarning( | ||
| "MetadataRequestThrottleRetryPolicy: Regional failure detected (StatusCode: {0}, SubStatusCode: {1}). " | ||
| + "Marking endpoint {2} unavailable. ResourceType {3}, CollectionName {4}, ResourceID {5}.", | ||
| statusCode, | ||
| subStatus, | ||
| this.locationEndpoint, | ||
| this.request?.ResourceType, | ||
| this.request?.CollectionName, | ||
| this.request?.ResourceId); | ||
|
|
||
| return Task.FromResult(this.HandleRegionalFailure()); | ||
| } | ||
|
|
||
| return this.throttlingRetryPolicy.ShouldRetryAsync(exception, cancellationToken); | ||
|
|
@@ -196,10 +233,17 @@ private Task<ShouldRetryResult> ShouldRetryInternalAsync( | |
| || (statusCode == HttpStatusCode.Gone && subStatus == SubStatusCodes.LeaseNotFound) | ||
| || (statusCode == HttpStatusCode.Forbidden && subStatus == SubStatusCodes.DatabaseAccountNotFound)) | ||
| { | ||
| if (this.IncrementRetryIndexOnUnavailableEndpointForMetadataRead()) | ||
| { | ||
| return Task.FromResult(ShouldRetryResult.RetryAfter(TimeSpan.Zero)); | ||
| } | ||
| DefaultTrace.TraceWarning( | ||
| "MetadataRequestThrottleRetryPolicy: Regional failure detected in response (StatusCode: {0}, SubStatusCode: {1}). " | ||
| + "Marking endpoint {2} unavailable. ResourceType {3}, CollectionName {4}, ResourceID {5}.", | ||
| statusCode, | ||
| subStatus, | ||
| this.locationEndpoint, | ||
| this.request?.ResourceType, | ||
| this.request?.CollectionName, | ||
| this.request?.ResourceId); | ||
|
|
||
| return Task.FromResult(this.HandleRegionalFailure()); | ||
| } | ||
|
|
||
| return this.throttlingRetryPolicy.ShouldRetryAsync(responseMessage, cancellationToken); | ||
|
|
@@ -219,21 +263,61 @@ public void OnBeforeSendRequest(DocumentServiceRequest request) | |
| this.retryContext.RetryLocationIndex, | ||
| this.retryContext.RetryRequestOnPreferredLocations); | ||
|
|
||
| Uri metadataLocationEndpoint = this.globalEndpointManager.ResolveServiceEndpoint(request); | ||
| this.locationEndpoint = this.globalEndpointManager.ResolveServiceEndpoint(request); | ||
|
|
||
| DefaultTrace.TraceInformation("MetadataRequestThrottleRetryPolicy: Routing the metadata request to: {0} for operation type: {1} and resource type: {2} for collection: {3} with collection rid {4}.", | ||
| metadataLocationEndpoint, | ||
| this.locationEndpoint, | ||
| request.OperationType, | ||
| request.ResourceType, | ||
| request.CollectionName, | ||
| request.ResourceId); | ||
| request.RequestContext.RouteToLocation(metadataLocationEndpoint); | ||
| request.RequestContext.RouteToLocation(this.locationEndpoint); | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Marks the current endpoint as unavailable and attempts to increment the | ||
| /// retry location index so the next attempt targets a different region. | ||
| /// </summary> | ||
| /// <returns> | ||
| /// <see cref="ShouldRetryResult"/> with <c>ShouldRetry = true</c> if there are still | ||
| /// regions left to try; <see cref="ShouldRetryResult"/> with <c>ShouldRetry = false</c> otherwise, | ||
| /// allowing the exception to propagate to the operation-level retry policy. | ||
| /// </returns> | ||
| private ShouldRetryResult HandleRegionalFailure() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💬 Observation — Clean consolidation with a subtle behavioral change
One subtle behavioral change worth noting: previously, when This is functionally equivalent today because Not blocking — just calling it out for awareness. |
||
| { | ||
| this.MarkEndpointUnavailable(); | ||
|
|
||
| if (this.IncrementRetryIndexOnUnavailableEndpointForMetadataRead()) | ||
| { | ||
| return ShouldRetryResult.RetryAfter(TimeSpan.Zero); | ||
| } | ||
|
|
||
| return ShouldRetryResult.NoRetry(); | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Marks the current <see cref="locationEndpoint"/> as unavailable for reads | ||
| /// in the <see cref="IGlobalEndpointManager"/>. This acts as a hint to the | ||
| /// <see cref="LocationCache"/> so that all subsequent calls to | ||
| /// <see cref="IGlobalEndpointManager.ResolveServiceEndpoint"/> will prefer | ||
| /// other regions. | ||
| /// </summary> | ||
| private void MarkEndpointUnavailable() | ||
| { | ||
| if (this.locationEndpoint != null) | ||
| { | ||
| DefaultTrace.TraceWarning( | ||
| "MetadataRequestThrottleRetryPolicy: Marking endpoint {0} unavailable for reads.", | ||
| this.locationEndpoint); | ||
|
|
||
| this.globalEndpointManager.MarkEndpointUnavailableForRead(this.locationEndpoint); | ||
| } | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Increments the location index when a unavailable endpoint exception ocurrs, for any future read requests. | ||
| /// Increments the location index when an unavailable endpoint exception occurs, for any future read requests. | ||
| /// </summary> | ||
| /// <returns>A boolean flag indicating if the operation was successful.</returns> | ||
| /// <returns>A boolean flag indicating if there are still regions left to try.</returns> | ||
| private bool IncrementRetryIndexOnUnavailableEndpointForMetadataRead() | ||
| { | ||
| if (this.unavailableEndpointRetryCount++ >= this.maxUnavailableEndpointRetryCount) | ||
|
|
@@ -242,8 +326,6 @@ private bool IncrementRetryIndexOnUnavailableEndpointForMetadataRead() | |
| return false; | ||
| } | ||
|
|
||
| // Retrying on second PreferredLocations. | ||
| // RetryCount is used as zero-based index. | ||
| DefaultTrace.TraceWarning("MetadataRequestThrottleRetryPolicy: Incrementing the metadata retry location index to: {0}.", this.unavailableEndpointRetryCount); | ||
| this.retryContext = new MetadataRetryContext() | ||
| { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🟡 Recommendation — Overbroad Exception Filter: Catches all
CosmosExceptiontypes including non-transient errorsThe
whenguard only checksretryCount, but the XML doc and comment explicitly say this is for "regional error (e.g., CosmosException with 503 or HttpRequestException)". The code doesn't enforce this — a400 BadRequest,404 NotFound, or409 Conflictwill also be caught and retried 3 times with 15-second delays (up to 45 seconds wasted on a deterministic failure).This is inconsistent with
MetadataRequestThrottleRetryPolicy.ShouldRetryInternalAsync()in this same PR, which carefully discriminates by status code (503, 500, 410/LeaseNotFound, 403/DatabaseAccountNotFound).Concrete failure scenario: If the lease container is deleted mid-initialization,
CreateMissingLeasesAsync()throws a 404CosmosException. This catch retries 3× against the same endpoint with the same result, adding ~45 seconds of latency before finally propagating.Suggestion: Add a status code filter to align with the policy: