From 141246f0195a3e174b63dc3d1d4ddf9867875638 Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Tue, 14 Jan 2025 14:59:06 -0800 Subject: [PATCH 01/15] Moved cosmos job hosting to use unified retry logic --- .../Reindex/ReindexJobWorkerTests.cs | 3 ++ .../Operations/Reindex/ReindexJobWorker.cs | 34 +++++++++++++------ .../Storage/Queues/CosmosQueueClient.cs | 22 ++++++++---- .../Storage/RetryExceptionPolicyFactory.cs | 2 ++ .../Operations/Reindex/ReindexJobTests.cs | 1 + .../CosmosDbFhirStorageTestsFixture.cs | 4 ++- 6 files changed, 48 insertions(+), 18 deletions(-) diff --git a/src/Microsoft.Health.Fhir.Core.UnitTests/Features/Operations/Reindex/ReindexJobWorkerTests.cs b/src/Microsoft.Health.Fhir.Core.UnitTests/Features/Operations/Reindex/ReindexJobWorkerTests.cs index 8a79d4f5bf..ec6d4649c1 100644 --- a/src/Microsoft.Health.Fhir.Core.UnitTests/Features/Operations/Reindex/ReindexJobWorkerTests.cs +++ b/src/Microsoft.Health.Fhir.Core.UnitTests/Features/Operations/Reindex/ReindexJobWorkerTests.cs @@ -9,8 +9,10 @@ using System.Threading.Tasks; using Microsoft.Extensions.Logging.Abstractions; using Microsoft.Extensions.Options; +using Microsoft.Health.Core.Features.Context; using Microsoft.Health.Extensions.DependencyInjection; using Microsoft.Health.Fhir.Core.Configs; +using Microsoft.Health.Fhir.Core.Features.Context; using Microsoft.Health.Fhir.Core.Features.Operations; using Microsoft.Health.Fhir.Core.Features.Operations.Reindex; using Microsoft.Health.Fhir.Core.Features.Operations.Reindex.Models; @@ -59,6 +61,7 @@ public ReindexJobWorkerTests() Options.Create(_reindexJobConfiguration), _reindexJobTask.CreateMockScopeProvider(), searchParameterOperations, + Substitute.For>(), NullLogger.Instance); _reindexJobWorker.Handle(new Messages.Search.SearchParametersInitializedNotification(), CancellationToken.None); diff --git a/src/Microsoft.Health.Fhir.Core/Features/Operations/Reindex/ReindexJobWorker.cs b/src/Microsoft.Health.Fhir.Core/Features/Operations/Reindex/ReindexJobWorker.cs index 9f46990f61..c288d3e3a5 100644 --- a/src/Microsoft.Health.Fhir.Core/Features/Operations/Reindex/ReindexJobWorker.cs +++ b/src/Microsoft.Health.Fhir.Core/Features/Operations/Reindex/ReindexJobWorker.cs @@ -12,8 +12,11 @@ using MediatR; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; +using Microsoft.Extensions.Primitives; +using Microsoft.Health.Core.Features.Context; using Microsoft.Health.Extensions.DependencyInjection; using Microsoft.Health.Fhir.Core.Configs; +using Microsoft.Health.Fhir.Core.Features.Context; using Microsoft.Health.Fhir.Core.Features.Operations.Reindex.Models; using Microsoft.Health.Fhir.Core.Features.Search.Parameters; using Microsoft.Health.Fhir.Core.Messages.Search; @@ -29,6 +32,7 @@ public class ReindexJobWorker : INotificationHandler _reindexJobTaskFactory; private readonly ISearchParameterOperations _searchParameterOperations; + private readonly RequestContextAccessor _contextAccessor; private readonly ILogger _logger; private bool _searchParametersInitialized = false; @@ -37,19 +41,15 @@ public ReindexJobWorker( IOptions reindexJobConfiguration, IScopeProvider reindexJobTaskFactory, ISearchParameterOperations searchParameterOperations, + RequestContextAccessor contextAccessor, ILogger logger) { - EnsureArg.IsNotNull(fhirOperationDataStoreFactory, nameof(fhirOperationDataStoreFactory)); - EnsureArg.IsNotNull(reindexJobConfiguration?.Value, nameof(reindexJobConfiguration)); - EnsureArg.IsNotNull(reindexJobTaskFactory, nameof(reindexJobTaskFactory)); - EnsureArg.IsNotNull(searchParameterOperations, nameof(searchParameterOperations)); - EnsureArg.IsNotNull(logger, nameof(logger)); - - _fhirOperationDataStoreFactory = fhirOperationDataStoreFactory; - _reindexJobConfiguration = reindexJobConfiguration.Value; - _reindexJobTaskFactory = reindexJobTaskFactory; - _searchParameterOperations = searchParameterOperations; - _logger = logger; + _fhirOperationDataStoreFactory = EnsureArg.IsNotNull(fhirOperationDataStoreFactory, nameof(fhirOperationDataStoreFactory)); + _reindexJobConfiguration = EnsureArg.IsNotNull(reindexJobConfiguration?.Value, nameof(reindexJobConfiguration)); + _reindexJobTaskFactory = EnsureArg.IsNotNull(reindexJobTaskFactory, nameof(reindexJobTaskFactory)); + _searchParameterOperations = EnsureArg.IsNotNull(searchParameterOperations, nameof(searchParameterOperations)); + _contextAccessor = EnsureArg.IsNotNull(contextAccessor, nameof(contextAccessor)); + _logger = EnsureArg.IsNotNull(logger, nameof(logger)); } public async Task ExecuteAsync(CancellationToken cancellationToken) @@ -60,6 +60,18 @@ public async Task ExecuteAsync(CancellationToken cancellationToken) { if (_searchParametersInitialized) { + // Create a background task context to trigger the correct retry policy. + var fhirRequestContext = new FhirRequestContext( + method: nameof(ReindexJobWorker), + uriString: string.Empty, + baseUriString: string.Empty, + correlationId: string.Empty, + requestHeaders: new Dictionary(), + responseHeaders: new Dictionary()) + { + IsBackgroundTask = true, + }; + // Check for any changes to Search Parameters try { diff --git a/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/Queues/CosmosQueueClient.cs b/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/Queues/CosmosQueueClient.cs index 379b8168a0..6bd358c490 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/Queues/CosmosQueueClient.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/Queues/CosmosQueueClient.cs @@ -12,15 +12,21 @@ using System.Threading.Tasks; using EnsureThat; using Microsoft.Azure.Cosmos; +using Microsoft.Build.Framework; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Primitives; using Microsoft.Health.Abstractions.Exceptions; using Microsoft.Health.Core; using Microsoft.Health.Core.Extensions; +using Microsoft.Health.Core.Features.Context; using Microsoft.Health.Extensions.DependencyInjection; using Microsoft.Health.Fhir.Core.Extensions; +using Microsoft.Health.Fhir.Core.Features.Context; using Microsoft.Health.Fhir.CosmosDb.Core.Features.Storage; using Microsoft.Health.Fhir.CosmosDb.Features.Queries; using Microsoft.Health.JobManagement; using Polly; +using Polly.Retry; namespace Microsoft.Health.Fhir.CosmosDb.Features.Storage.Queues; @@ -29,20 +35,24 @@ public class CosmosQueueClient : IQueueClient private readonly Func> _containerFactory; private readonly ICosmosQueryFactory _queryFactory; private readonly ICosmosDbDistributedLockFactory _distributedLockFactory; - private static readonly AsyncPolicy _retryPolicy = Policy - .Handle(ex => ex.StatusCode == HttpStatusCode.PreconditionFailed) - .Or(ex => ex.StatusCode == HttpStatusCode.TooManyRequests) - .Or() - .WaitAndRetryAsync(5, _ => TimeSpan.FromMilliseconds(RandomNumberGenerator.GetInt32(100, 1000))); + private readonly RetryExceptionPolicyFactory _retryExceptionPolicyFactory; + private readonly ILogger _logger; + private readonly AsyncPolicy _retryPolicy; public CosmosQueueClient( Func> containerFactory, ICosmosQueryFactory queryFactory, - ICosmosDbDistributedLockFactory distributedLockFactory) + ICosmosDbDistributedLockFactory distributedLockFactory, + RetryExceptionPolicyFactory retryExceptionPolicyFactor, + ILogger logger) { _containerFactory = EnsureArg.IsNotNull(containerFactory, nameof(containerFactory)); _queryFactory = EnsureArg.IsNotNull(queryFactory, nameof(queryFactory)); _distributedLockFactory = EnsureArg.IsNotNull(distributedLockFactory, nameof(distributedLockFactory)); + _retryExceptionPolicyFactory = EnsureArg.IsNotNull(retryExceptionPolicyFactor, nameof(retryExceptionPolicyFactor)); + _logger = EnsureArg.IsNotNull(logger, nameof(logger)); + + _retryPolicy = _retryExceptionPolicyFactory.BackgroundWorkerRetryPolicy; } public bool IsInitialized() => true; diff --git a/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs b/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs index 2850384550..f9f93e3738 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs @@ -54,6 +54,8 @@ public AsyncPolicy RetryPolicy } } + public AsyncPolicy BackgroundWorkerRetryPolicy => _backgroundJobRetryPolicy; + private static AsyncRetryPolicy CreateExtendedRetryPolicy(int maxRetries, int maxWaitTimeInSeconds) { return Policy.Handle() diff --git a/test/Microsoft.Health.Fhir.Shared.Tests.Integration/Features/Operations/Reindex/ReindexJobTests.cs b/test/Microsoft.Health.Fhir.Shared.Tests.Integration/Features/Operations/Reindex/ReindexJobTests.cs index 7e710831cf..e7f150caec 100644 --- a/test/Microsoft.Health.Fhir.Shared.Tests.Integration/Features/Operations/Reindex/ReindexJobTests.cs +++ b/test/Microsoft.Health.Fhir.Shared.Tests.Integration/Features/Operations/Reindex/ReindexJobTests.cs @@ -899,6 +899,7 @@ private async Task SetUpForReindexing(CreateReindexReques Options.Create(_jobConfiguration), InitializeReindexJobTask().CreateMockScopeProvider(), _searchParameterOperations, + Substitute.For>(), NullLogger.Instance); await _reindexJobWorker.Handle(new SearchParametersInitializedNotification(), CancellationToken.None); diff --git a/test/Microsoft.Health.Fhir.Shared.Tests.Integration/Persistence/CosmosDbFhirStorageTestsFixture.cs b/test/Microsoft.Health.Fhir.Shared.Tests.Integration/Persistence/CosmosDbFhirStorageTestsFixture.cs index 8cd2204e52..d264ab0ee9 100644 --- a/test/Microsoft.Health.Fhir.Shared.Tests.Integration/Persistence/CosmosDbFhirStorageTestsFixture.cs +++ b/test/Microsoft.Health.Fhir.Shared.Tests.Integration/Persistence/CosmosDbFhirStorageTestsFixture.cs @@ -232,7 +232,9 @@ public virtual async Task InitializeAsync() _queueClient = new CosmosQueueClient( () => _container.CreateMockScope(), new CosmosQueryFactory(Substitute.For(), Substitute.For()), - new CosmosDbDistributedLockFactory(() => _container.CreateMockScope(), NullLogger.Instance)); + new CosmosDbDistributedLockFactory(() => _container.CreateMockScope(), NullLogger.Instance), + retryExceptionPolicyFactory, + NullLogger.Instance); _cosmosFhirOperationDataStore = new CosmosFhirOperationDataStore( _queueClient, From 5afb7258aac00df53570069b37f66c87df4b091e Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Tue, 14 Jan 2025 15:52:12 -0800 Subject: [PATCH 02/15] Improve exception policy per cosmos recomendations --- .../Operations/Reindex/ReindexJobWorker.cs | 2 + .../Storage/CosmosFhirDataStoreTests.cs | 2 +- .../FhirCosmosClientInitializerTests.cs | 2 +- .../Storage/RetryExceptionPolicyFactory.cs | 61 ++++++++++++++++--- .../CosmosDbFhirStorageTestsFixture.cs | 2 +- 5 files changed, 59 insertions(+), 10 deletions(-) diff --git a/src/Microsoft.Health.Fhir.Core/Features/Operations/Reindex/ReindexJobWorker.cs b/src/Microsoft.Health.Fhir.Core/Features/Operations/Reindex/ReindexJobWorker.cs index c288d3e3a5..66ebb568a8 100644 --- a/src/Microsoft.Health.Fhir.Core/Features/Operations/Reindex/ReindexJobWorker.cs +++ b/src/Microsoft.Health.Fhir.Core/Features/Operations/Reindex/ReindexJobWorker.cs @@ -72,6 +72,8 @@ public async Task ExecuteAsync(CancellationToken cancellationToken) IsBackgroundTask = true, }; + _contextAccessor.RequestContext = fhirRequestContext; + // Check for any changes to Search Parameters try { diff --git a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/CosmosFhirDataStoreTests.cs b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/CosmosFhirDataStoreTests.cs index 5958d6ee22..d8de268713 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/CosmosFhirDataStoreTests.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/CosmosFhirDataStoreTests.cs @@ -76,7 +76,7 @@ public CosmosFhirDataStoreTests() _cosmosDataStoreConfiguration, Substitute.For>(), _cosmosQueryFactory, - new RetryExceptionPolicyFactory(_cosmosDataStoreConfiguration, requestContextAccessor), + new RetryExceptionPolicyFactory(_cosmosDataStoreConfiguration, requestContextAccessor, NullLogger.Instance), NullLogger.Instance, Options.Create(new CoreFeatureConfiguration()), _bundleOrchestrator, diff --git a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/FhirCosmosClientInitializerTests.cs b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/FhirCosmosClientInitializerTests.cs index 503b8af40b..0364c5b1c4 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/FhirCosmosClientInitializerTests.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/FhirCosmosClientInitializerTests.cs @@ -40,7 +40,7 @@ public FhirCosmosClientInitializerTests() _initializer = new FhirCosmosClientInitializer( clientTestProvider, () => new[] { new TestRequestHandler() }, - new RetryExceptionPolicyFactory(_cosmosDataStoreConfiguration, Substitute.For>()), + new RetryExceptionPolicyFactory(_cosmosDataStoreConfiguration, Substitute.For>(), NullLogger.Instance), Substitute.For(), NullLogger.Instance); diff --git a/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs b/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs index f9f93e3738..e180247b3f 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs @@ -4,10 +4,14 @@ // ------------------------------------------------------------------------------------------------- using System; +using System.Net; using System.Runtime.ExceptionServices; +using System.Security.Cryptography; using System.Threading.Tasks; using EnsureThat; using Microsoft.Azure.Cosmos; +using Microsoft.Build.Framework; +using Microsoft.Extensions.Logging; using Microsoft.Health.Abstractions.Exceptions; using Microsoft.Health.Core.Features.Context; using Microsoft.Health.Fhir.Core.Extensions; @@ -22,15 +26,16 @@ public class RetryExceptionPolicyFactory { private const string RetryEndTimeContextKey = "RetryEndTime"; private readonly RequestContextAccessor _requestContextAccessor; + private readonly ILogger _logger; private readonly AsyncPolicy _sdkOnlyRetryPolicy; private readonly AsyncPolicy _bundleActionRetryPolicy; private readonly AsyncPolicy _backgroundJobRetryPolicy; - public RetryExceptionPolicyFactory(CosmosDataStoreConfiguration configuration, RequestContextAccessor requestContextAccessor) + public RetryExceptionPolicyFactory(CosmosDataStoreConfiguration configuration, RequestContextAccessor requestContextAccessor, ILogger logger) { - _requestContextAccessor = requestContextAccessor; + _requestContextAccessor = EnsureArg.IsNotNull(requestContextAccessor, nameof(requestContextAccessor)); EnsureArg.IsNotNull(configuration, nameof(configuration)); - EnsureArg.IsNotNull(requestContextAccessor, nameof(requestContextAccessor)); + _logger = EnsureArg.IsNotNull(logger, nameof(logger)); _sdkOnlyRetryPolicy = Policy.NoOpAsync(); @@ -38,7 +43,7 @@ public RetryExceptionPolicyFactory(CosmosDataStoreConfiguration configuration, R ? CreateExtendedRetryPolicy(configuration.IndividualBatchActionRetryOptions.MaxNumberOfRetries / configuration.RetryOptions.MaxNumberOfRetries, configuration.IndividualBatchActionRetryOptions.MaxWaitTimeInSeconds) : Policy.NoOpAsync(); - _backgroundJobRetryPolicy = CreateExtendedRetryPolicy(100, -1); + _backgroundJobRetryPolicy = CreateExtendedRetryPolicy(100, -1, true); } public AsyncPolicy RetryPolicy @@ -56,16 +61,58 @@ public AsyncPolicy RetryPolicy public AsyncPolicy BackgroundWorkerRetryPolicy => _backgroundJobRetryPolicy; - private static AsyncRetryPolicy CreateExtendedRetryPolicy(int maxRetries, int maxWaitTimeInSeconds) + private AsyncRetryPolicy CreateExtendedRetryPolicy(int maxRetries, int maxWaitTimeInSeconds, bool useExponentialRetry = false) { + // Define a sleep duration provider based on the retry strategy + TimeSpan SleepDurationProvider(int retryAttempt, Exception exception) + { + // Respect x-ms-retry-after-ms from RequestRateExceededException + if (exception.AsRequestRateExceeded()?.RetryAfter is TimeSpan retryAfter) + { + return retryAfter; + } + + // Respect x-ms-retry-after-ms from CosmosException + if (exception is CosmosException cosmosException && cosmosException.StatusCode == HttpStatusCode.TooManyRequests && cosmosException.RetryAfter.HasValue) + { + return cosmosException.RetryAfter.Value; + } + + if (useExponentialRetry) + { + // Exponential backoff with jitter + var backoff = Math.Pow(2, retryAttempt) * 100; // Exponential backoff in milliseconds + var jitter = RandomNumberGenerator.GetInt32(0, 300); // Add jitter in milliseconds + return TimeSpan.FromMilliseconds(backoff + jitter); + } + + // Default fixed wait time + return TimeSpan.FromSeconds(2); + } + + // Retry recommendations for Cosmos DB: https://learn.microsoft.com/azure/cosmos-db/nosql/conceptual-resilient-sdk-applications#should-my-application-retry-on-errors return Policy.Handle() .Or(e => e.IsRequestRateExceeded()) - .Or(e => (e.StatusCode == System.Net.HttpStatusCode.ServiceUnavailable || e.StatusCode == System.Net.HttpStatusCode.RequestTimeout)) + .Or(e => + e.StatusCode == System.Net.HttpStatusCode.ServiceUnavailable || + e.StatusCode == System.Net.HttpStatusCode.TooManyRequests || + e.StatusCode == System.Net.HttpStatusCode.Gone || + e.StatusCode == (HttpStatusCode)449 || // "Retry with" status code + e.StatusCode == System.Net.HttpStatusCode.RequestTimeout) .WaitAndRetryAsync( retryCount: maxRetries, - sleepDurationProvider: (_, e, _) => e.AsRequestRateExceeded()?.RetryAfter ?? TimeSpan.FromSeconds(2), + sleepDurationProvider: (retryAttempt, exception, context) => SleepDurationProvider(retryAttempt, exception), onRetryAsync: (e, _, _, ctx) => { + if (e is CosmosException cosmosException) + { + if (cosmosException.StatusCode == HttpStatusCode.ServiceUnavailable) + { + var diagnostics = cosmosException.Diagnostics.ToString(); + _logger.LogWarning(cosmosException, "Received a ServiceUnavailable response from Cosmos DB. Retrying. Diagnostics: {CosmosDiagnostics}", diagnostics); + } + } + if (maxWaitTimeInSeconds == -1) { // no timeout diff --git a/test/Microsoft.Health.Fhir.Shared.Tests.Integration/Persistence/CosmosDbFhirStorageTestsFixture.cs b/test/Microsoft.Health.Fhir.Shared.Tests.Integration/Persistence/CosmosDbFhirStorageTestsFixture.cs index d264ab0ee9..fb10d6e9d9 100644 --- a/test/Microsoft.Health.Fhir.Shared.Tests.Integration/Persistence/CosmosDbFhirStorageTestsFixture.cs +++ b/test/Microsoft.Health.Fhir.Shared.Tests.Integration/Persistence/CosmosDbFhirStorageTestsFixture.cs @@ -147,7 +147,7 @@ public virtual async Task InitializeAsync() var responseProcessor = new CosmosResponseProcessor(_fhirRequestContextAccessor, mediator, Substitute.For(), NullLogger.Instance); var handler = new FhirCosmosResponseHandler(() => new NonDisposingScope(_container), _cosmosDataStoreConfiguration, _fhirRequestContextAccessor, responseProcessor); - var retryExceptionPolicyFactory = new RetryExceptionPolicyFactory(_cosmosDataStoreConfiguration, _fhirRequestContextAccessor); + var retryExceptionPolicyFactory = new RetryExceptionPolicyFactory(_cosmosDataStoreConfiguration, _fhirRequestContextAccessor, NullLogger.Instance); var documentClientInitializer = new FhirCosmosClientInitializer( testProvider, () => new[] { handler }, From 65782edf4784ec6f323fb1bb93b750ecab914840 Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Tue, 14 Jan 2025 15:59:29 -0800 Subject: [PATCH 03/15] remove unneeded using --- .../Features/Storage/RetryExceptionPolicyFactory.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs b/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs index e180247b3f..c19931f637 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs @@ -10,7 +10,6 @@ using System.Threading.Tasks; using EnsureThat; using Microsoft.Azure.Cosmos; -using Microsoft.Build.Framework; using Microsoft.Extensions.Logging; using Microsoft.Health.Abstractions.Exceptions; using Microsoft.Health.Core.Features.Context; From 619a7cadd2355c9a26f388b4297df13c232e5fc6 Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Tue, 14 Jan 2025 16:19:30 -0800 Subject: [PATCH 04/15] fix reindex worker context --- .../Features/Operations/Reindex/ReindexJobWorker.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.Health.Fhir.Core/Features/Operations/Reindex/ReindexJobWorker.cs b/src/Microsoft.Health.Fhir.Core/Features/Operations/Reindex/ReindexJobWorker.cs index 66ebb568a8..2fe0ec0158 100644 --- a/src/Microsoft.Health.Fhir.Core/Features/Operations/Reindex/ReindexJobWorker.cs +++ b/src/Microsoft.Health.Fhir.Core/Features/Operations/Reindex/ReindexJobWorker.cs @@ -63,9 +63,9 @@ public async Task ExecuteAsync(CancellationToken cancellationToken) // Create a background task context to trigger the correct retry policy. var fhirRequestContext = new FhirRequestContext( method: nameof(ReindexJobWorker), - uriString: string.Empty, - baseUriString: string.Empty, - correlationId: string.Empty, + uriString: nameof(ReindexJobWorker), + baseUriString: nameof(ReindexJobWorker), + correlationId: Guid.NewGuid().ToString(), requestHeaders: new Dictionary(), responseHeaders: new Dictionary()) { From d2eced1b33c3c256487d98f98bde6145eb96f87a Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Tue, 14 Jan 2025 16:51:47 -0800 Subject: [PATCH 05/15] fix code scanning issue --- .../Features/Storage/RetryExceptionPolicyFactory.cs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs b/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs index c19931f637..738b222cae 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs @@ -103,13 +103,10 @@ TimeSpan SleepDurationProvider(int retryAttempt, Exception exception) sleepDurationProvider: (retryAttempt, exception, context) => SleepDurationProvider(retryAttempt, exception), onRetryAsync: (e, _, _, ctx) => { - if (e is CosmosException cosmosException) + if (e is CosmosException cosmosException && cosmosException.StatusCode == HttpStatusCode.ServiceUnavailable) { - if (cosmosException.StatusCode == HttpStatusCode.ServiceUnavailable) - { - var diagnostics = cosmosException.Diagnostics.ToString(); - _logger.LogWarning(cosmosException, "Received a ServiceUnavailable response from Cosmos DB. Retrying. Diagnostics: {CosmosDiagnostics}", diagnostics); - } + var diagnostics = cosmosException.Diagnostics.ToString(); + _logger.LogWarning(cosmosException, "Received a ServiceUnavailable response from Cosmos DB. Retrying. Diagnostics: {CosmosDiagnostics}", diagnostics); } if (maxWaitTimeInSeconds == -1) From 5f1160cca931500c6cea413b161d31aa17b7ff77 Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Tue, 14 Jan 2025 18:08:14 -0800 Subject: [PATCH 06/15] handle case of null diagnostics --- .../Features/Storage/RetryExceptionPolicyFactory.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs b/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs index 738b222cae..d0edfd7ded 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs @@ -105,7 +105,7 @@ TimeSpan SleepDurationProvider(int retryAttempt, Exception exception) { if (e is CosmosException cosmosException && cosmosException.StatusCode == HttpStatusCode.ServiceUnavailable) { - var diagnostics = cosmosException.Diagnostics.ToString(); + var diagnostics = cosmosException.Diagnostics?.ToString() ?? "empty"; _logger.LogWarning(cosmosException, "Received a ServiceUnavailable response from Cosmos DB. Retrying. Diagnostics: {CosmosDiagnostics}", diagnostics); } From 92594b767f250d3e41f0300c508ece1d4c50c160 Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Tue, 14 Jan 2025 21:20:13 -0800 Subject: [PATCH 07/15] Add retry unit tests --- .../Storage/Queues/CosmosQueueClientTests.cs | 146 ++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs diff --git a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs new file mode 100644 index 0000000000..849a0ec560 --- /dev/null +++ b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs @@ -0,0 +1,146 @@ +// ------------------------------------------------------------------------------------------------- +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. +// ------------------------------------------------------------------------------------------------- + +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; +using System.Net; +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using Azure; +using Hl7.Fhir.ElementModel.Types; +using Microsoft.AspNetCore.Http; +using Microsoft.Azure.Cosmos; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Health.Abstractions.Exceptions; +using Microsoft.Health.Core.Features.Context; +using Microsoft.Health.Extensions.DependencyInjection; +using Microsoft.Health.Fhir.Core.Features.Context; +using Microsoft.Health.Fhir.Core.UnitTests.Extensions; +using Microsoft.Health.Fhir.CosmosDb.Core.Configs; +using Microsoft.Health.Fhir.CosmosDb.Core.Features.Storage; +using Microsoft.Health.Fhir.CosmosDb.Features.Queries; +using Microsoft.Health.Fhir.CosmosDb.Features.Storage; +using Microsoft.Health.Fhir.CosmosDb.Features.Storage.Queues; +using Microsoft.Health.Fhir.Tests.Common; +using Microsoft.Health.Test.Utilities; +using NSubstitute; +using Xunit; + +namespace Microsoft.Health.Fhir.CosmosDb.UnitTests.Features.Storage.Queues +{ + [Trait(Traits.OwningTeam, OwningTeam.Fhir)] + [Trait(Traits.Category, Categories.DataSourceValidation)] + public class CosmosQueueClientTests + { + private readonly ICosmosQueryFactory _cosmosQueryFactory; + private readonly ICosmosDbDistributedLockFactory _distributedLockFactory; + private readonly CosmosDataStoreConfiguration _cosmosDataStoreConfiguration = new CosmosDataStoreConfiguration(); + private readonly RequestContextAccessor _requestContextAccessor; + private readonly RetryExceptionPolicyFactory _retryPolicyFactory; + private readonly CosmosQueueClient _cosmosQueueClient; + + public CosmosQueueClientTests() + { + _cosmosQueryFactory = Substitute.For(); + _distributedLockFactory = Substitute.For(); + _requestContextAccessor = Substitute.For>(); + _retryPolicyFactory = new RetryExceptionPolicyFactory(_cosmosDataStoreConfiguration, _requestContextAccessor, NullLogger.Instance); + + _cosmosQueueClient = new CosmosQueueClient( + Substitute.For>>(), + _cosmosQueryFactory, + _distributedLockFactory, + _retryPolicyFactory, + NullLogger.Instance); + } + + [Theory] + [InlineData(HttpStatusCode.ServiceUnavailable)] + [InlineData(HttpStatusCode.TooManyRequests)] + [InlineData(HttpStatusCode.Gone)] + [InlineData((HttpStatusCode)449)] + [InlineData(HttpStatusCode.RequestTimeout)] + public async Task GivenADequeueJobOperation_WhenExceptionOccurs_RetryWillHappen(HttpStatusCode statusCode) + { + // Arrange + ICosmosQuery cosmosQuery = Substitute.For>(); + _cosmosQueryFactory.Create(Arg.Any(), Arg.Any()) + .ReturnsForAnyArgs(cosmosQuery); + + int callCount = 0; + cosmosQuery.ExecuteNextAsync(Arg.Any()).ReturnsForAnyArgs(_ => + { + if (callCount++ == 0) + { + throw new TestCosmosException(statusCode); + } + + return Task.FromResult(Substitute.For>()); + }); + + // Act + await _cosmosQueueClient.DequeueAsync(0, "testworker", 10, CancellationToken.None); + + // Assert + Assert.Equal(2, callCount); + await cosmosQuery.ReceivedWithAnyArgs(2).ExecuteNextAsync(Arg.Any()); + } + + [Theory] + [InlineData(typeof(CosmosException))] + [InlineData(typeof(RequestRateExceededException))] + public async Task GivenADequeueJobOperation_WhenExceptionWithRetryAfterIsProvided_PolicyRespectsRetryAfter(Type exceptionType) + { + // Arrange + ICosmosQuery cosmosQuery = Substitute.For>(); + _cosmosQueryFactory.Create(Arg.Any(), Arg.Any()) + .ReturnsForAnyArgs(cosmosQuery); + var retryAfter = TimeSpan.FromSeconds(1); + int callCount = 0; + + cosmosQuery.ExecuteNextAsync(Arg.Any()).ReturnsForAnyArgs(_ => + { + if (callCount++ == 0) + { + throw exceptionType == typeof(CosmosException) + ? new TestCosmosException(HttpStatusCode.TooManyRequests, retryAfter) + : new RequestRateExceededException(retryAfter); + } + + return Task.FromResult(Substitute.For>()); + }); + + var stopwatch = Stopwatch.StartNew(); + + // Act + await _cosmosQueueClient.DequeueAsync(0, "testworker", 10, CancellationToken.None); + + stopwatch.Stop(); + + // Assert + Assert.Equal(2, callCount); + await cosmosQuery.ReceivedWithAnyArgs(2).ExecuteNextAsync(Arg.Any()); + Assert.True(stopwatch.Elapsed >= retryAfter, "Policy should respect the RetryAfter value."); + } + + public class TestCosmosException : CosmosException + { + private readonly TimeSpan? _retryAfter; + + public TestCosmosException(HttpStatusCode statusCode, TimeSpan? retryAfter = null) + : base("Test exception message", statusCode, 0, "test-activity-id", 0.0) + { + _retryAfter = retryAfter; + } + + public override TimeSpan? RetryAfter => _retryAfter; + } + } +} From 875e3e5cbdb733c7fc76bfca233218cfe7917177 Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Tue, 14 Jan 2025 21:52:39 -0800 Subject: [PATCH 08/15] formatting fixes --- .../Storage/Queues/CosmosQueueClientTests.cs | 194 ++++++++---------- 1 file changed, 89 insertions(+), 105 deletions(-) diff --git a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs index 849a0ec560..c68b8d5c7e 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs @@ -4,25 +4,16 @@ // ------------------------------------------------------------------------------------------------- using System; -using System.Collections.Generic; using System.Diagnostics; -using System.Linq; using System.Net; -using System.Reflection; -using System.Runtime.CompilerServices; -using System.Text; using System.Threading; using System.Threading.Tasks; -using Azure; -using Hl7.Fhir.ElementModel.Types; -using Microsoft.AspNetCore.Http; using Microsoft.Azure.Cosmos; using Microsoft.Extensions.Logging.Abstractions; using Microsoft.Health.Abstractions.Exceptions; using Microsoft.Health.Core.Features.Context; using Microsoft.Health.Extensions.DependencyInjection; using Microsoft.Health.Fhir.Core.Features.Context; -using Microsoft.Health.Fhir.Core.UnitTests.Extensions; using Microsoft.Health.Fhir.CosmosDb.Core.Configs; using Microsoft.Health.Fhir.CosmosDb.Core.Features.Storage; using Microsoft.Health.Fhir.CosmosDb.Features.Queries; @@ -33,114 +24,107 @@ using NSubstitute; using Xunit; -namespace Microsoft.Health.Fhir.CosmosDb.UnitTests.Features.Storage.Queues +namespace Microsoft.Health.Fhir.CosmosDb.UnitTests.Features.Storage.Queues; + +[Trait(Traits.OwningTeam, OwningTeam.Fhir)] +[Trait(Traits.Category, Categories.DataSourceValidation)] +public class CosmosQueueClientTests { - [Trait(Traits.OwningTeam, OwningTeam.Fhir)] - [Trait(Traits.Category, Categories.DataSourceValidation)] - public class CosmosQueueClientTests + private readonly ICosmosQueryFactory _cosmosQueryFactory; + private readonly ICosmosDbDistributedLockFactory _distributedLockFactory; + private readonly CosmosDataStoreConfiguration _cosmosDataStoreConfiguration = new CosmosDataStoreConfiguration(); + private readonly RequestContextAccessor _requestContextAccessor; + private readonly RetryExceptionPolicyFactory _retryPolicyFactory; + private readonly CosmosQueueClient _cosmosQueueClient; + + public CosmosQueueClientTests() { - private readonly ICosmosQueryFactory _cosmosQueryFactory; - private readonly ICosmosDbDistributedLockFactory _distributedLockFactory; - private readonly CosmosDataStoreConfiguration _cosmosDataStoreConfiguration = new CosmosDataStoreConfiguration(); - private readonly RequestContextAccessor _requestContextAccessor; - private readonly RetryExceptionPolicyFactory _retryPolicyFactory; - private readonly CosmosQueueClient _cosmosQueueClient; - - public CosmosQueueClientTests() - { - _cosmosQueryFactory = Substitute.For(); - _distributedLockFactory = Substitute.For(); - _requestContextAccessor = Substitute.For>(); - _retryPolicyFactory = new RetryExceptionPolicyFactory(_cosmosDataStoreConfiguration, _requestContextAccessor, NullLogger.Instance); - - _cosmosQueueClient = new CosmosQueueClient( - Substitute.For>>(), - _cosmosQueryFactory, - _distributedLockFactory, - _retryPolicyFactory, - NullLogger.Instance); - } - - [Theory] - [InlineData(HttpStatusCode.ServiceUnavailable)] - [InlineData(HttpStatusCode.TooManyRequests)] - [InlineData(HttpStatusCode.Gone)] - [InlineData((HttpStatusCode)449)] - [InlineData(HttpStatusCode.RequestTimeout)] - public async Task GivenADequeueJobOperation_WhenExceptionOccurs_RetryWillHappen(HttpStatusCode statusCode) - { - // Arrange - ICosmosQuery cosmosQuery = Substitute.For>(); - _cosmosQueryFactory.Create(Arg.Any(), Arg.Any()) - .ReturnsForAnyArgs(cosmosQuery); + _cosmosQueryFactory = Substitute.For(); + _distributedLockFactory = Substitute.For(); + _requestContextAccessor = Substitute.For>(); + _retryPolicyFactory = new RetryExceptionPolicyFactory(_cosmosDataStoreConfiguration, _requestContextAccessor, NullLogger.Instance); + + _cosmosQueueClient = new CosmosQueueClient( + Substitute.For>>(), + _cosmosQueryFactory, + _distributedLockFactory, + _retryPolicyFactory, + NullLogger.Instance); + } - int callCount = 0; - cosmosQuery.ExecuteNextAsync(Arg.Any()).ReturnsForAnyArgs(_ => - { - if (callCount++ == 0) - { - throw new TestCosmosException(statusCode); - } - - return Task.FromResult(Substitute.For>()); - }); - - // Act - await _cosmosQueueClient.DequeueAsync(0, "testworker", 10, CancellationToken.None); - - // Assert - Assert.Equal(2, callCount); - await cosmosQuery.ReceivedWithAnyArgs(2).ExecuteNextAsync(Arg.Any()); - } - - [Theory] - [InlineData(typeof(CosmosException))] - [InlineData(typeof(RequestRateExceededException))] - public async Task GivenADequeueJobOperation_WhenExceptionWithRetryAfterIsProvided_PolicyRespectsRetryAfter(Type exceptionType) + [Theory] + [InlineData(HttpStatusCode.ServiceUnavailable)] + [InlineData(HttpStatusCode.TooManyRequests)] + [InlineData(HttpStatusCode.Gone)] + [InlineData((HttpStatusCode)449)] + [InlineData(HttpStatusCode.RequestTimeout)] + public async Task GivenADequeueJobOperation_WhenExceptionOccurs_RetryWillHappen(HttpStatusCode statusCode) + { + // Arrange + ICosmosQuery cosmosQuery = Substitute.For>(); + _cosmosQueryFactory.Create(Arg.Any(), Arg.Any()) + .ReturnsForAnyArgs(cosmosQuery); + + int callCount = 0; + cosmosQuery.ExecuteNextAsync(Arg.Any()).ReturnsForAnyArgs(_ => { - // Arrange - ICosmosQuery cosmosQuery = Substitute.For>(); - _cosmosQueryFactory.Create(Arg.Any(), Arg.Any()) - .ReturnsForAnyArgs(cosmosQuery); - var retryAfter = TimeSpan.FromSeconds(1); - int callCount = 0; - - cosmosQuery.ExecuteNextAsync(Arg.Any()).ReturnsForAnyArgs(_ => + if (callCount++ == 0) { - if (callCount++ == 0) - { - throw exceptionType == typeof(CosmosException) - ? new TestCosmosException(HttpStatusCode.TooManyRequests, retryAfter) - : new RequestRateExceededException(retryAfter); - } - - return Task.FromResult(Substitute.For>()); - }); - - var stopwatch = Stopwatch.StartNew(); + throw new TestCosmosException(statusCode); + } - // Act - await _cosmosQueueClient.DequeueAsync(0, "testworker", 10, CancellationToken.None); + return Task.FromResult(Substitute.For>()); + }); - stopwatch.Stop(); + // Act + await _cosmosQueueClient.DequeueAsync(0, "testworker", 10, CancellationToken.None); - // Assert - Assert.Equal(2, callCount); - await cosmosQuery.ReceivedWithAnyArgs(2).ExecuteNextAsync(Arg.Any()); - Assert.True(stopwatch.Elapsed >= retryAfter, "Policy should respect the RetryAfter value."); - } + // Assert + Assert.Equal(2, callCount); + await cosmosQuery.ReceivedWithAnyArgs(2).ExecuteNextAsync(Arg.Any()); + } - public class TestCosmosException : CosmosException + [Theory] + [InlineData(typeof(CosmosException))] + [InlineData(typeof(RequestRateExceededException))] + public async Task GivenADequeueJobOperation_WhenExceptionWithRetryAfterIsProvided_PolicyRespectsRetryAfter(Type exceptionType) + { + // Arrange + ICosmosQuery cosmosQuery = Substitute.For>(); + _cosmosQueryFactory.Create(Arg.Any(), Arg.Any()) + .ReturnsForAnyArgs(cosmosQuery); + var retryAfter = TimeSpan.FromSeconds(1); + int callCount = 0; + + cosmosQuery.ExecuteNextAsync(Arg.Any()).ReturnsForAnyArgs(_ => { - private readonly TimeSpan? _retryAfter; - - public TestCosmosException(HttpStatusCode statusCode, TimeSpan? retryAfter = null) - : base("Test exception message", statusCode, 0, "test-activity-id", 0.0) + if (callCount++ == 0) { - _retryAfter = retryAfter; + throw exceptionType == typeof(CosmosException) + ? new TestCosmosException(HttpStatusCode.TooManyRequests, retryAfter) + : new RequestRateExceededException(retryAfter); } - public override TimeSpan? RetryAfter => _retryAfter; - } + return Task.FromResult(Substitute.For>()); + }); + + var stopwatch = Stopwatch.StartNew(); + + // Act + await _cosmosQueueClient.DequeueAsync(0, "testworker", 10, CancellationToken.None); + + stopwatch.Stop(); + + // Assert + Assert.Equal(2, callCount); + await cosmosQuery.ReceivedWithAnyArgs(2).ExecuteNextAsync(Arg.Any()); + Assert.True(stopwatch.Elapsed >= retryAfter, "Policy should respect the RetryAfter value."); + } + + public class TestCosmosException(HttpStatusCode statusCode, TimeSpan? retryAfter = null) : CosmosException("Test exception message", statusCode, 0, "test-activity-id", 0.0) + { + private readonly TimeSpan? _retryAfter = retryAfter; + + public override TimeSpan? RetryAfter => _retryAfter; } } From c75ef3c00a09dffa41f5b95dcd2c99f6fec28691 Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Tue, 14 Jan 2025 21:58:52 -0800 Subject: [PATCH 09/15] remove unneeded usings --- .../Features/Storage/Queues/CosmosQueueClient.cs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/Queues/CosmosQueueClient.cs b/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/Queues/CosmosQueueClient.cs index 6bd358c490..a3f7b60a30 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/Queues/CosmosQueueClient.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/Queues/CosmosQueueClient.cs @@ -12,21 +12,16 @@ using System.Threading.Tasks; using EnsureThat; using Microsoft.Azure.Cosmos; -using Microsoft.Build.Framework; using Microsoft.Extensions.Logging; -using Microsoft.Extensions.Primitives; using Microsoft.Health.Abstractions.Exceptions; using Microsoft.Health.Core; using Microsoft.Health.Core.Extensions; -using Microsoft.Health.Core.Features.Context; using Microsoft.Health.Extensions.DependencyInjection; using Microsoft.Health.Fhir.Core.Extensions; -using Microsoft.Health.Fhir.Core.Features.Context; using Microsoft.Health.Fhir.CosmosDb.Core.Features.Storage; using Microsoft.Health.Fhir.CosmosDb.Features.Queries; using Microsoft.Health.JobManagement; using Polly; -using Polly.Retry; namespace Microsoft.Health.Fhir.CosmosDb.Features.Storage.Queues; From c0e80d5c7d3f2bbd09d0b794b8f010e7712e8712 Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Wed, 15 Jan 2025 08:55:44 -0800 Subject: [PATCH 10/15] updated retry policy --- .../Storage/RetryExceptionPolicyFactory.cs | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs b/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs index d0edfd7ded..a70e61e018 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs @@ -30,6 +30,10 @@ public class RetryExceptionPolicyFactory private readonly AsyncPolicy _bundleActionRetryPolicy; private readonly AsyncPolicy _backgroundJobRetryPolicy; + private const int _exponentialBackoffBaseDelayMs = 100; + private const int _exponentialMaxJitterMs = 300; + private const int _exponentialMaxDelayMs = 60 * 1000; + public RetryExceptionPolicyFactory(CosmosDataStoreConfiguration configuration, RequestContextAccessor requestContextAccessor, ILogger logger) { _requestContextAccessor = EnsureArg.IsNotNull(requestContextAccessor, nameof(requestContextAccessor)); @@ -42,7 +46,7 @@ public RetryExceptionPolicyFactory(CosmosDataStoreConfiguration configuration, R ? CreateExtendedRetryPolicy(configuration.IndividualBatchActionRetryOptions.MaxNumberOfRetries / configuration.RetryOptions.MaxNumberOfRetries, configuration.IndividualBatchActionRetryOptions.MaxWaitTimeInSeconds) : Policy.NoOpAsync(); - _backgroundJobRetryPolicy = CreateExtendedRetryPolicy(100, -1, true); + _backgroundJobRetryPolicy = CreateExtendedRetryPolicy(30, -1, true); } public AsyncPolicy RetryPolicy @@ -77,11 +81,14 @@ TimeSpan SleepDurationProvider(int retryAttempt, Exception exception) return cosmosException.RetryAfter.Value; } + // Exponential backoff is used for background jobs. Given current values, exponential backoff is used for the first 10 retries. After that a fixed wait time of_exponentialMaxDelayMs (60 seconds) is used. + // Jitter is multiplied by the retry attempt to increase the randomness of the retry interval for longer retry delays (especially retry > 10). if (useExponentialRetry) { - // Exponential backoff with jitter - var backoff = Math.Pow(2, retryAttempt) * 100; // Exponential backoff in milliseconds - var jitter = RandomNumberGenerator.GetInt32(0, 300); // Add jitter in milliseconds + // Calculate exponential backoff with a cap of 60 seconds + var backoff = Math.Min(Math.Pow(2, retryAttempt) * _exponentialBackoffBaseDelayMs, _exponentialMaxDelayMs); + + var jitter = RandomNumberGenerator.GetInt32(0, _exponentialMaxJitterMs) * retryAttempt; // Add jitter in milliseconds return TimeSpan.FromMilliseconds(backoff + jitter); } From a06f388a57c85673f4fb71ff2ae389835751c021 Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Wed, 15 Jan 2025 09:09:03 -0800 Subject: [PATCH 11/15] cleanup --- .../Features/Operations/Reindex/ReindexJobWorker.cs | 4 ++++ .../Features/Storage/Queues/CosmosQueueClientTests.cs | 3 +-- .../Features/Storage/Queues/CosmosQueueClient.cs | 7 ++----- .../Persistence/CosmosDbFhirStorageTestsFixture.cs | 3 +-- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/Microsoft.Health.Fhir.Core/Features/Operations/Reindex/ReindexJobWorker.cs b/src/Microsoft.Health.Fhir.Core/Features/Operations/Reindex/ReindexJobWorker.cs index 2fe0ec0158..fc053624b9 100644 --- a/src/Microsoft.Health.Fhir.Core/Features/Operations/Reindex/ReindexJobWorker.cs +++ b/src/Microsoft.Health.Fhir.Core/Features/Operations/Reindex/ReindexJobWorker.cs @@ -60,6 +60,8 @@ public async Task ExecuteAsync(CancellationToken cancellationToken) { if (_searchParametersInitialized) { + var originalRequestContext = _contextAccessor.RequestContext; + // Create a background task context to trigger the correct retry policy. var fhirRequestContext = new FhirRequestContext( method: nameof(ReindexJobWorker), @@ -138,6 +140,8 @@ public async Task ExecuteAsync(CancellationToken cancellationToken) // The job failed. _logger.LogError(ex, "Error polling Reindex jobs."); } + + _contextAccessor.RequestContext = originalRequestContext; } try diff --git a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs index c68b8d5c7e..6f56d5e557 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs @@ -48,8 +48,7 @@ public CosmosQueueClientTests() Substitute.For>>(), _cosmosQueryFactory, _distributedLockFactory, - _retryPolicyFactory, - NullLogger.Instance); + _retryPolicyFactory); } [Theory] diff --git a/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/Queues/CosmosQueueClient.cs b/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/Queues/CosmosQueueClient.cs index a3f7b60a30..725c7ed0f6 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/Queues/CosmosQueueClient.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/Queues/CosmosQueueClient.cs @@ -31,21 +31,18 @@ public class CosmosQueueClient : IQueueClient private readonly ICosmosQueryFactory _queryFactory; private readonly ICosmosDbDistributedLockFactory _distributedLockFactory; private readonly RetryExceptionPolicyFactory _retryExceptionPolicyFactory; - private readonly ILogger _logger; private readonly AsyncPolicy _retryPolicy; public CosmosQueueClient( Func> containerFactory, ICosmosQueryFactory queryFactory, ICosmosDbDistributedLockFactory distributedLockFactory, - RetryExceptionPolicyFactory retryExceptionPolicyFactor, - ILogger logger) + RetryExceptionPolicyFactory retryExceptionPolicyFactory) { _containerFactory = EnsureArg.IsNotNull(containerFactory, nameof(containerFactory)); _queryFactory = EnsureArg.IsNotNull(queryFactory, nameof(queryFactory)); _distributedLockFactory = EnsureArg.IsNotNull(distributedLockFactory, nameof(distributedLockFactory)); - _retryExceptionPolicyFactory = EnsureArg.IsNotNull(retryExceptionPolicyFactor, nameof(retryExceptionPolicyFactor)); - _logger = EnsureArg.IsNotNull(logger, nameof(logger)); + _retryExceptionPolicyFactory = EnsureArg.IsNotNull(retryExceptionPolicyFactory, nameof(retryExceptionPolicyFactory)); _retryPolicy = _retryExceptionPolicyFactory.BackgroundWorkerRetryPolicy; } diff --git a/test/Microsoft.Health.Fhir.Shared.Tests.Integration/Persistence/CosmosDbFhirStorageTestsFixture.cs b/test/Microsoft.Health.Fhir.Shared.Tests.Integration/Persistence/CosmosDbFhirStorageTestsFixture.cs index fb10d6e9d9..aa7256ace8 100644 --- a/test/Microsoft.Health.Fhir.Shared.Tests.Integration/Persistence/CosmosDbFhirStorageTestsFixture.cs +++ b/test/Microsoft.Health.Fhir.Shared.Tests.Integration/Persistence/CosmosDbFhirStorageTestsFixture.cs @@ -233,8 +233,7 @@ public virtual async Task InitializeAsync() () => _container.CreateMockScope(), new CosmosQueryFactory(Substitute.For(), Substitute.For()), new CosmosDbDistributedLockFactory(() => _container.CreateMockScope(), NullLogger.Instance), - retryExceptionPolicyFactory, - NullLogger.Instance); + retryExceptionPolicyFactory); _cosmosFhirOperationDataStore = new CosmosFhirOperationDataStore( _queueClient, From f8614e702e4840585abd182626d29d7405016065 Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Wed, 15 Jan 2025 09:57:02 -0800 Subject: [PATCH 12/15] add logging to new tests --- .../Features/Storage/Queues/CosmosQueueClientTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs index 6f56d5e557..904b177666 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs @@ -117,7 +117,7 @@ public async Task GivenADequeueJobOperation_WhenExceptionWithRetryAfterIsProvide // Assert Assert.Equal(2, callCount); await cosmosQuery.ReceivedWithAnyArgs(2).ExecuteNextAsync(Arg.Any()); - Assert.True(stopwatch.Elapsed >= retryAfter, "Policy should respect the RetryAfter value."); + Assert.True(stopwatch.Elapsed >= retryAfter, $"Policy should respect the RetryAfter value. Stopwatch: {stopwatch.Elapsed}. Retry after: {retryAfter}."); } public class TestCosmosException(HttpStatusCode statusCode, TimeSpan? retryAfter = null) : CosmosException("Test exception message", statusCode, 0, "test-activity-id", 0.0) From 2c97c037ee6f9bfee0f46ad843a0e363c0bb9923 Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Wed, 15 Jan 2025 10:19:44 -0800 Subject: [PATCH 13/15] relaxed retry test timing due to stopwatch impercision --- .../Features/Storage/Queues/CosmosQueueClientTests.cs | 9 +++++++-- .../Features/Storage/RetryExceptionPolicyFactory.cs | 8 ++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs index 904b177666..537c8853e9 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs @@ -92,7 +92,7 @@ public async Task GivenADequeueJobOperation_WhenExceptionWithRetryAfterIsProvide ICosmosQuery cosmosQuery = Substitute.For>(); _cosmosQueryFactory.Create(Arg.Any(), Arg.Any()) .ReturnsForAnyArgs(cosmosQuery); - var retryAfter = TimeSpan.FromSeconds(1); + var retryAfter = TimeSpan.FromSeconds(2); int callCount = 0; cosmosQuery.ExecuteNextAsync(Arg.Any()).ReturnsForAnyArgs(_ => @@ -117,7 +117,12 @@ public async Task GivenADequeueJobOperation_WhenExceptionWithRetryAfterIsProvide // Assert Assert.Equal(2, callCount); await cosmosQuery.ReceivedWithAnyArgs(2).ExecuteNextAsync(Arg.Any()); - Assert.True(stopwatch.Elapsed >= retryAfter, $"Policy should respect the RetryAfter value. Stopwatch: {stopwatch.Elapsed}. Retry after: {retryAfter}."); + + // Allowing small imprecision due to timer resolution + var actualElapsedSeconds = stopwatch.Elapsed.TotalSeconds; + Assert.True( + Math.Abs(actualElapsedSeconds - retryAfter.TotalSeconds) <= 0.5, + $"Expected retry after {retryAfter.TotalSeconds} seconds, but actual elapsed time was {actualElapsedSeconds} seconds."); } public class TestCosmosException(HttpStatusCode statusCode, TimeSpan? retryAfter = null) : CosmosException("Test exception message", statusCode, 0, "test-activity-id", 0.0) diff --git a/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs b/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs index a70e61e018..900a0318a0 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs @@ -100,11 +100,11 @@ TimeSpan SleepDurationProvider(int retryAttempt, Exception exception) return Policy.Handle() .Or(e => e.IsRequestRateExceeded()) .Or(e => - e.StatusCode == System.Net.HttpStatusCode.ServiceUnavailable || - e.StatusCode == System.Net.HttpStatusCode.TooManyRequests || - e.StatusCode == System.Net.HttpStatusCode.Gone || + e.StatusCode == HttpStatusCode.ServiceUnavailable || + e.StatusCode == HttpStatusCode.TooManyRequests || + e.StatusCode == HttpStatusCode.Gone || e.StatusCode == (HttpStatusCode)449 || // "Retry with" status code - e.StatusCode == System.Net.HttpStatusCode.RequestTimeout) + e.StatusCode == HttpStatusCode.RequestTimeout) .WaitAndRetryAsync( retryCount: maxRetries, sleepDurationProvider: (retryAttempt, exception, context) => SleepDurationProvider(retryAttempt, exception), From 03f898b49f6cb5b8142844efec058ff1266bcf9e Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Wed, 15 Jan 2025 12:27:50 -0800 Subject: [PATCH 14/15] removed primary constructor in CosmosQueueClientTests child test class --- .../Features/Storage/Queues/CosmosQueueClientTests.cs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs index 537c8853e9..51a0822d39 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/Queues/CosmosQueueClientTests.cs @@ -125,9 +125,15 @@ public async Task GivenADequeueJobOperation_WhenExceptionWithRetryAfterIsProvide $"Expected retry after {retryAfter.TotalSeconds} seconds, but actual elapsed time was {actualElapsedSeconds} seconds."); } - public class TestCosmosException(HttpStatusCode statusCode, TimeSpan? retryAfter = null) : CosmosException("Test exception message", statusCode, 0, "test-activity-id", 0.0) + public class TestCosmosException : CosmosException { - private readonly TimeSpan? _retryAfter = retryAfter; + private readonly TimeSpan? _retryAfter; + + public TestCosmosException(HttpStatusCode statusCode, TimeSpan? retryAfter = null) + : base("Test exception message", statusCode, 0, "test-activity-id", 0.0) + { + _retryAfter = retryAfter; + } public override TimeSpan? RetryAfter => _retryAfter; } From fb3616c4ccbf0dc61fb5d01c5924b5a15579782d Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Wed, 15 Jan 2025 13:37:16 -0800 Subject: [PATCH 15/15] advanced base, non default delay --- .../Features/Storage/RetryExceptionPolicyFactory.cs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs b/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs index 900a0318a0..bde69df1d9 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/RetryExceptionPolicyFactory.cs @@ -34,6 +34,9 @@ public class RetryExceptionPolicyFactory private const int _exponentialMaxJitterMs = 300; private const int _exponentialMaxDelayMs = 60 * 1000; + private const int _nonExponentialBaseDelayMs = 500; + private const int _nonExponentialMaxJitterMs = 300; + public RetryExceptionPolicyFactory(CosmosDataStoreConfiguration configuration, RequestContextAccessor requestContextAccessor, ILogger logger) { _requestContextAccessor = EnsureArg.IsNotNull(requestContextAccessor, nameof(requestContextAccessor)); @@ -92,8 +95,9 @@ TimeSpan SleepDurationProvider(int retryAttempt, Exception exception) return TimeSpan.FromMilliseconds(backoff + jitter); } - // Default fixed wait time - return TimeSpan.FromSeconds(2); + // Default logic: 500ms + retryAttempt * jitter + var defaultJitter = RandomNumberGenerator.GetInt32(0, _nonExponentialMaxJitterMs) * retryAttempt; // Jitter scaled by retry attempt + return TimeSpan.FromMilliseconds(_nonExponentialBaseDelayMs + defaultJitter); } // Retry recommendations for Cosmos DB: https://learn.microsoft.com/azure/cosmos-db/nosql/conceptual-resilient-sdk-applications#should-my-application-retry-on-errors