Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add additional error handling to CosmosHealthCheck #4781

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -61,48 +61,94 @@
Assert.Equal(HealthStatus.Healthy, result.Status);
}

[Fact]
public async Task GivenCosmosDb_WhenCosmosOperationCanceledExceptionIsAlwaysThrown_ThenUnhealthyStateShouldBeReturned()
[Theory]
[InlineData(typeof(CosmosOperationCanceledException))]
[InlineData(typeof(CosmosException))]
public async Task GivenCosmosDb_WhenRetryableExceptionIsAlwaysThrown_ThenUnhealthyStateShouldBeReturned(Type exceptionType)
{
// This test simulates that all Health Check calls result in OperationCanceledExceptions.
// And all retries should fail.
// Arrange
Exception exception;

if (exceptionType == typeof(CosmosOperationCanceledException))
{
exception = new CosmosOperationCanceledException(
originalException: new OperationCanceledException(),
diagnostics: Substitute.For<CosmosDiagnostics>());
}
else if (exceptionType == typeof(CosmosException))
{
exception = new CosmosException(
message: "Service Unavailable",
statusCode: System.Net.HttpStatusCode.ServiceUnavailable,
subStatusCode: 0,
activityId: Guid.NewGuid().ToString(),
requestCharge: 0);
}
else
{
throw new ArgumentException("Unsupported exception type.");
}

var diagnostics = Substitute.For<CosmosDiagnostics>();
var coce = new CosmosOperationCanceledException(originalException: new OperationCanceledException(), diagnostics);
_testProvider.PerformTestAsync(default, CancellationToken.None).ThrowsForAnyArgs(exception);

_testProvider.PerformTestAsync(default, CancellationToken.None).ThrowsForAnyArgs(coce);
// Act
HealthCheckResult result = await _healthCheck.CheckHealthAsync(new HealthCheckContext());

// Assert
Assert.Equal(HealthStatus.Unhealthy, result.Status);
_testProvider.ReceivedWithAnyArgs(3);
_testProvider.ReceivedWithAnyArgs(3); // Ensure the maximum retries were attempted
}

[Fact]
public async Task GivenCosmosDb_WhenCosmosOperationCanceledExceptionIsOnceThrown_ThenHealthyStateShouldBeReturned()
[Theory]
[InlineData(typeof(CosmosOperationCanceledException))]
[InlineData(typeof(CosmosException))]
public async Task GivenCosmosDb_WhenRetryableExceptionIsOnceThrown_ThenHealthyStateShouldBeReturned(Type exceptionType)
{
// This test simulates that the first call to Health Check results in an OperationCanceledException.
// The first attempt should fail, but the next ones should pass.
// Arrange
Exception exception;

var diagnostics = Substitute.For<CosmosDiagnostics>();
var coce = new CosmosOperationCanceledException(originalException: new OperationCanceledException(), diagnostics);
if (exceptionType == typeof(CosmosOperationCanceledException))
{
exception = new CosmosOperationCanceledException(
originalException: new OperationCanceledException(),
diagnostics: Substitute.For<CosmosDiagnostics>());
}
else if (exceptionType == typeof(CosmosException))
{
exception = new CosmosException(
message: "Service Unavailable",
statusCode: System.Net.HttpStatusCode.ServiceUnavailable,
subStatusCode: 0,
activityId: Guid.NewGuid().ToString(),
requestCharge: 0);
}
else
{
throw new ArgumentException("Unsupported exception type.");
}

int runs = 0;
Func<Task> fakeRetry = () =>
{
runs++;
if (runs == 1)

// Simulate failure on the first attempt and success on subsequent attempts
_testProvider.PerformTestAsync(default, CancellationToken.None)
.ReturnsForAnyArgs(_ =>
{
throw coce;
}
runs++;
if (runs == 1)
{
throw exception;
}

return Task.CompletedTask;
};
return Task.CompletedTask;
});

_testProvider.PerformTestAsync(default, CancellationToken.None).ReturnsForAnyArgs(x => fakeRetry());
// Act
HealthCheckResult result = await _healthCheck.CheckHealthAsync(new HealthCheckContext());

Assert.Equal(HealthStatus.Healthy, result.Status);
_testProvider.ReceivedWithAnyArgs(2);
// Assert
Assert.Equal(HealthStatus.Healthy, result.Status); // Final state should be Healthy
Assert.Equal(2, runs); // Ensure 2 attempts were made
_testProvider.ReceivedWithAnyArgs(2); // Verify PerformTestAsync was called twice
}

[Fact]
Expand Down Expand Up @@ -174,5 +220,26 @@
Assert.True(result.Data.ContainsKey("Error"));
Assert.Equal(FhirHealthErrorCode.Error429.ToString(), result.Data["Error"]);
}

[Fact]
public async Task GivenCosmosDbWithTimeout_WhenHealthIsChecked_ThenHealthyStateShouldBeReturned()
{
var exception = new CosmosException(
message: "RequestTimeout",
statusCode: HttpStatusCode.RequestTimeout,
subStatusCode: 0,
activityId: Guid.NewGuid().ToString(),
requestCharge: 0);

_testProvider.PerformTestAsync(default, CancellationToken.None)
.ThrowsForAnyArgs(exception);

HealthCheckResult result = await _healthCheck.CheckHealthAsync(new HealthCheckContext());

Assert.Equal(HealthStatus.Degraded, result.Status);

Assert.True(result.Data.ContainsKey("Error"));
github-advanced-security[bot] marked this conversation as resolved.
Fixed
Show resolved Hide resolved
Assert.Equal(FhirHealthErrorCode.Error408.ToString(), result.Data["Error"]);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

using System;
using System.Collections.Generic;
using System.Net;
using System.Threading;
using System.Threading.Tasks;
using EnsureThat;
Expand Down Expand Up @@ -79,51 +80,32 @@ public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context
catch (CosmosOperationCanceledException coce)
{
// CosmosOperationCanceledException are "safe to retry on and can be treated as timeouts from the retrying perspective.".
// Reference: https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/troubleshoot-dotnet-sdk-request-timeout?tabs=cpu-new
// Reference: https://learn.microsoft.com/azure/cosmos-db/nosql/troubleshoot-dotnet-sdk-request-timeout?tabs=cpu-new
attempt++;

if (cancellationToken.IsCancellationRequested)
var result = HandleRetry(coce, attempt, maxNumberAttempts, nameof(CosmosOperationCanceledException), cancellationToken);
if (result.HasValue)
{
// Handling an extenal cancellation.
// No reasons to retry as the cancellation was external to the health check.

_logger.LogWarning(coce, "Failed to connect to the data store. External cancellation requested.");

return HealthCheckResult.Unhealthy(
description: UnhealthyDescription,
data: new Dictionary<string, object>
{
{ "Reason", HealthStatusReason.ServiceUnavailable },
{ "Error", FhirHealthErrorCode.Error408.ToString() },
});
return result.Value;
}
else if (attempt >= maxNumberAttempts)
}
catch (CosmosException ex) when (ex.StatusCode == HttpStatusCode.ServiceUnavailable ||
ex.StatusCode == (HttpStatusCode)449)
{
// Cosmos 503 and 449 are transient errors that can be retried.
// Reference: https://learn.microsoft.com/azure/cosmos-db/nosql/conceptual-resilient-sdk-applications#should-my-application-retry-on-errors
attempt++;
var result = HandleRetry(ex, attempt, maxNumberAttempts, nameof(CosmosException), cancellationToken);

// Log additional diagnostics for 503 errors.
if (ex.StatusCode == HttpStatusCode.ServiceUnavailable)
{
// This is a very rare situation. This condition indicates that multiple attempts to connect to the data store happened, but they were not successful.

_logger.LogWarning(
coce,
"Failed to connect to the data store. There were {NumberOfAttempts} attempts to connect to the data store, but they suffered a '{ExceptionType}'.",
attempt,
nameof(CosmosOperationCanceledException));

return HealthCheckResult.Unhealthy(
description: UnhealthyDescription,
data: new Dictionary<string, object>
{
{ "Reason", HealthStatusReason.ServiceUnavailable },
{ "Error", FhirHealthErrorCode.Error501.ToString() },
});
var diagnostics = ex.Diagnostics?.ToString() ?? "empty";
_logger.LogWarning(ex, "Received a ServiceUnavailable response from Cosmos DB. Retrying. Diagnostics: {CosmosDiagnostics}", diagnostics);
}
else
{
// Number of attempts not reached. Allow retry.

_logger.LogWarning(
coce,
"Failed to connect to the data store. Attempt {NumberOfAttempts}. '{ExceptionType}'.",
attempt,
nameof(CosmosOperationCanceledException));
if (result.HasValue)
{
return result.Value;
}
}
catch (CosmosException ex) when (ex.IsCmkClientError())
Expand All @@ -142,6 +124,22 @@ public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context
{ "Error", FhirHealthErrorCode.Error412.ToString() },
});
}
catch (CosmosException ex) when (ex.StatusCode == HttpStatusCode.RequestTimeout)
{
// Handling timeout exceptions

_logger.LogWarning(
ex,
"Failed to connect to the data store. Request has timed out.");

return HealthCheckResult.Degraded(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should timeout requests to CosmosDB result in Degraded or ServiceUnavailable? 408 status code can mean the database is overloaded from client requests.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed with @fhibf - degraded is the proper behavior here.

description: DegradedDescription,
data: new Dictionary<string, object>
{
{ "Reason", HealthStatusReason.ServiceDegraded },
{ "Error", FhirHealthErrorCode.Error408.ToString() },
});
}
catch (Exception ex) when (ex.IsRequestRateExceeded())
{
// Handling request rate exceptions.
Expand Down Expand Up @@ -176,5 +174,52 @@ public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context
}
while (true);
}

private HealthCheckResult? HandleRetry(Exception ex, int attempt, int maxNumberAttempts, string exceptionType, CancellationToken cancellationToken)
{
if (cancellationToken.IsCancellationRequested)
{
// Handling an extenal cancellation.
// No reasons to retry as the cancellation was external to the health check.

_logger.LogWarning(ex, "Failed to connect to the data store. External cancellation requested.");

return HealthCheckResult.Unhealthy(
description: UnhealthyDescription,
data: new Dictionary<string, object>
{
{ "Reason", HealthStatusReason.ServiceUnavailable },
{ "Error", FhirHealthErrorCode.Error408.ToString() },
});
}

if (attempt >= maxNumberAttempts)
{
// This is a very rare situation. This condition indicates that multiple attempts to connect to the data store happened, but they were not successful.

_logger.LogWarning(
ex,
"Failed to connect to the data store. There were {NumberOfAttempts} attempts to connect to the data store, but they suffered a '{ExceptionType}'.",
attempt,
exceptionType);

return HealthCheckResult.Unhealthy(
description: UnhealthyDescription,
data: new Dictionary<string, object>
{
{ "Reason", HealthStatusReason.ServiceUnavailable },
{ "Error", FhirHealthErrorCode.Error501.ToString() },
});
}

// Number of attempts not reached. Allow retry.
_logger.LogWarning(
ex,
"Failed to connect to the data store. Attempt {NumberOfAttempts}. '{ExceptionType}'.",
attempt,
exceptionType);

return null; // Indicates that the retry loop should continue.
}
}
}
Loading