-
Notifications
You must be signed in to change notification settings - Fork 521
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add additional error handling to CosmosHealthCheck #4781
base: main
Are you sure you want to change the base?
Changes from 3 commits
c494b81
e16044d
8e85f65
5950354
03a7162
7653e90
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
|
||
using System; | ||
using System.Collections.Generic; | ||
using System.Net; | ||
using System.Threading; | ||
using System.Threading.Tasks; | ||
using EnsureThat; | ||
|
@@ -79,51 +80,32 @@ public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context | |
catch (CosmosOperationCanceledException coce) | ||
{ | ||
// CosmosOperationCanceledException are "safe to retry on and can be treated as timeouts from the retrying perspective.". | ||
// Reference: https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/troubleshoot-dotnet-sdk-request-timeout?tabs=cpu-new | ||
// Reference: https://learn.microsoft.com/azure/cosmos-db/nosql/troubleshoot-dotnet-sdk-request-timeout?tabs=cpu-new | ||
attempt++; | ||
|
||
if (cancellationToken.IsCancellationRequested) | ||
var result = HandleRetry(coce, attempt, maxNumberAttempts, nameof(CosmosOperationCanceledException), cancellationToken); | ||
if (result.HasValue) | ||
{ | ||
// Handling an extenal cancellation. | ||
// No reasons to retry as the cancellation was external to the health check. | ||
|
||
_logger.LogWarning(coce, "Failed to connect to the data store. External cancellation requested."); | ||
|
||
return HealthCheckResult.Unhealthy( | ||
description: UnhealthyDescription, | ||
data: new Dictionary<string, object> | ||
{ | ||
{ "Reason", HealthStatusReason.ServiceUnavailable }, | ||
{ "Error", FhirHealthErrorCode.Error408.ToString() }, | ||
}); | ||
return result.Value; | ||
} | ||
else if (attempt >= maxNumberAttempts) | ||
} | ||
catch (CosmosException ex) when (ex.StatusCode == HttpStatusCode.ServiceUnavailable || | ||
ex.StatusCode == (HttpStatusCode)449) | ||
{ | ||
// Cosmos 503 and 449 are transient errors that can be retried. | ||
// Reference: https://learn.microsoft.com/azure/cosmos-db/nosql/conceptual-resilient-sdk-applications#should-my-application-retry-on-errors | ||
attempt++; | ||
var result = HandleRetry(ex, attempt, maxNumberAttempts, nameof(CosmosException), cancellationToken); | ||
|
||
// Log additional diagnostics for 503 errors. | ||
if (ex.StatusCode == HttpStatusCode.ServiceUnavailable) | ||
{ | ||
// This is a very rare situation. This condition indicates that multiple attempts to connect to the data store happened, but they were not successful. | ||
|
||
_logger.LogWarning( | ||
coce, | ||
"Failed to connect to the data store. There were {NumberOfAttempts} attempts to connect to the data store, but they suffered a '{ExceptionType}'.", | ||
attempt, | ||
nameof(CosmosOperationCanceledException)); | ||
|
||
return HealthCheckResult.Unhealthy( | ||
description: UnhealthyDescription, | ||
data: new Dictionary<string, object> | ||
{ | ||
{ "Reason", HealthStatusReason.ServiceUnavailable }, | ||
{ "Error", FhirHealthErrorCode.Error501.ToString() }, | ||
}); | ||
var diagnostics = ex.Diagnostics?.ToString() ?? "empty"; | ||
_logger.LogWarning(ex, "Received a ServiceUnavailable response from Cosmos DB. Retrying. Diagnostics: {CosmosDiagnostics}", diagnostics); | ||
} | ||
else | ||
{ | ||
// Number of attempts not reached. Allow retry. | ||
|
||
_logger.LogWarning( | ||
coce, | ||
"Failed to connect to the data store. Attempt {NumberOfAttempts}. '{ExceptionType}'.", | ||
attempt, | ||
nameof(CosmosOperationCanceledException)); | ||
if (result.HasValue) | ||
{ | ||
return result.Value; | ||
} | ||
} | ||
catch (CosmosException ex) when (ex.IsCmkClientError()) | ||
|
@@ -142,6 +124,22 @@ public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context | |
{ "Error", FhirHealthErrorCode.Error412.ToString() }, | ||
}); | ||
} | ||
catch (CosmosException ex) when (ex.StatusCode == HttpStatusCode.RequestTimeout) | ||
{ | ||
// Handling timeout exceptions | ||
|
||
_logger.LogWarning( | ||
ex, | ||
"Failed to connect to the data store. Request has timed out."); | ||
|
||
return HealthCheckResult.Degraded( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should timeout requests to CosmosDB result in Degraded or ServiceUnavailable? 408 status code can mean the database is overloaded from client requests. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Discussed with @fhibf - degraded is the proper behavior here. |
||
description: DegradedDescription, | ||
data: new Dictionary<string, object> | ||
{ | ||
{ "Reason", HealthStatusReason.ServiceDegraded }, | ||
{ "Error", FhirHealthErrorCode.Error408.ToString() }, | ||
}); | ||
} | ||
catch (Exception ex) when (ex.IsRequestRateExceeded()) | ||
{ | ||
// Handling request rate exceptions. | ||
|
@@ -176,5 +174,52 @@ public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context | |
} | ||
while (true); | ||
} | ||
|
||
private HealthCheckResult? HandleRetry(Exception ex, int attempt, int maxNumberAttempts, string exceptionType, CancellationToken cancellationToken) | ||
{ | ||
if (cancellationToken.IsCancellationRequested) | ||
{ | ||
// Handling an extenal cancellation. | ||
// No reasons to retry as the cancellation was external to the health check. | ||
|
||
_logger.LogWarning(ex, "Failed to connect to the data store. External cancellation requested."); | ||
|
||
return HealthCheckResult.Unhealthy( | ||
description: UnhealthyDescription, | ||
data: new Dictionary<string, object> | ||
{ | ||
{ "Reason", HealthStatusReason.ServiceUnavailable }, | ||
{ "Error", FhirHealthErrorCode.Error408.ToString() }, | ||
}); | ||
} | ||
|
||
if (attempt >= maxNumberAttempts) | ||
{ | ||
// This is a very rare situation. This condition indicates that multiple attempts to connect to the data store happened, but they were not successful. | ||
|
||
_logger.LogWarning( | ||
ex, | ||
"Failed to connect to the data store. There were {NumberOfAttempts} attempts to connect to the data store, but they suffered a '{ExceptionType}'.", | ||
attempt, | ||
exceptionType); | ||
|
||
return HealthCheckResult.Unhealthy( | ||
description: UnhealthyDescription, | ||
data: new Dictionary<string, object> | ||
{ | ||
{ "Reason", HealthStatusReason.ServiceUnavailable }, | ||
{ "Error", FhirHealthErrorCode.Error501.ToString() }, | ||
}); | ||
} | ||
|
||
// Number of attempts not reached. Allow retry. | ||
_logger.LogWarning( | ||
ex, | ||
"Failed to connect to the data store. Attempt {NumberOfAttempts}. '{ExceptionType}'.", | ||
attempt, | ||
exceptionType); | ||
|
||
return null; // Indicates that the retry loop should continue. | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added this helper due to code scanning errors.