Skip to content

Commit

Permalink
chore: Agent Control health checks (#2968)
Browse files Browse the repository at this point in the history
  • Loading branch information
tippmar-nr authored Jan 30, 2025
1 parent 66ab25a commit 6cd66df
Show file tree
Hide file tree
Showing 40 changed files with 1,629 additions and 119 deletions.
139 changes: 135 additions & 4 deletions src/Agent/NewRelic/Agent/Core/AgentHealth/AgentHealthReporter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
using System.Linq;
using System.Net;
using System.Threading;
using System.IO;
using System.Text;

namespace NewRelic.Agent.Core.AgentHealth
{
Expand All @@ -26,6 +28,8 @@ public class AgentHealthReporter : ConfigurationBasedService, IAgentHealthReport

private readonly IMetricBuilder _metricBuilder;
private readonly IScheduler _scheduler;
private readonly IFileWrapper _fileWrapper;
private readonly IDirectoryWrapper _directoryWrapper;
private readonly IList<string> _recurringLogData = new ConcurrentList<string>();
private readonly IDictionary<AgentHealthEvent, InterlockedCounter> _agentHealthEventCounters = new Dictionary<AgentHealthEvent, InterlockedCounter>();
private readonly ConcurrentDictionary<string, InterlockedCounter> _logLinesCountByLevel = new ConcurrentDictionary<string, InterlockedCounter>();
Expand All @@ -38,10 +42,30 @@ public class AgentHealthReporter : ConfigurationBasedService, IAgentHealthReport
private InterlockedCounter _traceContextCreateSuccessCounter;
private InterlockedCounter _traceContextAcceptSuccessCounter;

public AgentHealthReporter(IMetricBuilder metricBuilder, IScheduler scheduler)
private HealthCheck _healthCheck;
private bool _healthChecksInitialized;
private bool _healthChecksFailed;
private string _healthCheckPath;

public AgentHealthReporter(IMetricBuilder metricBuilder, IScheduler scheduler, IFileWrapper fileWrapper, IDirectoryWrapper directoryWrapper)
{
_metricBuilder = metricBuilder;
_scheduler = scheduler;
_fileWrapper = fileWrapper;
_directoryWrapper = directoryWrapper;

if (!_configuration.AgentControlEnabled)
Log.Debug("Agent Control is disabled. Health checks will not be reported.");
else
{
Log.Debug("Agent Control health checks will be published every {HealthCheckInterval} seconds", _configuration.HealthFrequency);

_healthCheck = new() { IsHealthy = true, Status = "Agent starting", LastError = string.Empty };

// schedule the health check and issue the first one immediately
_scheduler.ExecuteEvery(PublishAgentControlHealthCheck, TimeSpan.FromSeconds(_configuration.HealthFrequency), TimeSpan.Zero);
}

_scheduler.ExecuteEvery(LogPeriodicReport, _timeBetweenExecutions);
var agentHealthEvents = Enum.GetValues(typeof(AgentHealthEvent)) as AgentHealthEvent[];
foreach (var agentHealthEvent in agentHealthEvents)
Expand Down Expand Up @@ -258,9 +282,9 @@ public void ReportIfHostIsLinuxOs()
{
#if NETSTANDARD2_0

bool isLinux = System.Runtime.InteropServices.RuntimeInformation.IsOSPlatform(System.Runtime.InteropServices.OSPlatform.Linux);
var metric =_metricBuilder.TryBuildLinuxOsMetric(isLinux);
TrySend(metric);
bool isLinux = System.Runtime.InteropServices.RuntimeInformation.IsOSPlatform(System.Runtime.InteropServices.OSPlatform.Linux);
var metric = _metricBuilder.TryBuildLinuxOsMetric(isLinux);
TrySend(metric);
#endif
}

Expand Down Expand Up @@ -667,6 +691,107 @@ public void ReportLogForwardingConfiguredValues()

#endregion

#region Agent Control

private void ReportIfAgentControlHealthEnabled()
{
if (_configuration.AgentControlEnabled)
{
ReportSupportabilityCountMetric(MetricNames.SupportabilityAgentControlHealthEnabled);
}
}

public void SetAgentControlStatus((bool IsHealthy, string Code, string Status) healthStatus, params string[] statusParams)
{
// Do nothing if agent control is not enabled
if (!_configuration.AgentControlEnabled)
return;

if (healthStatus.Equals(HealthCodes.AgentShutdownHealthy))
{
if (_healthCheck.IsHealthy)
{
_healthCheck.TrySetHealth(healthStatus);
}
}
else
{
_healthCheck.TrySetHealth(healthStatus, statusParams);
}
}

public void PublishAgentControlHealthCheck()
{
if (!_healthChecksInitialized) // initialize on first invocation
{
InitializeHealthChecks();
_healthChecksInitialized = true;
}

// stop the scheduled task if agent control isn't enabled or health checks fail for any reason
if (!_configuration.AgentControlEnabled || _healthChecksFailed)
{
_scheduler.StopExecuting(PublishAgentControlHealthCheck);
return;
}

var healthCheckYaml = _healthCheck.ToYaml();

Log.Finest("Publishing Agent Control health check report: {HealthCheckYaml}", healthCheckYaml);

try
{
using var fs = _fileWrapper.OpenWrite(Path.Combine(_healthCheckPath, _healthCheck.FileName));
var payloadBytes = Encoding.UTF8.GetBytes(healthCheckYaml);
fs.Write(payloadBytes, 0, payloadBytes.Length);
fs.Flush();
}
catch (Exception ex)
{
Log.Warn(ex, "Failed to write Agent Control health check report. Health checks will be disabled.");
_healthChecksFailed = true;
}
}

private void InitializeHealthChecks()
{
if (!_configuration.AgentControlEnabled)
{
Log.Debug("Agent Control is disabled. Health checks will not be reported.");
return;
}

Log.Debug("Initializing Agent Control health checks");

// make sure the delivery location is a file URI
var fileUri = new Uri(_configuration.HealthDeliveryLocation);
if (fileUri.Scheme != Uri.UriSchemeFile)
{
Log.Warn(
"Agent Control is enabled but the provided agent_control.health.delivery_location is not a file URL. Health checks will be disabled.");
_healthChecksFailed = true;
return;
}

_healthCheckPath = fileUri.LocalPath;

// verify the directory exists
if (!_directoryWrapper.Exists(_healthCheckPath))
{
Log.Warn("Agent Control is enabled but the path specified in agent_control.health.delivery_location does not exist. Health checks will be disabled.");
_healthChecksFailed = true;
}

// verify we can write a file to the directory
var testFile = Path.Combine(_healthCheckPath, Path.GetRandomFileName());
if (!_fileWrapper.TryCreateFile(testFile))
{
Log.Warn("Agent Control is enabled but the agent is unable to create files in the directory specified in agent_control.health.delivery_location. Health checks will be disabled.");
_healthChecksFailed = true;
}
}
#endregion

public void ReportSupportabilityPayloadsDroppeDueToMaxPayloadSizeLimit(string endpoint)
{
TrySend(_metricBuilder.TryBuildSupportabilityPayloadsDroppedDueToMaxPayloadLimit(endpoint));
Expand All @@ -686,6 +811,7 @@ private void CollectOneTimeMetrics()
ReportIfInstrumentationIsDisabled();
ReportIfGCSamplerV2IsEnabled();
ReportIfAwsAccountIdProvided();
ReportIfAgentControlHealthEnabled();
}

public void CollectMetrics()
Expand Down Expand Up @@ -857,5 +983,10 @@ private void ReportIfAwsAccountIdProvided()
ReportSupportabilityCountMetric(MetricNames.SupportabilityAwsAccountIdProvided);
}
}

/// <summary>
/// FOR UNIT TESTING ONLY
/// </summary>
public bool HealthCheckFailed => _healthChecksFailed;
}
}
61 changes: 61 additions & 0 deletions src/Agent/NewRelic/Agent/Core/AgentHealth/HealthCheck.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Copyright 2020 New Relic, Inc. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

using System;
using NewRelic.Agent.Core.Utilities;

namespace NewRelic.Agent.Core.AgentHealth
{
public class HealthCheck
{
private const int NanoSecondsPerMillisecond = 1000000;

public bool IsHealthy { get; internal set; }
public string Status { get; internal set; }
public string LastError { get; internal set; }
public DateTime StartTime { get; } = DateTime.UtcNow;
public DateTime StatusTime { get; internal set; }
public string FileName { get; } = "health-" + System.Guid.NewGuid().ToString("N") + ".yml";

/// <summary>
/// Set the health status of the agent, but only update changed values.
/// </summary>
/// <param name="healthy"></param>
/// <param name="healthStatus"></param>
/// <param name="statusParams"></param>
public void TrySetHealth((bool IsHealthy, string Code, string Status) healthStatus, params string[] statusParams)
{
lock (this)
{
if (IsHealthy != healthStatus.IsHealthy)
{
IsHealthy = healthStatus.IsHealthy;
}

if (string.IsNullOrEmpty(Status) || !Status.Equals(healthStatus.Code, StringComparison.OrdinalIgnoreCase))
{
Status = statusParams is { Length: > 0 } ?
string.Format(healthStatus.Status, statusParams)
:
healthStatus.Status;
}

if (string.IsNullOrEmpty(LastError) || !LastError.Equals(healthStatus.Code, StringComparison.OrdinalIgnoreCase))
{
LastError = healthStatus.Code;
}

StatusTime = DateTime.UtcNow;
}
}

public string ToYaml()
{
lock (this)
{
return
$"healthy: {IsHealthy}\nstatus: {Status}\nlast_error: {LastError}\nstart_time_unix_nano: {StartTime.ToUnixTimeMilliseconds() * NanoSecondsPerMillisecond}\nstatus_time_unix_nano: {StatusTime.ToUnixTimeMilliseconds() * NanoSecondsPerMillisecond}";
}
}
}
}
84 changes: 84 additions & 0 deletions src/Agent/NewRelic/Agent/Core/AgentHealth/HealthCodes.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// Copyright 2020 New Relic, Inc. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

namespace NewRelic.Agent.Core.AgentHealth
{
public static class HealthCodes
{
/// <summary>
/// Healthy
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) Healthy = (true, "NR-APM-000",
"Healthy");

/// <summary>
/// Invalid license key (HTTP status code 401)
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) LicenseKeyInvalid = (false, "NR-APM-001",
"Invalid license key (HTTP status code 401)");

/// <summary>
/// License key missing in configuration
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) LicenseKeyMissing = (false, "NR-APM-002",
"License key missing in configuration");

/// <summary>
/// Forced disconnect received from New Relic (HTTP status code 410)
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) ForceDisconnect = (false, "NR-APM-003",
"Forced disconnect received from New Relic (HTTP status code 410)");

/// <summary>
/// HTTP error response code [%s] received from New Relic while sending data type [%s]
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) HttpError = (false, "NR-APM-004",
"HTTP error response code {0} received from New Relic while sending data type {1}");

/// <summary>
/// Missing application name in agent configuration
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) ApplicationNameMissing = (false, "NR-APM-005",
"Missing application name in agent configuration");

/// <summary>
/// The maximum number of configured app names (3) exceeded
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) MaxApplicationNamesExceeded = (false, "NR-APM-006",
"The maximum number of configured app names (3) exceeded");

/// <summary>
/// HTTP Proxy configuration error; response code [%s]
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) HttpProxyError = (false, "NR-APM-007",
"HTTP Proxy configuration error; response code {0}");

/// <summary>
/// Agent is disabled via configuration
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) AgentDisabledByConfiguration = (false, "NR-APM-008",
"Agent is disabled via configuration");

/// <summary>
/// Failed to connect to New Relic data collector
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) FailedToConnect = (false, "NR-APM-009",
"Failed to connect to New Relic data collector");

/// <summary>
/// Agent has shutdown
/// Only be reported if agent is "healthy" on shutdown.
/// If the agent status is not Healthy on agent shutdown, the existing error MUST not be overwritten.
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) AgentShutdownHealthy = (true, "NR-APM-099",
"Agent has shutdown");

// Agent health codes for the .NET agent are 200-299

/// <summary>
/// Agent has shutdown with exception [%s]
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) AgentShutdownError = (false, "NR-APM-200",
"Agent has shutdown with exception {0}");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -151,5 +151,7 @@ public interface IAgentHealthReporter : IOutOfBandMetricSource
void ReportLogForwardingEnabledWithFramework(string logFramework);
void ReportByteMetric(string metricName, long totalBytes, long? exclusiveBytes = null);
void ReportLoggingEventsEmpty(int count = 1);
void SetAgentControlStatus((bool IsHealthy, string Code, string Status) healthStatus, params string[] statusParams);
void PublishAgentControlHealthCheck();
}
}
Loading

0 comments on commit 6cd66df

Please sign in to comment.