Skip to content

Commit 6cd66df

Browse files
authored
chore: Agent Control health checks (#2968)
1 parent 66ab25a commit 6cd66df

40 files changed

+1629
-119
lines changed

src/Agent/NewRelic/Agent/Core/AgentHealth/AgentHealthReporter.cs

Lines changed: 135 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
using System.Linq;
1818
using System.Net;
1919
using System.Threading;
20+
using System.IO;
21+
using System.Text;
2022

2123
namespace NewRelic.Agent.Core.AgentHealth
2224
{
@@ -26,6 +28,8 @@ public class AgentHealthReporter : ConfigurationBasedService, IAgentHealthReport
2628

2729
private readonly IMetricBuilder _metricBuilder;
2830
private readonly IScheduler _scheduler;
31+
private readonly IFileWrapper _fileWrapper;
32+
private readonly IDirectoryWrapper _directoryWrapper;
2933
private readonly IList<string> _recurringLogData = new ConcurrentList<string>();
3034
private readonly IDictionary<AgentHealthEvent, InterlockedCounter> _agentHealthEventCounters = new Dictionary<AgentHealthEvent, InterlockedCounter>();
3135
private readonly ConcurrentDictionary<string, InterlockedCounter> _logLinesCountByLevel = new ConcurrentDictionary<string, InterlockedCounter>();
@@ -38,10 +42,30 @@ public class AgentHealthReporter : ConfigurationBasedService, IAgentHealthReport
3842
private InterlockedCounter _traceContextCreateSuccessCounter;
3943
private InterlockedCounter _traceContextAcceptSuccessCounter;
4044

41-
public AgentHealthReporter(IMetricBuilder metricBuilder, IScheduler scheduler)
45+
private HealthCheck _healthCheck;
46+
private bool _healthChecksInitialized;
47+
private bool _healthChecksFailed;
48+
private string _healthCheckPath;
49+
50+
public AgentHealthReporter(IMetricBuilder metricBuilder, IScheduler scheduler, IFileWrapper fileWrapper, IDirectoryWrapper directoryWrapper)
4251
{
4352
_metricBuilder = metricBuilder;
4453
_scheduler = scheduler;
54+
_fileWrapper = fileWrapper;
55+
_directoryWrapper = directoryWrapper;
56+
57+
if (!_configuration.AgentControlEnabled)
58+
Log.Debug("Agent Control is disabled. Health checks will not be reported.");
59+
else
60+
{
61+
Log.Debug("Agent Control health checks will be published every {HealthCheckInterval} seconds", _configuration.HealthFrequency);
62+
63+
_healthCheck = new() { IsHealthy = true, Status = "Agent starting", LastError = string.Empty };
64+
65+
// schedule the health check and issue the first one immediately
66+
_scheduler.ExecuteEvery(PublishAgentControlHealthCheck, TimeSpan.FromSeconds(_configuration.HealthFrequency), TimeSpan.Zero);
67+
}
68+
4569
_scheduler.ExecuteEvery(LogPeriodicReport, _timeBetweenExecutions);
4670
var agentHealthEvents = Enum.GetValues(typeof(AgentHealthEvent)) as AgentHealthEvent[];
4771
foreach (var agentHealthEvent in agentHealthEvents)
@@ -258,9 +282,9 @@ public void ReportIfHostIsLinuxOs()
258282
{
259283
#if NETSTANDARD2_0
260284

261-
bool isLinux = System.Runtime.InteropServices.RuntimeInformation.IsOSPlatform(System.Runtime.InteropServices.OSPlatform.Linux);
262-
var metric =_metricBuilder.TryBuildLinuxOsMetric(isLinux);
263-
TrySend(metric);
285+
bool isLinux = System.Runtime.InteropServices.RuntimeInformation.IsOSPlatform(System.Runtime.InteropServices.OSPlatform.Linux);
286+
var metric = _metricBuilder.TryBuildLinuxOsMetric(isLinux);
287+
TrySend(metric);
264288
#endif
265289
}
266290

@@ -667,6 +691,107 @@ public void ReportLogForwardingConfiguredValues()
667691

668692
#endregion
669693

694+
#region Agent Control
695+
696+
private void ReportIfAgentControlHealthEnabled()
697+
{
698+
if (_configuration.AgentControlEnabled)
699+
{
700+
ReportSupportabilityCountMetric(MetricNames.SupportabilityAgentControlHealthEnabled);
701+
}
702+
}
703+
704+
public void SetAgentControlStatus((bool IsHealthy, string Code, string Status) healthStatus, params string[] statusParams)
705+
{
706+
// Do nothing if agent control is not enabled
707+
if (!_configuration.AgentControlEnabled)
708+
return;
709+
710+
if (healthStatus.Equals(HealthCodes.AgentShutdownHealthy))
711+
{
712+
if (_healthCheck.IsHealthy)
713+
{
714+
_healthCheck.TrySetHealth(healthStatus);
715+
}
716+
}
717+
else
718+
{
719+
_healthCheck.TrySetHealth(healthStatus, statusParams);
720+
}
721+
}
722+
723+
public void PublishAgentControlHealthCheck()
724+
{
725+
if (!_healthChecksInitialized) // initialize on first invocation
726+
{
727+
InitializeHealthChecks();
728+
_healthChecksInitialized = true;
729+
}
730+
731+
// stop the scheduled task if agent control isn't enabled or health checks fail for any reason
732+
if (!_configuration.AgentControlEnabled || _healthChecksFailed)
733+
{
734+
_scheduler.StopExecuting(PublishAgentControlHealthCheck);
735+
return;
736+
}
737+
738+
var healthCheckYaml = _healthCheck.ToYaml();
739+
740+
Log.Finest("Publishing Agent Control health check report: {HealthCheckYaml}", healthCheckYaml);
741+
742+
try
743+
{
744+
using var fs = _fileWrapper.OpenWrite(Path.Combine(_healthCheckPath, _healthCheck.FileName));
745+
var payloadBytes = Encoding.UTF8.GetBytes(healthCheckYaml);
746+
fs.Write(payloadBytes, 0, payloadBytes.Length);
747+
fs.Flush();
748+
}
749+
catch (Exception ex)
750+
{
751+
Log.Warn(ex, "Failed to write Agent Control health check report. Health checks will be disabled.");
752+
_healthChecksFailed = true;
753+
}
754+
}
755+
756+
private void InitializeHealthChecks()
757+
{
758+
if (!_configuration.AgentControlEnabled)
759+
{
760+
Log.Debug("Agent Control is disabled. Health checks will not be reported.");
761+
return;
762+
}
763+
764+
Log.Debug("Initializing Agent Control health checks");
765+
766+
// make sure the delivery location is a file URI
767+
var fileUri = new Uri(_configuration.HealthDeliveryLocation);
768+
if (fileUri.Scheme != Uri.UriSchemeFile)
769+
{
770+
Log.Warn(
771+
"Agent Control is enabled but the provided agent_control.health.delivery_location is not a file URL. Health checks will be disabled.");
772+
_healthChecksFailed = true;
773+
return;
774+
}
775+
776+
_healthCheckPath = fileUri.LocalPath;
777+
778+
// verify the directory exists
779+
if (!_directoryWrapper.Exists(_healthCheckPath))
780+
{
781+
Log.Warn("Agent Control is enabled but the path specified in agent_control.health.delivery_location does not exist. Health checks will be disabled.");
782+
_healthChecksFailed = true;
783+
}
784+
785+
// verify we can write a file to the directory
786+
var testFile = Path.Combine(_healthCheckPath, Path.GetRandomFileName());
787+
if (!_fileWrapper.TryCreateFile(testFile))
788+
{
789+
Log.Warn("Agent Control is enabled but the agent is unable to create files in the directory specified in agent_control.health.delivery_location. Health checks will be disabled.");
790+
_healthChecksFailed = true;
791+
}
792+
}
793+
#endregion
794+
670795
public void ReportSupportabilityPayloadsDroppeDueToMaxPayloadSizeLimit(string endpoint)
671796
{
672797
TrySend(_metricBuilder.TryBuildSupportabilityPayloadsDroppedDueToMaxPayloadLimit(endpoint));
@@ -686,6 +811,7 @@ private void CollectOneTimeMetrics()
686811
ReportIfInstrumentationIsDisabled();
687812
ReportIfGCSamplerV2IsEnabled();
688813
ReportIfAwsAccountIdProvided();
814+
ReportIfAgentControlHealthEnabled();
689815
}
690816

691817
public void CollectMetrics()
@@ -857,5 +983,10 @@ private void ReportIfAwsAccountIdProvided()
857983
ReportSupportabilityCountMetric(MetricNames.SupportabilityAwsAccountIdProvided);
858984
}
859985
}
986+
987+
/// <summary>
988+
/// FOR UNIT TESTING ONLY
989+
/// </summary>
990+
public bool HealthCheckFailed => _healthChecksFailed;
860991
}
861992
}
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
// Copyright 2020 New Relic, Inc. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
using System;
5+
using NewRelic.Agent.Core.Utilities;
6+
7+
namespace NewRelic.Agent.Core.AgentHealth
8+
{
9+
public class HealthCheck
10+
{
11+
private const int NanoSecondsPerMillisecond = 1000000;
12+
13+
public bool IsHealthy { get; internal set; }
14+
public string Status { get; internal set; }
15+
public string LastError { get; internal set; }
16+
public DateTime StartTime { get; } = DateTime.UtcNow;
17+
public DateTime StatusTime { get; internal set; }
18+
public string FileName { get; } = "health-" + System.Guid.NewGuid().ToString("N") + ".yml";
19+
20+
/// <summary>
21+
/// Set the health status of the agent, but only update changed values.
22+
/// </summary>
23+
/// <param name="healthy"></param>
24+
/// <param name="healthStatus"></param>
25+
/// <param name="statusParams"></param>
26+
public void TrySetHealth((bool IsHealthy, string Code, string Status) healthStatus, params string[] statusParams)
27+
{
28+
lock (this)
29+
{
30+
if (IsHealthy != healthStatus.IsHealthy)
31+
{
32+
IsHealthy = healthStatus.IsHealthy;
33+
}
34+
35+
if (string.IsNullOrEmpty(Status) || !Status.Equals(healthStatus.Code, StringComparison.OrdinalIgnoreCase))
36+
{
37+
Status = statusParams is { Length: > 0 } ?
38+
string.Format(healthStatus.Status, statusParams)
39+
:
40+
healthStatus.Status;
41+
}
42+
43+
if (string.IsNullOrEmpty(LastError) || !LastError.Equals(healthStatus.Code, StringComparison.OrdinalIgnoreCase))
44+
{
45+
LastError = healthStatus.Code;
46+
}
47+
48+
StatusTime = DateTime.UtcNow;
49+
}
50+
}
51+
52+
public string ToYaml()
53+
{
54+
lock (this)
55+
{
56+
return
57+
$"healthy: {IsHealthy}\nstatus: {Status}\nlast_error: {LastError}\nstart_time_unix_nano: {StartTime.ToUnixTimeMilliseconds() * NanoSecondsPerMillisecond}\nstatus_time_unix_nano: {StatusTime.ToUnixTimeMilliseconds() * NanoSecondsPerMillisecond}";
58+
}
59+
}
60+
}
61+
}
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
// Copyright 2020 New Relic, Inc. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
namespace NewRelic.Agent.Core.AgentHealth
5+
{
6+
public static class HealthCodes
7+
{
8+
/// <summary>
9+
/// Healthy
10+
/// </summary>
11+
public static readonly (bool IsHealthy, string Code, string Status) Healthy = (true, "NR-APM-000",
12+
"Healthy");
13+
14+
/// <summary>
15+
/// Invalid license key (HTTP status code 401)
16+
/// </summary>
17+
public static readonly (bool IsHealthy, string Code, string Status) LicenseKeyInvalid = (false, "NR-APM-001",
18+
"Invalid license key (HTTP status code 401)");
19+
20+
/// <summary>
21+
/// License key missing in configuration
22+
/// </summary>
23+
public static readonly (bool IsHealthy, string Code, string Status) LicenseKeyMissing = (false, "NR-APM-002",
24+
"License key missing in configuration");
25+
26+
/// <summary>
27+
/// Forced disconnect received from New Relic (HTTP status code 410)
28+
/// </summary>
29+
public static readonly (bool IsHealthy, string Code, string Status) ForceDisconnect = (false, "NR-APM-003",
30+
"Forced disconnect received from New Relic (HTTP status code 410)");
31+
32+
/// <summary>
33+
/// HTTP error response code [%s] received from New Relic while sending data type [%s]
34+
/// </summary>
35+
public static readonly (bool IsHealthy, string Code, string Status) HttpError = (false, "NR-APM-004",
36+
"HTTP error response code {0} received from New Relic while sending data type {1}");
37+
38+
/// <summary>
39+
/// Missing application name in agent configuration
40+
/// </summary>
41+
public static readonly (bool IsHealthy, string Code, string Status) ApplicationNameMissing = (false, "NR-APM-005",
42+
"Missing application name in agent configuration");
43+
44+
/// <summary>
45+
/// The maximum number of configured app names (3) exceeded
46+
/// </summary>
47+
public static readonly (bool IsHealthy, string Code, string Status) MaxApplicationNamesExceeded = (false, "NR-APM-006",
48+
"The maximum number of configured app names (3) exceeded");
49+
50+
/// <summary>
51+
/// HTTP Proxy configuration error; response code [%s]
52+
/// </summary>
53+
public static readonly (bool IsHealthy, string Code, string Status) HttpProxyError = (false, "NR-APM-007",
54+
"HTTP Proxy configuration error; response code {0}");
55+
56+
/// <summary>
57+
/// Agent is disabled via configuration
58+
/// </summary>
59+
public static readonly (bool IsHealthy, string Code, string Status) AgentDisabledByConfiguration = (false, "NR-APM-008",
60+
"Agent is disabled via configuration");
61+
62+
/// <summary>
63+
/// Failed to connect to New Relic data collector
64+
/// </summary>
65+
public static readonly (bool IsHealthy, string Code, string Status) FailedToConnect = (false, "NR-APM-009",
66+
"Failed to connect to New Relic data collector");
67+
68+
/// <summary>
69+
/// Agent has shutdown
70+
/// Only be reported if agent is "healthy" on shutdown.
71+
/// If the agent status is not Healthy on agent shutdown, the existing error MUST not be overwritten.
72+
/// </summary>
73+
public static readonly (bool IsHealthy, string Code, string Status) AgentShutdownHealthy = (true, "NR-APM-099",
74+
"Agent has shutdown");
75+
76+
// Agent health codes for the .NET agent are 200-299
77+
78+
/// <summary>
79+
/// Agent has shutdown with exception [%s]
80+
/// </summary>
81+
public static readonly (bool IsHealthy, string Code, string Status) AgentShutdownError = (false, "NR-APM-200",
82+
"Agent has shutdown with exception {0}");
83+
}
84+
}

src/Agent/NewRelic/Agent/Core/AgentHealth/IAgentHealthReporter.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,5 +151,7 @@ public interface IAgentHealthReporter : IOutOfBandMetricSource
151151
void ReportLogForwardingEnabledWithFramework(string logFramework);
152152
void ReportByteMetric(string metricName, long totalBytes, long? exclusiveBytes = null);
153153
void ReportLoggingEventsEmpty(int count = 1);
154+
void SetAgentControlStatus((bool IsHealthy, string Code, string Status) healthStatus, params string[] statusParams);
155+
void PublishAgentControlHealthCheck();
154156
}
155157
}

0 commit comments

Comments
 (0)