Skip to content

Commit 26eaad2

Browse files
committed
feat(k8s): add installation health check
1 parent 12cbb36 commit 26eaad2

File tree

5 files changed

+284
-30
lines changed

5 files changed

+284
-30
lines changed

agent-control/src/cli/errors.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ pub enum CliError {
1212

1313
#[error("failed to apply resource: {0}")]
1414
ApplyResource(String),
15+
16+
#[error("installation check failure: {0}")]
17+
InstallationCheck(String),
1518
}
1619

1720
impl CliError {
@@ -26,7 +29,7 @@ impl CliError {
2629
match self {
2730
CliError::K8sClient(_) => ExitCode::from(69),
2831
CliError::Tracing(_) => ExitCode::from(70),
29-
CliError::ApplyResource(_) => ExitCode::from(1),
32+
CliError::ApplyResource(_) | CliError::InstallationCheck(_) => ExitCode::from(1),
3033
}
3134
}
3235
}

agent-control/src/cli/install_agent_control.rs

Lines changed: 61 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,21 @@ use crate::agent_control::config::{helmrelease_v2_type_meta, helmrepository_type
22
use crate::cli::errors::CliError;
33
use crate::cli::utils::parse_key_value_pairs;
44
use crate::k8s::annotations::Annotations;
5+
#[cfg_attr(test, mockall_double::double)]
56
use crate::k8s::client::SyncK8sClient;
67
use crate::k8s::labels::Labels;
8+
use crate::sub_agent::health::health_checker::HealthChecker;
9+
use crate::sub_agent::health::k8s::health_checker::SubAgentHealthChecker;
10+
use crate::sub_agent::health::with_start_time::StartTime;
711
use crate::sub_agent::identity::AgentIdentity;
812
use clap::Parser;
913
use kube::{
1014
Resource,
1115
api::{DynamicObject, ObjectMeta},
12-
core::Duration,
1316
};
1417
use std::sync::Arc;
18+
use std::thread::sleep;
19+
use std::time::Duration;
1520
use std::{collections::BTreeMap, str::FromStr};
1621
use tracing::{debug, info};
1722

@@ -20,6 +25,10 @@ const REPOSITORY_URL: &str = "https://helm-charts.newrelic.com";
2025
const FIVE_MINUTES: &str = "5m";
2126
const AC_DEPLOYMENT_CHART_NAME: &str = "agent-control-deployment";
2227

28+
const INSTALLATION_CHECK_DEFAULT_INITIAL_DELAY: Duration = Duration::from_secs(10);
29+
const INSTALLATION_CHECK_DEFAULT_MAX_RETRIES: i32 = 10;
30+
const INSTALLATION_CHECK_DEFAULT_RETRY_INTERVAL: Duration = Duration::from_secs(3);
31+
2332
#[derive(Debug, Parser)]
2433
pub struct AgentControlInstallData {
2534
/// Release name
@@ -50,6 +59,10 @@ pub struct AgentControlInstallData {
5059
/// [k8s labels]: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#syntax-and-character-set
5160
#[arg(long)]
5261
pub extra_labels: Option<String>,
62+
63+
/// Skip the installation check if set
64+
#[arg(long)]
65+
pub skip_installation_check: bool,
5366
}
5467

5568
pub fn install_agent_control(
@@ -58,6 +71,7 @@ pub fn install_agent_control(
5871
) -> Result<(), CliError> {
5972
info!("Installing agent control");
6073

74+
let skip_check = data.skip_installation_check;
6175
let dynamic_objects = Vec::<DynamicObject>::from(data);
6276

6377
let k8s_client = k8s_client(namespace.clone())?;
@@ -66,12 +80,16 @@ pub fn install_agent_control(
6680
// For example, what happens if the user applies a remote configuration with a lower version
6781
// that includes a breaking change?
6882
info!("Applying agent control resources");
69-
for object in dynamic_objects {
70-
apply_resource(&k8s_client, &object)?;
83+
for object in dynamic_objects.iter() {
84+
apply_resource(&k8s_client, object)?;
7185
}
7286
info!("Agent control resources applied successfully");
7387

74-
info!("Agent control installed successfully");
88+
if !skip_check {
89+
info!("Checking Agent control installation");
90+
check_installation(k8s_client, dynamic_objects)?;
91+
info!("Agent control installed successfully");
92+
}
7593

7694
Ok(())
7795
}
@@ -143,7 +161,7 @@ fn helm_repository(
143161
data: serde_json::json!({
144162
"spec": {
145163
"url": REPOSITORY_URL,
146-
"interval": Duration::from_str(FIVE_MINUTES).expect("Hardcoded value should be correct"),
164+
"interval": kube::core::Duration::from_str(FIVE_MINUTES).expect("Hardcoded value should be correct"),
147165
}
148166
}),
149167
}
@@ -155,8 +173,10 @@ fn helm_release(
155173
labels: BTreeMap<String, String>,
156174
annotations: BTreeMap<String, String>,
157175
) -> DynamicObject {
158-
let interval = Duration::from_str(FIVE_MINUTES).expect("Hardcoded value should be correct");
159-
let timeout = Duration::from_str(FIVE_MINUTES).expect("Hardcoded value should be correct");
176+
let interval =
177+
kube::core::Duration::from_str(FIVE_MINUTES).expect("Hardcoded value should be correct");
178+
let timeout =
179+
kube::core::Duration::from_str(FIVE_MINUTES).expect("Hardcoded value should be correct");
160180
let mut data = serde_json::json!({
161181
"spec": {
162182
"interval": interval,
@@ -206,6 +226,39 @@ fn secrets_to_json(secrets: BTreeMap<String, String>) -> serde_json::Value {
206226
serde_json::json!(items)
207227
}
208228

229+
fn check_installation(
230+
k8s_client: SyncK8sClient,
231+
objects: Vec<DynamicObject>,
232+
) -> Result<(), CliError> {
233+
let health_checker =
234+
SubAgentHealthChecker::try_new(Arc::new(k8s_client), Arc::new(objects), StartTime::now())
235+
.map_err(|err| {
236+
CliError::InstallationCheck(format!("could not build health-checker: {err}"))
237+
})?
238+
.ok_or_else(|| {
239+
CliError::InstallationCheck("no resources to check health were found".to_string())
240+
})?;
241+
242+
// An initial delay is needed because the api-server can take a while to actually apply the changes and we could
243+
// perform the health check to previous objects which could lead to false positives.
244+
sleep(INSTALLATION_CHECK_DEFAULT_INITIAL_DELAY);
245+
let format_err = |err| {
246+
format!(
247+
"installation check failed after {INSTALLATION_CHECK_DEFAULT_MAX_RETRIES} attempts: {err}"
248+
)
249+
};
250+
let health = health_checker
251+
.check_health_with_retry(
252+
INSTALLATION_CHECK_DEFAULT_MAX_RETRIES,
253+
INSTALLATION_CHECK_DEFAULT_RETRY_INTERVAL,
254+
)
255+
.map_err(|err| CliError::InstallationCheck(format_err(err.to_string())))?;
256+
if let Some(err) = health.last_error() {
257+
return Err(CliError::InstallationCheck(format_err(err)));
258+
}
259+
Ok(())
260+
}
261+
209262
#[cfg(test)]
210263
mod tests {
211264
use super::*;
@@ -219,6 +272,7 @@ mod tests {
219272
chart_version: VERSION.to_string(),
220273
secrets: None,
221274
extra_labels: None,
275+
skip_installation_check: false,
222276
}
223277
}
224278

agent-control/src/sub_agent/health/health_checker.rs

Lines changed: 122 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ use crate::sub_agent::health::with_start_time::HealthWithStartTime;
1010
use crate::sub_agent::identity::ID_ATTRIBUTE_NAME;
1111
use crate::sub_agent::supervisor::starter::SupervisorStarterError;
1212
use crate::utils::thread_context::{NotStartedThreadContext, StartedThreadContext};
13-
use std::time::{SystemTime, SystemTimeError};
13+
use std::thread::sleep;
14+
use std::time::{Duration, SystemTime, SystemTimeError};
1415
use tracing::{debug, error, info_span};
1516

1617
const HEALTH_CHECKER_THREAD_NAME: &str = "health_checker";
@@ -211,6 +212,36 @@ pub trait HealthChecker {
211212
/// See OpAMP's [spec](https://github.com/open-telemetry/opamp-spec/blob/main/specification.md#componenthealthstatus)
212213
/// for more details.
213214
fn check_health(&self) -> Result<HealthWithStartTime, HealthCheckerError>;
215+
216+
/// Checks health and perform retries if the result is unhealthy or there was an error executing health check.
217+
/// The retries are performed as specified by provided limit and retry_interval.
218+
fn check_health_with_retry(
219+
&self,
220+
limit: i32,
221+
retry_interval: Duration,
222+
) -> Result<HealthWithStartTime, HealthCheckerError> {
223+
let mut last_health = Err(HealthCheckerError::Generic("initial value".into()));
224+
for attempt in 1..=limit {
225+
debug!("Checking health with retries {attempt}/{limit}");
226+
last_health = self.check_health();
227+
match last_health.as_ref() {
228+
Ok(health) => {
229+
if health.is_healthy() {
230+
debug!("Health check result was healthy");
231+
return last_health;
232+
}
233+
if let Some(err) = health.last_error() {
234+
debug!("Health check result was unhealthy: {err}");
235+
}
236+
}
237+
Err(err) => {
238+
debug!("Failure to check health: {err}");
239+
}
240+
}
241+
sleep(retry_interval);
242+
}
243+
last_health
244+
}
214245
}
215246

216247
pub(crate) fn spawn_health_checker<H>(
@@ -292,6 +323,7 @@ pub mod tests {
292323
use crate::event::channel::pub_sub;
293324

294325
use super::*;
326+
use assert_matches::assert_matches;
295327
use mockall::{Sequence, mock};
296328

297329
impl Default for Healthy {
@@ -358,6 +390,95 @@ pub mod tests {
358390
}
359391
}
360392

393+
#[test]
394+
fn test_health_check_with_retry_success_on_first_attempt() {
395+
let health_checker = MockHealthCheck::new_healthy();
396+
397+
let result = health_checker.check_health_with_retry(3, Duration::from_millis(10));
398+
399+
assert_matches!(result, Ok(health) => {
400+
assert!(health.is_healthy());
401+
});
402+
}
403+
404+
#[test]
405+
fn test_health_check_with_retry_success_after_retries() {
406+
let mut health_checker = MockHealthCheck::new();
407+
let mut seq = Sequence::new();
408+
409+
health_checker
410+
.expect_check_health()
411+
.once()
412+
.in_sequence(&mut seq)
413+
.returning(|| {
414+
Err(HealthCheckerError::Generic(
415+
"error on first attempt".to_string(),
416+
))
417+
});
418+
health_checker
419+
.expect_check_health()
420+
.once()
421+
.in_sequence(&mut seq)
422+
.returning(|| {
423+
Ok(HealthWithStartTime::from_unhealthy(
424+
Unhealthy::new(
425+
"Unhealthy".to_string(),
426+
"Unhealthy on second attempt".to_string(),
427+
),
428+
UNIX_EPOCH,
429+
))
430+
});
431+
health_checker
432+
.expect_check_health()
433+
.once()
434+
.in_sequence(&mut seq)
435+
.returning(|| {
436+
Ok(HealthWithStartTime::from_healthy(
437+
Healthy::default(),
438+
UNIX_EPOCH,
439+
))
440+
});
441+
442+
let result = health_checker.check_health_with_retry(3, Duration::from_millis(10));
443+
444+
assert_matches!(result, Ok(health) => {
445+
assert!(health.is_healthy());
446+
});
447+
}
448+
449+
#[test]
450+
fn test_health_check_with_retry_failure_after_all_attempts() {
451+
let mut health_checker = MockHealthCheck::new();
452+
health_checker
453+
.expect_check_health()
454+
.times(3)
455+
.returning(|| Err(HealthCheckerError::Generic("persistent error".to_string())));
456+
457+
let result = health_checker.check_health_with_retry(3, Duration::from_millis(10));
458+
459+
assert_matches!(result, Err(HealthCheckerError::Generic(s)) => {
460+
assert_eq!(s, "persistent error".to_string());
461+
});
462+
}
463+
464+
#[test]
465+
fn test_health_check_with_retry_unhealthy_result() {
466+
let mut health_checker = MockHealthCheck::new();
467+
health_checker.expect_check_health().times(3).returning(|| {
468+
Ok(HealthWithStartTime::from_unhealthy(
469+
Unhealthy::new("Unhealthy".to_string(), "persistent unhealthy".to_string()),
470+
UNIX_EPOCH,
471+
))
472+
});
473+
474+
let result = health_checker.check_health_with_retry(3, Duration::from_millis(10));
475+
476+
assert_matches!(result, Ok(health) => {
477+
assert!(!health.is_healthy());
478+
assert_eq!(health.last_error().unwrap(), "persistent unhealthy".to_string());
479+
});
480+
}
481+
361482
#[test]
362483
fn test_spawn_health_checker() {
363484
let (health_publisher, health_consumer) = pub_sub();

agent-control/tests/k8s/agent_control_cli/dynamic_objects.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ fn k8s_cli_install_agent_control_creates_resources() {
2727
.arg("chart=podinfo, env=testing, app=ac");
2828
cmd.arg("--secrets")
2929
.arg("secret1=default.yaml,secret2=values.yaml,secret3=fixed.yaml");
30+
cmd.arg("--skip-installation-check"); // Skipping checks because we are merely checking that the resources are created.
3031
cmd.assert().success();
3132

3233
let k8s_client = SyncK8sClient::try_new(runtime.clone(), namespace.clone()).unwrap();

0 commit comments

Comments
 (0)