Skip to content

Commit ecae74e

Browse files
authored
feat(logs): improve health related logs [NR-362022] (#1028)
1 parent 5ce853d commit ecae74e

File tree

2 files changed

+26
-12
lines changed

2 files changed

+26
-12
lines changed

agent-control/src/sub_agent/event_handler/on_health.rs

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ use crate::sub_agent::error::SubAgentError;
55
use crate::sub_agent::health::with_start_time::HealthWithStartTime;
66
use opamp_client::operation::callbacks::Callbacks;
77
use opamp_client::StartedClient;
8-
use tracing::{debug, warn};
98

109
pub fn on_health<C, CB>(
1110
health: HealthWithStartTime,
@@ -18,13 +17,6 @@ where
1817
C: StartedClient<CB>,
1918
CB: Callbacks,
2019
{
21-
if health.is_healthy() {
22-
debug!(select_arm = "sub_agent_internal_consumer", "HealthyAgent");
23-
} else {
24-
debug!(select_arm = "sub_agent_internal_consumer", "UnhealthyAgent");
25-
warn!(%agent_id, "sub agent became unhealthy!");
26-
}
27-
2820
if let Some(client) = maybe_opamp_client.as_ref() {
2921
client.set_health(health.clone().into())?;
3022
}

agent-control/src/sub_agent/sub_agent.rs

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,10 @@ use std::marker::PhantomData;
2424
use std::thread;
2525
use std::thread::JoinHandle;
2626
use std::time::SystemTime;
27-
use tracing::{debug, error};
27+
use tracing::{debug, error, info, warn};
2828

2929
use super::error::SubAgentStopError;
30+
use super::health::health_checker::Health;
3031

3132
pub(crate) type SubAgentCallbacks<C> = AgentCallbacks<C>;
3233

@@ -141,6 +142,9 @@ where
141142
thread::spawn(move || {
142143
let mut supervisor = self.assemble_and_start_supervisor();
143144

145+
// Stores the current healthy state for logging purposes.
146+
let mut is_healthy = false;
147+
144148
debug!(
145149
agent_id = %self.agent_id,
146150
"runtime started"
@@ -208,18 +212,21 @@ where
208212
break;
209213
},
210214
Ok(SubAgentInternalEvent::AgentHealthInfo(health))=>{
215+
debug!(select_arm = "sub_agent_internal_consumer", ?health, "AgentHealthInfo");
216+
Self::log_health_info(&self.agent_id, is_healthy, health.clone().into());
211217
let _ = on_health(
212-
health,
218+
health.clone(),
213219
self.maybe_opamp_client.as_ref(),
214220
self.sub_agent_publisher.clone(),
215221
self.agent_id.clone(),
216222
self.agent_cfg.agent_type.clone(),
217223
)
218224
.inspect_err(|e| error!(error = %e, select_arm = "sub_agent_internal_consumer", "processing health message"));
225+
is_healthy = health.is_healthy()
219226
}
220-
Ok(SubAgentInternalEvent::AgentVersionInfo(agenta_data)) => {
227+
Ok(SubAgentInternalEvent::AgentVersionInfo(agent_data)) => {
221228
let _ = on_version(
222-
agenta_data,
229+
agent_data,
223230
self.maybe_opamp_client.as_ref(),
224231
)
225232
.inspect_err(|e| error!(error = %e, select_arm = "sub_agent_internal_consumer", "processing version message"));
@@ -233,6 +240,21 @@ where
233240
})
234241
}
235242

243+
fn log_health_info(agent_id: &AgentID, was_healthy: bool, health: Health) {
244+
match health {
245+
// From unhealthy (or initial) to healthy
246+
Health::Healthy(_) => {
247+
if !was_healthy {
248+
info!(%agent_id, "agent is healthy");
249+
}
250+
}
251+
// Every time health is unhealthy
252+
Health::Unhealthy(unhealthy) => {
253+
warn!(%agent_id, status=unhealthy.status(), last_error=unhealthy.last_error(), "agent is unhealthy");
254+
}
255+
}
256+
}
257+
236258
pub(crate) fn start_supervisor(
237259
&self,
238260
not_started_supervisor: B::SupervisorStarter,

0 commit comments

Comments
 (0)