Skip to content
Merged
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
-- removes hardware_health_report column from machines, replaced by health_override
ALTER TABLE machines
DROP COLUMN hardware_health_report;
16 changes: 16 additions & 0 deletions crates/api-model/src/machine/health_override.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ pub struct HealthReportOverrides {
pub merges: BTreeMap<String, HealthReport>,
}

pub const HARDWARE_HEALTH_OVERRIDE_PREFIX: &str = "hardware-health.";

pub struct MaintenanceOverride {
pub maintenance_reference: String,
pub maintenance_start_time: Option<rpc::Timestamp>,
Expand Down Expand Up @@ -58,4 +60,18 @@ impl HealthReportOverrides {
maintenance_start_time: alert.in_alert_since.map(rpc::Timestamp::from),
})
}

pub fn is_hardware_health_override_source(source: &str) -> bool {
source.starts_with(HARDWARE_HEALTH_OVERRIDE_PREFIX)
}

pub fn hardware_health_reports(&self) -> impl Iterator<Item = &HealthReport> {
self.merges.iter().filter_map(|(source, report)| {
if Self::is_hardware_health_override_source(source) {
Some(report)
} else {
None
}
})
}
}
2 changes: 0 additions & 2 deletions crates/api-model/src/machine/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ pub struct MachineSnapshotPgJson {
pub machine_validation_health_report: HealthReport,
pub site_explorer_health_report: Option<HealthReport>,
pub firmware_autoupdate: Option<bool>,
pub hardware_health_report: Option<HealthReport>,
pub health_report_overrides: Option<HealthReportOverrides>,
pub on_demand_machine_validation_id: Option<uuid::Uuid>,
pub on_demand_machine_validation_request: Option<bool>,
Expand Down Expand Up @@ -185,7 +184,6 @@ impl TryFrom<MachineSnapshotPgJson> for Machine {
manual_firmware_upgrade_completed: value.manual_firmware_upgrade_completed,
dpu_agent_upgrade_requested: value.dpu_agent_upgrade_requested,
dpu_agent_health_report: value.dpu_agent_health_report,
hardware_health_report: value.hardware_health_report,
machine_validation_health_report: value.machine_validation_health_report,
site_explorer_health_report: value.site_explorer_health_report,
health_report_overrides: value.health_report_overrides.unwrap_or_default(),
Expand Down
80 changes: 50 additions & 30 deletions crates/api-model/src/machine/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,30 @@ impl From<ManagedHostStateSnapshotError> for sqlx::Error {
}

impl ManagedHostStateSnapshot {
/// Returns `true` if override report is hw_health, `false` otherwise
fn merge_override_report_with_hw_health(
output: &mut HealthReport,
source: &str,
report: &HealthReport,
hardware_health_config: HardwareHealthReportsConfig,
) -> bool {
if HealthReportOverrides::is_hardware_health_override_source(source) {
match hardware_health_config {
HardwareHealthReportsConfig::Disabled => {}
HardwareHealthReportsConfig::MonitorOnly => {
output.merge_with_alert_transform(report, |alert| {
alert.to_mut().classifications.clear();
});
}
HardwareHealthReportsConfig::Enabled => output.merge(report),
}
true
} else {
output.merge(report);
false
}
}

/// Returns `Ok` if the Host can be used as an instance
///
/// This requires
Expand Down Expand Up @@ -306,29 +330,7 @@ impl ManagedHostStateSnapshot {
}
};

// Merge hardware health if configured.
use HardwareHealthReportsConfig as HWConf;
match host_health_config.hardware_health_reports {
HWConf::Disabled => {}
HWConf::MonitorOnly => {
// If MonitorOnly, clear all alert classifications.
if let Some(h) = &mut self.host_snapshot.hardware_health_report {
for alert in &mut h.alerts {
alert.classifications.clear();
}
output.merge(h)
}
}
HWConf::Enabled => {
// If hw_health_reports are enabled, then add a heartbeat timeout
// if the report is missing.
merge_or_timeout(
&mut output,
&self.host_snapshot.hardware_health_report,
"hardware-health".to_string(),
);
}
}
let mut has_hardware_health = false;

// Merge DPU's alerts. If DPU alerts should be suppressed, than remove the classification from the
// alert so that metrics won't show a critical issue.
Expand Down Expand Up @@ -363,13 +365,31 @@ impl ManagedHostStateSnapshot {
output.merge(report);
}

for over in snapshot.health_report_overrides.merges.values() {
output.merge(over);
for (source, over) in snapshot.health_report_overrides.merges.iter() {
let merged_hardware = Self::merge_override_report_with_hw_health(
&mut output,
source,
over,
host_health_config.hardware_health_reports,
);
has_hardware_health |= merged_hardware;
}
}

for over in self.host_snapshot.health_report_overrides.merges.values() {
output.merge(over);
for (source, over) in self.host_snapshot.health_report_overrides.merges.iter() {
let merged_hardware = Self::merge_override_report_with_hw_health(
&mut output,
source,
over,
host_health_config.hardware_health_reports,
);
has_hardware_health |= merged_hardware;
}

if host_health_config.hardware_health_reports == HardwareHealthReportsConfig::Enabled
&& !has_hardware_health
{
merge_or_timeout(&mut output, &None, "hardware-health".to_string());
}

if let Some(rack_overrides) = &self.rack_health_overrides {
Expand Down Expand Up @@ -681,9 +701,6 @@ pub struct Machine {
/// Latest health report received by forge-dpu-agent
pub dpu_agent_health_report: Option<HealthReport>,

/// Latest health report received by hardware-health
pub hardware_health_report: Option<HealthReport>,

/// Latest health report generated by validation tests
pub machine_validation_health_report: HealthReport,

Expand Down Expand Up @@ -1117,6 +1134,9 @@ impl From<Machine> for rpc::forge::Machine {
health_overrides: machine
.health_report_overrides
.into_iter()
.filter(|(hr, _)| {
!HealthReportOverrides::is_hardware_health_override_source(&hr.source)
})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why remove the hardware reports?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question, at fist i assummed overrides here expected to be only one set by operator. But there is no other source of reports after removal of hardware_health_report, so this filter most likely uncessery.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1. I don't understand why it wouldn't be listed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yoks Can you clarify?

Copy link
Contributor Author

@yoks yoks Mar 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was a mistake, i added it to keep test behavior the same, but from functional point of view we now using override for hardware_health, so it would not need to be filtered.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. Just asking becasue the change is still there :)

.map(|(hr, m)| HealthOverrideOrigin {
mode: m as i32,
source: hr.source,
Expand Down
14 changes: 0 additions & 14 deletions crates/api/src/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -448,20 +448,6 @@ impl Forge for Api {
crate::handlers::dpu::record_dpu_network_status(self, request).await
}

async fn record_hardware_health_report(
&self,
request: Request<rpc::HardwareHealthReport>,
) -> Result<Response<()>, Status> {
crate::handlers::health::record_hardware_health_report(self, request).await
}

async fn get_hardware_health_report(
&self,
request: Request<MachineId>,
) -> Result<Response<rpc::OptionalHealthReport>, Status> {
crate::handlers::health::get_hardware_health_report(self, request).await
}

async fn list_health_report_overrides(
&self,
request: Request<MachineId>,
Expand Down
2 changes: 0 additions & 2 deletions crates/api/src/auth/internal_rbac_rules.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,6 @@ impl InternalRBACRules {
vec![ForgeAdminCLI, Agent, Machineatron, SiteAgent],
);
x.perm("RecordDpuNetworkStatus", vec![Agent, Machineatron]);
x.perm("RecordHardwareHealthReport", vec![Health]);
x.perm("GetHardwareHealthReport", vec![]);
x.perm(
"ListHealthReportOverrides",
vec![ForgeAdminCLI, Health, Ssh, SshRs],
Expand Down
57 changes: 0 additions & 57 deletions crates/api/src/handlers/health.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,62 +27,6 @@ use crate::api::Api;
use crate::auth::AuthContext;
use crate::handlers::utils::convert_and_log_machine_id;

pub async fn record_hardware_health_report(
api: &Api,
request: Request<rpc::HardwareHealthReport>,
) -> Result<Response<()>, Status> {
let mut txn = api.txn_begin().await?;
let rpc::HardwareHealthReport { machine_id, report } = request.into_inner();
let machine_id = convert_and_log_machine_id(machine_id.as_ref())?;

let Some(report) = report else {
return Err(CarbideError::MissingArgument("report").into());
};

let host_machine = db::machine::find_one(&mut txn, &machine_id, MachineSearchConfig::default())
.await?
.ok_or_else(|| CarbideError::NotFoundError {
kind: "machine",
id: machine_id.to_string(),
})?;

let mut report = health_report::HealthReport::try_from(report.clone())
.map_err(|e| CarbideError::internal(e.to_string()))?;
report.observed_at = Some(chrono::Utc::now());

// Fix the in_alert times based on the previously stored report
report.update_in_alert_since(host_machine.hardware_health_report.as_ref());
db::machine::update_hardware_health_report(&mut txn, &machine_id, &report).await?;

txn.commit().await?;

Ok(Response::new(()))
}

pub async fn get_hardware_health_report(
api: &Api,
request: Request<MachineId>,
) -> Result<Response<rpc::OptionalHealthReport>, Status> {
let mut txn = api.txn_begin().await?;

let machine_id = request.into_inner();
let machine_id = convert_and_log_machine_id(Some(&machine_id))?;

let host_machine = db::machine::find_one(&mut txn, &machine_id, MachineSearchConfig::default())
.await?
.ok_or_else(|| CarbideError::NotFoundError {
kind: "machine",
id: machine_id.to_string(),
})?;

txn.commit().await?;

let report = host_machine.hardware_health_report.clone();
Ok(Response::new(::rpc::forge::OptionalHealthReport {
report: report.map(|hr| hr.into()),
}))
}

pub async fn list_health_report_overrides(
api: &Api,
machine_id: Request<MachineId>,
Expand Down Expand Up @@ -224,7 +168,6 @@ pub async fn remove_health_report_override(

let rpc::RemoveHealthReportOverrideRequest { machine_id, source } = request.into_inner();
let machine_id = convert_and_log_machine_id(machine_id.as_ref())?;

remove_by_source(&mut txn, machine_id, source).await?;
txn.commit().await?;

Expand Down
14 changes: 11 additions & 3 deletions crates/api/src/tests/common/api_fixtures/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2171,14 +2171,22 @@ pub async fn simulate_hardware_health_report(
host_machine_id: &MachineId,
health_report: health_report::HealthReport,
) {
use rpc::forge::HardwareHealthReport;
use model::machine::health_override::HARDWARE_HEALTH_OVERRIDE_PREFIX;
use rpc::forge::forge_server::Forge;
use rpc::forge::{HealthReportOverride, InsertHealthReportOverrideRequest};
use tonic::Request;

let source = format!("{HARDWARE_HEALTH_OVERRIDE_PREFIX}.{}", health_report.source);
let mut hw_report = health_report;
hw_report.source = source;
let _ = env
.api
.record_hardware_health_report(Request::new(HardwareHealthReport {
.insert_health_report_override(Request::new(InsertHealthReportOverrideRequest {
machine_id: Some(*host_machine_id),
report: Some(health_report.into()),
r#override: Some(HealthReportOverride {
report: Some(hw_report.into()),
..Default::default()
}),
}))
.await
.unwrap();
Expand Down
16 changes: 11 additions & 5 deletions crates/api/src/tests/common/api_fixtures/site_explorer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ use model::site_explorer::EndpointExplorationReport;
use model::switch::switch_id::from_hardware_info as switch_from_hardware_info;
use model::switch::{NewSwitch, SwitchConfig};
use rpc::forge::forge_server::Forge;
use rpc::forge::{self, HardwareHealthReport};
use rpc::forge::{self, HealthReportOverride, InsertHealthReportOverrideRequest};
use rpc::forge_agent_control_response::Action;
use rpc::machine_discovery::AttestKeyInfo;
use rpc::{DiscoveryData, DiscoveryInfo};
Expand Down Expand Up @@ -705,9 +705,12 @@ impl<'a> MockExploredHost<'a> {

self.test_env
.api
.record_hardware_health_report(Request::new(HardwareHealthReport {
.insert_health_report_override(Request::new(InsertHealthReportOverrideRequest {
r#override: Some(HealthReportOverride {
report: Some(HealthReport::empty("hardware-health".to_string()).into()),
..Default::default()
}),
machine_id: Some(host_machine_id),
report: Some(HealthReport::empty("hardware-health".to_string()).into()),
}))
.await
.expect("Failed to add hardware health report to newly created machine");
Expand Down Expand Up @@ -922,9 +925,12 @@ impl<'a> MockExploredHost<'a> {

self.test_env
.api
.record_hardware_health_report(Request::new(HardwareHealthReport {
.insert_health_report_override(Request::new(InsertHealthReportOverrideRequest {
r#override: Some(HealthReportOverride {
report: Some(HealthReport::empty("hardware-health".to_string()).into()),
..Default::default()
}),
machine_id: Some(host_machine_id),
report: Some(HealthReport::empty("hardware-health".to_string()).into()),
}))
.await
.expect("Failed to add hardware health report to newly created machine");
Expand Down
24 changes: 15 additions & 9 deletions crates/api/src/tests/machine_health.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,11 @@ async fn test_machine_health_reporting(
load_snapshot(&env, &host_machine_id)
.await?
.host_snapshot
.hardware_health_report
.unwrap(),
.health_report_overrides
.hardware_health_reports()
.next()
.unwrap()
.clone(),
health_report::HealthReport::empty("".to_string()),
);

Expand Down Expand Up @@ -177,14 +180,14 @@ async fn test_hardware_health_reporting(
let (host_machine_id, _) = create_managed_host(&env).await.into();

// Hardware health should start empty.
check_reports_equal(
"hardware-health",
assert!(
load_snapshot(&env, &host_machine_id)
.await?
.host_snapshot
.hardware_health_report
.unwrap(),
health_report::HealthReport::empty("".to_string()),
.health_report_overrides
.hardware_health_reports()
.next()
.is_none(),
);

let report = hr(
Expand All @@ -197,8 +200,11 @@ async fn test_hardware_health_reporting(
let stored_report = load_snapshot(&env, &host_machine_id)
.await?
.host_snapshot
.hardware_health_report
.unwrap();
.health_report_overrides
.hardware_health_reports()
.next()
.unwrap()
.clone();
check_time(&stored_report);
check_reports_equal("hardware-health", report, stored_report);

Expand Down
Loading
Loading