Skip to content
Merged
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
-- removes hardware_health_report column from machines, replaced by health_override
ALTER TABLE machines
DROP COLUMN hardware_health_report;
6 changes: 6 additions & 0 deletions crates/api-model/src/machine/health_override.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ pub struct HealthReportOverrides {
pub merges: BTreeMap<String, HealthReport>,
}

pub const HARDWARE_HEALTH_OVERRIDE_PREFIX: &str = "hardware-health.";

pub struct MaintenanceOverride {
pub maintenance_reference: String,
pub maintenance_start_time: Option<rpc::Timestamp>,
Expand Down Expand Up @@ -58,4 +60,8 @@ impl HealthReportOverrides {
maintenance_start_time: alert.in_alert_since.map(rpc::Timestamp::from),
})
}

pub fn is_hardware_health_override_source(source: &str) -> bool {
source.starts_with(HARDWARE_HEALTH_OVERRIDE_PREFIX)
}
}
2 changes: 0 additions & 2 deletions crates/api-model/src/machine/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ pub struct MachineSnapshotPgJson {
pub machine_validation_health_report: HealthReport,
pub site_explorer_health_report: Option<HealthReport>,
pub firmware_autoupdate: Option<bool>,
pub hardware_health_report: Option<HealthReport>,
pub health_report_overrides: Option<HealthReportOverrides>,
pub on_demand_machine_validation_id: Option<uuid::Uuid>,
pub on_demand_machine_validation_request: Option<bool>,
Expand Down Expand Up @@ -185,7 +184,6 @@ impl TryFrom<MachineSnapshotPgJson> for Machine {
manual_firmware_upgrade_completed: value.manual_firmware_upgrade_completed,
dpu_agent_upgrade_requested: value.dpu_agent_upgrade_requested,
dpu_agent_health_report: value.dpu_agent_health_report,
hardware_health_report: value.hardware_health_report,
machine_validation_health_report: value.machine_validation_health_report,
site_explorer_health_report: value.site_explorer_health_report,
health_report_overrides: value.health_report_overrides.unwrap_or_default(),
Expand Down
78 changes: 48 additions & 30 deletions crates/api-model/src/machine/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,31 @@ impl From<ManagedHostStateSnapshotError> for sqlx::Error {
}

impl ManagedHostStateSnapshot {
/// Returns `true` if override report is hw_health, `false` otherwise
fn merge_override_report_with_hw_health(
output: &mut HealthReport,
source: &str,
report: &mut HealthReport,
hardware_health_config: HardwareHealthReportsConfig,
) -> bool {
if HealthReportOverrides::is_hardware_health_override_source(source) {
match hardware_health_config {
HardwareHealthReportsConfig::Disabled => {}
HardwareHealthReportsConfig::MonitorOnly => {
for alert in &mut report.alerts {
alert.classifications.clear();
}
output.merge(report)
}
HardwareHealthReportsConfig::Enabled => output.merge(report),
}
true
} else {
output.merge(report);
false
}
}

/// Returns `Ok` if the Host can be used as an instance
///
/// This requires
Expand Down Expand Up @@ -306,29 +331,7 @@ impl ManagedHostStateSnapshot {
}
};

// Merge hardware health if configured.
use HardwareHealthReportsConfig as HWConf;
match host_health_config.hardware_health_reports {
HWConf::Disabled => {}
HWConf::MonitorOnly => {
// If MonitorOnly, clear all alert classifications.
if let Some(h) = &mut self.host_snapshot.hardware_health_report {
for alert in &mut h.alerts {
alert.classifications.clear();
}
output.merge(h)
}
}
HWConf::Enabled => {
// If hw_health_reports are enabled, then add a heartbeat timeout
// if the report is missing.
merge_or_timeout(
&mut output,
&self.host_snapshot.hardware_health_report,
"hardware-health".to_string(),
);
}
}
let mut has_hardware_health = false;

// Merge DPU's alerts. If DPU alerts should be suppressed, than remove the classification from the
// alert so that metrics won't show a critical issue.
Expand Down Expand Up @@ -363,13 +366,31 @@ impl ManagedHostStateSnapshot {
output.merge(report);
}

for over in snapshot.health_report_overrides.merges.values() {
output.merge(over);
for (source, over) in snapshot.health_report_overrides.merges.iter_mut() {
let merged_hardware = Self::merge_override_report_with_hw_health(
&mut output,
source,
over,
host_health_config.hardware_health_reports,
);
has_hardware_health |= merged_hardware;
}
}

for over in self.host_snapshot.health_report_overrides.merges.values() {
output.merge(over);
for (source, over) in self.host_snapshot.health_report_overrides.merges.iter_mut() {
let merged_hardware = Self::merge_override_report_with_hw_health(
&mut output,
source,
over,
host_health_config.hardware_health_reports,
);
has_hardware_health |= merged_hardware;
}

if host_health_config.hardware_health_reports == HardwareHealthReportsConfig::Enabled
&& !has_hardware_health
{
merge_or_timeout(&mut output, &None, "hardware-health".to_string());
}

if let Some(rack_overrides) = &self.rack_health_overrides {
Expand Down Expand Up @@ -681,9 +702,6 @@ pub struct Machine {
/// Latest health report received by forge-dpu-agent
pub dpu_agent_health_report: Option<HealthReport>,

/// Latest health report received by hardware-health
pub hardware_health_report: Option<HealthReport>,

/// Latest health report generated by validation tests
pub machine_validation_health_report: HealthReport,

Expand Down
14 changes: 0 additions & 14 deletions crates/api/src/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -448,20 +448,6 @@ impl Forge for Api {
crate::handlers::dpu::record_dpu_network_status(self, request).await
}

async fn record_hardware_health_report(
&self,
request: Request<rpc::HardwareHealthReport>,
) -> Result<Response<()>, Status> {
crate::handlers::health::record_hardware_health_report(self, request).await
}

async fn get_hardware_health_report(
&self,
request: Request<MachineId>,
) -> Result<Response<rpc::OptionalHealthReport>, Status> {
crate::handlers::health::get_hardware_health_report(self, request).await
}

async fn list_health_report_overrides(
&self,
request: Request<MachineId>,
Expand Down
2 changes: 0 additions & 2 deletions crates/api/src/auth/internal_rbac_rules.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,6 @@ impl InternalRBACRules {
vec![ForgeAdminCLI, Agent, Machineatron, SiteAgent],
);
x.perm("RecordDpuNetworkStatus", vec![Agent, Machineatron]);
x.perm("RecordHardwareHealthReport", vec![Health]);
x.perm("GetHardwareHealthReport", vec![]);
x.perm(
"ListHealthReportOverrides",
vec![ForgeAdminCLI, Health, Ssh, SshRs],
Expand Down
57 changes: 0 additions & 57 deletions crates/api/src/handlers/health.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,62 +27,6 @@ use crate::api::Api;
use crate::auth::AuthContext;
use crate::handlers::utils::convert_and_log_machine_id;

pub async fn record_hardware_health_report(
api: &Api,
request: Request<rpc::HardwareHealthReport>,
) -> Result<Response<()>, Status> {
let mut txn = api.txn_begin().await?;
let rpc::HardwareHealthReport { machine_id, report } = request.into_inner();
let machine_id = convert_and_log_machine_id(machine_id.as_ref())?;

let Some(report) = report else {
return Err(CarbideError::MissingArgument("report").into());
};

let host_machine = db::machine::find_one(&mut txn, &machine_id, MachineSearchConfig::default())
.await?
.ok_or_else(|| CarbideError::NotFoundError {
kind: "machine",
id: machine_id.to_string(),
})?;

let mut report = health_report::HealthReport::try_from(report.clone())
.map_err(|e| CarbideError::internal(e.to_string()))?;
report.observed_at = Some(chrono::Utc::now());

// Fix the in_alert times based on the previously stored report
report.update_in_alert_since(host_machine.hardware_health_report.as_ref());
db::machine::update_hardware_health_report(&mut txn, &machine_id, &report).await?;

txn.commit().await?;

Ok(Response::new(()))
}

pub async fn get_hardware_health_report(
api: &Api,
request: Request<MachineId>,
) -> Result<Response<rpc::OptionalHealthReport>, Status> {
let mut txn = api.txn_begin().await?;

let machine_id = request.into_inner();
let machine_id = convert_and_log_machine_id(Some(&machine_id))?;

let host_machine = db::machine::find_one(&mut txn, &machine_id, MachineSearchConfig::default())
.await?
.ok_or_else(|| CarbideError::NotFoundError {
kind: "machine",
id: machine_id.to_string(),
})?;

txn.commit().await?;

let report = host_machine.hardware_health_report.clone();
Ok(Response::new(::rpc::forge::OptionalHealthReport {
report: report.map(|hr| hr.into()),
}))
}

pub async fn list_health_report_overrides(
api: &Api,
machine_id: Request<MachineId>,
Expand Down Expand Up @@ -224,7 +168,6 @@ pub async fn remove_health_report_override(

let rpc::RemoveHealthReportOverrideRequest { machine_id, source } = request.into_inner();
let machine_id = convert_and_log_machine_id(machine_id.as_ref())?;

remove_by_source(&mut txn, machine_id, source).await?;
txn.commit().await?;

Expand Down
3 changes: 2 additions & 1 deletion crates/api/src/tests/common/api_fixtures/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ use model::instance::config::network::DeviceLocator;
use model::instance::config::nvlink::InstanceNvLinkConfig;
use model::instance::snapshot::InstanceSnapshot;
use model::instance::status::network::InstanceNetworkStatusObservation;
use model::machine::health_override::HARDWARE_HEALTH_OVERRIDE_PREFIX;
use model::machine::{
CleanupState, Machine, MachineState, MachineValidatingState, ManagedHostState, ValidationState,
};
Expand Down Expand Up @@ -371,7 +372,7 @@ pub async fn advance_created_instance_into_state(
super::simulate_hardware_health_report(
env,
&mh.host().id,
health_report::HealthReport::empty("hardware-health".to_string()),
health_report::HealthReport::empty(format!("{HARDWARE_HEALTH_OVERRIDE_PREFIX}health")),
)
.await;

Expand Down
10 changes: 7 additions & 3 deletions crates/api/src/tests/common/api_fixtures/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2171,14 +2171,18 @@ pub async fn simulate_hardware_health_report(
host_machine_id: &MachineId,
health_report: health_report::HealthReport,
) {
use rpc::forge::HardwareHealthReport;
use rpc::forge::forge_server::Forge;
use rpc::forge::{HealthReportOverride, InsertHealthReportOverrideRequest};
use tonic::Request;

let _ = env
.api
.record_hardware_health_report(Request::new(HardwareHealthReport {
.insert_health_report_override(Request::new(InsertHealthReportOverrideRequest {
machine_id: Some(*host_machine_id),
report: Some(health_report.into()),
r#override: Some(HealthReportOverride {
report: Some(health_report.into()),
..Default::default()
}),
}))
.await
.unwrap();
Expand Down
23 changes: 18 additions & 5 deletions crates/api/src/tests/common/api_fixtures/site_explorer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ use futures_util::FutureExt;
use health_report::HealthReport;
use mac_address::MacAddress;
use model::hardware_info::HardwareInfo;
use model::machine::health_override::HARDWARE_HEALTH_OVERRIDE_PREFIX;
use model::machine::{
BomValidating, BomValidatingContext, DpfState, DpuInitState, FailureCause, FailureDetails,
FailureSource, LockdownInfo, LockdownMode, LockdownState, MachineState, MachineValidatingState,
Expand All @@ -42,7 +43,7 @@ use model::site_explorer::EndpointExplorationReport;
use model::switch::switch_id::from_hardware_info as switch_from_hardware_info;
use model::switch::{NewSwitch, SwitchConfig};
use rpc::forge::forge_server::Forge;
use rpc::forge::{self, HardwareHealthReport};
use rpc::forge::{self, HealthReportOverride, InsertHealthReportOverrideRequest};
use rpc::forge_agent_control_response::Action;
use rpc::machine_discovery::AttestKeyInfo;
use rpc::{DiscoveryData, DiscoveryInfo};
Expand Down Expand Up @@ -705,9 +706,15 @@ impl<'a> MockExploredHost<'a> {

self.test_env
.api
.record_hardware_health_report(Request::new(HardwareHealthReport {
.insert_health_report_override(Request::new(InsertHealthReportOverrideRequest {
r#override: Some(HealthReportOverride {
report: Some(
HealthReport::empty(format!("{HARDWARE_HEALTH_OVERRIDE_PREFIX}health"))
.into(),
),
..Default::default()
}),
machine_id: Some(host_machine_id),
report: Some(HealthReport::empty("hardware-health".to_string()).into()),
}))
.await
.expect("Failed to add hardware health report to newly created machine");
Expand Down Expand Up @@ -922,9 +929,15 @@ impl<'a> MockExploredHost<'a> {

self.test_env
.api
.record_hardware_health_report(Request::new(HardwareHealthReport {
.insert_health_report_override(Request::new(InsertHealthReportOverrideRequest {
r#override: Some(HealthReportOverride {
report: Some(
HealthReport::empty(format!("{HARDWARE_HEALTH_OVERRIDE_PREFIX}health"))
.into(),
),
..Default::default()
}),
machine_id: Some(host_machine_id),
report: Some(HealthReport::empty("hardware-health".to_string()).into()),
}))
.await
.expect("Failed to add hardware health report to newly created machine");
Expand Down
8 changes: 4 additions & 4 deletions crates/api/src/tests/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4421,7 +4421,7 @@ async fn test_instance_release_backward_compatibility(_: PgPoolOptions, options:
// When using old API format (no issue, no is_repair_tenant), NO health overrides should be applied
assert_eq!(
host_machine.health_report_overrides.merges.len(),
0,
1, // Single HealthOverride for HardwareHealth
"Backward compatibility test: NO health overrides should be applied when using old API format"
);

Expand Down Expand Up @@ -4725,10 +4725,10 @@ async fn test_instance_release_auto_repair_enabled(_: PgPoolOptions, options: Pg
);

// CRITICAL VERIFICATIONS for auto-repair enabled scenario:
// 1. Should have TWO health overrides (TenantReportedIssue + RequestRepair)
// 1. Should have THREE health overrides (TenantReportedIssue + RequestRepair + Default HardwareHealth)
assert_eq!(
host_machine.health_report_overrides.merges.len(),
2,
3,
"Auto-repair enabled should apply both TenantReportedIssue and RequestRepair overrides"
);

Expand Down Expand Up @@ -4827,7 +4827,7 @@ async fn test_instance_release_repair_tenant_successful_completion(

assert_eq!(
host_machine.health_report_overrides.merges.len(),
2,
3,
"Should have both TenantReportedIssue and RequestRepair after regular tenant release"
);

Expand Down
Loading
Loading