diff --git a/crates/api-db/migrations/20260311022301_delete_hardware_health_health_column.sql b/crates/api-db/migrations/20260311022301_delete_hardware_health_health_column.sql new file mode 100644 index 0000000000..3817bd699d --- /dev/null +++ b/crates/api-db/migrations/20260311022301_delete_hardware_health_health_column.sql @@ -0,0 +1,3 @@ +-- removes hardware_health_report column from machines, replaced by health_override +ALTER TABLE machines +DROP COLUMN hardware_health_report; diff --git a/crates/api-model/src/machine/health_override.rs b/crates/api-model/src/machine/health_override.rs index 9129a2a805..831938a962 100644 --- a/crates/api-model/src/machine/health_override.rs +++ b/crates/api-model/src/machine/health_override.rs @@ -29,6 +29,8 @@ pub struct HealthReportOverrides { pub merges: BTreeMap, } +pub const HARDWARE_HEALTH_OVERRIDE_PREFIX: &str = "hardware-health."; + pub struct MaintenanceOverride { pub maintenance_reference: String, pub maintenance_start_time: Option, @@ -58,4 +60,8 @@ impl HealthReportOverrides { maintenance_start_time: alert.in_alert_since.map(rpc::Timestamp::from), }) } + + pub fn is_hardware_health_override_source(source: &str) -> bool { + source.starts_with(HARDWARE_HEALTH_OVERRIDE_PREFIX) + } } diff --git a/crates/api-model/src/machine/json.rs b/crates/api-model/src/machine/json.rs index a56de3af8d..ac739f0b13 100644 --- a/crates/api-model/src/machine/json.rs +++ b/crates/api-model/src/machine/json.rs @@ -77,7 +77,6 @@ pub struct MachineSnapshotPgJson { pub machine_validation_health_report: HealthReport, pub site_explorer_health_report: Option, pub firmware_autoupdate: Option, - pub hardware_health_report: Option, pub health_report_overrides: Option, pub on_demand_machine_validation_id: Option, pub on_demand_machine_validation_request: Option, @@ -185,7 +184,6 @@ impl TryFrom for Machine { manual_firmware_upgrade_completed: value.manual_firmware_upgrade_completed, dpu_agent_upgrade_requested: value.dpu_agent_upgrade_requested, dpu_agent_health_report: value.dpu_agent_health_report, - hardware_health_report: value.hardware_health_report, machine_validation_health_report: value.machine_validation_health_report, site_explorer_health_report: value.site_explorer_health_report, health_report_overrides: value.health_report_overrides.unwrap_or_default(), diff --git a/crates/api-model/src/machine/mod.rs b/crates/api-model/src/machine/mod.rs index 022fa32533..c108eaec83 100644 --- a/crates/api-model/src/machine/mod.rs +++ b/crates/api-model/src/machine/mod.rs @@ -223,6 +223,31 @@ impl From for sqlx::Error { } impl ManagedHostStateSnapshot { + /// Returns `true` if override report is hw_health, `false` otherwise + fn merge_override_report_with_hw_health( + output: &mut HealthReport, + source: &str, + report: &mut HealthReport, + hardware_health_config: HardwareHealthReportsConfig, + ) -> bool { + if HealthReportOverrides::is_hardware_health_override_source(source) { + match hardware_health_config { + HardwareHealthReportsConfig::Disabled => {} + HardwareHealthReportsConfig::MonitorOnly => { + for alert in &mut report.alerts { + alert.classifications.clear(); + } + output.merge(report) + } + HardwareHealthReportsConfig::Enabled => output.merge(report), + } + true + } else { + output.merge(report); + false + } + } + /// Returns `Ok` if the Host can be used as an instance /// /// This requires @@ -306,29 +331,7 @@ impl ManagedHostStateSnapshot { } }; - // Merge hardware health if configured. - use HardwareHealthReportsConfig as HWConf; - match host_health_config.hardware_health_reports { - HWConf::Disabled => {} - HWConf::MonitorOnly => { - // If MonitorOnly, clear all alert classifications. - if let Some(h) = &mut self.host_snapshot.hardware_health_report { - for alert in &mut h.alerts { - alert.classifications.clear(); - } - output.merge(h) - } - } - HWConf::Enabled => { - // If hw_health_reports are enabled, then add a heartbeat timeout - // if the report is missing. - merge_or_timeout( - &mut output, - &self.host_snapshot.hardware_health_report, - "hardware-health".to_string(), - ); - } - } + let mut has_hardware_health = false; // Merge DPU's alerts. If DPU alerts should be suppressed, than remove the classification from the // alert so that metrics won't show a critical issue. @@ -363,13 +366,31 @@ impl ManagedHostStateSnapshot { output.merge(report); } - for over in snapshot.health_report_overrides.merges.values() { - output.merge(over); + for (source, over) in snapshot.health_report_overrides.merges.iter_mut() { + let merged_hardware = Self::merge_override_report_with_hw_health( + &mut output, + source, + over, + host_health_config.hardware_health_reports, + ); + has_hardware_health |= merged_hardware; } } - for over in self.host_snapshot.health_report_overrides.merges.values() { - output.merge(over); + for (source, over) in self.host_snapshot.health_report_overrides.merges.iter_mut() { + let merged_hardware = Self::merge_override_report_with_hw_health( + &mut output, + source, + over, + host_health_config.hardware_health_reports, + ); + has_hardware_health |= merged_hardware; + } + + if host_health_config.hardware_health_reports == HardwareHealthReportsConfig::Enabled + && !has_hardware_health + { + merge_or_timeout(&mut output, &None, "hardware-health".to_string()); } if let Some(rack_overrides) = &self.rack_health_overrides { @@ -681,9 +702,6 @@ pub struct Machine { /// Latest health report received by forge-dpu-agent pub dpu_agent_health_report: Option, - /// Latest health report received by hardware-health - pub hardware_health_report: Option, - /// Latest health report generated by validation tests pub machine_validation_health_report: HealthReport, diff --git a/crates/api/src/api.rs b/crates/api/src/api.rs index 93ad742409..35818faf22 100644 --- a/crates/api/src/api.rs +++ b/crates/api/src/api.rs @@ -448,20 +448,6 @@ impl Forge for Api { crate::handlers::dpu::record_dpu_network_status(self, request).await } - async fn record_hardware_health_report( - &self, - request: Request, - ) -> Result, Status> { - crate::handlers::health::record_hardware_health_report(self, request).await - } - - async fn get_hardware_health_report( - &self, - request: Request, - ) -> Result, Status> { - crate::handlers::health::get_hardware_health_report(self, request).await - } - async fn list_health_report_overrides( &self, request: Request, diff --git a/crates/api/src/auth/internal_rbac_rules.rs b/crates/api/src/auth/internal_rbac_rules.rs index 59c0f20aef..9179658ce1 100644 --- a/crates/api/src/auth/internal_rbac_rules.rs +++ b/crates/api/src/auth/internal_rbac_rules.rs @@ -130,8 +130,6 @@ impl InternalRBACRules { vec![ForgeAdminCLI, Agent, Machineatron, SiteAgent], ); x.perm("RecordDpuNetworkStatus", vec![Agent, Machineatron]); - x.perm("RecordHardwareHealthReport", vec![Health]); - x.perm("GetHardwareHealthReport", vec![]); x.perm( "ListHealthReportOverrides", vec![ForgeAdminCLI, Health, Ssh, SshRs], diff --git a/crates/api/src/handlers/health.rs b/crates/api/src/handlers/health.rs index 6369a76ebc..c45a572ed2 100644 --- a/crates/api/src/handlers/health.rs +++ b/crates/api/src/handlers/health.rs @@ -27,62 +27,6 @@ use crate::api::Api; use crate::auth::AuthContext; use crate::handlers::utils::convert_and_log_machine_id; -pub async fn record_hardware_health_report( - api: &Api, - request: Request, -) -> Result, Status> { - let mut txn = api.txn_begin().await?; - let rpc::HardwareHealthReport { machine_id, report } = request.into_inner(); - let machine_id = convert_and_log_machine_id(machine_id.as_ref())?; - - let Some(report) = report else { - return Err(CarbideError::MissingArgument("report").into()); - }; - - let host_machine = db::machine::find_one(&mut txn, &machine_id, MachineSearchConfig::default()) - .await? - .ok_or_else(|| CarbideError::NotFoundError { - kind: "machine", - id: machine_id.to_string(), - })?; - - let mut report = health_report::HealthReport::try_from(report.clone()) - .map_err(|e| CarbideError::internal(e.to_string()))?; - report.observed_at = Some(chrono::Utc::now()); - - // Fix the in_alert times based on the previously stored report - report.update_in_alert_since(host_machine.hardware_health_report.as_ref()); - db::machine::update_hardware_health_report(&mut txn, &machine_id, &report).await?; - - txn.commit().await?; - - Ok(Response::new(())) -} - -pub async fn get_hardware_health_report( - api: &Api, - request: Request, -) -> Result, Status> { - let mut txn = api.txn_begin().await?; - - let machine_id = request.into_inner(); - let machine_id = convert_and_log_machine_id(Some(&machine_id))?; - - let host_machine = db::machine::find_one(&mut txn, &machine_id, MachineSearchConfig::default()) - .await? - .ok_or_else(|| CarbideError::NotFoundError { - kind: "machine", - id: machine_id.to_string(), - })?; - - txn.commit().await?; - - let report = host_machine.hardware_health_report.clone(); - Ok(Response::new(::rpc::forge::OptionalHealthReport { - report: report.map(|hr| hr.into()), - })) -} - pub async fn list_health_report_overrides( api: &Api, machine_id: Request, @@ -224,7 +168,6 @@ pub async fn remove_health_report_override( let rpc::RemoveHealthReportOverrideRequest { machine_id, source } = request.into_inner(); let machine_id = convert_and_log_machine_id(machine_id.as_ref())?; - remove_by_source(&mut txn, machine_id, source).await?; txn.commit().await?; diff --git a/crates/api/src/tests/common/api_fixtures/instance.rs b/crates/api/src/tests/common/api_fixtures/instance.rs index 3f8eecbb18..0acb24a0d0 100644 --- a/crates/api/src/tests/common/api_fixtures/instance.rs +++ b/crates/api/src/tests/common/api_fixtures/instance.rs @@ -26,6 +26,7 @@ use model::instance::config::network::DeviceLocator; use model::instance::config::nvlink::InstanceNvLinkConfig; use model::instance::snapshot::InstanceSnapshot; use model::instance::status::network::InstanceNetworkStatusObservation; +use model::machine::health_override::HARDWARE_HEALTH_OVERRIDE_PREFIX; use model::machine::{ CleanupState, Machine, MachineState, MachineValidatingState, ManagedHostState, ValidationState, }; @@ -371,7 +372,7 @@ pub async fn advance_created_instance_into_state( super::simulate_hardware_health_report( env, &mh.host().id, - health_report::HealthReport::empty("hardware-health".to_string()), + health_report::HealthReport::empty(format!("{HARDWARE_HEALTH_OVERRIDE_PREFIX}health")), ) .await; diff --git a/crates/api/src/tests/common/api_fixtures/mod.rs b/crates/api/src/tests/common/api_fixtures/mod.rs index 87d2520d6e..83b872ab1f 100644 --- a/crates/api/src/tests/common/api_fixtures/mod.rs +++ b/crates/api/src/tests/common/api_fixtures/mod.rs @@ -2171,14 +2171,18 @@ pub async fn simulate_hardware_health_report( host_machine_id: &MachineId, health_report: health_report::HealthReport, ) { - use rpc::forge::HardwareHealthReport; use rpc::forge::forge_server::Forge; + use rpc::forge::{HealthReportOverride, InsertHealthReportOverrideRequest}; use tonic::Request; + let _ = env .api - .record_hardware_health_report(Request::new(HardwareHealthReport { + .insert_health_report_override(Request::new(InsertHealthReportOverrideRequest { machine_id: Some(*host_machine_id), - report: Some(health_report.into()), + r#override: Some(HealthReportOverride { + report: Some(health_report.into()), + ..Default::default() + }), })) .await .unwrap(); diff --git a/crates/api/src/tests/common/api_fixtures/site_explorer.rs b/crates/api/src/tests/common/api_fixtures/site_explorer.rs index 84da057459..c9d1ba2c92 100644 --- a/crates/api/src/tests/common/api_fixtures/site_explorer.rs +++ b/crates/api/src/tests/common/api_fixtures/site_explorer.rs @@ -30,6 +30,7 @@ use futures_util::FutureExt; use health_report::HealthReport; use mac_address::MacAddress; use model::hardware_info::HardwareInfo; +use model::machine::health_override::HARDWARE_HEALTH_OVERRIDE_PREFIX; use model::machine::{ BomValidating, BomValidatingContext, DpfState, DpuInitState, FailureCause, FailureDetails, FailureSource, LockdownInfo, LockdownMode, LockdownState, MachineState, MachineValidatingState, @@ -42,7 +43,7 @@ use model::site_explorer::EndpointExplorationReport; use model::switch::switch_id::from_hardware_info as switch_from_hardware_info; use model::switch::{NewSwitch, SwitchConfig}; use rpc::forge::forge_server::Forge; -use rpc::forge::{self, HardwareHealthReport}; +use rpc::forge::{self, HealthReportOverride, InsertHealthReportOverrideRequest}; use rpc::forge_agent_control_response::Action; use rpc::machine_discovery::AttestKeyInfo; use rpc::{DiscoveryData, DiscoveryInfo}; @@ -705,9 +706,15 @@ impl<'a> MockExploredHost<'a> { self.test_env .api - .record_hardware_health_report(Request::new(HardwareHealthReport { + .insert_health_report_override(Request::new(InsertHealthReportOverrideRequest { + r#override: Some(HealthReportOverride { + report: Some( + HealthReport::empty(format!("{HARDWARE_HEALTH_OVERRIDE_PREFIX}health")) + .into(), + ), + ..Default::default() + }), machine_id: Some(host_machine_id), - report: Some(HealthReport::empty("hardware-health".to_string()).into()), })) .await .expect("Failed to add hardware health report to newly created machine"); @@ -922,9 +929,15 @@ impl<'a> MockExploredHost<'a> { self.test_env .api - .record_hardware_health_report(Request::new(HardwareHealthReport { + .insert_health_report_override(Request::new(InsertHealthReportOverrideRequest { + r#override: Some(HealthReportOverride { + report: Some( + HealthReport::empty(format!("{HARDWARE_HEALTH_OVERRIDE_PREFIX}health")) + .into(), + ), + ..Default::default() + }), machine_id: Some(host_machine_id), - report: Some(HealthReport::empty("hardware-health".to_string()).into()), })) .await .expect("Failed to add hardware health report to newly created machine"); diff --git a/crates/api/src/tests/instance.rs b/crates/api/src/tests/instance.rs index 4d524c740b..51eb0f7df6 100644 --- a/crates/api/src/tests/instance.rs +++ b/crates/api/src/tests/instance.rs @@ -4421,7 +4421,7 @@ async fn test_instance_release_backward_compatibility(_: PgPoolOptions, options: // When using old API format (no issue, no is_repair_tenant), NO health overrides should be applied assert_eq!( host_machine.health_report_overrides.merges.len(), - 0, + 1, // Single HealthOverride for HardwareHealth "Backward compatibility test: NO health overrides should be applied when using old API format" ); @@ -4725,10 +4725,10 @@ async fn test_instance_release_auto_repair_enabled(_: PgPoolOptions, options: Pg ); // CRITICAL VERIFICATIONS for auto-repair enabled scenario: - // 1. Should have TWO health overrides (TenantReportedIssue + RequestRepair) + // 1. Should have THREE health overrides (TenantReportedIssue + RequestRepair + Default HardwareHealth) assert_eq!( host_machine.health_report_overrides.merges.len(), - 2, + 3, "Auto-repair enabled should apply both TenantReportedIssue and RequestRepair overrides" ); @@ -4827,7 +4827,7 @@ async fn test_instance_release_repair_tenant_successful_completion( assert_eq!( host_machine.health_report_overrides.merges.len(), - 2, + 3, "Should have both TenantReportedIssue and RequestRepair after regular tenant release" ); diff --git a/crates/api/src/tests/machine_health.rs b/crates/api/src/tests/machine_health.rs index 8669b07560..6a84cacb3e 100644 --- a/crates/api/src/tests/machine_health.rs +++ b/crates/api/src/tests/machine_health.rs @@ -20,6 +20,7 @@ use std::str::FromStr; use db::machine::update_dpu_agent_health_report; use db::{self}; use health_report::OverrideMode; +use model::machine::health_override::HARDWARE_HEALTH_OVERRIDE_PREFIX; use model::machine::{HardwareHealthReportsConfig, HostHealthConfig, LoadSnapshotOptions}; use rpc::forge::HealthOverrideOrigin; use rpc::forge::forge_server::Forge; @@ -113,17 +114,27 @@ async fn test_machine_health_reporting( health_report::HealthReport::empty("".to_string()), ); check_reports_equal( - "hardware-health", + &format!("{HARDWARE_HEALTH_OVERRIDE_PREFIX}health"), load_snapshot(&env, &host_machine_id) .await? .host_snapshot - .hardware_health_report - .unwrap(), + .health_report_overrides + .merges + .values() + .next() + .unwrap() + .clone(), health_report::HealthReport::empty("".to_string()), ); let m = find_machine(&env, &host_machine_id).await; - assert_eq!(m.health_overrides, vec![]); + assert_eq!( + m.health_overrides, + vec![HealthOverrideOrigin { + mode: OverrideMode::Merge as i32, + source: format!("{HARDWARE_HEALTH_OVERRIDE_PREFIX}health") + }] + ); let aggregate_health = aggregate(m).unwrap(); assert_eq!(aggregate_health.source, "aggregate-host-health"); check_time(&aggregate_health); @@ -178,17 +189,22 @@ async fn test_hardware_health_reporting( // Hardware health should start empty. check_reports_equal( - "hardware-health", + &format!("{HARDWARE_HEALTH_OVERRIDE_PREFIX}health"), load_snapshot(&env, &host_machine_id) .await? .host_snapshot - .hardware_health_report - .unwrap(), + .health_report_overrides + .merges + .values() + .next() + .unwrap() + .clone(), health_report::HealthReport::empty("".to_string()), ); + let report_name: String = format!("{HARDWARE_HEALTH_OVERRIDE_PREFIX}health"); let report = hr( - "hardware-health", + &report_name, vec![("Fan", Some("TestFan"))], vec![("Failure", Some("Sensor"), "Failure")], ); @@ -197,10 +213,14 @@ async fn test_hardware_health_reporting( let stored_report = load_snapshot(&env, &host_machine_id) .await? .host_snapshot - .hardware_health_report - .unwrap(); + .health_report_overrides + .merges + .values() + .next() + .unwrap() + .clone(); check_time(&stored_report); - check_reports_equal("hardware-health", report, stored_report); + check_reports_equal(&report_name, report, stored_report); Ok(()) } @@ -229,7 +249,7 @@ async fn test_machine_health_aggregation( assert_eq!( override_metrics, vec![ - "{fresh=\"true\",in_use=\"false\",override_type=\"merge\"} 0".to_string(), + "{fresh=\"true\",in_use=\"false\",override_type=\"merge\"} 1".to_string(), "{fresh=\"true\",in_use=\"false\",override_type=\"replace\"} 0".to_string(), "{fresh=\"true\",in_use=\"true\",override_type=\"merge\"} 0".to_string(), "{fresh=\"true\",in_use=\"true\",override_type=\"replace\"} 0".to_string() @@ -257,7 +277,7 @@ async fn test_machine_health_aggregation( // Simulate the same alert as DPU but with a different message and from hardware health. let hardware_health = hr( - "hardware-health", + &format!("{HARDWARE_HEALTH_OVERRIDE_PREFIX}health"), vec![("Fan", Some("TestFan"))], vec![("Failure1", None, "HardwareReason")], ); @@ -274,7 +294,7 @@ async fn test_machine_health_aggregation( hr( "", vec![("Fan", Some("TestFan")), ("Success1", None)], - vec![("Failure1", None, "HardwareReason\nReason1")], + vec![("Failure1", None, "Reason1\nHardwareReason")], ), ); @@ -295,7 +315,7 @@ async fn test_machine_health_aggregation( assert_eq!( override_metrics, vec![ - "{fresh=\"true\",in_use=\"false\",override_type=\"merge\"} 1".to_string(), + "{fresh=\"true\",in_use=\"false\",override_type=\"merge\"} 2".to_string(), "{fresh=\"true\",in_use=\"false\",override_type=\"replace\"} 0".to_string(), "{fresh=\"true\",in_use=\"true\",override_type=\"merge\"} 0".to_string(), "{fresh=\"true\",in_use=\"true\",override_type=\"replace\"} 0".to_string() @@ -305,17 +325,23 @@ async fn test_machine_health_aggregation( let m = find_machine(&env, &host_machine_id).await; assert_eq!( m.health_overrides, - vec![HealthOverrideOrigin { - mode: OverrideMode::Merge as i32, - source: "add-host-failure".to_string() - }] + vec![ + HealthOverrideOrigin { + mode: OverrideMode::Merge as i32, + source: "add-host-failure".to_string() + }, + HealthOverrideOrigin { + mode: OverrideMode::Merge as i32, + source: "hardware-health.health".to_string() + } + ] ); let aggregate_health = aggregate(m).unwrap(); let merged_hr = hr( "", vec![("Success1", None)], vec![ - ("Failure1", None, "HardwareReason\nReason1"), + ("Failure1", None, "Reason1\nHardwareReason"), ("Fan", Some("TestFan"), "Reason"), ], ); @@ -345,7 +371,7 @@ async fn test_machine_health_aggregation( assert_eq!( override_metrics, vec![ - "{fresh=\"true\",in_use=\"false\",override_type=\"merge\"} 1".to_string(), + "{fresh=\"true\",in_use=\"false\",override_type=\"merge\"} 2".to_string(), "{fresh=\"true\",in_use=\"false\",override_type=\"replace\"} 1".to_string(), "{fresh=\"true\",in_use=\"true\",override_type=\"merge\"} 0".to_string(), "{fresh=\"true\",in_use=\"true\",override_type=\"replace\"} 0".to_string() @@ -360,6 +386,10 @@ async fn test_machine_health_aggregation( mode: OverrideMode::Merge as i32, source: "add-host-failure".to_string() }, + HealthOverrideOrigin { + mode: OverrideMode::Merge as i32, + source: "hardware-health.health".to_string() + }, HealthOverrideOrigin { mode: OverrideMode::Replace as i32, source: "replace-host-report".to_string() @@ -515,7 +545,11 @@ async fn test_double_insert(pool: sqlx::PgPool) -> Result<(), Box Result<(), Box TestEnv { /// Creates a health report. fn hr( - source: &'static str, + source: &str, successes: Vec<(&'static str, Option<&'static str>)>, alerts: Vec<(&'static str, Option<&'static str>, &'static str)>, ) -> health_report::HealthReport { @@ -792,7 +832,7 @@ fn check_time(report: &health_report::HealthReport) { /// to have this source and checks that the reports are equal (not considering /// timestamps). fn check_reports_equal( - source: &'static str, + source: &str, reported: health_report::HealthReport, mut expected: health_report::HealthReport, ) { @@ -895,9 +935,9 @@ async fn test_tenant_reported_issue_health_override_template( let machine = find_machine(&env, &host_machine_id).await; // Check that the override was stored - assert_eq!(machine.health_overrides.len(), 1); - assert_eq!(machine.health_overrides[0].mode, OverrideMode::Merge as i32); - assert_eq!(machine.health_overrides[0].source, "tenant-reported-issue"); + assert_eq!(machine.health_overrides.len(), 2); + assert_eq!(machine.health_overrides[1].mode, OverrideMode::Merge as i32); + assert_eq!(machine.health_overrides[1].source, "tenant-reported-issue"); // Verify aggregate health includes the override let aggregate_health = aggregate(machine).unwrap(); @@ -970,9 +1010,9 @@ async fn test_request_repair_health_override_template( let machine = find_machine(&env, &host_machine_id).await; // Check that the override was stored - assert_eq!(machine.health_overrides.len(), 1); - assert_eq!(machine.health_overrides[0].mode, OverrideMode::Merge as i32); - assert_eq!(machine.health_overrides[0].source, "repair-request"); + assert_eq!(machine.health_overrides.len(), 2); + assert_eq!(machine.health_overrides[1].mode, OverrideMode::Merge as i32); + assert_eq!(machine.health_overrides[1].source, "repair-request"); // Verify aggregate health includes the override let aggregate_health = aggregate(machine).unwrap(); @@ -1069,7 +1109,7 @@ async fn test_tenant_reported_issue_and_request_repair_combined( let aggregate_health = aggregate(machine.clone()).unwrap(); // Check that both overrides were stored - assert_eq!(machine.health_overrides.len(), 2); + assert_eq!(machine.health_overrides.len(), 3); let sources: Vec = machine .health_overrides .iter() diff --git a/crates/api/src/tests/machine_states.rs b/crates/api/src/tests/machine_states.rs index 4e823cd031..05509c6514 100644 --- a/crates/api/src/tests/machine_states.rs +++ b/crates/api/src/tests/machine_states.rs @@ -33,12 +33,13 @@ use measured_boot::records::MeasurementBundleState; use measured_boot::report::MeasurementReport; use model::controller_outcome::PersistentStateHandlerOutcome; use model::hardware_info::TpmEkCertificate; +use model::machine::health_override::HARDWARE_HEALTH_OVERRIDE_PREFIX; use model::machine::{ DpuInitState, FailureCause, FailureDetails, FailureSource, LockdownMode, MachineState, MachineValidatingState, ManagedHostState, MeasuringState, ValidationState, }; use rpc::forge::forge_server::Forge; -use rpc::forge::{HardwareHealthReport, TpmCaCert, TpmCaCertId}; +use rpc::forge::{HealthReportOverride, InsertHealthReportOverrideRequest, TpmCaCert, TpmCaCertId}; use rpc::forge_agent_control_response::Action; use tonic::Request; @@ -1233,9 +1234,14 @@ async fn test_measurement_host_init_failed_to_waiting_for_measurements_to_pendin .await; env.api - .record_hardware_health_report(Request::new(HardwareHealthReport { - machine_id: host_machine_id.into(), - report: Some(HealthReport::empty("hardware-health".to_string()).into()), + .insert_health_report_override(Request::new(InsertHealthReportOverrideRequest { + r#override: Some(HealthReportOverride { + report: Some( + HealthReport::empty(format!("{HARDWARE_HEALTH_OVERRIDE_PREFIX}health")).into(), + ), + ..Default::default() + }), + machine_id: Some(host_machine_id), })) .await .expect("Failed to add hardware health report to newly created machine"); diff --git a/crates/api/src/tests/web/managed_host.rs b/crates/api/src/tests/web/managed_host.rs index d2de3f6801..da795dc8da 100644 --- a/crates/api/src/tests/web/managed_host.rs +++ b/crates/api/src/tests/web/managed_host.rs @@ -189,7 +189,7 @@ async fn test_managed_host_row_display(pool: sqlx::PgPool) -> eyre::Result<()> { hardware_info.dmi_data.as_ref().unwrap().product_name ); assert_eq!(row.machine_id, machine_id.to_string()); - assert!(row.health_overrides.is_empty()); + assert!(!row.health_overrides.is_empty()); assert!(row.health_probe_alerts.is_empty()); assert!(!row.host_admin_ip.is_empty()); assert_eq!( diff --git a/crates/api/src/web/health.rs b/crates/api/src/web/health.rs index c7e2f51f49..219ce5773d 100644 --- a/crates/api/src/web/health.rs +++ b/crates/api/src/web/health.rs @@ -24,6 +24,7 @@ use axum::response::{Html, IntoResponse, Response}; use carbide_uuid::machine::{MachineId, MachineType}; use health_report::HealthReport; use hyper::http::StatusCode; +use model::machine::health_override::HealthReportOverrides; use rpc::forge::forge_server::Forge; use rpc::forge::{ InsertHealthReportOverrideRequest, MachinesByIdsRequest, OverrideMode, @@ -129,7 +130,7 @@ pub async fn health( }; let request = tonic::Request::new(machine_id); - let mut overrides = match state + let mut listed_overrides = match state .list_health_report_overrides(request) .await .map(|response| response.into_inner().overrides) @@ -141,6 +142,28 @@ pub async fn health( return (StatusCode::INTERNAL_SERVER_ERROR, Html(err.to_string())).into_response(); } }; + let mut hardware_health: Option = None; + let mut overrides = Vec::new(); + for override_entry in listed_overrides.drain(..) { + let source = override_entry + .report + .as_ref() + .map(|report| report.source.as_str()) + .unwrap_or_default(); + if HealthReportOverrides::is_hardware_health_override_source(source) { + if let Some(report) = override_entry.report { + let report = health_report_from_rpc_convert_invalid(report); + if let Some(aggregated) = hardware_health.as_mut() { + aggregated.merge(&report); + } else { + hardware_health = Some(report); + } + } + continue; + } + overrides.push(override_entry); + } + // Sort by type first and source name second. overrides.sort_by(|a, b| { if a.mode() == OverrideMode::Replace { @@ -160,23 +183,9 @@ pub async fn health( .collect(); let mut component_health = Vec::new(); - - let request = tonic::Request::new(machine_id); - let hw_report = match state - .get_hardware_health_report(request) - .await - .map(|response| response.into_inner().report) - { - Ok(m) => m, - Err(err) if err.code() == tonic::Code::NotFound => None, - Err(err) => { - tracing::error!(%err, %machine_id, "get_hardware_health_report"); - return (StatusCode::INTERNAL_SERVER_ERROR, Html(err.to_string())).into_response(); - } - }; component_health.push(LabeledHealthReport { label: "Hardware Health".to_string(), - report: hw_report.map(health_report_from_rpc_convert_invalid), + report: hardware_health, }); if !associated_dpu_machine_ids.is_empty() { diff --git a/crates/health/benches/sink_pipeline.rs b/crates/health/benches/sink_pipeline.rs index 207affbed9..0a9e767230 100644 --- a/crates/health/benches/sink_pipeline.rs +++ b/crates/health/benches/sink_pipeline.rs @@ -159,7 +159,7 @@ fn bench_composite_sink(c: &mut Criterion) { fn health_report_with_alerts(alert_count: usize) -> HealthReport { let mut report = HealthReport { - source: carbide_health::sink::ReportSource::Health, + source: carbide_health::sink::ReportSource::BmcSensors, observed_at: Some(chrono::Utc::now()), successes: Vec::new(), alerts: Vec::new(), diff --git a/crates/health/src/api_client.rs b/crates/health/src/api_client.rs index b6b66c0b0a..a8ca7ba22c 100644 --- a/crates/health/src/api_client.rs +++ b/crates/health/src/api_client.rs @@ -202,13 +202,18 @@ impl ApiClientWrapper { machine_id: &carbide_uuid::machine::MachineId, report: health_report::HealthReport, ) -> Result<(), HealthError> { - let request = rpc::forge::HardwareHealthReport { - machine_id: Some(*machine_id), + let ovrd = rpc::forge::HealthReportOverride { report: Some(report.into()), + mode: rpc::forge::OverrideMode::Merge.into(), + }; + + let request = rpc::forge::InsertHealthReportOverrideRequest { + machine_id: Some(*machine_id), + r#override: Some(ovrd), }; self.client - .record_hardware_health_report(request) + .insert_health_report_override(request) .await .map_err(HealthError::ApiInvocationError)?; diff --git a/crates/health/src/processor/health_report.rs b/crates/health/src/processor/health_report.rs index 6395f87ac2..fda702d005 100644 --- a/crates/health/src/processor/health_report.rs +++ b/crates/health/src/processor/health_report.rs @@ -203,7 +203,7 @@ impl EventProcessor for HealthReportProcessor { return Vec::new(); }; let report = HealthReport { - source: ReportSource::Health, + source: ReportSource::BmcSensors, observed_at: Some(chrono::Utc::now()), successes: window.successes, alerts: window.alerts, @@ -293,7 +293,7 @@ mod tests { panic!("expected health report event"); }; - assert_eq!(report.source, ReportSource::Health); + assert_eq!(report.source, ReportSource::BmcSensors); assert!(report.successes.is_empty()); assert_eq!(report.alerts.len(), 1); } diff --git a/crates/health/src/processor/leak_events.rs b/crates/health/src/processor/leak_events.rs index da2ffb5f76..39045b62e1 100644 --- a/crates/health/src/processor/leak_events.rs +++ b/crates/health/src/processor/leak_events.rs @@ -151,7 +151,7 @@ mod tests { fn does_not_emit_alert_when_threshold_not_met() { let processor = LeakEventProcessor::new(2); let report = HealthReport { - source: ReportSource::Health, + source: ReportSource::BmcSensors, observed_at: Some(chrono::Utc::now()), successes: Vec::new(), alerts: vec![leak_alert("LeakDetector_Probe")], @@ -173,7 +173,7 @@ mod tests { fn emits_derived_leak_report_when_threshold_met() { let processor = LeakEventProcessor::new(1); let report = HealthReport { - source: ReportSource::Health, + source: ReportSource::BmcSensors, observed_at: Some(chrono::Utc::now()), successes: Vec::new(), alerts: vec![leak_alert("LeakDetector_Probe")], diff --git a/crates/health/src/sink/events.rs b/crates/health/src/sink/events.rs index 005eeebac4..c2bd29c497 100644 --- a/crates/health/src/sink/events.rs +++ b/crates/health/src/sink/events.rs @@ -134,15 +134,15 @@ pub enum CollectorEvent { #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] pub enum ReportSource { - Health, + BmcSensors, TrayLeakDetection, } impl ReportSource { pub const fn as_str(self) -> &'static str { match self { - Self::Health => "hardware-health", - Self::TrayLeakDetection => "hardware-tray-leak-detection", + Self::BmcSensors => "bmc-sensors", + Self::TrayLeakDetection => "tray-leak-detection", } } } @@ -239,8 +239,10 @@ impl TryFrom for CarbideHealthReport { type Error = HealthReportConversionError; fn try_from(value: HealthReport) -> Result { + let source = format!("hardware-health.{}", value.source.as_str()); + Ok(Self { - source: value.source.as_str().to_string(), + source, triggered_by: None, observed_at: value.observed_at, successes: value diff --git a/crates/rpc/proto/forge.proto b/crates/rpc/proto/forge.proto index 2e46c01ab4..b067564e27 100644 --- a/crates/rpc/proto/forge.proto +++ b/crates/rpc/proto/forge.proto @@ -124,8 +124,6 @@ service Forge { // forge-dpu-agent -> carbide-api rpc GetManagedHostNetworkConfig(ManagedHostNetworkConfigRequest) returns (ManagedHostNetworkConfigResponse); rpc RecordDpuNetworkStatus(DpuNetworkStatus) returns (google.protobuf.Empty); - rpc RecordHardwareHealthReport(HardwareHealthReport) returns (google.protobuf.Empty); - rpc GetHardwareHealthReport(common.MachineId) returns (OptionalHealthReport); // Lists all overrides that have been placed on a Machine health status rpc ListHealthReportOverrides(common.MachineId) returns (ListHealthReportOverrideResponse); // Adds a new override for a Machines health status