-
Notifications
You must be signed in to change notification settings - Fork 64
feat: use health-override instead of hardware health #518
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
d322fe0
f3649a8
e040561
1dbefe2
a808984
9ac42ce
8788ea7
acb3b1b
f7f28d0
4fa786d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| -- removes hardware_health_report column from machines, replaced by health_override | ||
| ALTER TABLE machines | ||
| DROP COLUMN hardware_health_report; | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -223,6 +223,30 @@ impl From<ManagedHostStateSnapshotError> for sqlx::Error { | |
| } | ||
|
|
||
| impl ManagedHostStateSnapshot { | ||
| /// Returns `true` if override report is hw_health, `false` otherwise | ||
| fn merge_override_report_with_hw_health( | ||
| output: &mut HealthReport, | ||
| source: &str, | ||
| report: &HealthReport, | ||
| hardware_health_config: HardwareHealthReportsConfig, | ||
| ) -> bool { | ||
| if HealthReportOverrides::is_hardware_health_override_source(source) { | ||
| match hardware_health_config { | ||
| HardwareHealthReportsConfig::Disabled => {} | ||
| HardwareHealthReportsConfig::MonitorOnly => { | ||
| output.merge_with_alert_transform(report, |alert| { | ||
| alert.to_mut().classifications.clear(); | ||
| }); | ||
| } | ||
| HardwareHealthReportsConfig::Enabled => output.merge(report), | ||
| } | ||
| true | ||
| } else { | ||
| output.merge(report); | ||
| false | ||
| } | ||
| } | ||
|
|
||
| /// Returns `Ok` if the Host can be used as an instance | ||
| /// | ||
| /// This requires | ||
|
|
@@ -306,29 +330,7 @@ impl ManagedHostStateSnapshot { | |
| } | ||
| }; | ||
|
|
||
| // Merge hardware health if configured. | ||
| use HardwareHealthReportsConfig as HWConf; | ||
| match host_health_config.hardware_health_reports { | ||
| HWConf::Disabled => {} | ||
| HWConf::MonitorOnly => { | ||
| // If MonitorOnly, clear all alert classifications. | ||
| if let Some(h) = &mut self.host_snapshot.hardware_health_report { | ||
| for alert in &mut h.alerts { | ||
| alert.classifications.clear(); | ||
| } | ||
| output.merge(h) | ||
| } | ||
| } | ||
| HWConf::Enabled => { | ||
| // If hw_health_reports are enabled, then add a heartbeat timeout | ||
| // if the report is missing. | ||
| merge_or_timeout( | ||
| &mut output, | ||
| &self.host_snapshot.hardware_health_report, | ||
| "hardware-health".to_string(), | ||
| ); | ||
| } | ||
| } | ||
| let mut has_hardware_health = false; | ||
|
|
||
| // Merge DPU's alerts. If DPU alerts should be suppressed, than remove the classification from the | ||
| // alert so that metrics won't show a critical issue. | ||
|
|
@@ -363,13 +365,31 @@ impl ManagedHostStateSnapshot { | |
| output.merge(report); | ||
| } | ||
|
|
||
| for over in snapshot.health_report_overrides.merges.values() { | ||
| output.merge(over); | ||
| for (source, over) in snapshot.health_report_overrides.merges.iter() { | ||
| let merged_hardware = Self::merge_override_report_with_hw_health( | ||
| &mut output, | ||
| source, | ||
| over, | ||
| host_health_config.hardware_health_reports, | ||
| ); | ||
| has_hardware_health |= merged_hardware; | ||
| } | ||
| } | ||
|
|
||
| for over in self.host_snapshot.health_report_overrides.merges.values() { | ||
| output.merge(over); | ||
| for (source, over) in self.host_snapshot.health_report_overrides.merges.iter() { | ||
| let merged_hardware = Self::merge_override_report_with_hw_health( | ||
| &mut output, | ||
| source, | ||
| over, | ||
| host_health_config.hardware_health_reports, | ||
| ); | ||
| has_hardware_health |= merged_hardware; | ||
| } | ||
|
|
||
| if host_health_config.hardware_health_reports == HardwareHealthReportsConfig::Enabled | ||
| && !has_hardware_health | ||
| { | ||
| merge_or_timeout(&mut output, &None, "hardware-health".to_string()); | ||
| } | ||
|
|
||
| if let Some(rack_overrides) = &self.rack_health_overrides { | ||
|
|
@@ -681,9 +701,6 @@ pub struct Machine { | |
| /// Latest health report received by forge-dpu-agent | ||
| pub dpu_agent_health_report: Option<HealthReport>, | ||
|
|
||
| /// Latest health report received by hardware-health | ||
| pub hardware_health_report: Option<HealthReport>, | ||
|
|
||
| /// Latest health report generated by validation tests | ||
| pub machine_validation_health_report: HealthReport, | ||
|
|
||
|
|
@@ -1117,6 +1134,9 @@ impl From<Machine> for rpc::forge::Machine { | |
| health_overrides: machine | ||
| .health_report_overrides | ||
| .into_iter() | ||
| .filter(|(hr, _)| { | ||
| !HealthReportOverrides::is_hardware_health_override_source(&hr.source) | ||
| }) | ||
|
||
| .map(|(hr, m)| HealthOverrideOrigin { | ||
| mode: m as i32, | ||
| source: hr.source, | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.