Skip to content

Commit bfb4f76

Browse files
authored
[fm] Toggle to disable fm_analysis background task (#10360)
Alternative to #10358 Adds an opt-in `fm.analysis_enabled` flag which defaults to false. Yet another possible fix for #10348 To re-enable: toggle `default_fm_analysis_enabled` to return `true` instead of `false`.
1 parent 391d3e7 commit bfb4f76

8 files changed

Lines changed: 55 additions & 2 deletions

File tree

dev-tools/omdb/src/bin/omdb/nexus.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3514,6 +3514,13 @@ fn print_task_fm_analysis(details: &serde_json::Value) {
35143514
println!(" FAULT MANAGEMENT ANALYSIS SUMMARY");
35153515
println!(" =================================");
35163516
let (prep_status, analysis_status) = match outcome {
3517+
Outcome::Disabled => {
3518+
println!(
3519+
" fault management analysis explicitly disabled \
3520+
by config!"
3521+
);
3522+
return;
3523+
}
35173524
Outcome::WaitingForInventory => {
35183525
println!(
35193526
" analysis was not performed, as the inventory has\n \

nexus-config/src/nexus_config.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -968,9 +968,18 @@ impl Default for MulticastGroupReconcilerConfig {
968968
}
969969
}
970970

971+
/// Default for [`FmTasksConfig::analysis_enabled`].
972+
fn default_fm_analysis_enabled() -> bool {
973+
// TODO(#10349): Flip to true
974+
false
975+
}
976+
971977
#[serde_as]
972978
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
973979
pub struct FmTasksConfig {
980+
/// whether the fault management analysis background task runs.
981+
#[serde(default = "default_fm_analysis_enabled")]
982+
pub analysis_enabled: bool,
974983
/// period (in seconds) for periodic activations of the background task that
975984
/// drives fault management analysis.
976985
#[serde_as(as = "DurationSeconds<u64>")]
@@ -993,6 +1002,7 @@ pub struct FmTasksConfig {
9931002
impl Default for FmTasksConfig {
9941003
fn default() -> Self {
9951004
Self {
1005+
analysis_enabled: default_fm_analysis_enabled(),
9961006
// Analysis is generally triggered by changes in the current sitrep,
9971007
// inventory, or by the ereport ingester(s), so it need not be
9981008
// periodically activated all that frequently.
@@ -1575,6 +1585,7 @@ mod test {
15751585
disable: false,
15761586
},
15771587
fm: FmTasksConfig {
1588+
analysis_enabled: default_fm_analysis_enabled(),
15781589
analysis_period_secs: Duration::from_secs(52),
15791590
sitrep_load_period_secs: Duration::from_secs(48),
15801591
sitrep_gc_period_secs: Duration::from_secs(49),

nexus/examples/config-second.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ sp_ereport_ingester.period_secs = 30
174174
# Nexus).
175175
# This is cheap, so we should check frequently.
176176
fm.sitrep_load_period_secs = 15
177+
fm.analysis_enabled = true
177178
# How frequently to run analysis from the current sitrep.
178179
fm.analysis_period_secs = 120
179180
# Sitrep GC, on the other hand, does not need to be activated very frequently,

nexus/examples/config.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ sp_ereport_ingester.period_secs = 30
158158
# Nexus).
159159
# This is cheap, so we should check frequently.
160160
fm.sitrep_load_period_secs = 15
161+
fm.analysis_enabled = true
161162
# How frequently to run analysis from the current sitrep.
162163
fm.analysis_period_secs = 120
163164
# Sitrep GC, on the other hand, does not need to be activated very frequently,

nexus/src/app/background/init.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1150,6 +1150,7 @@ impl BackgroundTasksInitializer {
11501150
sitrep_gc: task_fm_sitrep_gc.clone(),
11511151
},
11521152
nexus_id,
1153+
config.fm.analysis_enabled,
11531154
);
11541155
driver.register(TaskDefinition {
11551156
name: "fm_analysis",

nexus/src/app/background/tasks/fm_analysis.rs

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ pub struct FmAnalysis {
3131
inv_rx: watch::Receiver<Option<Arc<inventory::Collection>>>,
3232
activators: Activators,
3333
nexus_id: OmicronZoneUuid,
34+
analysis_enabled: bool,
3435
}
3536

3637
/// This is just because I don't like it when a constructor takes multiple
@@ -48,7 +49,19 @@ impl BackgroundTask for FmAnalysis {
4849
opctx: &'a OpContext,
4950
) -> BoxFuture<'a, serde_json::Value> {
5051
Box::pin(async {
51-
let status = self.actually_activate(opctx).await;
52+
let status = if self.analysis_enabled {
53+
self.actually_activate(opctx).await
54+
} else {
55+
slog::info!(
56+
opctx.log,
57+
"fault management analysis explicitly disabled by config",
58+
);
59+
FmAnalysisStatus {
60+
parent_sitrep_id: None,
61+
inv_collection_id: None,
62+
outcome: status::Outcome::Disabled,
63+
}
64+
};
5265
match serde_json::to_value(status) {
5366
Ok(val) => val,
5467
Err(err) => {
@@ -70,8 +83,16 @@ impl FmAnalysis {
7083
inv_rx: watch::Receiver<Option<Arc<inventory::Collection>>>,
7184
activators: Activators,
7285
nexus_id: OmicronZoneUuid,
86+
analysis_enabled: bool,
7387
) -> Self {
74-
Self { datastore, sitrep_rx, inv_rx, activators, nexus_id }
88+
Self {
89+
datastore,
90+
sitrep_rx,
91+
inv_rx,
92+
activators,
93+
nexus_id,
94+
analysis_enabled,
95+
}
7596
}
7697

7798
async fn actually_activate(
@@ -349,6 +370,8 @@ mod tests {
349370
use omicron_test_utils::dev;
350371
use omicron_uuid_kinds::SitrepUuid;
351372

373+
const ANALYSIS_ENABLED: bool = true;
374+
352375
fn activators() -> Activators {
353376
let a = Activators {
354377
inventory_loader: Activator::new(),
@@ -432,6 +455,7 @@ mod tests {
432455
inv_rx,
433456
activators(),
434457
OmicronZoneUuid::new_v4(),
458+
ANALYSIS_ENABLED,
435459
);
436460

437461
let result = task.actually_activate(opctx).await;
@@ -464,6 +488,7 @@ mod tests {
464488
inv_rx,
465489
activators(),
466490
OmicronZoneUuid::new_v4(),
491+
ANALYSIS_ENABLED,
467492
);
468493

469494
let result = task.actually_activate(opctx).await;
@@ -489,6 +514,7 @@ mod tests {
489514
inv_rx,
490515
activators(),
491516
OmicronZoneUuid::new_v4(),
517+
ANALYSIS_ENABLED,
492518
);
493519

494520
let result = task.actually_activate(opctx).await;
@@ -518,6 +544,7 @@ mod tests {
518544
inv_rx,
519545
activators(),
520546
OmicronZoneUuid::new_v4(),
547+
ANALYSIS_ENABLED,
521548
);
522549

523550
let result = task.actually_activate(opctx).await;
@@ -547,6 +574,7 @@ mod tests {
547574
inv_rx,
548575
activators(),
549576
OmicronZoneUuid::new_v4(),
577+
ANALYSIS_ENABLED,
550578
);
551579

552580
let result = task.actually_activate(opctx).await;

nexus/tests/config.test.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ sp_ereport_ingester.disable = true
198198
# How frequently to check for a new fault management sitrep (made by any Nexus).
199199
# This is cheap, so we should check frequently.
200200
fm.sitrep_load_period_secs = 15
201+
fm.analysis_enabled = true
201202
# How frequently to run analysis from the current sitrep.
202203
fm.analysis_period_secs = 120
203204
# Sitrep GC, on the other hand, does not need to be activated very frequently,

nexus/types/src/internal_api/background.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -932,6 +932,9 @@ pub mod fm_analysis {
932932
#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
933933
#[allow(clippy::large_enum_variant)]
934934
pub enum Outcome {
935+
/// The task is disabled by config.
936+
Disabled,
937+
935938
/// Fault management analysis was not performed, as no inventory
936939
/// collection has been loaded.
937940
WaitingForInventory,

0 commit comments

Comments
 (0)