Skip to content

Commit d4c0966

Browse files
authored
fix(machine-validation): skip reboot when disabled (#2488)
## Description NICO currently enters the machine-validation reboot path even when machine validation is disabled, issuing an unnecessary host restart and increasing startup latency. This change: - skips disabled machine validation directly from `RebootHost`, before issuing a host reboot - reuses one completion path for disabled validation from both `RebootHost` and `MachineValidating` - preserves the existing wait when validation is disabled while a reboot is already in flight - preserves the upstream guard for validation runs that are no longer active - makes site-explorer fixtures branch on the observed machine state instead of the validation configuration - adds regression coverage for both disabled-validation paths ## Type of Change <!-- Check one that best describes this PR --> - [ ] **Add** - New feature or capability - [x] **Change** - Changes in existing functionality - [x] **Fix** - Bug fixes - [ ] **Remove** - Removed features or deprecated functionality - [ ] **Internal** - Internal changes (refactoring, tests, docs, etc.) ## Related Issues (Optional) Related to #2486 feat: skip machine validation reboot when validation is disabled ## Breaking Changes - N/A ## Testing <!-- How was this tested? Check all that apply --> - [ ] Unit tests added/updated - [x] Integration tests added/updated - [ ] Manual testing performed - [ ] No testing required (docs, internal refactor, etc.) Validation performed against current `NVIDIA/infra-controller:main`: - `git diff --check` - `cargo fmt --all -- --check` - `cargo test -p carbide-machine-controller` (19 passed) - `cargo test -p carbide-api-core --no-default-features --no-run` ## Additional Notes - N/A Signed-off-by: Clement Liaw <clliaw@nvidia.com>
1 parent 5b698e8 commit d4c0966

3 files changed

Lines changed: 310 additions & 112 deletions

File tree

crates/api-core/src/tests/common/api_fixtures/site_explorer.rs

Lines changed: 85 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,7 @@ use crate::tests::common::api_fixtures::network_segment::{
6868
};
6969
use crate::tests::common::api_fixtures::{
7070
TestEnv, TestManagedHost, forge_agent_control, get_machine_validation_runs,
71-
machine_validation_completed, persist_machine_validation_result, reboot_completed,
72-
update_machine_validation_run,
71+
machine_validation_completed, persist_machine_validation_result, update_machine_validation_run,
7372
};
7473
use crate::tests::common::rpc_builder::DhcpDiscovery;
7574

@@ -913,6 +912,10 @@ impl<'a> MockExploredHost<'a> {
913912
machine_validation:
914913
MachineValidatingState::MachineValidating { .. },
915914
},
915+
} | ManagedHostState::HostInit {
916+
machine_state: MachineState::Discovered {
917+
skip_reboot_wait: true,
918+
},
916919
}
917920
)
918921
},
@@ -923,30 +926,43 @@ impl<'a> MockExploredHost<'a> {
923926
return self;
924927
}
925928

926-
if self.test_env.config.machine_validation_config.enabled {
929+
let stop_state = if matches!(
930+
stop_state,
931+
ManagedHostState::Validation {
932+
validation_state: ValidationState::MachineValidation {
933+
machine_validation: MachineValidatingState::MachineValidating { .. },
934+
},
935+
}
936+
) {
927937
machine_validation_completed(self.test_env, &host_machine_id, None).await;
928-
} else {
929-
// need to mark as reboot completed to move it to the next state
930-
// even if machine validation is disabled
931-
reboot_completed(self.test_env, host_machine_id).await;
932-
}
933938

934-
let stop_state = self
935-
.test_env
936-
.run_machine_state_controller_iteration_until_state_condition(
937-
&host_machine_id,
938-
10,
939-
|machine| {
940-
machine.current_state() == &expected_state
941-
|| matches!(
942-
*machine.current_state(),
943-
ManagedHostState::HostInit {
944-
machine_state: MachineState::Discovered { .. },
945-
}
946-
)
939+
self.test_env
940+
.run_machine_state_controller_iteration_until_state_condition(
941+
&host_machine_id,
942+
10,
943+
|machine| {
944+
machine.current_state() == &expected_state
945+
|| matches!(
946+
*machine.current_state(),
947+
ManagedHostState::HostInit {
948+
machine_state: MachineState::Discovered { .. },
949+
}
950+
)
951+
},
952+
)
953+
.await
954+
} else if matches!(
955+
stop_state,
956+
ManagedHostState::HostInit {
957+
machine_state: MachineState::Discovered {
958+
skip_reboot_wait: true,
947959
},
948-
)
949-
.await;
960+
}
961+
) {
962+
stop_state
963+
} else {
964+
panic!("Unexpected state while handling machine validation: {stop_state}");
965+
};
950966

951967
if stop_state == expected_state {
952968
return self;
@@ -1046,26 +1062,48 @@ impl<'a> MockExploredHost<'a> {
10461062
)
10471063
.await;
10481064

1049-
self.test_env
1050-
.run_machine_state_controller_iteration_until_state_matches(
1065+
let expected_machine_validating_state = ManagedHostState::Validation {
1066+
validation_state: ValidationState::MachineValidation {
1067+
machine_validation: MachineValidatingState::MachineValidating {
1068+
context: "Discovery".to_string(),
1069+
id: MachineValidationId::new(),
1070+
completed: 1,
1071+
total: 1,
1072+
is_enabled: true,
1073+
},
1074+
},
1075+
};
1076+
let stop_state = self
1077+
.test_env
1078+
.run_machine_state_controller_iteration_until_state_condition(
10511079
&host_machine_id,
10521080
10,
1053-
ManagedHostState::Validation {
1054-
validation_state: ValidationState::MachineValidation {
1055-
machine_validation: MachineValidatingState::MachineValidating {
1056-
context: "Discovery".to_string(),
1057-
id: MachineValidationId::new(),
1058-
completed: 1,
1059-
total: 1,
1060-
is_enabled: self.test_env.config.machine_validation_config.enabled,
1061-
},
1062-
},
1081+
|machine| {
1082+
let expected_machine_validating_state = self
1083+
.test_env
1084+
.fill_machine_information(&expected_machine_validating_state, machine);
1085+
machine.current_state() == &expected_machine_validating_state
1086+
|| matches!(
1087+
*machine.current_state(),
1088+
ManagedHostState::HostInit {
1089+
machine_state: MachineState::Discovered {
1090+
skip_reboot_wait: true,
1091+
},
1092+
}
1093+
)
10631094
},
10641095
)
10651096
.await;
10661097

1067-
let response = forge_agent_control(self.test_env, host_machine_id).await;
1068-
if self.test_env.config.machine_validation_config.enabled {
1098+
if matches!(
1099+
stop_state,
1100+
ManagedHostState::Validation {
1101+
validation_state: ValidationState::MachineValidation {
1102+
machine_validation: MachineValidatingState::MachineValidating { .. },
1103+
},
1104+
}
1105+
) {
1106+
let response = forge_agent_control(self.test_env, host_machine_id).await;
10691107
let uuid = &response.data.unwrap().pair[1].value;
10701108
let validation_id: MachineValidationId = uuid.parse().unwrap();
10711109
let success = update_machine_validation_run(
@@ -1178,29 +1216,23 @@ impl<'a> MockExploredHost<'a> {
11781216
)
11791217
.await;
11801218
}
1181-
} else {
1182-
self.test_env
1183-
.run_machine_state_controller_iteration_until_state_matches(
1184-
&host_machine_id,
1185-
10,
1186-
ManagedHostState::HostInit {
1187-
machine_state: MachineState::Discovered {
1188-
skip_reboot_wait: true,
1189-
},
1190-
},
1191-
)
1192-
.await;
1193-
1194-
// Note: no forge_agent_control/reboot_completed call happens here, since we're skipping
1195-
// machine validation and thus not doing an extra reboot.
1196-
1219+
} else if matches!(
1220+
stop_state,
1221+
ManagedHostState::HostInit {
1222+
machine_state: MachineState::Discovered {
1223+
skip_reboot_wait: true,
1224+
},
1225+
}
1226+
) {
11971227
self.test_env
11981228
.run_machine_state_controller_iteration_until_state_matches(
11991229
&host_machine_id,
12001230
1,
12011231
ManagedHostState::Ready,
12021232
)
12031233
.await;
1234+
} else {
1235+
panic!("Unexpected state while handling machine validation: {stop_state}");
12041236
}
12051237

12061238
self

0 commit comments

Comments
 (0)