From b38d80a59be43cf056befd25dcc374038fa2860e Mon Sep 17 00:00:00 2001 From: brian Date: Mon, 15 Jun 2026 13:16:20 -0400 Subject: [PATCH] fix(api,state-controller): Add cloud-init and state-machine fix for IB-to-ETH link-type flip --- .../api-core/src/tests/dpu_reprovisioning.rs | 572 ++++++++++--- crates/api-model/src/machine/mod.rs | 69 +- crates/api/files/bf.cfg | 58 ++ crates/machine-controller/src/handler.rs | 802 +++++++++++++++--- .../machine-controller/src/handler/helpers.rs | 14 +- crates/redfish/src/libredfish/test_support.rs | 43 +- pxe/templates/user-data | 58 ++ 7 files changed, 1402 insertions(+), 214 deletions(-) diff --git a/crates/api-core/src/tests/dpu_reprovisioning.rs b/crates/api-core/src/tests/dpu_reprovisioning.rs index 653d3a7efb..20c6a1f82a 100644 --- a/crates/api-core/src/tests/dpu_reprovisioning.rs +++ b/crates/api-core/src/tests/dpu_reprovisioning.rs @@ -20,13 +20,18 @@ use std::collections::HashMap; use carbide_machine_controller::handler::MachineStateHandlerBuilder; use carbide_redfish::libredfish::test_support::RedfishSimAction; use chrono::Utc; -use common::api_fixtures::{create_managed_host_multi_dpu, create_test_env, reboot_completed}; -use libredfish::SystemPowerControl; +use common::api_fixtures::{ + create_managed_host_multi_dpu, create_managed_host_with_hardware_info_template, + create_test_env, reboot_completed, +}; +use libredfish::{EnabledDisabled, SystemPowerControl}; use model::instance::status::tenant::TenantState; use model::machine::{ - DpuInitState, FailureDetails, InstallDpuOsState, InstanceState, MachineLastRebootRequestedMode, - MachineState, ManagedHostState, ReprovisionState, + DpuInitState, FailureCause, FailureDetails, FailureSource, InstallDpuOsState, InstanceState, + Machine, MachineLastRebootRequestedMode, MachineState, ManagedHostState, ReprovisionState, + SetBootOrderInfo, SetBootOrderState, StateMachineArea, UnlockHostState, }; +use model::test_support::HardwareInfoTemplate; use rpc::forge::MachineArchitecture; use rpc::forge::dpu_reprovisioning_request::Mode; use rpc::forge::forge_server::Forge; @@ -39,9 +44,215 @@ use crate::tests::common::api_fixtures::instance::TestInstance; use crate::tests::common::api_fixtures::rpc_instance::RpcInstance; use crate::tests::common::api_fixtures::test_machine::TestMachineInterface; use crate::tests::common::api_fixtures::{ - TestEnv, TestManagedHost, create_managed_host, forge_agent_control, update_time_params, + TestEnv, TestMachine, TestManagedHost, create_managed_host, forge_agent_control, + update_time_params, }; +const DGX_H100_INFO_JSON: &[u8] = br#"{ + "machine_type": "x86_64", + "dmi_data": { + "product_name": "DGXH100", + "sys_vendor": "NVIDIA" + } +}"#; + +fn reprovision_set_host_boot_order_state( + set_boot_order_state: SetBootOrderState, +) -> ReprovisionState { + ReprovisionState::SetHostBootOrder { + set_boot_order_info: SetBootOrderInfo { + set_boot_order_jid: None, + set_boot_order_state, + retry_count: 0, + }, + } +} + +#[derive(Clone, Copy)] +enum ReprovisionHostBootRepairShape { + SingleDpu, + AssignedSingleDpu, + FirstDpuOnly, + AllDpus, +} + +fn reprovision_host_boot_repair_states( + mh: &TestManagedHost, + shape: ReprovisionHostBootRepairShape, +) -> Vec { + let states = [ + ReprovisionState::PrepareHostBootRepair, + ReprovisionState::UnlockHostForBootRepair { + unlock_host_state: UnlockHostState::DisableLockdown, + }, + ReprovisionState::CheckHostBootConfig, + ReprovisionState::ConfigureHostBoot { retry_count: 0 }, + ReprovisionState::PollingHostBiosSetup { retry_count: 0 }, + reprovision_set_host_boot_order_state(SetBootOrderState::SetBootOrder), + reprovision_set_host_boot_order_state(SetBootOrderState::WaitForSetBootOrderJobScheduled), + reprovision_set_host_boot_order_state(SetBootOrderState::RebootHost), + reprovision_set_host_boot_order_state(SetBootOrderState::WaitForSetBootOrderJobCompletion), + reprovision_set_host_boot_order_state(SetBootOrderState::CheckBootOrder), + ReprovisionState::LockHostAfterBootRepair, + ReprovisionState::RebootHostBmc, + ]; + + states + .into_iter() + .map(|state| match shape { + ReprovisionHostBootRepairShape::SingleDpu => mh.new_dpu_reprovision_state(state), + ReprovisionHostBootRepairShape::AssignedSingleDpu => { + mh.new_dpu_assigned_reprovision_state(state) + } + ReprovisionHostBootRepairShape::FirstDpuOnly => { + let not_under_reprovision = ReprovisionState::NotUnderReprovision; + let mut states = vec![&state]; + states.extend((1..mh.dpu_ids.len()).map(|_| ¬_under_reprovision)); + mh.new_dpus_reprovision_state(&states) + } + ReprovisionHostBootRepairShape::AllDpus => { + mh.new_dpus_reprovision_state(&vec![&state; mh.dpu_ids.len()]) + } + }) + .collect() +} + +/// Return true when any DPU in a reprovisioning managed-host state matches. +fn has_dpu_reprovision_state( + state: &ManagedHostState, + matches_state: impl FnMut(&ReprovisionState) -> bool, +) -> bool { + match state { + ManagedHostState::DPUReprovision { dpu_states } + | ManagedHostState::Assigned { + instance_state: InstanceState::DPUReprovision { dpu_states }, + } => dpu_states.states.values().any(matches_state), + _ => false, + } +} + +async fn assert_dpu_reprovision_host_boot_repair( + env: &TestEnv, + machine: &TestMachine, + expected_states: Vec, +) -> Machine { + env.redfish_sim.set_lockdown(EnabledDisabled::Enabled); + env.redfish_sim.set_is_boot_order_setup(false); + + let redfish_timepoint = env.redfish_sim.timepoint(); + + // Drive the shared repair path; each expected state should be externally restartable. + for expected_state in expected_states { + let current_machine = machine.next_iteration_machine(env).await; + assert_eq!(current_machine.current_state(), &expected_state); + + // Keep restart available so wedged BIOS/job/boot-order repair can be operator-restarted. + assert!( + current_machine.reprovision_requested.is_some(), + "expected DPU reprovision request to remain present during host boot repair" + ); + + // Disable lockdown so Redfish reflects writable host BIOS/boot state. + if has_dpu_reprovision_state(&expected_state, |state| { + matches!(state, ReprovisionState::CheckHostBootConfig) + }) { + assert!( + env.redfish_sim + .lockdown_states() + .contains(&EnabledDisabled::Disabled), + "expected DPU reprovision host boot repair to disable lockdown before boot config checks" + ); + } + + // Re-enable lockdown so DPU reprovision preserves the host profile's security posture. + if has_dpu_reprovision_state(&expected_state, |state| { + matches!(state, ReprovisionState::RebootHostBmc) + }) { + let lockdown_states = env.redfish_sim.lockdown_states(); + assert!( + !lockdown_states.is_empty() + && lockdown_states + .iter() + .all(|state| *state == EnabledDisabled::Enabled), + "expected DPU reprovision host boot repair to re-enable lockdown before rebooting the host BMC" + ); + } + } + + // machine_setup enables the bootable DPU interface before boot-order promotion. + let actions = env + .redfish_sim + .actions_since(&redfish_timepoint) + .all_hosts(); + let machine_setup_pos = actions + .iter() + .position(|action| matches!(action, RedfishSimAction::MachineSetup { .. })) + .expect("expected DPU reprovision boot repair to call machine_setup"); + let set_boot_order_pos = actions + .iter() + .position(|action| matches!(action, RedfishSimAction::SetBootOrderDpuFirst { .. })) + .expect("expected DPU reprovision boot repair to set DPU-first boot order"); + let check_boot_order_pos = actions + .iter() + .rposition(|action| matches!(action, RedfishSimAction::IsBootOrderSetup { .. })) + .expect("expected DPU reprovision boot repair to verify boot order"); + + assert!( + machine_setup_pos < set_boot_order_pos && set_boot_order_pos < check_boot_order_pos, + "expected machine_setup before set_boot_order_dpu_first before is_boot_order_setup; got: {actions:?}" + ); + + let rebooting_machine = machine.next_iteration_machine(env).await; + assert!( + has_dpu_reprovision_state(rebooting_machine.current_state(), |state| { + matches!(state, ReprovisionState::RebootHost) + }), + "expected DPU reprovision host boot repair to transition to RebootHost; got: {:?}", + rebooting_machine.current_state() + ); + assert!( + rebooting_machine.reprovision_requested.is_some(), + "expected DPU reprovision request to remain present until the final host reboot is handled" + ); + + // Clearing the request before RebootHost would make wedged repair work non-restartable. + let final_reboot_timepoint = env.redfish_sim.timepoint(); + let terminal_machine = machine.next_iteration_machine(env).await; + assert!( + terminal_machine.reprovision_requested.is_none(), + "expected DPU reprovision request to be cleared after final host reboot" + ); + assert_eq!( + env.redfish_sim + .actions_since(&final_reboot_timepoint) + .all_hosts(), + vec![RedfishSimAction::Power(SystemPowerControl::ForceRestart)] + ); + + terminal_machine +} + +async fn prepare_dpu_reprovision_host_boot_check( + env: &TestEnv, + mh: &TestManagedHost, +) -> TestMachine { + let dpu_machine = mh.dpu(); + let mut txn = env.pool.begin().await.unwrap(); + db::machine::update_state( + &mut txn, + &mh.id, + &mh.new_dpu_reprovision_state(ReprovisionState::CheckHostBootConfig), + ) + .await + .unwrap(); + db::machine::trigger_dpu_reprovisioning_request(&dpu_machine.id, &mut txn, "AdminCli", true) + .await + .unwrap(); + txn.commit().await.unwrap(); + + dpu_machine +} + #[crate::sqlx_test] async fn test_dpu_for_set_clear_reprovisioning(pool: sqlx::PgPool) { let env = create_test_env(pool).await; @@ -188,8 +399,6 @@ async fn test_dpu_for_reprovisioning_with_firmware_upgrade(pool: sqlx::PgPool) { .all_hosts(), vec![RedfishSimAction::Power(SystemPowerControl::On)] ); - let redfish_timepoint = env.redfish_sim.timepoint(); - let pxe = dpu_interface.get_pxe_instructions(dpu_arch).await; assert!( pxe.pxe_script @@ -207,15 +416,15 @@ async fn test_dpu_for_reprovisioning_with_firmware_upgrade(pool: sqlx::PgPool) { ); mh.network_configured(&env).await; - for state in [ - ReprovisionState::RebootHostBmc, - ReprovisionState::RebootHost, - ] { - let dpu = mh.dpu().next_iteration_machine(&env).await; - assert_eq!(dpu.current_state(), &mh.new_dpu_reprovision_state(state)); - } - let dpu = mh.dpu().next_iteration_machine(&env).await; + // Repair host boot setup before the final BMC and host reboot sequence. + let dpu_machine = mh.dpu(); + let dpu = assert_dpu_reprovision_host_boot_repair( + &env, + &dpu_machine, + reprovision_host_boot_repair_states(&mh, ReprovisionHostBootRepairShape::SingleDpu), + ) + .await; assert!(matches!( dpu.current_state(), &ManagedHostState::HostInit { @@ -224,15 +433,134 @@ async fn test_dpu_for_reprovisioning_with_firmware_upgrade(pool: sqlx::PgPool) { )); let _response = mh.host().forge_agent_control().await; - let dpu = mh.dpu().next_iteration_machine(&env).await; + let dpu = dpu_machine.next_iteration_machine(&env).await; assert!(matches!(dpu.current_state(), &ManagedHostState::Ready)); +} - // HostInit::Discovered -> Ready goes through restart +#[crate::sqlx_test] +async fn test_dpu_reprovision_host_boot_repair_runs_machine_setup_when_bios_not_setup( + pool: sqlx::PgPool, +) { + let env = create_test_env(pool).await; + let mh = common::api_fixtures::create_managed_host(&env).await; + let dpu_machine = prepare_dpu_reprovision_host_boot_check(&env, &mh).await; + + env.redfish_sim.set_is_boot_order_setup(true); + env.redfish_sim.set_is_bios_setup(false); + + let redfish_timepoint = env.redfish_sim.timepoint(); + + let dpu = dpu_machine.next_iteration_machine(&env).await; assert_eq!( - env.redfish_sim - .actions_since(&redfish_timepoint) - .all_hosts(), - vec![RedfishSimAction::Power(SystemPowerControl::ForceRestart)] + dpu.current_state(), + &mh.new_dpu_reprovision_state(ReprovisionState::ConfigureHostBoot { retry_count: 0 }) + ); + + let dpu = dpu_machine.next_iteration_machine(&env).await; + assert_eq!( + dpu.current_state(), + &mh.new_dpu_reprovision_state(ReprovisionState::PollingHostBiosSetup { retry_count: 0 }) + ); + + let actions = env + .redfish_sim + .actions_since(&redfish_timepoint) + .all_hosts(); + assert!( + actions + .iter() + .any(|action| matches!(action, RedfishSimAction::MachineSetup { .. })), + "expected DPU reprovision host boot repair to run machine_setup when BIOS setup is false; got: {actions:?}" + ); +} + +#[crate::sqlx_test] +async fn test_dpu_reprovision_viking_repairs_bios_before_boot_order_skip(pool: sqlx::PgPool) { + let env = create_test_env(pool).await; + let mh = create_managed_host_with_hardware_info_template( + &env, + HardwareInfoTemplate::Custom(DGX_H100_INFO_JSON), + ) + .await; + let dpu_machine = prepare_dpu_reprovision_host_boot_check(&env, &mh).await; + + env.redfish_sim.set_is_boot_order_setup(false); + env.redfish_sim.set_is_bios_setup(false); + + let redfish_timepoint = env.redfish_sim.timepoint(); + + let dpu = dpu_machine.next_iteration_machine(&env).await; + assert_eq!( + dpu.current_state(), + &mh.new_dpu_reprovision_state(ReprovisionState::ConfigureHostBoot { retry_count: 0 }) + ); + + let dpu = dpu_machine.next_iteration_machine(&env).await; + assert_eq!( + dpu.current_state(), + &mh.new_dpu_reprovision_state(ReprovisionState::PollingHostBiosSetup { retry_count: 0 }) + ); + + env.redfish_sim.set_is_bios_setup(true); + let dpu = dpu_machine.next_iteration_machine(&env).await; + assert_eq!( + dpu.current_state(), + &mh.new_dpu_reprovision_state(ReprovisionState::LockHostAfterBootRepair) + ); + + let actions = env + .redfish_sim + .actions_since(&redfish_timepoint) + .all_hosts(); + assert!( + actions + .iter() + .any(|action| matches!(action, RedfishSimAction::MachineSetup { .. })), + "expected DPU reprovision host boot repair to run machine_setup when Viking BIOS setup is false; got: {actions:?}" + ); + assert!( + actions.iter().all(|action| !matches!( + action, + RedfishSimAction::SetBootOrderDpuFirst { .. } + | RedfishSimAction::IsBootOrderSetup { .. } + )), + "expected Viking DPU reprovision host boot repair to skip boot-order remediation after BIOS repair; got: {actions:?}" + ); +} + +#[crate::sqlx_test] +async fn test_dpu_reprovision_viking_skips_boot_order_when_bios_setup(pool: sqlx::PgPool) { + let env = create_test_env(pool).await; + let mh = create_managed_host_with_hardware_info_template( + &env, + HardwareInfoTemplate::Custom(DGX_H100_INFO_JSON), + ) + .await; + let dpu_machine = prepare_dpu_reprovision_host_boot_check(&env, &mh).await; + + env.redfish_sim.set_is_boot_order_setup(false); + env.redfish_sim.set_is_bios_setup(true); + + let redfish_timepoint = env.redfish_sim.timepoint(); + + let dpu = dpu_machine.next_iteration_machine(&env).await; + assert_eq!( + dpu.current_state(), + &mh.new_dpu_reprovision_state(ReprovisionState::LockHostAfterBootRepair) + ); + + let actions = env + .redfish_sim + .actions_since(&redfish_timepoint) + .all_hosts(); + assert!( + actions.iter().all(|action| !matches!( + action, + RedfishSimAction::MachineSetup { .. } + | RedfishSimAction::SetBootOrderDpuFirst { .. } + | RedfishSimAction::IsBootOrderSetup { .. } + )), + "expected Viking DPU reprovision host boot repair to skip BIOS and boot-order remediation when BIOS setup is true; got: {actions:?}" ); } @@ -354,15 +682,15 @@ async fn test_dpu_for_reprovisioning_with_no_firmware_upgrade(pool: sqlx::PgPool let _response = mh.dpu().forge_agent_control().await; mh.network_configured(&env).await; - for state in [ - ReprovisionState::RebootHostBmc, - ReprovisionState::RebootHost, - ] { - let dpu = mh.dpu().next_iteration_machine(&env).await; - assert_eq!(dpu.current_state(), &mh.new_dpu_reprovision_state(state)); - } - let dpu = mh.dpu().next_iteration_machine(&env).await; + // Repair host boot setup before the final BMC and host reboot sequence. + let dpu_machine = mh.dpu(); + let dpu = assert_dpu_reprovision_host_boot_repair( + &env, + &dpu_machine, + reprovision_host_boot_repair_states(&mh, ReprovisionHostBootRepairShape::SingleDpu), + ) + .await; assert!(matches!( dpu.current_state(), &ManagedHostState::HostInit { @@ -371,7 +699,7 @@ async fn test_dpu_for_reprovisioning_with_no_firmware_upgrade(pool: sqlx::PgPool )); let _response = mh.host().forge_agent_control().await; - let dpu = mh.dpu().next_iteration_machine(&env).await; + let dpu = dpu_machine.next_iteration_machine(&env).await; assert!(matches!(dpu.current_state(), &ManagedHostState::Ready)); } @@ -563,32 +891,14 @@ async fn instance_reprov_complete( ); mh.network_configured(env).await; - env.run_machine_state_controller_iteration().await; - let mut txn = env.pool.begin().await.unwrap(); - let dpu = mh.dpu().db_machine(&mut txn).await; - txn.commit().await.unwrap(); - assert_eq!( - dpu.current_state(), - &mh.new_dpu_assigned_reprovision_state(ReprovisionState::RebootHostBmc) - ); - - assert_reprov_tenant_state(env, mh, tinstance, TenantState::Updating).await; - - env.run_machine_state_controller_iteration().await; - let mut txn = env.pool.begin().await.unwrap(); - let dpu = mh.dpu().db_machine(&mut txn).await; - txn.commit().await.unwrap(); - assert_eq!( - dpu.current_state(), - &mh.new_dpu_assigned_reprovision_state(ReprovisionState::RebootHost), - ); - - assert_reprov_tenant_state(env, mh, tinstance, TenantState::Updating).await; - - env.run_machine_state_controller_iteration().await; - let mut txn = env.pool.begin().await.unwrap(); - let dpu = mh.dpu().db_machine(&mut txn).await; - txn.commit().await.unwrap(); + // Repair host boot setup before returning the assigned host to service. + let dpu_machine = mh.dpu(); + let dpu = assert_dpu_reprovision_host_boot_repair( + env, + &dpu_machine, + reprovision_host_boot_repair_states(mh, ReprovisionHostBootRepairShape::AssignedSingleDpu), + ) + .await; assert!(matches!( dpu.current_state(), &ManagedHostState::Assigned { @@ -782,18 +1092,14 @@ async fn test_instance_reprov_without_firmware_upgrade(pool: sqlx::PgPool) { ); mh.network_configured(&env).await; - for state in [ - ReprovisionState::RebootHostBmc, - ReprovisionState::RebootHost, - ] { - let dpu = mh.dpu().next_iteration_machine(&env).await; - assert_eq!( - dpu.current_state(), - &mh.new_dpu_assigned_reprovision_state(state) - ); - } - - let dpu = mh.dpu().next_iteration_machine(&env).await; + // Repair host boot setup before returning the assigned host to service. + let dpu_machine = mh.dpu(); + let dpu = assert_dpu_reprovision_host_boot_repair( + &env, + &dpu_machine, + reprovision_host_boot_repair_states(&mh, ReprovisionHostBootRepairShape::AssignedSingleDpu), + ) + .await; assert!(matches!( dpu.current_state(), &ManagedHostState::Assigned { @@ -1242,6 +1548,68 @@ async fn test_restart_dpu_reprov(pool: sqlx::PgPool) { ); } +#[crate::sqlx_test] +async fn test_restart_dpu_reprov_unassigned_host_boot_failure(pool: sqlx::PgPool) { + let env = create_test_env(pool).await; + let mh = common::api_fixtures::create_managed_host(&env).await; + let dpu_machine = mh.dpu(); + mh.mark_machine_for_updates().await; + + let failed_at = Utc::now(); + let mut txn = env.pool.begin().await.unwrap(); + db::machine::trigger_dpu_reprovisioning_request(&dpu_machine.id, &mut txn, "AdminCli", true) + .await + .unwrap(); + db::machine::update_dpu_reprovision_explicit_start_time(&dpu_machine.id, failed_at, &mut txn) + .await + .unwrap(); + db::machine::update_state( + &mut txn, + &mh.id, + &ManagedHostState::Failed { + machine_id: mh.id, + retry_count: 0, + details: FailureDetails { + cause: FailureCause::BiosSetupFailed { + err: "host boot repair exhausted retries".to_string(), + }, + failed_at, + source: FailureSource::StateMachineArea(StateMachineArea::MainFlow), + }, + }, + ) + .await + .unwrap(); + txn.commit().await.unwrap(); + + // Restart detection is intentionally gated on a request newer than the failed state. + tokio::time::sleep(std::time::Duration::from_millis(1)).await; + mh.host() + .trigger_dpu_reprovisioning(Mode::Restart, true) + .await; + + // The repair failure preserves the DPU request so operators can restart from top-level Failed. + let redfish_timepoint = env.redfish_sim.timepoint(); + let dpu = dpu_machine.next_iteration_machine(&env).await; + assert_eq!( + dpu.current_state(), + &mh.new_dpu_reprovision_state(ReprovisionState::InstallDpuOs { + substate: InstallDpuOsState::InstallingBFB + }), + ); + assert!( + dpu.reprovision_requested + .as_ref() + .is_some_and(|request| request.started_at.is_some()) + ); + assert_eq!( + env.redfish_sim + .actions_since(&redfish_timepoint) + .all_hosts(), + vec![RedfishSimAction::Power(SystemPowerControl::ForceRestart)] + ); +} + #[crate::sqlx_test] async fn test_dpu_for_reprovisioning_with_firmware_upgrade_multidpu_onedpu_reprov( pool: sqlx::PgPool, @@ -1357,18 +1725,14 @@ async fn test_dpu_for_reprovisioning_with_firmware_upgrade_multidpu_onedpu_repro ); mh.network_configured(&env).await; - for state in [ - ReprovisionState::RebootHostBmc, - ReprovisionState::RebootHost, - ] { - let dpu = mh.dpu_n(0).next_iteration_machine(&env).await; - assert_eq!( - dpu.current_state(), - &mh.new_dpus_reprovision_state(&[&state, &state]) - ); - } - - let dpu = mh.dpu_n(0).next_iteration_machine(&env).await; + // Host boot repair is host-scoped, but reprovision ownership stays limited to the requested DPU. + let dpu_machine = mh.dpu_n(0); + let dpu = assert_dpu_reprovision_host_boot_repair( + &env, + &dpu_machine, + reprovision_host_boot_repair_states(&mh, ReprovisionHostBootRepairShape::FirstDpuOnly), + ) + .await; assert!(matches!( dpu.current_state(), &ManagedHostState::HostInit { @@ -1378,7 +1742,7 @@ async fn test_dpu_for_reprovisioning_with_firmware_upgrade_multidpu_onedpu_repro let _response = mh.host().forge_agent_control().await; - let dpu = mh.dpu_n(0).next_iteration_machine(&env).await; + let dpu = dpu_machine.next_iteration_machine(&env).await; assert!(matches!(dpu.current_state(), &ManagedHostState::Ready)); } @@ -1512,18 +1876,14 @@ async fn test_dpu_for_reprovisioning_with_firmware_upgrade_multidpu_bothdpu(pool ); mh.network_configured(&env).await; - for state in [ - ReprovisionState::RebootHostBmc, - ReprovisionState::RebootHost, - ] { - let dpu = mh.dpu_n(0).next_iteration_machine(&env).await; - assert_eq!( - dpu.current_state(), - &mh.new_dpus_reprovision_state(&[&state, &state]) - ); - } - - let dpu = mh.dpu_n(0).next_iteration_machine(&env).await; + // Repair host boot setup across all reprovisioned DPUs. + let dpu_machine = mh.dpu_n(0); + let dpu = assert_dpu_reprovision_host_boot_repair( + &env, + &dpu_machine, + reprovision_host_boot_repair_states(&mh, ReprovisionHostBootRepairShape::AllDpus), + ) + .await; assert!(matches!( dpu.current_state(), &ManagedHostState::HostInit { @@ -1532,12 +1892,16 @@ async fn test_dpu_for_reprovisioning_with_firmware_upgrade_multidpu_bothdpu(pool )); mh.host().forge_agent_control().await; - let dpu = mh.dpu_n(0).next_iteration_machine(&env).await; + let dpu = dpu_machine.next_iteration_machine(&env).await; assert!(matches!(dpu.current_state(), &ManagedHostState::Ready)); } #[crate::sqlx_test] async fn test_instance_reprov_restart_failed(pool: sqlx::PgPool) { + Box::pin(test_instance_reprov_restart_failed_impl(pool)).await; +} + +async fn test_instance_reprov_restart_failed_impl(pool: sqlx::PgPool) { let env = create_test_env(pool).await; let segment_id = env.create_vpc_and_tenant_segment().await; let mh = create_managed_host(&env).await; @@ -1808,18 +2172,14 @@ async fn test_instance_reprov_restart_failed(pool: sqlx::PgPool) { ); mh.network_configured(&env).await; - for state in [ - ReprovisionState::RebootHostBmc, - ReprovisionState::RebootHost, - ] { - let dpu = mh.dpu().next_iteration_machine(&env).await; - assert_eq!( - dpu.current_state(), - &mh.new_dpu_assigned_reprovision_state(state) - ); - } - - let dpu = mh.dpu().next_iteration_machine(&env).await; + // Repair host boot setup before returning the assigned host to service. + let dpu_machine = mh.dpu(); + let dpu = assert_dpu_reprovision_host_boot_repair( + &env, + &dpu_machine, + reprovision_host_boot_repair_states(&mh, ReprovisionHostBootRepairShape::AssignedSingleDpu), + ) + .await; assert!(matches!( dpu.current_state(), &ManagedHostState::Assigned { diff --git a/crates/api-model/src/machine/mod.rs b/crates/api-model/src/machine/mod.rs index 8576e23fb7..7ee708d220 100644 --- a/crates/api-model/src/machine/mod.rs +++ b/crates/api-model/src/machine/mod.rs @@ -1321,6 +1321,28 @@ pub enum ReprovisionState { BufferTime, VerifyFirmareVersions, WaitingForNetworkConfig, + PrepareHostBootRepair, + UnlockHostForBootRepair { + #[serde(default)] + unlock_host_state: UnlockHostState, + }, + CheckHostBootConfig, + CheckHostBootConfigAfterHostReboot, + ConfigureHostBoot { + #[serde(default)] + retry_count: u32, + }, + WaitingForHostBiosJob { + bios_config_info: BiosConfigInfo, + }, + PollingHostBiosSetup { + #[serde(default)] + retry_count: u32, + }, + SetHostBootOrder { + set_boot_order_info: SetBootOrderInfo, + }, + LockHostAfterBootRepair, RebootHostBmc, RebootHost, NotUnderReprovision, @@ -1797,7 +1819,9 @@ pub enum UefiSetupState { /// `bios_job_id` is `Some` while polling a vendor BIOS job (e.g. Dell). `None` only during /// `HandleBiosJobFailure` recovery from stuck PollingBiosSetup; non-Dell hosts reboot in /// `configure_host_bios` and never enter job-polling substates. -#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] +/// +/// Derived ordering is used by enclosing reprovision states to report the least advanced DPU. +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, PartialOrd, Ord)] #[serde(rename_all = "lowercase")] pub struct BiosConfigInfo { #[serde(default, skip_serializing_if = "Option::is_none")] @@ -1808,7 +1832,8 @@ pub struct BiosConfigInfo { pub retry_count: u32, } -#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] +/// Variant order follows BIOS job progression for derived reprovision-state comparisons. +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, PartialOrd, Ord)] #[serde(tag = "state", rename_all = "lowercase")] pub enum BiosConfigState { WaitForBiosJobScheduled, @@ -1821,7 +1846,8 @@ pub enum BiosConfigState { }, } -#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] +/// Derived ordering is used by enclosing reprovision states to report the least advanced DPU. +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, PartialOrd, Ord)] #[serde(rename_all = "lowercase")] pub struct SetBootOrderInfo { #[serde(default, skip_serializing_if = "Option::is_none")] @@ -1832,7 +1858,8 @@ pub struct SetBootOrderInfo { pub retry_count: u32, } -#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] +/// Variant order follows boot-order job progression for derived reprovision-state comparisons. +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, PartialOrd, Ord)] #[serde(tag = "state", rename_all = "lowercase")] pub enum SetBootOrderState { SetBootOrder, @@ -2020,7 +2047,8 @@ pub enum HostPlatformConfigurationState { LockHost, } -#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Default)] +/// Variant order follows unlock progression for derived reprovision-state comparisons. +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, PartialOrd, Ord, Default)] #[serde(tag = "state", rename_all = "lowercase")] pub enum UnlockHostState { #[default] @@ -2705,7 +2733,8 @@ impl<'r> FromRow<'r, PgRow> for MachineInterfaceSnapshot { // TODO: reconcile with site_explorer::PowerState. They are almost // identical but here we have Reset enum item. -#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] +/// Variant order is a deterministic tie-breaker inside derived recovery-state comparisons. +#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub enum PowerState { Off, On, @@ -2947,6 +2976,10 @@ mod tests { // variant; the parsed value (PartialEq) is the whole assertion. #[test] fn test_json_deserialize_managed_host_states() { + let machine_id = + MachineId::from_str("fm100ds7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng") + .unwrap(); + scenarios!( run = |s| serde_json::from_str::(s).map_err(drop); "assigned booting with discovery image, default retry" { @@ -2965,6 +2998,30 @@ mod tests { }), } + "dpu reprovision host boot configure state" { + r#"{"state":"dpureprovision","dpu_states":{"states":{"fm100ds7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng":{"configurehostboot":{"retry_count":2}}}}}"# => Yields(ManagedHostState::DPUReprovision { + dpu_states: DpuReprovisionStates { + states: HashMap::from([( + machine_id, + ReprovisionState::ConfigureHostBoot { retry_count: 2 }, + )]), + }, + }), + } + + "dpu reprovision host boot unlock default state" { + r#"{"state":"dpureprovision","dpu_states":{"states":{"fm100ds7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng":{"unlockhostforbootrepair":{}}}}}"# => Yields(ManagedHostState::DPUReprovision { + dpu_states: DpuReprovisionStates { + states: HashMap::from([( + machine_id, + ReprovisionState::UnlockHostForBootRepair { + unlock_host_state: UnlockHostState::DisableLockdown, + }, + )]), + }, + }), + } + "host init polling bios setup, default retry" { r#"{"state":"hostinit","machine_state":{"state":"pollingbiossetup"}}"# => Yields(ManagedHostState::HostInit { machine_state: MachineState::PollingBiosSetup { retry_count: 0 }, diff --git a/crates/api/files/bf.cfg b/crates/api/files/bf.cfg index ac8375b88c..284f9b70d8 100644 --- a/crates/api/files/bf.cfg +++ b/crates/api/files/bf.cfg @@ -165,11 +165,69 @@ write_files: /bin/rm /run/cloud-init.done fi } + function ensure_ethernet_link_type() + { + local log_file="/var/log/forge/link-type.log" + mkdir -p /var/log/forge + + # Find the DPU config device before any management VRF setup depends on Ethernet. + local mst_device + mst_device="$(compgen -G '/dev/mst/mt*_pciconf0' | head -n 1)" + if [[ -z "${mst_device}" ]] + then + echo "No mst pciconf device found for LINK_TYPE normalization" | tee -a "${log_file}" + return + fi + + # Non-VPI hardware may not expose LINK_TYPE, so treat query failures as non-fatal. + local link_config + if ! link_config="$(/usr/bin/mlxconfig -d "${mst_device}" q 2>&1)" + then + echo "LINK_TYPE query unsupported; continuing" | tee -a "${log_file}" + echo "${link_config}" | tee -a "${log_file}" + return + fi + echo "${link_config}" >> "${log_file}" + + # Apply Ethernet mode only to exposed ports not already reporting Ethernet. + local link_type_keys=() + local link_type_key + local link_type_value + for link_type_key in LINK_TYPE_P1 LINK_TYPE_P2 + do + link_type_value="$(printf '%s\n' "${link_config}" | awk -v key="${link_type_key}" '$1 == key { print $2; exit }')" + if [[ -n "${link_type_value}" && "${link_type_value}" != *"(2)"* ]] + then + link_type_keys+=("${link_type_key}") + fi + done + + local link_type_changed=0 + for link_type_key in "${link_type_keys[@]}" + do + echo "Normalizing ${link_type_key} to Ethernet" | tee -a "${log_file}" + if /usr/bin/mlxconfig -y -d "${mst_device}" set "${link_type_key}=2" + then + link_type_changed=1 + else + echo "Failed to normalize ${link_type_key}; continuing" | tee -a "${log_file}" + fi + done + + if [[ "${link_type_changed}" == "1" ]] + then + trap - EXIT + reboot + # Keep cloud-init incomplete; the controller restart path will re-enter PXE with the persisted LINK_TYPE setting. + exit 1 + fi + } trap cleanup EXIT set -e date -s '@{{ seconds_since_epoch }}' hwclock --systohc /usr/bin/bfrec --capsule /lib/firmware/mellanox/boot/capsule/efi_sbkeysync.cap || true + ensure_ethernet_link_type /usr/bin/bfcfg ip vrf exec mgmt curl --retry 5 --retry-all-errors -v -o /opt/forge/forge_root.pem {{ pxe_url }}/api/v0/tls/root_ca echo "downloaded forge root cert" diff --git a/crates/machine-controller/src/handler.rs b/crates/machine-controller/src/handler.rs index f8b179c432..4bb558f5e8 100644 --- a/crates/machine-controller/src/handler.rs +++ b/crates/machine-controller/src/handler.rs @@ -1827,6 +1827,33 @@ impl MachineStateHandler { )?, ); } + ManagedHostState::Failed { .. } + if is_unassigned_dpu_reprovision_host_boot_failure( + managed_state, + host_machine_id, + ) => + { + set_managed_host_topology_update_needed( + ctx.pending_db_writes, + &state.host_snapshot, + &dpus_for_reprov, + ); + + // Host boot repair failures leave the host in top-level Failed; restart must + // reconstruct the reprovision map instead of trying to advance from Failed. + next_state = Some( + ReprovisionState::next_substate_based_on_bfb_support( + self.enable_secure_boot, + state, + ctx.services.site_config.dpf_enabled, + ) + .next_state_with_all_dpus_updated( + &ManagedHostState::Ready, + &state.dpu_snapshots, + dpus_for_reprov.iter().map(|x| &x.id).collect_vec(), + )?, + ); + } _ => { next_state = None; } @@ -1849,6 +1876,25 @@ impl MachineStateHandler { } } +fn is_unassigned_dpu_reprovision_host_boot_failure( + managed_state: &ManagedHostState, + host_machine_id: &MachineId, +) -> bool { + matches!( + managed_state, + ManagedHostState::Failed { + machine_id, + details: + FailureDetails { + cause: FailureCause::BiosSetupFailed { .. }, + source: FailureSource::StateMachineArea(StateMachineArea::MainFlow), + .. + }, + .. + } if machine_id == host_machine_id + ) +} + #[derive(Clone)] struct FullFirmwareInfo<'a> { model: &'a str, @@ -2179,6 +2225,16 @@ fn is_dpu_up(state: &ManagedHostStateSnapshot, dpu_snapshot: &Machine) -> bool { true } +fn is_dpu_observed_since(dpu_snapshot: &Machine, minimum_observed_at: DateTime) -> bool { + let observation_time = dpu_snapshot + .network_status_observation + .as_ref() + .map(|o| o.observed_at) + .unwrap_or(DateTime::::MIN_UTC); + + observation_time >= minimum_observed_at +} + /// are_dpus_up_trigger_reboot_if_needed returns true if the dpu_agent indicates that the DPU has rebooted and is healthy. /// otherwise returns false. triggers a reboot in case the DPU is down/bricked. async fn are_dpus_up_trigger_reboot_if_needed( @@ -2658,6 +2714,24 @@ pub fn identify_dpu(dpu_snapshot: &Machine) -> DpuModel { model.into() } +fn update_reprovision_targets_to_reprovision_state( + state: &ManagedHostStateSnapshot, + reprovision_state: ReprovisionState, +) -> Result { + // Host repair steps are shared, but only DPUs with active requests own reprovision state. + let reprovision_target_dpu_ids = state + .dpu_snapshots + .iter() + .filter_map(|dpu| dpu.reprovision_requested.as_ref().map(|_| &dpu.id)) + .collect_vec(); + + reprovision_state.next_state_with_all_dpus_updated( + &state.managed_state, + &state.dpu_snapshots, + reprovision_target_dpu_ids, + ) +} + /// Handle workflow of DPU reprovision #[allow(clippy::too_many_arguments)] async fn handle_dpu_reprovision( @@ -2727,7 +2801,7 @@ async fn handle_dpu_reprovision( .iter() .filter_map(|x| { if x.reprovision_requested.is_some() { - state.managed_state.as_reprovision_state(dpu_machine_id) + state.managed_state.as_reprovision_state(&x.id) } else { None } @@ -2816,12 +2890,15 @@ async fn handle_dpu_reprovision( )) } ReprovisionState::WaitingForNetworkConfig => { + // Host boot repair is host-scoped, so wait until every reprovisioning + // DPU has reached the same post-network-config point before touching + // host BIOS. let dpus_states_for_reprov = &state .dpu_snapshots .iter() .filter_map(|x| { if x.reprovision_requested.is_some() { - state.managed_state.as_reprovision_state(dpu_machine_id) + state.managed_state.as_reprovision_state(&x.id) } else { None } @@ -2833,13 +2910,18 @@ async fn handle_dpu_reprovision( "Waiting for DPUs to come in WaitingForNetworkConfig state.".to_string(), )); } + + // Validate all DPUs before host boot repair; subsequent states may + // reboot the host or BMC and should not run while any DPU is still + // unhealthy or unsynced. for dsnapshot in &state.dpu_snapshots { if !is_dpu_up(state, dsnapshot) { let msg = format!("Waiting for DPU {} to come up", dsnapshot.id); tracing::warn!("{msg}"); let mut reboot_status = None; - // Reboot only dpu for which handler is called. + // Only the DPU handled by this invocation should trigger its + // own recovery reboot; other DPUs are observed for gating. if dpu_snapshot.id == dsnapshot.id { reboot_status = Some( trigger_reboot_if_needed( @@ -2864,11 +2946,12 @@ async fn handle_dpu_reprovision( ) { tracing::warn!("Waiting for network to be ready for DPU {}", dsnapshot.id); - // we requested a DPU reboot in ReprovisionState::WaitingForNetworkInstall - // let the trigger_reboot_if_needed determine if we are stuck here - // (based on how long it has been since the last requested reboot) + // The install path already requested a DPU reboot. If this + // specific DPU remains unsynced, let trigger_reboot_if_needed + // decide whether enough time has elapsed for another reboot. let mut reboot_status = None; - // Reboot only dpu for which handler is called. + // Only the DPU handled by this invocation should trigger its + // own recovery reboot; other DPUs are observed for gating. if dpu_snapshot.id == dsnapshot.id { reboot_status = Some( trigger_reboot_if_needed( @@ -2889,18 +2972,395 @@ async fn handle_dpu_reprovision( } } - let mut txn = ctx.services.db_pool.begin().await?; + Ok(StateHandlerOutcome::transition( + next_state_resolver.next_state_with_all_dpus_updated(state, reprovision_state)?, + )) + } + ReprovisionState::PrepareHostBootRepair => { + // Ensure host boot repair does not write through a locked BMC. + let redfish_client = ctx + .services + .create_redfish_client_from_machine(&state.host_snapshot) + .await?; - // Clear reprovisioning state. - for dpu_snapshot in &state.dpu_snapshots { - db::machine::clear_dpu_reprovisioning_request(&mut txn, &dpu_snapshot.id, false) - .await?; - } + let next_state = match redfish_client.lockdown_status().await { + Err(RedfishError::NotSupported(_)) => { + tracing::info!( + machine_id = %state.host_snapshot.id, + "BMC vendor does not support checking lockdown status during DPU reprovision host boot repair" + ); + ReprovisionState::CheckHostBootConfig + } + Err(e) => { + tracing::warn!( + machine_id = %state.host_snapshot.id, + error = %e, + "Failed to fetch lockdown status during DPU reprovision host boot repair" + ); + return Ok(StateHandlerOutcome::wait(format!( + "Failed to fetch lockdown status: {e}" + ))); + } + Ok(lockdown_status) if !lockdown_status.is_fully_disabled() => { + tracing::info!( + machine_id = %state.host_snapshot.id, + "Lockdown is enabled during DPU reprovision host boot repair; disabling before boot config checks" + ); + ReprovisionState::UnlockHostForBootRepair { + unlock_host_state: UnlockHostState::DisableLockdown, + } + } + Ok(_) => ReprovisionState::CheckHostBootConfig, + }; Ok(StateHandlerOutcome::transition( - next_state_resolver.next_state_with_all_dpus_updated(state, reprovision_state)?, + update_reprovision_targets_to_reprovision_state(state, next_state)?, + )) + } + ReprovisionState::UnlockHostForBootRepair { unlock_host_state } => { + // Mirror assigned platform config's unlock choreography before checking boot state. + let redfish_client = ctx + .services + .create_redfish_client_from_machine(&state.host_snapshot) + .await?; + + let next_state = match unlock_host_state { + UnlockHostState::DisableLockdown => { + redfish_client + .lockdown_bmc(EnabledDisabled::Disabled) + .await + .map_err(|e| redfish_error("lockdown_bmc", e))?; + + let vendor = state.host_snapshot.bmc_vendor(); + + if vendor.is_supermicro() { + tracing::info!( + machine_id = %state.host_snapshot.id, + %vendor, + "BMC lockdown disabled; rebooting host so Redfish reflects actual boot order" + ); + ReprovisionState::UnlockHostForBootRepair { + unlock_host_state: UnlockHostState::RebootHost, + } + } else { + tracing::info!( + machine_id = %state.host_snapshot.id, + %vendor, + "BMC lockdown disabled; skipping post-unlock reboot (not required for this vendor)" + ); + ReprovisionState::CheckHostBootConfig + } + } + UnlockHostState::RebootHost => { + host_power_control( + redfish_client.as_ref(), + &state.host_snapshot, + SystemPowerControl::ForceRestart, + ctx, + ) + .await + .map_err(|e| { + StateHandlerError::GenericError(eyre!( + "failed to ForceRestart host after disabling BMC lockdown: {}", + e + )) + })?; + + ReprovisionState::UnlockHostForBootRepair { + unlock_host_state: UnlockHostState::WaitForUefiBoot, + } + } + UnlockHostState::WaitForUefiBoot => { + let entered_at = state.host_snapshot.state.version.timestamp(); + if wait(&entered_at, reachability_params.uefi_boot_wait) { + return Ok(StateHandlerOutcome::wait(format!( + "Waiting for UEFI boot to complete on {} after post-unlock reboot; \ + wait duration: {}, will proceed after {}", + state.host_snapshot.id, + reachability_params.uefi_boot_wait, + entered_at + reachability_params.uefi_boot_wait, + ))); + } + + ReprovisionState::CheckHostBootConfigAfterHostReboot + } + }; + + Ok(StateHandlerOutcome::transition( + update_reprovision_targets_to_reprovision_state(state, next_state)?, + )) + } + ReprovisionState::CheckHostBootConfig => { + // WaitingForNetworkConfig already accepted the DPU observation. Do + // not require a newer observation just because the host state + // version advanced while entering host boot repair. + let redfish_client = ctx + .services + .create_redfish_client_from_machine(&state.host_snapshot) + .await?; + + let next_state = match check_host_boot_config( + redfish_client.as_ref(), + state, + reachability_params, + HostBootConfigDpuFreshness::AlreadyValidated, + ctx, ) - .with_txn(txn)) + .await? + { + HostBootConfigDecision::Wait(reason) => { + return Ok(StateHandlerOutcome::wait(reason)); + } + HostBootConfigDecision::ConfigureBoot => { + ReprovisionState::ConfigureHostBoot { retry_count: 0 } + } + HostBootConfigDecision::LockHost => ReprovisionState::LockHostAfterBootRepair, + }; + + Ok(StateHandlerOutcome::transition( + update_reprovision_targets_to_reprovision_state(state, next_state)?, + )) + } + ReprovisionState::CheckHostBootConfigAfterHostReboot => { + // This path rebooted the host after unlocking, so require a DPU + // observation newer than that reboot before trusting boot checks. + let redfish_client = ctx + .services + .create_redfish_client_from_machine(&state.host_snapshot) + .await?; + + let next_state = match check_host_boot_config( + redfish_client.as_ref(), + state, + reachability_params, + HostBootConfigDpuFreshness::SinceLastHostRebootRequest, + ctx, + ) + .await? + { + HostBootConfigDecision::Wait(reason) => { + return Ok(StateHandlerOutcome::wait(reason)); + } + HostBootConfigDecision::ConfigureBoot => { + ReprovisionState::ConfigureHostBoot { retry_count: 0 } + } + HostBootConfigDecision::LockHost => ReprovisionState::LockHostAfterBootRepair, + }; + + Ok(StateHandlerOutcome::transition( + update_reprovision_targets_to_reprovision_state(state, next_state)?, + )) + } + ReprovisionState::ConfigureHostBoot { retry_count } => { + // Run machine_setup only after the reprovisioned DPU is healthy; it + // may patch BIOS settings and trigger host-impacting recovery. + let redfish_client = ctx + .services + .create_redfish_client_from_machine(&state.host_snapshot) + .await?; + + match configure_host_bios( + ctx, + reachability_params, + redfish_client.as_ref(), + state, + *retry_count, + ) + .await? + { + BiosConfigOutcome::Done => Ok(StateHandlerOutcome::transition( + update_reprovision_targets_to_reprovision_state( + state, + ReprovisionState::PollingHostBiosSetup { + retry_count: *retry_count, + }, + )?, + )), + BiosConfigOutcome::WaitingForBiosJob(bios_config_info) => { + Ok(StateHandlerOutcome::transition( + update_reprovision_targets_to_reprovision_state( + state, + ReprovisionState::WaitingForHostBiosJob { bios_config_info }, + )?, + )) + } + BiosConfigOutcome::WaitingForReboot(reason) => { + Ok(StateHandlerOutcome::wait(reason)) + } + } + } + ReprovisionState::WaitingForHostBiosJob { bios_config_info } => { + // Poll vendor BIOS jobs before verifying the setup and boot order. + let redfish_client = ctx + .services + .create_redfish_client_from_machine(&state.host_snapshot) + .await?; + + match advance_bios_config_job( + ctx, + redfish_client.as_ref(), + state, + bios_config_info.clone(), + ) + .await? + { + BiosConfigJobAdvanceOutcome::Continue(updated) => { + Ok(StateHandlerOutcome::transition( + update_reprovision_targets_to_reprovision_state( + state, + ReprovisionState::WaitingForHostBiosJob { + bios_config_info: updated, + }, + )?, + )) + } + BiosConfigJobAdvanceOutcome::Done => Ok(StateHandlerOutcome::transition( + update_reprovision_targets_to_reprovision_state( + state, + ReprovisionState::PollingHostBiosSetup { + retry_count: bios_config_info.retry_count, + }, + )?, + )), + BiosConfigJobAdvanceOutcome::Failed { failure } => Ok( + StateHandlerOutcome::transition(dpu_reprovision_host_boot_failed_state( + &state.managed_state, + state.host_snapshot.id, + failure, + )), + ), + BiosConfigJobAdvanceOutcome::RetryPlatformConfiguration { retry_count } => { + Ok(StateHandlerOutcome::transition( + update_reprovision_targets_to_reprovision_state( + state, + ReprovisionState::ConfigureHostBoot { retry_count }, + )?, + )) + } + BiosConfigJobAdvanceOutcome::Wait(reason) => Ok(StateHandlerOutcome::wait(reason)), + } + } + ReprovisionState::PollingHostBiosSetup { retry_count } => { + // Verify machine_setup effects before promoting the DPU boot option. + let redfish_client = ctx + .services + .create_redfish_client_from_machine(&state.host_snapshot) + .await?; + + match advance_polling_bios_setup( + redfish_client.as_ref(), + state, + *retry_count, + &ctx.services.site_config.machine_state_controller, + ) + .await? + { + PollingBiosSetupOutcome::Verified => { + let next_state = if should_skip_boot_order_remediation(state) { + ReprovisionState::LockHostAfterBootRepair + } else { + ReprovisionState::SetHostBootOrder { + set_boot_order_info: SetBootOrderInfo { + set_boot_order_jid: None, + set_boot_order_state: SetBootOrderState::SetBootOrder, + retry_count: 0, + }, + } + }; + + Ok(StateHandlerOutcome::transition( + update_reprovision_targets_to_reprovision_state(state, next_state)?, + )) + } + PollingBiosSetupOutcome::EnterRecovery(bios_config_info) => { + Ok(StateHandlerOutcome::transition( + update_reprovision_targets_to_reprovision_state( + state, + ReprovisionState::WaitingForHostBiosJob { bios_config_info }, + )?, + )) + } + PollingBiosSetupOutcome::Failed { failure } => Ok(StateHandlerOutcome::transition( + dpu_reprovision_host_boot_failed_state( + &state.managed_state, + state.host_snapshot.id, + failure, + ), + )), + PollingBiosSetupOutcome::Wait(reason) => Ok(StateHandlerOutcome::wait(reason)), + } + } + ReprovisionState::SetHostBootOrder { + set_boot_order_info, + } => { + // Promote the selected DPU boot option after machine_setup has enabled it. + let redfish_client = ctx + .services + .create_redfish_client_from_machine(&state.host_snapshot) + .await?; + + match set_host_boot_order( + ctx, + reachability_params, + redfish_client.as_ref(), + state, + set_boot_order_info.clone(), + ) + .await? + { + SetBootOrderOutcome::Continue(boot_order_info) => { + Ok(StateHandlerOutcome::transition( + update_reprovision_targets_to_reprovision_state( + state, + ReprovisionState::SetHostBootOrder { + set_boot_order_info: boot_order_info, + }, + )?, + )) + } + SetBootOrderOutcome::Done => Ok(StateHandlerOutcome::transition( + update_reprovision_targets_to_reprovision_state( + state, + ReprovisionState::LockHostAfterBootRepair, + )?, + )), + SetBootOrderOutcome::WaitingForReboot(reason) => { + Ok(StateHandlerOutcome::wait(reason)) + } + } + } + ReprovisionState::LockHostAfterBootRepair => { + // Preserve expected-machine lockdown policy after temporarily + // opening the BMC for host boot repair. + let redfish_client = ctx + .services + .create_redfish_client_from_machine(&state.host_snapshot) + .await?; + + if state.host_snapshot.host_profile.disable_lockdown { + tracing::info!( + machine_id = %state.host_snapshot.id, + "Skipping lockdown re-enable in DPU reprovision per expected-machine config" + ); + } else { + match redfish_client.lockdown_bmc(EnabledDisabled::Enabled).await { + Ok(()) => {} + Err(RedfishError::NotSupported(_)) => { + tracing::info!( + machine_id = %state.host_snapshot.id, + "BMC vendor does not support re-enabling lockdown after DPU reprovision host boot repair" + ); + } + Err(e) => return Err(redfish_error("lockdown_bmc", e)), + } + } + + Ok(StateHandlerOutcome::transition( + update_reprovision_targets_to_reprovision_state( + state, + ReprovisionState::RebootHostBmc, + )?, + )) } ReprovisionState::RebootHostBmc => { // Work around for FORGE-3864 @@ -2970,20 +3430,207 @@ async fn handle_dpu_reprovision( // We can expect transient issues here in case we just rebooted the host's BMC and it has not come up yet handler_host_power_control(state, ctx, SystemPowerControl::ForceRestart).await?; + let mut txn = ctx.services.db_pool.begin().await?; + + // Clear reprovisioning requests only after the terminal host reboot is accepted. + for dpu_snapshot in &state.dpu_snapshots { + db::machine::clear_dpu_reprovisioning_request(&mut txn, &dpu_snapshot.id, false) + .await?; + } + // We need to wait for the host to reboot and submit its new Hardware information in // case of Ready. - Ok(StateHandlerOutcome::transition( - next_state_resolver.next_state( + Ok( + StateHandlerOutcome::transition(next_state_resolver.next_state( &state.managed_state, dpu_machine_id, &state.host_snapshot, - )?, - )) + )?) + .with_txn(txn), + ) } ReprovisionState::NotUnderReprovision => Ok(StateHandlerOutcome::do_nothing()), } } +/// Build the correct failed state for host boot repair during DPU reprovision. +fn dpu_reprovision_host_boot_failed_state( + current_state: &ManagedHostState, + host_id: MachineId, + failure: String, +) -> ManagedHostState { + // Attribute the failure to the flow that owns the current reprovision. + let source = FailureSource::StateMachineArea( + if matches!(current_state, ManagedHostState::Assigned { .. }) { + StateMachineArea::AssignedInstance + } else { + StateMachineArea::MainFlow + }, + ); + + // Reuse the existing BIOS setup failure category for machine_setup repair. + let details = FailureDetails { + cause: FailureCause::BiosSetupFailed { err: failure }, + failed_at: Utc::now(), + source, + }; + + // Preserve the top-level assigned-state shape for tenant-owned hosts. + if matches!(current_state, ManagedHostState::Assigned { .. }) { + ManagedHostState::Assigned { + instance_state: InstanceState::Failed { + details, + machine_id: host_id, + }, + } + } else { + ManagedHostState::Failed { + details, + machine_id: host_id, + retry_count: 0, + } + } +} + +/// Check whether host BIOS and DPU-first boot order remediation is required. +async fn check_host_boot_config( + redfish_client: &dyn Redfish, + mh_snapshot: &ManagedHostStateSnapshot, + reachability_params: &ReachabilityParams, + dpu_freshness: HostBootConfigDpuFreshness, + ctx: &mut StateHandlerContext<'_, MachineStateHandlerContextObjects>, +) -> Result { + // Wait for DPUs only when this caller needs a fresh observation. DPU + // reprovision already validated DPU health before entering host boot repair. + if should_wait_for_dpus_before_host_boot_config( + mh_snapshot, + reachability_params, + dpu_freshness, + ctx, + ) + .await + { + return Ok(HostBootConfigDecision::Wait( + "Waiting for DPUs to come up.".to_string(), + )); + } + + // Resolve the interface whose boot option should be first in host UEFI. + let boot_interface = boot_interface_target(mh_snapshot).ok_or_else(|| { + StateHandlerError::GenericError(eyre::eyre!( + "Missing boot interface for host: {}", + mh_snapshot.host_snapshot.id + )) + })?; + + let vendor = mh_snapshot.host_snapshot.bmc_vendor(); + + log_host_config(redfish_client, mh_snapshot).await; + + let is_bios_setup = boot_interface + .run(|bi| redfish_client.is_bios_setup(Some(bi))) + .await + .map_err(|e| redfish_error("is_bios_setup", e))?; + + if should_skip_boot_order_remediation(mh_snapshot) { + if is_bios_setup { + tracing::info!( + machine_id = %mh_snapshot.host_snapshot.id, + bmc_vendor = %vendor, + "Skipping boot order remediation on Viking (known FW/BMC issue)" + ); + return Ok(HostBootConfigDecision::LockHost); + } + + tracing::warn!( + machine_id = %mh_snapshot.host_snapshot.id, + bmc_vendor = %vendor, + "Host BIOS setup is not configured properly on Viking; running BIOS repair before skipping boot order remediation" + ); + return Ok(HostBootConfigDecision::ConfigureBoot); + } + + let is_boot_order_setup = boot_interface + .run(|bi| redfish_client.is_boot_order_setup(bi)) + .await + .map_err(|e| redfish_error("is_boot_order_setup", e))?; + + if is_bios_setup && is_boot_order_setup { + tracing::info!( + machine_id = %mh_snapshot.host_snapshot.id, + bmc_vendor = %vendor, + "Host BIOS setup and boot order are configured properly" + ); + Ok(HostBootConfigDecision::LockHost) + } else { + tracing::warn!( + machine_id = %mh_snapshot.host_snapshot.id, + bmc_vendor = %vendor, + is_bios_setup, + is_boot_order_setup, + "Host BIOS setup or boot order is not configured properly" + ); + Ok(HostBootConfigDecision::ConfigureBoot) + } +} + +/// Viking BMC firmware cannot safely run boot-order remediation; BIOS repair still applies. +fn should_skip_boot_order_remediation(mh_snapshot: &ManagedHostStateSnapshot) -> bool { + mh_snapshot + .host_snapshot + .hardware_info + .as_ref() + .is_some_and(|hw| hw.is_dgx_h100()) +} + +async fn should_wait_for_dpus_before_host_boot_config( + mh_snapshot: &ManagedHostStateSnapshot, + reachability_params: &ReachabilityParams, + dpu_freshness: HostBootConfigDpuFreshness, + ctx: &mut StateHandlerContext<'_, MachineStateHandlerContextObjects>, +) -> bool { + if !mh_snapshot.has_managed_dpus() { + return false; + } + + match dpu_freshness { + HostBootConfigDpuFreshness::AlreadyValidated => false, + HostBootConfigDpuFreshness::CurrentHostState => { + !are_dpus_up_trigger_reboot_if_needed(mh_snapshot, reachability_params, ctx).await + } + HostBootConfigDpuFreshness::SinceLastHostRebootRequest => { + let Some(last_reboot_requested) = mh_snapshot.host_snapshot.last_reboot_requested + else { + tracing::warn!( + machine_id = %mh_snapshot.host_snapshot.id, + "No host reboot request timestamp found before post-reboot host boot config check" + ); + return false; + }; + + for dpu_snapshot in &mh_snapshot.dpu_snapshots { + if !is_dpu_observed_since(dpu_snapshot, last_reboot_requested.time) { + match trigger_reboot_if_needed( + dpu_snapshot, + mh_snapshot, + None, + reachability_params, + ctx, + ) + .await + { + Ok(_) => {} + Err(e) => tracing::warn!("could not reboot dpu {}: {e}", dpu_snapshot.id), + } + return true; + } + } + + false + } + } +} + // Returns true if update_manager flagged this managed host as needing its firmware examined fn host_reprovisioning_requested(state: &ManagedHostStateSnapshot) -> bool { state.host_snapshot.host_reprovision_requested.is_some() @@ -4178,6 +4825,20 @@ enum SetBootOrderOutcome { WaitingForReboot(String), } +/// Decision from checking whether host boot repair is still required. +enum HostBootConfigDecision { + ConfigureBoot, + LockHost, + Wait(String), +} + +/// DPU observation freshness required before checking host boot config. +enum HostBootConfigDpuFreshness { + AlreadyValidated, + CurrentHostState, + SinceLastHostRebootRequest, +} + /// In case machine does not come up until a specified duration, this function tries to reboot /// it again. The reboot continues till 6 hours only. After that this function gives up. /// WARNING: @@ -10000,84 +10661,27 @@ async fn handle_instance_host_platform_config( } } HostPlatformConfigurationState::CheckHostConfig => { - // For hosts with DPU(s), wait for DPU(s) to come up before reading - // BIOS state -- the host can report stale Redfish info from before the - // power cycle until the DPUs have finished initializing. Zero-DPU - // hosts skip this wait, because there are no DPUs to come up. - if mh_snapshot.has_managed_dpus() - && !are_dpus_up_trigger_reboot_if_needed(mh_snapshot, reachability_params, ctx) - .await - { - return Ok(StateHandlerOutcome::wait( - "Waiting for DPUs to come up.".to_string(), - )); - } - - // Resolve the MAC whose interface should be first in the boot - // order. For hosts with DPUs, this is the DPU-facing PF (set as - // the primary_interface by site-explorer during DPU attach). - // - // For zero-DPU hosts, it's the operator-declared primary host - // NIC (which comes from `ExpectedHostNic.primary`) *or* the - // "lowest" deterministic-fallback host NIC. - let boot_interface = boot_interface_target(mh_snapshot).ok_or_else(|| { - StateHandlerError::GenericError(eyre::eyre!( - "Missing boot interface for host: {}", - mh_snapshot.host_snapshot.id - )) - })?; - - let vendor = mh_snapshot.host_snapshot.bmc_vendor(); - - log_host_config(redfish_client.as_ref(), mh_snapshot).await; - - let is_viking = mh_snapshot - .host_snapshot - .hardware_info - .as_ref() - .is_some_and(|hw| hw.is_dgx_h100()); - - let configure_host_boot_order = if is_viking { - // Viking BMC FW has known issues with the boot-order remediation path. - // Skip the unreliable Redfish read/PATCH sequence and apply the host's - // lockdown policy before continuing. - tracing::info!( - machine_id = %mh_snapshot.host_snapshot.id, - bmc_vendor = %vendor, - "Skipping boot order remediation on Viking (known FW/BMC issue)" - ); - false - } else if boot_interface - .run(|bi| redfish_client.is_boot_order_setup(bi)) - .await - .map_err(|e| redfish_error("is_boot_order_setup", e))? + match check_host_boot_config( + redfish_client.as_ref(), + mh_snapshot, + reachability_params, + HostBootConfigDpuFreshness::CurrentHostState, + ctx, + ) + .await? { - tracing::info!( - machine_id = %mh_snapshot.host_snapshot.id, - bmc_vendor = %vendor, - "Host boot order is configured properly" - ); - false - } else { - tracing::warn!( - machine_id = %mh_snapshot.host_snapshot.id, - bmc_vendor = %vendor, - "Host boot order is not configured properly" - ); - true - }; - - if configure_host_boot_order { - InstanceState::HostPlatformConfiguration { + HostBootConfigDecision::Wait(reason) => { + return Ok(StateHandlerOutcome::wait(reason)); + } + HostBootConfigDecision::ConfigureBoot => InstanceState::HostPlatformConfiguration { platform_config_state: HostPlatformConfigurationState::ConfigureBios { bios_config_info: None, retry_count: 0, }, - } - } else { - InstanceState::HostPlatformConfiguration { + }, + HostBootConfigDecision::LockHost => InstanceState::HostPlatformConfiguration { platform_config_state: HostPlatformConfigurationState::LockHost, - } + }, } } HostPlatformConfigurationState::ConfigureBios { @@ -10180,12 +10784,16 @@ async fn handle_instance_host_platform_config( } HostPlatformConfigurationState::PollingBiosSetup { retry_count } => { let next_instance_state = InstanceState::HostPlatformConfiguration { - platform_config_state: HostPlatformConfigurationState::SetBootOrder { - set_boot_order_info: SetBootOrderInfo { - set_boot_order_jid: None, - set_boot_order_state: SetBootOrderState::SetBootOrder, - retry_count: 0, - }, + platform_config_state: if should_skip_boot_order_remediation(mh_snapshot) { + HostPlatformConfigurationState::LockHost + } else { + HostPlatformConfigurationState::SetBootOrder { + set_boot_order_info: SetBootOrderInfo { + set_boot_order_jid: None, + set_boot_order_state: SetBootOrderState::SetBootOrder, + retry_count: 0, + }, + } }, }; diff --git a/crates/machine-controller/src/handler/helpers.rs b/crates/machine-controller/src/handler/helpers.rs index 1c5bce4888..fe5ea882b4 100644 --- a/crates/machine-controller/src/handler/helpers.rs +++ b/crates/machine-controller/src/handler/helpers.rs @@ -101,7 +101,19 @@ pub trait NextState { &state.dpu_snapshots, dpu_ids_for_reprov, ), - ReprovisionState::WaitingForNetworkConfig => ReprovisionState::RebootHostBmc + ReprovisionState::WaitingForNetworkConfig => ReprovisionState::PrepareHostBootRepair + .next_state_with_all_dpus_updated( + &state.managed_state, + &state.dpu_snapshots, + dpu_ids_for_reprov, + ), + ReprovisionState::SetHostBootOrder { .. } => ReprovisionState::LockHostAfterBootRepair + .next_state_with_all_dpus_updated( + &state.managed_state, + &state.dpu_snapshots, + all_machine_ids, + ), + ReprovisionState::LockHostAfterBootRepair => ReprovisionState::RebootHostBmc .next_state_with_all_dpus_updated( &state.managed_state, &state.dpu_snapshots, diff --git a/crates/redfish/src/libredfish/test_support.rs b/crates/redfish/src/libredfish/test_support.rs index 4dc2a03ec8..f25b2b4714 100644 --- a/crates/redfish/src/libredfish/test_support.rs +++ b/crates/redfish/src/libredfish/test_support.rs @@ -56,6 +56,8 @@ struct RedfishSimState { get_task_trigger_evidence_returns_interrupted: bool, machine_setup_bios_job_id: Option, is_bios_setup: Option, + is_boot_order_setup: Option, + default_lockdown: Option, job_state_sequence: VecDeque, /// Offset (in seconds) applied to the BMC `DateTime` returned by /// `get_manager`, relative to the controller's `Utc::now()`. Defaults to 0 @@ -131,6 +133,17 @@ impl RedfishSim { } } + /// Return the simulated lockdown state for each Redfish client target. + pub fn lockdown_states(&self) -> Vec { + self.state + .lock() + .unwrap() + .hosts + .values() + .map(|host| host.lockdown) + .collect() + } + /// Build a simulator with optional SPDM / firmware-integration test flags. pub fn with_test_overrides(overrides: RedfishSimTestOverrides) -> Self { Self { @@ -157,6 +170,20 @@ impl RedfishSim { self.state.lock().unwrap().is_bios_setup = Some(ready); } + /// Configure whether simulated Redfish reports the host boot order as ready. + pub fn set_is_boot_order_setup(&self, ready: bool) { + self.state.lock().unwrap().is_boot_order_setup = Some(ready); + } + + /// Configure simulated BMC lockdown state for existing and future clients. + pub fn set_lockdown(&self, lockdown: EnabledDisabled) { + let mut state = self.state.lock().unwrap(); + state.default_lockdown = Some(lockdown); + for host_state in state.hosts.values_mut() { + host_state.lockdown = lockdown; + } + } + /// Set the offset (in seconds) applied to the BMC `DateTime` returned by /// `get_manager`, relative to the controller clock. Use a value larger than /// the time-sync threshold to simulate an out-of-sync BMC clock. @@ -1197,6 +1224,7 @@ impl Redfish for RedfishSimClient { ) -> libredfish::RedfishFuture<'a, Result, RedfishError>> { Box::pin(async move { let mut state = self.state.lock().unwrap(); + state.is_boot_order_setup = Some(true); let host_state = state.hosts.get_mut(&self._host).unwrap(); host_state .actions @@ -1255,9 +1283,14 @@ impl Redfish for RedfishSimClient { fn lockdown_bmc<'a>( &'a self, - _target: EnabledDisabled, + target: EnabledDisabled, ) -> libredfish::RedfishFuture<'a, Result<(), RedfishError>> { - Box::pin(async move { Ok(()) }) + Box::pin(async move { + let mut state = self.state.lock().unwrap(); + let host_state = state.hosts.get_mut(&self._host).unwrap(); + host_state.lockdown = target; + Ok(()) + }) } fn get_gpu_sensors<'a>( @@ -1373,11 +1406,12 @@ impl Redfish for RedfishSimClient { ) -> libredfish::RedfishFuture<'a, Result> { Box::pin(async move { let mut state = self.state.lock().unwrap(); + let is_boot_order_setup = state.is_boot_order_setup.unwrap_or(true); let host_state = state.hosts.get_mut(&self._host).unwrap(); host_state.actions.push(RedfishSimAction::IsBootOrderSetup { boot_interface_mac: boot_interface_ref_to_string(boot_interface), }); - Ok(true) + Ok(is_boot_order_setup) }) } @@ -1802,12 +1836,13 @@ impl RedfishClientPool for RedfishSim { host: host.to_string(), vendor, }); + let default_lockdown = state.default_lockdown.unwrap_or(EnabledDisabled::Disabled); state .hosts .entry(host.to_string()) .or_insert(RedfishSimHostState { power: PowerState::On, - lockdown: EnabledDisabled::Disabled, + lockdown: default_lockdown, actions: Default::default(), }); if state.fw_version.is_empty() { diff --git a/pxe/templates/user-data b/pxe/templates/user-data index 0998c41524..fefd264a88 100644 --- a/pxe/templates/user-data +++ b/pxe/templates/user-data @@ -196,11 +196,69 @@ write_files: /bin/rm /run/cloud-init.done fi } + function ensure_ethernet_link_type() + { + local log_file="/var/log/forge/link-type.log" + mkdir -p /var/log/forge + + # Find the DPU config device before any management VRF setup depends on Ethernet. + local mst_device + mst_device="$(compgen -G '/dev/mst/mt*_pciconf0' | head -n 1)" + if [[ -z "${mst_device}" ]] + then + echo "No mst pciconf device found for LINK_TYPE normalization" | tee -a "${log_file}" + return + fi + + # Non-VPI hardware may not expose LINK_TYPE, so treat query failures as non-fatal. + local link_config + if ! link_config="$(/usr/bin/mlxconfig -d "${mst_device}" q 2>&1)" + then + echo "LINK_TYPE query unsupported; continuing" | tee -a "${log_file}" + echo "${link_config}" | tee -a "${log_file}" + return + fi + echo "${link_config}" >> "${log_file}" + + # Apply Ethernet mode only to exposed ports not already reporting Ethernet. + local link_type_keys=() + local link_type_key + local link_type_value + for link_type_key in LINK_TYPE_P1 LINK_TYPE_P2 + do + link_type_value="$(printf '%s\n' "${link_config}" | awk -v key="${link_type_key}" '$1 == key { print $2; exit }')" + if [[ -n "${link_type_value}" && "${link_type_value}" != *"(2)"* ]] + then + link_type_keys+=("${link_type_key}") + fi + done + + local link_type_changed=0 + for link_type_key in "${link_type_keys[@]}" + do + echo "Normalizing ${link_type_key} to Ethernet" | tee -a "${log_file}" + if /usr/bin/mlxconfig -y -d "${mst_device}" set "${link_type_key}=2" + then + link_type_changed=1 + else + echo "Failed to normalize ${link_type_key}; continuing" | tee -a "${log_file}" + fi + done + + if [[ "${link_type_changed}" == "1" ]] + then + trap - EXIT + reboot + # Keep cloud-init incomplete; the controller restart path will re-enter PXE with the persisted LINK_TYPE setting. + exit 1 + fi + } trap cleanup EXIT set -e date -s '@{{ seconds_since_epoch }}' hwclock --systohc /usr/bin/bfrec --capsule /lib/firmware/mellanox/boot/capsule/efi_sbkeysync.cap || true + ensure_ethernet_link_type /usr/bin/bfcfg ip vrf exec mgmt curl --retry 5 --retry-all-errors -v -o /opt/forge/forge_root.pem {{ pxe_url }}/api/v0/tls/root_ca echo "downloaded forge root cert"