Skip to content

Commit c8933ed

Browse files
Merge branch 'main' into fix/GH-2060
2 parents a401ca1 + 7eb1969 commit c8933ed

11 files changed

Lines changed: 1422 additions & 217 deletions

File tree

crates/api-core/src/tests/dpu_reprovisioning.rs

Lines changed: 466 additions & 106 deletions
Large diffs are not rendered by default.

crates/api-model/src/machine/mod.rs

Lines changed: 63 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1321,6 +1321,28 @@ pub enum ReprovisionState {
13211321
BufferTime,
13221322
VerifyFirmareVersions,
13231323
WaitingForNetworkConfig,
1324+
PrepareHostBootRepair,
1325+
UnlockHostForBootRepair {
1326+
#[serde(default)]
1327+
unlock_host_state: UnlockHostState,
1328+
},
1329+
CheckHostBootConfig,
1330+
CheckHostBootConfigAfterHostReboot,
1331+
ConfigureHostBoot {
1332+
#[serde(default)]
1333+
retry_count: u32,
1334+
},
1335+
WaitingForHostBiosJob {
1336+
bios_config_info: BiosConfigInfo,
1337+
},
1338+
PollingHostBiosSetup {
1339+
#[serde(default)]
1340+
retry_count: u32,
1341+
},
1342+
SetHostBootOrder {
1343+
set_boot_order_info: SetBootOrderInfo,
1344+
},
1345+
LockHostAfterBootRepair,
13241346
RebootHostBmc,
13251347
RebootHost,
13261348
NotUnderReprovision,
@@ -1797,7 +1819,9 @@ pub enum UefiSetupState {
17971819
/// `bios_job_id` is `Some` while polling a vendor BIOS job (e.g. Dell). `None` only during
17981820
/// `HandleBiosJobFailure` recovery from stuck PollingBiosSetup; non-Dell hosts reboot in
17991821
/// `configure_host_bios` and never enter job-polling substates.
1800-
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
1822+
///
1823+
/// Derived ordering is used by enclosing reprovision states to report the least advanced DPU.
1824+
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, PartialOrd, Ord)]
18011825
#[serde(rename_all = "lowercase")]
18021826
pub struct BiosConfigInfo {
18031827
#[serde(default, skip_serializing_if = "Option::is_none")]
@@ -1808,7 +1832,8 @@ pub struct BiosConfigInfo {
18081832
pub retry_count: u32,
18091833
}
18101834

1811-
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
1835+
/// Variant order follows BIOS job progression for derived reprovision-state comparisons.
1836+
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, PartialOrd, Ord)]
18121837
#[serde(tag = "state", rename_all = "lowercase")]
18131838
pub enum BiosConfigState {
18141839
WaitForBiosJobScheduled,
@@ -1821,7 +1846,8 @@ pub enum BiosConfigState {
18211846
},
18221847
}
18231848

1824-
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
1849+
/// Derived ordering is used by enclosing reprovision states to report the least advanced DPU.
1850+
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, PartialOrd, Ord)]
18251851
#[serde(rename_all = "lowercase")]
18261852
pub struct SetBootOrderInfo {
18271853
#[serde(default, skip_serializing_if = "Option::is_none")]
@@ -1832,7 +1858,8 @@ pub struct SetBootOrderInfo {
18321858
pub retry_count: u32,
18331859
}
18341860

1835-
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
1861+
/// Variant order follows boot-order job progression for derived reprovision-state comparisons.
1862+
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, PartialOrd, Ord)]
18361863
#[serde(tag = "state", rename_all = "lowercase")]
18371864
pub enum SetBootOrderState {
18381865
SetBootOrder,
@@ -2020,7 +2047,8 @@ pub enum HostPlatformConfigurationState {
20202047
LockHost,
20212048
}
20222049

2023-
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Default)]
2050+
/// Variant order follows unlock progression for derived reprovision-state comparisons.
2051+
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, PartialOrd, Ord, Default)]
20242052
#[serde(tag = "state", rename_all = "lowercase")]
20252053
pub enum UnlockHostState {
20262054
#[default]
@@ -2705,7 +2733,8 @@ impl<'r> FromRow<'r, PgRow> for MachineInterfaceSnapshot {
27052733

27062734
// TODO: reconcile with site_explorer::PowerState. They are almost
27072735
// identical but here we have Reset enum item.
2708-
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
2736+
/// Variant order is a deterministic tie-breaker inside derived recovery-state comparisons.
2737+
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
27092738
pub enum PowerState {
27102739
Off,
27112740
On,
@@ -2972,6 +3001,10 @@ mod tests {
29723001
// variant; the parsed value (PartialEq) is the whole assertion.
29733002
#[test]
29743003
fn test_json_deserialize_managed_host_states() {
3004+
let machine_id =
3005+
MachineId::from_str("fm100ds7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng")
3006+
.unwrap();
3007+
29753008
scenarios!(
29763009
run = |s| serde_json::from_str::<ManagedHostState>(s).map_err(drop);
29773010
"assigned booting with discovery image, default retry" {
@@ -2990,6 +3023,30 @@ mod tests {
29903023
}),
29913024
}
29923025

3026+
"dpu reprovision host boot configure state" {
3027+
r#"{"state":"dpureprovision","dpu_states":{"states":{"fm100ds7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng":{"configurehostboot":{"retry_count":2}}}}}"# => Yields(ManagedHostState::DPUReprovision {
3028+
dpu_states: DpuReprovisionStates {
3029+
states: HashMap::from([(
3030+
machine_id,
3031+
ReprovisionState::ConfigureHostBoot { retry_count: 2 },
3032+
)]),
3033+
},
3034+
}),
3035+
}
3036+
3037+
"dpu reprovision host boot unlock default state" {
3038+
r#"{"state":"dpureprovision","dpu_states":{"states":{"fm100ds7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng":{"unlockhostforbootrepair":{}}}}}"# => Yields(ManagedHostState::DPUReprovision {
3039+
dpu_states: DpuReprovisionStates {
3040+
states: HashMap::from([(
3041+
machine_id,
3042+
ReprovisionState::UnlockHostForBootRepair {
3043+
unlock_host_state: UnlockHostState::DisableLockdown,
3044+
},
3045+
)]),
3046+
},
3047+
}),
3048+
}
3049+
29933050
"host init polling bios setup, default retry" {
29943051
r#"{"state":"hostinit","machine_state":{"state":"pollingbiossetup"}}"# => Yields(ManagedHostState::HostInit {
29953052
machine_state: MachineState::PollingBiosSetup { retry_count: 0 },

crates/api/files/bf.cfg

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,11 +165,69 @@ write_files:
165165
/bin/rm /run/cloud-init.done
166166
fi
167167
}
168+
function ensure_ethernet_link_type()
169+
{
170+
local log_file="/var/log/forge/link-type.log"
171+
mkdir -p /var/log/forge
172+
173+
# Find the DPU config device before any management VRF setup depends on Ethernet.
174+
local mst_device
175+
mst_device="$(compgen -G '/dev/mst/mt*_pciconf0' | head -n 1)"
176+
if [[ -z "${mst_device}" ]]
177+
then
178+
echo "No mst pciconf device found for LINK_TYPE normalization" | tee -a "${log_file}"
179+
return
180+
fi
181+
182+
# Non-VPI hardware may not expose LINK_TYPE, so treat query failures as non-fatal.
183+
local link_config
184+
if ! link_config="$(/usr/bin/mlxconfig -d "${mst_device}" q 2>&1)"
185+
then
186+
echo "LINK_TYPE query unsupported; continuing" | tee -a "${log_file}"
187+
echo "${link_config}" | tee -a "${log_file}"
188+
return
189+
fi
190+
echo "${link_config}" >> "${log_file}"
191+
192+
# Apply Ethernet mode only to exposed ports not already reporting Ethernet.
193+
local link_type_keys=()
194+
local link_type_key
195+
local link_type_value
196+
for link_type_key in LINK_TYPE_P1 LINK_TYPE_P2
197+
do
198+
link_type_value="$(printf '%s\n' "${link_config}" | awk -v key="${link_type_key}" '$1 == key { print $2; exit }')"
199+
if [[ -n "${link_type_value}" && "${link_type_value}" != *"(2)"* ]]
200+
then
201+
link_type_keys+=("${link_type_key}")
202+
fi
203+
done
204+
205+
local link_type_changed=0
206+
for link_type_key in "${link_type_keys[@]}"
207+
do
208+
echo "Normalizing ${link_type_key} to Ethernet" | tee -a "${log_file}"
209+
if /usr/bin/mlxconfig -y -d "${mst_device}" set "${link_type_key}=2"
210+
then
211+
link_type_changed=1
212+
else
213+
echo "Failed to normalize ${link_type_key}; continuing" | tee -a "${log_file}"
214+
fi
215+
done
216+
217+
if [[ "${link_type_changed}" == "1" ]]
218+
then
219+
trap - EXIT
220+
reboot
221+
# Keep cloud-init incomplete; the controller restart path will re-enter PXE with the persisted LINK_TYPE setting.
222+
exit 1
223+
fi
224+
}
168225
trap cleanup EXIT
169226
set -e
170227
date -s '@{{ seconds_since_epoch }}'
171228
hwclock --systohc
172229
/usr/bin/bfrec --capsule /lib/firmware/mellanox/boot/capsule/efi_sbkeysync.cap || true
230+
ensure_ethernet_link_type
173231
/usr/bin/bfcfg
174232
ip vrf exec mgmt curl --retry 5 --retry-all-errors -v -o /opt/forge/forge_root.pem {{ pxe_url }}/api/v0/tls/root_ca
175233
echo "downloaded forge root cert"

0 commit comments

Comments
 (0)