Skip to content
Draft
100 changes: 100 additions & 0 deletions e2e/scenario_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,106 @@ func runScenarioACLGRID(t *testing.T, vmSize string) {
})
}

func Test_ACL_ABUpdate(t *testing.T) {
cosiURL := loadCOSIURL(t, "cosi-publishing-info-acl-tl-gen2")
if cosiURL == "" {
t.Skip("COSI artifact not available for acl-tl-gen2, skipping A/B update test")
}

RunScenario(t, &Scenario{
Description: "Tests full A/B update lifecycle: stage COSI, finalize (reboot), verify volume switch",
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDACLGen2TL,
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties)
},
Validator: func(ctx context.Context, s *Scenario) {
ValidateACLABUpdate(ctx, s, cosiURL)
},
},
})
}

func Test_ACL_ABUpdate_ARM64(t *testing.T) {
cosiURL := loadCOSIURL(t, "cosi-publishing-info-acl-arm64-tl-gen2")
if cosiURL == "" {
t.Skip("COSI artifact not available for acl-arm64-tl-gen2, skipping A/B update test")
}

RunScenario(t, &Scenario{
Description: "Tests full A/B update lifecycle on ARM64: stage COSI, finalize (reboot), verify volume switch",
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDACLArm64Gen2TL,
UseNVMe: true,
BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) {
nbc.AgentPoolProfile.VMSize = "Standard_D2pds_v6"
nbc.IsARM64 = true
},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties)
vmss.SKU.Name = to.Ptr("Standard_D2pds_v6")
},
Validator: func(ctx context.Context, s *Scenario) {
ValidateACLABUpdate(ctx, s, cosiURL)
},
},
})
}

func Test_ACL_ABUpdate_FIPS(t *testing.T) {
cosiURL := loadCOSIURL(t, "cosi-publishing-info-acl-fips-tl-gen2")
if cosiURL == "" {
t.Skip("COSI artifact not available for acl-fips-tl-gen2, skipping A/B update test")
}

RunScenario(t, &Scenario{
Description: "Tests full A/B update lifecycle on FIPS: stage COSI, finalize (reboot), verify volume switch",
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDACLGen2FIPSTL,
BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) {
nbc.AgentPoolProfile.LocalDNSProfile = nil
},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties)
},
Validator: func(ctx context.Context, s *Scenario) {
ValidateACLABUpdate(ctx, s, cosiURL)
},
},
})
}

func Test_ACL_ABUpdate_ARM64_FIPS(t *testing.T) {
cosiURL := loadCOSIURL(t, "cosi-publishing-info-acl-arm64-fips-tl-gen2")
if cosiURL == "" {
t.Skip("COSI artifact not available for acl-arm64-fips-tl-gen2, skipping A/B update test")
}

RunScenario(t, &Scenario{
Description: "Tests full A/B update lifecycle on ARM64 FIPS: stage COSI, finalize (reboot), verify volume switch",
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDACLArm64Gen2FIPSTL,
UseNVMe: true,
BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) {
nbc.AgentPoolProfile.VMSize = "Standard_D2pds_v6"
nbc.IsARM64 = true
nbc.AgentPoolProfile.LocalDNSProfile = nil
},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties)
vmss.SKU.Name = to.Ptr("Standard_D2pds_v6")
},
Validator: func(ctx context.Context, s *Scenario) {
ValidateACLABUpdate(ctx, s, cosiURL)
},
},
})
}

func Test_AzureLinuxV3_SecureTLSBootstrapping_BootstrapToken_Fallback(t *testing.T) {
RunScenario(t, &Scenario{
Description: "Tests that a node using a AzureLinuxV3 Gen2 VHD can be properly bootstrapped even if secure TLS bootstrapping fails",
Expand Down
168 changes: 168 additions & 0 deletions e2e/validators.go
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,174 @@ func ValidateACLFIPSEnabled(ctx context.Context, s *Scenario) {
)
}

// ValidateACLABUpdate performs a full A/B update lifecycle test:
// 1. Verify initial partition layout and capture initial active volume
// 2. Write host config pointing at the COSI URL
// 3. Stage the update (trident update --allowed-operations stage)
// 4. Finalize the update (trident update --allowed-operations finalize) — triggers reboot
// 5. Wait for node to come back, re-establish SSH
// 6. Validate active volume switched and servicing state is Provisioned
func ValidateACLABUpdate(ctx context.Context, s *Scenario, cosiURL string) {
s.T.Helper()

// Step 1: Verify ACL identity and A/B partition layout
ValidateFileHasContent(ctx, s, "/etc/os-release", "VARIANT_ID=azurecontainerlinux")
// Verify dm-verity is active (ACL uses dm-verity for root integrity)
execScriptOnVMForScenarioValidateExitCode(
ctx,
s,
`sudo dmsetup status | grep -q verity`,
0,
"expected dm-verity to be active on root filesystem",
)
// Verify two usr partitions exist (A/B layout)
execScriptOnVMForScenarioValidateExitCode(
ctx,
s,
`lsblk -ln -o NAME,PARTLABEL | grep -c usr | grep -qE '^[2-9]'`,
0,
"expected at least 2 usr partitions for A/B layout",
)
// Verify Trident update service unit is present
ValidateFileExists(ctx, s, "/usr/lib/systemd/system/trident-update.service")

// Capture initial boot_id so we can detect reboot
bootIDResult := execScriptOnVMForScenarioValidateExitCode(ctx, s,
"cat /proc/sys/kernel/random/boot_id", 0, "failed to read boot_id")
initialBootID := strings.TrimSpace(bootIDResult.stdout)
s.T.Logf("Initial boot_id: %s", initialBootID)

// Capture initial trident status
initialStatus := execScriptOnVMForScenarioValidateExitCode(ctx, s,
"sudo trident get status", 0, "failed to get initial trident status")
s.T.Logf("Initial trident status:\n%s", initialStatus.stdout)

// Step 2: Write host config with COSI URL directly (Trident supports http/https)
hostConfigScript := fmt.Sprintf(`sudo mkdir -p /etc/trident && sudo tee /etc/trident/e2e-update-config.yaml >/dev/null <<'TRIDENT_EOF'
image:
url: "%s"
sha384: "ignored"
TRIDENT_EOF`, cosiURL)
execScriptOnVMForScenarioValidateExitCode(ctx, s, hostConfigScript, 0, "failed to write host config")

// Step 3: Stage the update
s.T.Log("=== Staging A/B update ===")
execScriptOnVMForScenarioValidateExitCode(ctx, s,
"sudo trident update /etc/trident/e2e-update-config.yaml --allowed-operations stage",
0, "trident update --allowed-operations stage failed")
s.T.Log("Stage completed successfully")

// Verify staging changed servicing state
stagedStatus := execScriptOnVMForScenarioValidateExitCode(ctx, s,
"sudo trident get status", 0, "failed to get trident status after stage")
s.T.Logf("Post-stage trident status:\n%s", stagedStatus.stdout)

// Step 4: Finalize the update (triggers immediate reboot)
s.T.Log("=== Finalizing A/B update (expecting reboot) ===")
finalizeResult, finalizeErr := execScriptOnVm(ctx, s, s.Runtime.VM,
"sudo trident update /etc/trident/e2e-update-config.yaml --allowed-operations finalize")

// Finalize triggers a reboot. If the command returned cleanly, check exit code.
// If it errored due to SSH disconnect, that's expected.
if finalizeErr == nil && finalizeResult != nil {
if finalizeResult.exitCode != "0" {
s.T.Fatalf("trident finalize returned exit code %s (expected 0 or SSH disconnect)\nstdout: %s\nstderr: %s",
finalizeResult.exitCode, finalizeResult.stdout, finalizeResult.stderr)
}
s.T.Log("Finalize returned successfully before reboot")
} else if finalizeErr != nil {
errMsg := finalizeErr.Error()
isDisconnect := strings.Contains(errMsg, "connection reset") ||
strings.Contains(errMsg, "connection refused") ||
strings.Contains(errMsg, "EOF") ||
strings.Contains(errMsg, "closed") ||
strings.Contains(errMsg, "broken pipe") ||
strings.Contains(errMsg, "System is going down")
if !isDisconnect {
s.T.Fatalf("trident finalize failed with unexpected error: %v", finalizeErr)
}
s.T.Logf("Finalize caused SSH disconnect (expected due to reboot): %v", finalizeErr)
}

// Step 5: Reconnect after reboot
s.T.Log("=== Waiting for node to come back after reboot ===")
cleanupBastionTunnel(s.Runtime.VM.SSHClient)
s.Runtime.VM.SSHClient = nil

// Wait for the node to rejoin the cluster
s.Runtime.Cluster.Kube.WaitUntilNodeReady(ctx, s.T, s.Runtime.VMSSName)

// Re-establish SSH with retry loop (reboot may take a while)
reconnectSSHAfterReboot(ctx, s, initialBootID)

// Step 6: Validate post-reboot status
s.T.Log("=== Validating post-reboot A/B update status ===")
validateABUpdatePostReboot(ctx, s, initialBootID)
}

// reconnectSSHAfterReboot re-establishes the SSH connection after a reboot,
// polling until the boot_id changes and SSH becomes available.
func reconnectSSHAfterReboot(ctx context.Context, s *Scenario, initialBootID string) {
s.T.Helper()

pollCtx, cancel := context.WithTimeout(ctx, 10*time.Minute)
defer cancel()

err := wait.PollUntilContextTimeout(pollCtx, 15*time.Second, 10*time.Minute, true, func(ctx context.Context) (bool, error) {
sshClient, dialErr := DialSSHOverBastion(ctx, s.Runtime.Cluster.Bastion, s.Runtime.VM.PrivateIP, config.VMSSHPrivateKey)
if dialErr != nil {
s.T.Logf("SSH not yet available: %v", dialErr)
return false, nil
}

// Verify boot_id changed to confirm actual reboot
result, cmdErr := runSSHCommand(ctx, sshClient, "cat /proc/sys/kernel/random/boot_id", false)
if cmdErr != nil {
cleanupBastionTunnel(sshClient)
s.T.Logf("SSH connected but command failed: %v", cmdErr)
return false, nil
}

newBootID := strings.TrimSpace(result.stdout)
if newBootID == initialBootID {
cleanupBastionTunnel(sshClient)
s.T.Log("SSH connected but boot_id unchanged — node hasn't rebooted yet, retrying...")
return false, nil
}

s.T.Logf("Node rebooted successfully. New boot_id: %s", newBootID)
s.Runtime.VM.SSHClient = sshClient
return true, nil
})
require.NoError(s.T, err, "failed to re-establish SSH after reboot within timeout")
}

// validateABUpdatePostReboot checks that trident status shows the volume switched
// and servicing state is Provisioned after an A/B update reboot.
func validateABUpdatePostReboot(ctx context.Context, s *Scenario, initialBootID string) {
s.T.Helper()

// Confirm boot_id changed
bootIDResult := execScriptOnVMForScenarioValidateExitCode(ctx, s,
"cat /proc/sys/kernel/random/boot_id", 0, "failed to read boot_id after reboot")
newBootID := strings.TrimSpace(bootIDResult.stdout)
require.NotEqual(s.T, initialBootID, newBootID, "boot_id should have changed after reboot")

// Get and log full trident status
statusResult := execScriptOnVMForScenarioValidateExitCode(ctx, s,
"sudo trident get status", 0, "failed to get trident status after reboot")
s.T.Logf("Post-reboot trident status:\n%s", statusResult.stdout)

// Validate servicing state is Provisioned
require.Contains(s.T, statusResult.stdout, "Provisioned",
"expected servicingState to be Provisioned after A/B update")

// Validate active volume changed (should be VolumeB if started on VolumeA)
// ACL nodes start on VolumeA, so after update they should be on VolumeB
require.Contains(s.T, statusResult.stdout, "VolumeB",
"expected abActiveVolume to show VolumeB after A/B update")
}

func ValidateFileDoesNotExist(ctx context.Context, s *Scenario, fileName string) {
s.T.Helper()
if fileExist(ctx, s, fileName) {
Expand Down
Loading