Skip to content

Commit 20f849c

Browse files
r2k1Copilot
andauthored
fix(e2e): unblock Windows sysprep when VMAgentDisabler.dll load stalls (#8544)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 0dfdff8 commit 20f849c

3 files changed

Lines changed: 205 additions & 50 deletions

File tree

e2e/scenario_win_test.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,34 @@ func Test_Windows2022_VHDCaching(t *testing.T) {
330330
})
331331
}
332332

333+
func Test_Windows2025Gen2_VHDCaching(t *testing.T) {
334+
RunScenario(t, &Scenario{
335+
Description: "VHD Caching - Windows Server 2025 Gen2",
336+
Config: Config{
337+
Cluster: ClusterAzureNetwork,
338+
VHD: config.VHDWindows2025Gen2,
339+
VHDCaching: true,
340+
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
341+
vmss.SKU.Capacity = to.Ptr[int64](2)
342+
},
343+
BootstrapConfigMutator: func(_ *Cluster, configuration *datamodel.NodeBootstrappingConfiguration) {
344+
Windows2025BootstrapConfigMutator(t, configuration)
345+
},
346+
Validator: func(ctx context.Context, s *Scenario) {
347+
ValidateWindowsVersionFromWindowsSettings(ctx, s, "2025-gen2")
348+
ValidateWindowsProductName(ctx, s, "Windows Server 2025 Datacenter")
349+
ValidateWindowsDisplayVersion(ctx, s, "24H2")
350+
ValidateFileHasContent(ctx, s, "/k/kubeletstart.ps1", "--container-runtime=remote")
351+
ValidateWindowsProcessHasCliArguments(ctx, s, "kubelet.exe", []string{"--rotate-certificates=true", "--client-ca-file=c:\\k\\ca.crt"})
352+
ValidateCiliumIsNotRunningWindows(ctx, s)
353+
ValidateDotnetNotInstalledWindows(ctx, s)
354+
ValidateWindowsSystemServicesRestartConfiguration(ctx, s)
355+
ValidateCollectWindowsLogsScript(ctx, s)
356+
},
357+
},
358+
})
359+
}
360+
333361
// Test_Windows2022_VHDCaching_LegacyTLSBootstrap exercises Windows PIS /
334362
// VHD-cached provisioning with secure TLS bootstrap disabled, forcing kubelet
335363
// to use the legacy bootstrap-token path. Catches regressions in the two-stage

e2e/test_helpers.go

Lines changed: 167 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -599,43 +599,188 @@ func createVMExtensionLinuxAKSNode(ctx context.Context, location *string) (*armc
599599
}, nil
600600
}
601601

602-
// RunCommand executes a command on the VMSS VM with instance ID "0" and returns the raw JSON response from Azure
603-
// Unlike default approach, it doesn't use SSH and uses Azure tooling
604-
// This approach is generally slower, but it works even if SSH is not available
605-
func RunCommand(ctx context.Context, s *Scenario, command string) (armcompute.RunCommandResult, error) {
602+
// RunCommand executes a script on the VMSS VM with the configured instance ID via the
603+
// Azure VMSS RunCommand v2 API (VirtualMachineRunCommand resource). This is the API
604+
// already used by production aks-rp PIS code; using it here keeps test and production
605+
// on the same surface and avoids the v1 RunCommand extension's failure modes
606+
// (e.g. the Microsoft.CPlat.Core/RunCommandWindows "Keyset does not exist" error
607+
// fixed by ADO PR https://msazure.visualstudio.com/CloudNativeCompute/_git/aks-rp/pullrequest/15721814).
608+
//
609+
// Unlike SSH-based exec, this works even when WinRM/SSH are unavailable (e.g. mid-sysprep).
610+
// It is generally slower than SSH because each call creates a VirtualMachineRunCommand
611+
// resource on the VM and waits for it to provision.
612+
func RunCommand(ctx context.Context, s *Scenario, command string) (armcompute.VirtualMachineRunCommandInstanceView, error) {
606613
s.T.Helper()
614+
rg := *s.Runtime.Cluster.Model.Properties.NodeResourceGroup
615+
instanceID := *s.Runtime.VM.VM.InstanceID
616+
// VirtualMachineRunCommand resources persist on the VM until explicitly deleted;
617+
// use a unique name per call so concurrent / repeated calls don't collide, and
618+
// best-effort delete it on the way out below.
619+
runCommandName := fmt.Sprintf("e2e-runcmd-%d", time.Now().UnixNano())
607620
start := time.Now()
608621
defer func() {
609-
elapsed := time.Since(start)
610-
toolkit.Logf(ctx, "Command %q took %s", command, elapsed)
622+
toolkit.Logf(ctx, "RunCommand %s took %s", runCommandName, time.Since(start))
611623
}()
612624

613-
runPoller, err := config.Azure.VMSSVM.BeginRunCommand(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, armcompute.RunCommandInput{
614-
CommandID: func() *string {
615-
if s.IsWindows() {
616-
return to.Ptr("RunPowerShellScript")
617-
}
618-
return to.Ptr("RunShellScript")
619-
}(),
620-
Script: []*string{to.Ptr(command)},
621-
}, nil)
625+
runCmd := armcompute.VirtualMachineRunCommand{
626+
Location: to.Ptr(s.Location),
627+
Properties: &armcompute.VirtualMachineRunCommandProperties{
628+
Source: &armcompute.VirtualMachineRunCommandScriptSource{
629+
Script: to.Ptr(command),
630+
},
631+
AsyncExecution: to.Ptr(false),
632+
},
633+
}
634+
635+
poller, err := config.Azure.VMSSVMRunCommands.BeginCreateOrUpdate(ctx, rg, s.Runtime.VMSSName, instanceID, runCommandName, runCmd, nil)
622636
if err != nil {
623-
return armcompute.RunCommandResult{}, fmt.Errorf("failed to run command on Windows VM for image creation: %w", err)
637+
return armcompute.VirtualMachineRunCommandInstanceView{}, fmt.Errorf("failed to start RunCommand on VMSS VM: %w", err)
638+
}
639+
defer func() {
640+
// Best-effort cleanup: VirtualMachineRunCommand resources persist on the VM
641+
// otherwise and can accumulate across many RunCommand calls in a single run.
642+
// Use a fresh context so we still clean up if the caller's ctx is cancelled.
643+
cleanupCtx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
644+
defer cancel()
645+
if _, derr := config.Azure.VMSSVMRunCommands.BeginDelete(cleanupCtx, rg, s.Runtime.VMSSName, instanceID, runCommandName, nil); derr != nil {
646+
toolkit.Logf(ctx, "best-effort RunCommand %s delete failed: %v", runCommandName, derr)
647+
}
648+
}()
649+
if _, err := poller.PollUntilDone(ctx, nil); err != nil {
650+
return armcompute.VirtualMachineRunCommandInstanceView{}, fmt.Errorf("failed to wait for RunCommand on VMSS VM: %w", err)
624651
}
625652

626-
runResp, err := runPoller.PollUntilDone(ctx, nil)
653+
// CreateOrUpdate (PUT) never includes InstanceView in the response — it's only
654+
// returned by Get when $expand=instanceView is set. Fetch it explicitly so we
655+
// get stdout/stderr/exit code.
656+
getResp, err := config.Azure.VMSSVMRunCommands.Get(ctx, rg, s.Runtime.VMSSName, instanceID, runCommandName, &armcompute.VirtualMachineScaleSetVMRunCommandsClientGetOptions{
657+
Expand: to.Ptr("instanceView"),
658+
})
627659
if err != nil {
628-
return runResp.RunCommandResult, fmt.Errorf("failed to run command on Windows VM for image creation: %w", err)
660+
return armcompute.VirtualMachineRunCommandInstanceView{}, fmt.Errorf("failed to get RunCommand instance view: %w", err)
661+
}
662+
if getResp.Properties == nil || getResp.Properties.InstanceView == nil {
663+
return armcompute.VirtualMachineRunCommandInstanceView{}, errors.New("RunCommand result missing instance view")
629664
}
630-
return runResp.RunCommandResult, err
665+
view := *getResp.Properties.InstanceView
666+
return view, runCommandScriptError(view)
631667
}
632668

669+
// runCommandScriptError converts a RunCommand instance view into an error if the
670+
// script itself failed. The ARM CreateOrUpdate operation reports success as long as
671+
// the extension was able to run the script — a non-zero exit, throw, or timeout
672+
// inside the script lives in ExecutionState / ExitCode and is otherwise invisible
673+
// to callers using require.NoError. See:
674+
// https://learn.microsoft.com/en-us/azure/virtual-machines/windows/run-command-managed
675+
// ("InstanceView.ExecutionState: Status of user's Run Command script. ...
676+
//
677+
// ProvisioningState: Status of general extension provisioning end to end").
678+
func runCommandScriptError(view armcompute.VirtualMachineRunCommandInstanceView) error {
679+
state := armcompute.ExecutionStateUnknown
680+
if view.ExecutionState != nil {
681+
state = *view.ExecutionState
682+
}
683+
exitCode := int32(0)
684+
if view.ExitCode != nil {
685+
exitCode = *view.ExitCode
686+
}
687+
if state == armcompute.ExecutionStateSucceeded && exitCode == 0 {
688+
return nil
689+
}
690+
output := ""
691+
if view.Output != nil {
692+
output = strings.TrimSpace(*view.Output)
693+
}
694+
stderr := ""
695+
if view.Error != nil {
696+
stderr = strings.TrimSpace(*view.Error)
697+
}
698+
msg := ""
699+
if view.ExecutionMessage != nil {
700+
msg = strings.TrimSpace(*view.ExecutionMessage)
701+
}
702+
return fmt.Errorf("RunCommand script failed: executionState=%s exitCode=%d message=%q stdout=%q stderr=%q",
703+
state, exitCode, msg, output, stderr)
704+
}
705+
706+
// windowsSysprepScript runs Sysprep /generalize on the test VM. It pre-emptively drops
707+
// any SysPrepExternal\Generalize provider entry pointing at VMAgentDisabler.dll: when
708+
// the DLL can't be loaded, Sysprep stalls past our vmssCtx deadline. The same workaround
709+
// has lived in vhdbuilder/packer/windows/sysprep.ps1 since 2020 (PR #429).
710+
//
711+
// Pre-cleanup of C:\Windows\Panther and unattend.xml follows
712+
// https://learn.microsoft.com/en-us/azure/virtual-machines/generalize, which notes
713+
// that stale Panther logs can cause Sysprep to fail and that custom answer files
714+
// aren't supported in this step. The ImageState poll handles Server 2022 where
715+
// Sysprep /quit can return before background SetupHost.exe finishes generalizing.
716+
const windowsSysprepScript = `
717+
$ErrorActionPreference = 'Stop'
718+
719+
# Best-effort: drop broken SysPrepExternal\Generalize providers that point at
720+
# VMAgentDisabler.dll. When the DLL can't be loaded Sysprep /generalize hangs
721+
# ~14m. Registry cleanup failures are logged but must not abort sysprep itself.
722+
try {
723+
$path = 'Registry::HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows\CurrentVersion\Setup\SysPrepExternal\Generalize'
724+
if (Test-Path $path) {
725+
foreach ($name in (Get-Item -Path $path).Property) {
726+
$value = Get-ItemPropertyValue -Path $path -Name $name
727+
if ($value -like '*VMAgentDisabler.dll*') {
728+
Write-Host "Removing broken generalize provider $name -> $value"
729+
Remove-ItemProperty -Path $path -Name $name
730+
}
731+
}
732+
}
733+
} catch {
734+
Write-Warning "Failed to clean SysPrepExternal\Generalize entries: $_"
735+
}
736+
737+
# Per https://learn.microsoft.com/en-us/azure/virtual-machines/generalize:
738+
# stale Panther logs can cause Sysprep to fail; custom unattend files unsupported.
739+
Remove-Item "$env:SystemRoot\Panther" -Recurse -Force -ErrorAction SilentlyContinue
740+
Remove-Item "$env:SystemRoot\System32\Sysprep\unattend.xml" -Force -ErrorAction SilentlyContinue
741+
742+
# /quit (not /shutdown) so RunCommand can return; deallocate happens separately.
743+
# $LASTEXITCODE isn't reliable after Sysprep.exe /quit — sysprep launches a
744+
# background SetupHost.exe and returns before generalization completes. The
745+
# ImageState poll below is the authoritative success signal (same as
746+
# vhdbuilder/packer/windows/sysprep.ps1).
747+
& "$env:SystemRoot\System32\Sysprep\Sysprep.exe" /oobe /generalize /mode:vm /quiet /quit
748+
749+
# On Server 2022, sysprep /quit can return before background SetupHost.exe
750+
# finishes generalizing. Wait for the registry state to confirm before letting
751+
# the caller deallocate and capture the disk. Same pattern as
752+
# vhdbuilder/packer/windows/sysprep.ps1 (in-tree since 2020).
753+
$deadline = (Get-Date).AddMinutes(10)
754+
$last = $null
755+
while ($true) {
756+
$state = (Get-ItemProperty 'HKLM:\SOFTWARE\Microsoft\Windows\CurrentVersion\Setup\State').ImageState
757+
if ($state -ne $last) {
758+
Write-Host "ImageState=$state"
759+
$last = $state
760+
}
761+
if ($state -eq 'IMAGE_STATE_GENERALIZE_RESEAL_TO_OOBE') { break }
762+
if ((Get-Date) -gt $deadline) {
763+
throw "Sysprep did not reach IMAGE_STATE_GENERALIZE_RESEAL_TO_OOBE within 10 minutes (last state: $state)"
764+
}
765+
Start-Sleep -Seconds 10
766+
}
767+
`
768+
633769
func CreateImage(ctx context.Context, s *Scenario) *config.Image {
634770
if s.IsWindows() {
635771
s.T.Log("Running sysprep on Windows VM...")
636-
res, err := RunCommand(ctx, s, `C:\Windows\System32\Sysprep\Sysprep.exe /oobe /generalize /mode:vm /quiet /quit;`)
637-
resJson, _ := json.MarshalIndent(res, "", " ")
638-
s.T.Logf("Sysprep result: %s", string(resJson))
772+
res, err := RunCommand(ctx, s, windowsSysprepScript)
773+
var stdout, stderr string
774+
if res.Output != nil {
775+
stdout = strings.TrimSpace(*res.Output)
776+
}
777+
if res.Error != nil {
778+
stderr = strings.TrimSpace(*res.Error)
779+
}
780+
s.T.Logf("Sysprep stdout: %s", stdout)
781+
if stderr != "" {
782+
s.T.Logf("Sysprep stderr: %s", stderr)
783+
}
639784
require.NoErrorf(s.T, err, "failed to run sysprep on Windows VM for image creation")
640785
}
641786

e2e/validators.go

Lines changed: 10 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@ import (
1616
"testing"
1717
"time"
1818

19-
"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
20-
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7"
2119
"github.com/Masterminds/semver/v3"
2220
"github.com/samber/lo"
2321
"github.com/tidwall/gjson"
@@ -2383,16 +2381,12 @@ else
23832381
exit 1
23842382
fi`)
23852383
require.NoError(s.T, err, "Failed to run command to check sshd_config")
2386-
respJson, err := resp.MarshalJSON()
2387-
require.NoError(s.T, err, "Failed to marshal response")
2388-
s.T.Logf("Run command output: %s", string(respJson))
2389-
2390-
// Parse the JSON response to extract the output and exit code
2391-
respString := string(respJson)
2384+
stdout := lo.FromPtr(resp.Output)
2385+
s.T.Logf("Run command stdout: %s\nstderr: %s", stdout, lo.FromPtr(resp.Error))
23922386

23932387
// Check if the command execution was successful by looking for our success message in the output
2394-
if !strings.Contains(respString, "SUCCESS: PubkeyAuthentication is disabled") {
2395-
s.T.Fatalf("PubkeyAuthentication is not properly disabled. Full response: %s", respString)
2388+
if !strings.Contains(stdout, "SUCCESS: PubkeyAuthentication is disabled") {
2389+
s.T.Fatalf("PubkeyAuthentication is not properly disabled. stdout: %s", stdout)
23962390
}
23972391

23982392
// Part 2. Check cannot SSH with private key (expect failure)
@@ -2411,9 +2405,7 @@ func ValidateSSHServiceDisabled(ctx context.Context, s *Scenario) {
24112405

24122406
// Use VMSS RunCommand to check SSH service status directly on the node
24132407
// Ubuntu uses 'ssh' as service name, while AzureLinux and Mariner use 'sshd'
2414-
runPoller, err := config.Azure.VMSSVM.BeginRunCommand(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, armcompute.RunCommandInput{
2415-
CommandID: to.Ptr("RunShellScript"),
2416-
Script: []*string{to.Ptr(`#!/bin/bash
2408+
resp, err := RunCommand(ctx, s, `#!/bin/bash
24172409
# Determine the correct SSH service name based on the distro
24182410
# Ubuntu uses 'ssh', AzureLinux and Mariner use 'sshd'
24192411
if [ -f /etc/os-release ]; then
@@ -2447,24 +2439,14 @@ if echo "$status_output" | grep -q "Active: inactive (dead)"; then
24472439
else
24482440
echo "FAILED: SSH service is not inactive"
24492441
exit 1
2450-
fi`)},
2451-
}, nil)
2442+
fi`)
24522443
require.NoError(s.T, err, "Failed to run command to check SSH service status")
2453-
2454-
runResp, err := runPoller.PollUntilDone(ctx, nil)
2455-
require.NoError(s.T, err, "Failed to complete command to check SSH service status")
2456-
2457-
// Parse the response to check the result
2458-
respJson, err := runResp.MarshalJSON()
2459-
require.NoError(s.T, err, "Failed to marshal run command response")
2460-
s.T.Logf("Run command output: %s", string(respJson))
2461-
2462-
// Parse the JSON response to extract the output
2463-
respString := string(respJson)
2444+
stdout := lo.FromPtr(resp.Output)
2445+
s.T.Logf("Run command stdout: %s\nstderr: %s", stdout, lo.FromPtr(resp.Error))
24642446

24652447
// Check if the command execution was successful by looking for our success message in the output
2466-
if !strings.Contains(respString, "SUCCESS: SSH service is disabled and stopped") {
2467-
s.T.Fatalf("SSH service is not properly disabled and stopped. Full response: %s", respString)
2448+
if !strings.Contains(stdout, "SUCCESS: SSH service is disabled and stopped") {
2449+
s.T.Fatalf("SSH service is not properly disabled and stopped. stdout: %s", stdout)
24682450
}
24692451

24702452
s.T.Logf("SSH service is properly disabled and stopped as expected")

0 commit comments

Comments
 (0)