@@ -599,43 +599,188 @@ func createVMExtensionLinuxAKSNode(ctx context.Context, location *string) (*armc
599599 }, nil
600600}
601601
602- // RunCommand executes a command on the VMSS VM with instance ID "0" and returns the raw JSON response from Azure
603- // Unlike default approach, it doesn't use SSH and uses Azure tooling
604- // This approach is generally slower, but it works even if SSH is not available
605- func RunCommand (ctx context.Context , s * Scenario , command string ) (armcompute.RunCommandResult , error ) {
602+ // RunCommand executes a script on the VMSS VM with the configured instance ID via the
603+ // Azure VMSS RunCommand v2 API (VirtualMachineRunCommand resource). This is the API
604+ // already used by production aks-rp PIS code; using it here keeps test and production
605+ // on the same surface and avoids the v1 RunCommand extension's failure modes
606+ // (e.g. the Microsoft.CPlat.Core/RunCommandWindows "Keyset does not exist" error
607+ // fixed by ADO PR https://msazure.visualstudio.com/CloudNativeCompute/_git/aks-rp/pullrequest/15721814).
608+ //
609+ // Unlike SSH-based exec, this works even when WinRM/SSH are unavailable (e.g. mid-sysprep).
610+ // It is generally slower than SSH because each call creates a VirtualMachineRunCommand
611+ // resource on the VM and waits for it to provision.
612+ func RunCommand (ctx context.Context , s * Scenario , command string ) (armcompute.VirtualMachineRunCommandInstanceView , error ) {
606613 s .T .Helper ()
614+ rg := * s .Runtime .Cluster .Model .Properties .NodeResourceGroup
615+ instanceID := * s .Runtime .VM .VM .InstanceID
616+ // VirtualMachineRunCommand resources persist on the VM until explicitly deleted;
617+ // use a unique name per call so concurrent / repeated calls don't collide, and
618+ // best-effort delete it on the way out below.
619+ runCommandName := fmt .Sprintf ("e2e-runcmd-%d" , time .Now ().UnixNano ())
607620 start := time .Now ()
608621 defer func () {
609- elapsed := time .Since (start )
610- toolkit .Logf (ctx , "Command %q took %s" , command , elapsed )
622+ toolkit .Logf (ctx , "RunCommand %s took %s" , runCommandName , time .Since (start ))
611623 }()
612624
613- runPoller , err := config .Azure .VMSSVM .BeginRunCommand (ctx , * s .Runtime .Cluster .Model .Properties .NodeResourceGroup , s .Runtime .VMSSName , * s .Runtime .VM .VM .InstanceID , armcompute.RunCommandInput {
614- CommandID : func () * string {
615- if s .IsWindows () {
616- return to .Ptr ("RunPowerShellScript" )
617- }
618- return to .Ptr ("RunShellScript" )
619- }(),
620- Script : []* string {to .Ptr (command )},
621- }, nil )
625+ runCmd := armcompute.VirtualMachineRunCommand {
626+ Location : to .Ptr (s .Location ),
627+ Properties : & armcompute.VirtualMachineRunCommandProperties {
628+ Source : & armcompute.VirtualMachineRunCommandScriptSource {
629+ Script : to .Ptr (command ),
630+ },
631+ AsyncExecution : to .Ptr (false ),
632+ },
633+ }
634+
635+ poller , err := config .Azure .VMSSVMRunCommands .BeginCreateOrUpdate (ctx , rg , s .Runtime .VMSSName , instanceID , runCommandName , runCmd , nil )
622636 if err != nil {
623- return armcompute.RunCommandResult {}, fmt .Errorf ("failed to run command on Windows VM for image creation: %w" , err )
637+ return armcompute.VirtualMachineRunCommandInstanceView {}, fmt .Errorf ("failed to start RunCommand on VMSS VM: %w" , err )
638+ }
639+ defer func () {
640+ // Best-effort cleanup: VirtualMachineRunCommand resources persist on the VM
641+ // otherwise and can accumulate across many RunCommand calls in a single run.
642+ // Use a fresh context so we still clean up if the caller's ctx is cancelled.
643+ cleanupCtx , cancel := context .WithTimeout (context .Background (), 2 * time .Minute )
644+ defer cancel ()
645+ if _ , derr := config .Azure .VMSSVMRunCommands .BeginDelete (cleanupCtx , rg , s .Runtime .VMSSName , instanceID , runCommandName , nil ); derr != nil {
646+ toolkit .Logf (ctx , "best-effort RunCommand %s delete failed: %v" , runCommandName , derr )
647+ }
648+ }()
649+ if _ , err := poller .PollUntilDone (ctx , nil ); err != nil {
650+ return armcompute.VirtualMachineRunCommandInstanceView {}, fmt .Errorf ("failed to wait for RunCommand on VMSS VM: %w" , err )
624651 }
625652
626- runResp , err := runPoller .PollUntilDone (ctx , nil )
653+ // CreateOrUpdate (PUT) never includes InstanceView in the response — it's only
654+ // returned by Get when $expand=instanceView is set. Fetch it explicitly so we
655+ // get stdout/stderr/exit code.
656+ getResp , err := config .Azure .VMSSVMRunCommands .Get (ctx , rg , s .Runtime .VMSSName , instanceID , runCommandName , & armcompute.VirtualMachineScaleSetVMRunCommandsClientGetOptions {
657+ Expand : to .Ptr ("instanceView" ),
658+ })
627659 if err != nil {
628- return runResp .RunCommandResult , fmt .Errorf ("failed to run command on Windows VM for image creation: %w" , err )
660+ return armcompute.VirtualMachineRunCommandInstanceView {}, fmt .Errorf ("failed to get RunCommand instance view: %w" , err )
661+ }
662+ if getResp .Properties == nil || getResp .Properties .InstanceView == nil {
663+ return armcompute.VirtualMachineRunCommandInstanceView {}, errors .New ("RunCommand result missing instance view" )
629664 }
630- return runResp .RunCommandResult , err
665+ view := * getResp .Properties .InstanceView
666+ return view , runCommandScriptError (view )
631667}
632668
669+ // runCommandScriptError converts a RunCommand instance view into an error if the
670+ // script itself failed. The ARM CreateOrUpdate operation reports success as long as
671+ // the extension was able to run the script — a non-zero exit, throw, or timeout
672+ // inside the script lives in ExecutionState / ExitCode and is otherwise invisible
673+ // to callers using require.NoError. See:
674+ // https://learn.microsoft.com/en-us/azure/virtual-machines/windows/run-command-managed
675+ // ("InstanceView.ExecutionState: Status of user's Run Command script. ...
676+ //
677+ // ProvisioningState: Status of general extension provisioning end to end").
678+ func runCommandScriptError (view armcompute.VirtualMachineRunCommandInstanceView ) error {
679+ state := armcompute .ExecutionStateUnknown
680+ if view .ExecutionState != nil {
681+ state = * view .ExecutionState
682+ }
683+ exitCode := int32 (0 )
684+ if view .ExitCode != nil {
685+ exitCode = * view .ExitCode
686+ }
687+ if state == armcompute .ExecutionStateSucceeded && exitCode == 0 {
688+ return nil
689+ }
690+ output := ""
691+ if view .Output != nil {
692+ output = strings .TrimSpace (* view .Output )
693+ }
694+ stderr := ""
695+ if view .Error != nil {
696+ stderr = strings .TrimSpace (* view .Error )
697+ }
698+ msg := ""
699+ if view .ExecutionMessage != nil {
700+ msg = strings .TrimSpace (* view .ExecutionMessage )
701+ }
702+ return fmt .Errorf ("RunCommand script failed: executionState=%s exitCode=%d message=%q stdout=%q stderr=%q" ,
703+ state , exitCode , msg , output , stderr )
704+ }
705+
706+ // windowsSysprepScript runs Sysprep /generalize on the test VM. It pre-emptively drops
707+ // any SysPrepExternal\Generalize provider entry pointing at VMAgentDisabler.dll: when
708+ // the DLL can't be loaded, Sysprep stalls past our vmssCtx deadline. The same workaround
709+ // has lived in vhdbuilder/packer/windows/sysprep.ps1 since 2020 (PR #429).
710+ //
711+ // Pre-cleanup of C:\Windows\Panther and unattend.xml follows
712+ // https://learn.microsoft.com/en-us/azure/virtual-machines/generalize, which notes
713+ // that stale Panther logs can cause Sysprep to fail and that custom answer files
714+ // aren't supported in this step. The ImageState poll handles Server 2022 where
715+ // Sysprep /quit can return before background SetupHost.exe finishes generalizing.
716+ const windowsSysprepScript = `
717+ $ErrorActionPreference = 'Stop'
718+
719+ # Best-effort: drop broken SysPrepExternal\Generalize providers that point at
720+ # VMAgentDisabler.dll. When the DLL can't be loaded Sysprep /generalize hangs
721+ # ~14m. Registry cleanup failures are logged but must not abort sysprep itself.
722+ try {
723+ $path = 'Registry::HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows\CurrentVersion\Setup\SysPrepExternal\Generalize'
724+ if (Test-Path $path) {
725+ foreach ($name in (Get-Item -Path $path).Property) {
726+ $value = Get-ItemPropertyValue -Path $path -Name $name
727+ if ($value -like '*VMAgentDisabler.dll*') {
728+ Write-Host "Removing broken generalize provider $name -> $value"
729+ Remove-ItemProperty -Path $path -Name $name
730+ }
731+ }
732+ }
733+ } catch {
734+ Write-Warning "Failed to clean SysPrepExternal\Generalize entries: $_"
735+ }
736+
737+ # Per https://learn.microsoft.com/en-us/azure/virtual-machines/generalize:
738+ # stale Panther logs can cause Sysprep to fail; custom unattend files unsupported.
739+ Remove-Item "$env:SystemRoot\Panther" -Recurse -Force -ErrorAction SilentlyContinue
740+ Remove-Item "$env:SystemRoot\System32\Sysprep\unattend.xml" -Force -ErrorAction SilentlyContinue
741+
742+ # /quit (not /shutdown) so RunCommand can return; deallocate happens separately.
743+ # $LASTEXITCODE isn't reliable after Sysprep.exe /quit — sysprep launches a
744+ # background SetupHost.exe and returns before generalization completes. The
745+ # ImageState poll below is the authoritative success signal (same as
746+ # vhdbuilder/packer/windows/sysprep.ps1).
747+ & "$env:SystemRoot\System32\Sysprep\Sysprep.exe" /oobe /generalize /mode:vm /quiet /quit
748+
749+ # On Server 2022, sysprep /quit can return before background SetupHost.exe
750+ # finishes generalizing. Wait for the registry state to confirm before letting
751+ # the caller deallocate and capture the disk. Same pattern as
752+ # vhdbuilder/packer/windows/sysprep.ps1 (in-tree since 2020).
753+ $deadline = (Get-Date).AddMinutes(10)
754+ $last = $null
755+ while ($true) {
756+ $state = (Get-ItemProperty 'HKLM:\SOFTWARE\Microsoft\Windows\CurrentVersion\Setup\State').ImageState
757+ if ($state -ne $last) {
758+ Write-Host "ImageState=$state"
759+ $last = $state
760+ }
761+ if ($state -eq 'IMAGE_STATE_GENERALIZE_RESEAL_TO_OOBE') { break }
762+ if ((Get-Date) -gt $deadline) {
763+ throw "Sysprep did not reach IMAGE_STATE_GENERALIZE_RESEAL_TO_OOBE within 10 minutes (last state: $state)"
764+ }
765+ Start-Sleep -Seconds 10
766+ }
767+ `
768+
633769func CreateImage (ctx context.Context , s * Scenario ) * config.Image {
634770 if s .IsWindows () {
635771 s .T .Log ("Running sysprep on Windows VM..." )
636- res , err := RunCommand (ctx , s , `C:\Windows\System32\Sysprep\Sysprep.exe /oobe /generalize /mode:vm /quiet /quit;` )
637- resJson , _ := json .MarshalIndent (res , "" , " " )
638- s .T .Logf ("Sysprep result: %s" , string (resJson ))
772+ res , err := RunCommand (ctx , s , windowsSysprepScript )
773+ var stdout , stderr string
774+ if res .Output != nil {
775+ stdout = strings .TrimSpace (* res .Output )
776+ }
777+ if res .Error != nil {
778+ stderr = strings .TrimSpace (* res .Error )
779+ }
780+ s .T .Logf ("Sysprep stdout: %s" , stdout )
781+ if stderr != "" {
782+ s .T .Logf ("Sysprep stderr: %s" , stderr )
783+ }
639784 require .NoErrorf (s .T , err , "failed to run sysprep on Windows VM for image creation" )
640785 }
641786
0 commit comments