Skip to content

Commit 36a2197

Browse files
authored
Fix managed rollback test (#11218)
* Add logging to stop/start operation for restartAgentNTimes in Test*UpgradeRollbackOnRestarts * Do not reset lastPid at every iteration * Switch to restarting based on version in TestFleetManagedUpgradeRollbackOnRestarts Move from a fixed number of restarts to restarting only if the agent matches the target version in the managed upgrade rollback integration test. This is done to avoid +-1 errors restarting the agent to simulate only the upgrade agent crashing, while we can keep restarting for a given time (controlled by a context with timeout) if the agent matches a particular version. This should make the test more reliable and avoid weird interaction between the restart timing and the watcher polling.
1 parent 1059de9 commit 36a2197

File tree

4 files changed

+110
-35
lines changed

4 files changed

+110
-35
lines changed

internal/pkg/agent/application/upgrade/watcher.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ func (ch *AgentWatcher) Run(ctx context.Context) {
8686

8787
ch.connectCounter = 0
8888
ch.lostCounter = 0
89+
ch.lastPid = -1
8990

9091
// tracking of an error runs in a separate goroutine, because
9192
// the call to `watch.Recv` blocks and a timer is needed
@@ -143,7 +144,6 @@ func (ch *AgentWatcher) Run(ctx context.Context) {
143144

144145
LOOP:
145146
for {
146-
ch.lastPid = -1
147147
connectTimer := time.NewTimer(ch.checkInterval)
148148
select {
149149
case <-ctx.Done():

pkg/testing/fixture.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ type Fixture struct {
5555
srcPackage string
5656
workDir string
5757
extractDir string
58+
socketPath string
5859

5960
installed bool
6061
installOpts *InstallOpts
@@ -186,12 +187,21 @@ func NewFixture(t *testing.T, version string, opts ...FixtureOpt) (*Fixture, err
186187
}
187188

188189
// Client returns the Elastic Agent communication client.
190+
// This client is shared across multiple calls to Client()
189191
func (f *Fixture) Client() client.Client {
190192
f.cMx.RLock()
191193
defer f.cMx.RUnlock()
192194
return f.c
193195
}
194196

197+
// NewClient returns a new Elastic Agent communication client.
198+
// The client is NOT shared across multiple calls but a new instance is allocated every time
199+
func (f *Fixture) NewClient() client.Client {
200+
f.cMx.Lock()
201+
defer f.cMx.Unlock()
202+
return client.New(client.WithAddress(f.socketPath))
203+
}
204+
195205
// Version returns the Elastic Agent version.
196206
func (f *Fixture) Version() string {
197207
return f.version
@@ -1136,6 +1146,12 @@ func (f *Fixture) prepareComponents(workDir string, components ...UsableComponen
11361146
return nil
11371147
}
11381148

1149+
func (f *Fixture) setSocketPath(socketPath string) {
1150+
f.cMx.Lock()
1151+
defer f.cMx.Unlock()
1152+
f.socketPath = socketPath
1153+
}
1154+
11391155
func (f *Fixture) setClient(c client.Client) {
11401156
f.cMx.Lock()
11411157
defer f.cMx.Unlock()
@@ -1605,6 +1621,7 @@ type AgentBinaryVersion struct {
16051621
Commit string `yaml:"commit"`
16061622
BuildTime string `yaml:"build_time"`
16071623
Snapshot bool `yaml:"snapshot"`
1624+
Fips bool `yaml:"fips"`
16081625
}
16091626

16101627
// String returns the version string.

pkg/testing/fixture_install.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@ func (f *Fixture) installNoPkgManager(ctx context.Context, installOpts *InstallO
278278
socketPath = paths.ControlSocketFromPath(runtime.GOOS, f.workDir)
279279
}
280280
c := client.New(client.WithAddress(socketPath))
281+
f.setSocketPath(socketPath)
281282
f.setClient(c)
282283

283284
f.t.Cleanup(func() {

testing/integration/ess/upgrade_rollback_test.go

Lines changed: 91 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -529,9 +529,13 @@ func managedRollbackRestartTest(ctx context.Context, t *testing.T, info *define.
529529
// we expect ErrSkipGrace at this point, meaning that we finished installing but didn't wait for agent to become healthy
530530
require.ErrorIs(t, err, ErrSkipGrace, "managed upgrade failed with unexpected error")
531531

532-
// A few seconds after the upgrade, deliberately restart upgraded Agent a
533-
// couple of times to simulate Agent crashing.
534-
restartAgentNTimes(t, 3, 10*time.Second)
532+
installedAgentClient := from.NewClient()
533+
targetVersion, err := to.ExecVersion(ctx)
534+
require.NoError(t, err, "failed to get target version")
535+
restartContext, cancel := context.WithTimeout(t.Context(), 1*time.Minute)
536+
defer cancel()
537+
// restart the agent only if it matches the (upgraded) target version
538+
restartAgentVersion(restartContext, t, installedAgentClient, targetVersion.Binary, 10*time.Second)
535539

536540
// wait for the agent to be healthy and correct version
537541
err = upgradetest.WaitHealthyAndVersion(ctx, from, startVersionInfo.Binary, 2*time.Minute, 10*time.Second, t)
@@ -657,42 +661,95 @@ func restartAgentNTimes(t *testing.T, noOfRestarts int, sleepBetweenIterations t
657661

658662
for restartIdx := 0; restartIdx < noOfRestarts; restartIdx++ {
659663
time.Sleep(sleepBetweenIterations)
664+
restartAgent(t, topPath, 5*time.Minute)
665+
}
666+
}
660667

661-
t.Logf("Stopping agent via service to simulate crashing")
662-
err := install.StopService(topPath, install.DefaultStopTimeout, install.DefaultStopInterval)
663-
if err != nil && runtime.GOOS == define.Windows && strings.Contains(err.Error(), "The service has not been started.") {
664-
// Due to the quick restarts every 10 seconds its possible that this is faster than Windows
665-
// can handle. Decrementing restartIdx means that the loop will occur again.
666-
t.Logf("Got an allowed error on Windows: %s", err)
667-
err = nil
668+
func restartAgent(t *testing.T, topPath string, operationTimeout time.Duration) {
669+
t.Logf("Stopping agent via service to simulate crashing")
670+
stopRequested := time.Now()
671+
err := install.StopService(topPath, install.DefaultStopTimeout, install.DefaultStopInterval)
672+
if err != nil && runtime.GOOS == define.Windows && strings.Contains(err.Error(), "The service has not been started.") {
673+
// Due to the quick restarts every sleepBetweenIterations its possible that this is faster than Windows
674+
// can handle. Decrementing restartIdx means that the loop will occur again.
675+
t.Logf("Got an allowed error on Windows: %s", err)
676+
err = nil
677+
}
678+
require.NoError(t, err)
679+
680+
// ensure that it's stopped before starting it again
681+
var status service.Status
682+
var statusErr error
683+
require.Eventuallyf(t, func() bool {
684+
status, statusErr = install.StatusService(topPath)
685+
if statusErr != nil {
686+
return false
668687
}
669-
require.NoError(t, err)
688+
return status != service.StatusRunning
689+
}, operationTimeout, 500*time.Millisecond, "service never fully stopped (status: %v): %s", status, statusErr)
690+
t.Logf("Stopped agent via service. Took roughly %s", time.Since(stopRequested))
691+
692+
// start it again
693+
t.Logf("Starting agent via service to simulate crashing")
694+
startRequested := time.Now()
695+
err = install.StartService(topPath)
696+
require.NoError(t, err)
670697

671-
// ensure that it's stopped before starting it again
672-
var status service.Status
673-
var statusErr error
674-
require.Eventuallyf(t, func() bool {
675-
status, statusErr = install.StatusService(topPath)
676-
if statusErr != nil {
677-
return false
678-
}
679-
return status != service.StatusRunning
680-
}, 5*time.Minute, 1*time.Second, "service never fully stopped (status: %v): %s", status, statusErr)
681-
t.Logf("Stopped agent via service to simulate crashing")
698+
// ensure that it's started before next loop
699+
require.Eventuallyf(t, func() bool {
700+
status, statusErr = install.StatusService(topPath)
701+
if statusErr != nil {
702+
return false
703+
}
704+
return status == service.StatusRunning
705+
}, operationTimeout, 500*time.Millisecond, "service never fully started (status: %v): %s", status, statusErr)
706+
t.Logf("Started agent after stopping to simulate crashing. Took roughly %s", time.Since(startRequested))
707+
}
682708

683-
// start it again
684-
t.Logf("Starting agent via service to simulate crashing")
685-
err = install.StartService(topPath)
686-
require.NoError(t, err)
709+
func restartAgentVersion(ctx context.Context, t *testing.T, client client.Client, targetVersion atesting.AgentBinaryVersion, restartInterval time.Duration) {
710+
topPath := paths.Top()
711+
712+
ticker := time.NewTicker(restartInterval)
713+
defer ticker.Stop()
687714

688-
// ensure that it's started before next loop
689-
require.Eventuallyf(t, func() bool {
690-
status, statusErr = install.StatusService(topPath)
691-
if statusErr != nil {
692-
return false
715+
for {
716+
select {
717+
case <-ctx.Done():
718+
t.Log("restart context is done, returning")
719+
return
720+
721+
case <-ticker.C:
722+
if !versionMatch(ctx, t, client, targetVersion) {
723+
// version of running agent does not match the target, continue to the next iteration
724+
continue
693725
}
694-
return status == service.StatusRunning
695-
}, 5*time.Minute, 1*time.Second, "service never fully started (status: %v): %s", status, statusErr)
696-
t.Logf("Started agent via service to simulate crashing")
726+
727+
restartAgent(t, topPath, restartInterval)
728+
}
729+
730+
}
731+
}
732+
733+
func versionMatch(ctx context.Context, t *testing.T, c client.Client, targetVersion atesting.AgentBinaryVersion) bool {
734+
err := c.Connect(ctx)
735+
if err != nil {
736+
t.Logf("failed to connect to agent: %v", err)
737+
return false
738+
}
739+
defer c.Disconnect()
740+
741+
actualVersion, err := c.Version(ctx)
742+
if err != nil {
743+
t.Logf("failed to detect agent version: %v", err)
744+
return false
745+
}
746+
747+
if actualVersion.Version != targetVersion.Version ||
748+
actualVersion.Snapshot != targetVersion.Snapshot ||
749+
actualVersion.Commit != targetVersion.Commit ||
750+
actualVersion.Fips != targetVersion.Fips {
751+
t.Logf("actual agent version %+v does not match target agent version %+v, skipping restart", actualVersion, targetVersion)
752+
return false
697753
}
754+
return true
698755
}

0 commit comments

Comments
 (0)