Skip to content

Commit a176495

Browse files
mergify[bot]pchila
andauthored
Fix managed rollback test (#11218) (#11255)
* Add logging to stop/start operation for restartAgentNTimes in Test*UpgradeRollbackOnRestarts * Do not reset lastPid at every iteration * Switch to restarting based on version in TestFleetManagedUpgradeRollbackOnRestarts Move from a fixed number of restarts to restarting only if the agent matches the target version in the managed upgrade rollback integration test. This is done to avoid +-1 errors restarting the agent to simulate only the upgrade agent crashing, while we can keep restarting for a given time (controlled by a context with timeout) if the agent matches a particular version. This should make the test more reliable and avoid weird interaction between the restart timing and the watcher polling. (cherry picked from commit 36a2197) Co-authored-by: Paolo Chilà <[email protected]>
1 parent ada237e commit a176495

File tree

4 files changed

+110
-35
lines changed

4 files changed

+110
-35
lines changed

internal/pkg/agent/application/upgrade/watcher.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ func (ch *AgentWatcher) Run(ctx context.Context) {
8686

8787
ch.connectCounter = 0
8888
ch.lostCounter = 0
89+
ch.lastPid = -1
8990

9091
// tracking of an error runs in a separate goroutine, because
9192
// the call to `watch.Recv` blocks and a timer is needed
@@ -143,7 +144,6 @@ func (ch *AgentWatcher) Run(ctx context.Context) {
143144

144145
LOOP:
145146
for {
146-
ch.lastPid = -1
147147
connectTimer := time.NewTimer(ch.checkInterval)
148148
select {
149149
case <-ctx.Done():

pkg/testing/fixture.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ type Fixture struct {
5555
srcPackage string
5656
workDir string
5757
extractDir string
58+
socketPath string
5859

5960
installed bool
6061
installOpts *InstallOpts
@@ -186,12 +187,21 @@ func NewFixture(t *testing.T, version string, opts ...FixtureOpt) (*Fixture, err
186187
}
187188

188189
// Client returns the Elastic Agent communication client.
190+
// This client is shared across multiple calls to Client()
189191
func (f *Fixture) Client() client.Client {
190192
f.cMx.RLock()
191193
defer f.cMx.RUnlock()
192194
return f.c
193195
}
194196

197+
// NewClient returns a new Elastic Agent communication client.
198+
// The client is NOT shared across multiple calls but a new instance is allocated every time
199+
func (f *Fixture) NewClient() client.Client {
200+
f.cMx.Lock()
201+
defer f.cMx.Unlock()
202+
return client.New(client.WithAddress(f.socketPath))
203+
}
204+
195205
// Version returns the Elastic Agent version.
196206
func (f *Fixture) Version() string {
197207
return f.version
@@ -1136,6 +1146,12 @@ func (f *Fixture) prepareComponents(workDir string, components ...UsableComponen
11361146
return nil
11371147
}
11381148

1149+
func (f *Fixture) setSocketPath(socketPath string) {
1150+
f.cMx.Lock()
1151+
defer f.cMx.Unlock()
1152+
f.socketPath = socketPath
1153+
}
1154+
11391155
func (f *Fixture) setClient(c client.Client) {
11401156
f.cMx.Lock()
11411157
defer f.cMx.Unlock()
@@ -1605,6 +1621,7 @@ type AgentBinaryVersion struct {
16051621
Commit string `yaml:"commit"`
16061622
BuildTime string `yaml:"build_time"`
16071623
Snapshot bool `yaml:"snapshot"`
1624+
Fips bool `yaml:"fips"`
16081625
}
16091626

16101627
// String returns the version string.

pkg/testing/fixture_install.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@ func (f *Fixture) installNoPkgManager(ctx context.Context, installOpts *InstallO
278278
socketPath = paths.ControlSocketFromPath(runtime.GOOS, f.workDir)
279279
}
280280
c := client.New(client.WithAddress(socketPath))
281+
f.setSocketPath(socketPath)
281282
f.setClient(c)
282283

283284
f.t.Cleanup(func() {

testing/integration/ess/upgrade_rollback_test.go

Lines changed: 91 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -452,9 +452,13 @@ func managedRollbackRestartTest(ctx context.Context, t *testing.T, info *define.
452452
// we expect ErrSkipGrace at this point, meaning that we finished installing but didn't wait for agent to become healthy
453453
require.ErrorIs(t, err, ErrSkipGrace, "managed upgrade failed with unexpected error")
454454

455-
// A few seconds after the upgrade, deliberately restart upgraded Agent a
456-
// couple of times to simulate Agent crashing.
457-
restartAgentNTimes(t, 3, 10*time.Second)
455+
installedAgentClient := from.NewClient()
456+
targetVersion, err := to.ExecVersion(ctx)
457+
require.NoError(t, err, "failed to get target version")
458+
restartContext, cancel := context.WithTimeout(t.Context(), 1*time.Minute)
459+
defer cancel()
460+
// restart the agent only if it matches the (upgraded) target version
461+
restartAgentVersion(restartContext, t, installedAgentClient, targetVersion.Binary, 10*time.Second)
458462

459463
// wait for the agent to be healthy and correct version
460464
err = upgradetest.WaitHealthyAndVersion(ctx, from, startVersionInfo.Binary, 2*time.Minute, 10*time.Second, t)
@@ -591,42 +595,95 @@ func restartAgentNTimes(t *testing.T, noOfRestarts int, sleepBetweenIterations t
591595

592596
for restartIdx := 0; restartIdx < noOfRestarts; restartIdx++ {
593597
time.Sleep(sleepBetweenIterations)
598+
restartAgent(t, topPath, 5*time.Minute)
599+
}
600+
}
594601

595-
t.Logf("Stopping agent via service to simulate crashing")
596-
err := install.StopService(topPath, install.DefaultStopTimeout, install.DefaultStopInterval)
597-
if err != nil && runtime.GOOS == define.Windows && strings.Contains(err.Error(), "The service has not been started.") {
598-
// Due to the quick restarts every 10 seconds its possible that this is faster than Windows
599-
// can handle. Decrementing restartIdx means that the loop will occur again.
600-
t.Logf("Got an allowed error on Windows: %s", err)
601-
err = nil
602+
func restartAgent(t *testing.T, topPath string, operationTimeout time.Duration) {
603+
t.Logf("Stopping agent via service to simulate crashing")
604+
stopRequested := time.Now()
605+
err := install.StopService(topPath, install.DefaultStopTimeout, install.DefaultStopInterval)
606+
if err != nil && runtime.GOOS == define.Windows && strings.Contains(err.Error(), "The service has not been started.") {
607+
// Due to the quick restarts every sleepBetweenIterations its possible that this is faster than Windows
608+
// can handle. Decrementing restartIdx means that the loop will occur again.
609+
t.Logf("Got an allowed error on Windows: %s", err)
610+
err = nil
611+
}
612+
require.NoError(t, err)
613+
614+
// ensure that it's stopped before starting it again
615+
var status service.Status
616+
var statusErr error
617+
require.Eventuallyf(t, func() bool {
618+
status, statusErr = install.StatusService(topPath)
619+
if statusErr != nil {
620+
return false
602621
}
603-
require.NoError(t, err)
622+
return status != service.StatusRunning
623+
}, operationTimeout, 500*time.Millisecond, "service never fully stopped (status: %v): %s", status, statusErr)
624+
t.Logf("Stopped agent via service. Took roughly %s", time.Since(stopRequested))
625+
626+
// start it again
627+
t.Logf("Starting agent via service to simulate crashing")
628+
startRequested := time.Now()
629+
err = install.StartService(topPath)
630+
require.NoError(t, err)
604631

605-
// ensure that it's stopped before starting it again
606-
var status service.Status
607-
var statusErr error
608-
require.Eventuallyf(t, func() bool {
609-
status, statusErr = install.StatusService(topPath)
610-
if statusErr != nil {
611-
return false
612-
}
613-
return status != service.StatusRunning
614-
}, 5*time.Minute, 1*time.Second, "service never fully stopped (status: %v): %s", status, statusErr)
615-
t.Logf("Stopped agent via service to simulate crashing")
632+
// ensure that it's started before next loop
633+
require.Eventuallyf(t, func() bool {
634+
status, statusErr = install.StatusService(topPath)
635+
if statusErr != nil {
636+
return false
637+
}
638+
return status == service.StatusRunning
639+
}, operationTimeout, 500*time.Millisecond, "service never fully started (status: %v): %s", status, statusErr)
640+
t.Logf("Started agent after stopping to simulate crashing. Took roughly %s", time.Since(startRequested))
641+
}
616642

617-
// start it again
618-
t.Logf("Starting agent via service to simulate crashing")
619-
err = install.StartService(topPath)
620-
require.NoError(t, err)
643+
func restartAgentVersion(ctx context.Context, t *testing.T, client client.Client, targetVersion atesting.AgentBinaryVersion, restartInterval time.Duration) {
644+
topPath := paths.Top()
645+
646+
ticker := time.NewTicker(restartInterval)
647+
defer ticker.Stop()
621648

622-
// ensure that it's started before next loop
623-
require.Eventuallyf(t, func() bool {
624-
status, statusErr = install.StatusService(topPath)
625-
if statusErr != nil {
626-
return false
649+
for {
650+
select {
651+
case <-ctx.Done():
652+
t.Log("restart context is done, returning")
653+
return
654+
655+
case <-ticker.C:
656+
if !versionMatch(ctx, t, client, targetVersion) {
657+
// version of running agent does not match the target, continue to the next iteration
658+
continue
627659
}
628-
return status == service.StatusRunning
629-
}, 5*time.Minute, 1*time.Second, "service never fully started (status: %v): %s", status, statusErr)
630-
t.Logf("Started agent via service to simulate crashing")
660+
661+
restartAgent(t, topPath, restartInterval)
662+
}
663+
664+
}
665+
}
666+
667+
func versionMatch(ctx context.Context, t *testing.T, c client.Client, targetVersion atesting.AgentBinaryVersion) bool {
668+
err := c.Connect(ctx)
669+
if err != nil {
670+
t.Logf("failed to connect to agent: %v", err)
671+
return false
672+
}
673+
defer c.Disconnect()
674+
675+
actualVersion, err := c.Version(ctx)
676+
if err != nil {
677+
t.Logf("failed to detect agent version: %v", err)
678+
return false
679+
}
680+
681+
if actualVersion.Version != targetVersion.Version ||
682+
actualVersion.Snapshot != targetVersion.Snapshot ||
683+
actualVersion.Commit != targetVersion.Commit ||
684+
actualVersion.Fips != targetVersion.Fips {
685+
t.Logf("actual agent version %+v does not match target agent version %+v, skipping restart", actualVersion, targetVersion)
686+
return false
631687
}
688+
return true
632689
}

0 commit comments

Comments
 (0)