From 5f73f4e518b61338de0c5999736d7d328b3dd2e6 Mon Sep 17 00:00:00 2001 From: Paolo Chila Date: Thu, 25 Sep 2025 19:17:28 +0200 Subject: [PATCH 01/14] Allow for multiple directories to be specified during cleanup --- .../pkg/agent/application/upgrade/rollback.go | 29 +++++----- .../application/upgrade/rollback_test.go | 6 ++- internal/pkg/agent/cmd/mocks.go | 53 +++++++++++-------- internal/pkg/agent/cmd/watch.go | 31 +++++++++-- internal/pkg/agent/cmd/watch_impl.go | 4 +- internal/pkg/agent/cmd/watch_test.go | 9 ++-- 6 files changed, 85 insertions(+), 47 deletions(-) diff --git a/internal/pkg/agent/application/upgrade/rollback.go b/internal/pkg/agent/application/upgrade/rollback.go index 6d5e81598fd..a9542ed109e 100644 --- a/internal/pkg/agent/application/upgrade/rollback.go +++ b/internal/pkg/agent/application/upgrade/rollback.go @@ -11,6 +11,7 @@ import ( "os" "os/exec" "path/filepath" + "slices" "strings" "time" @@ -126,17 +127,21 @@ func RollbackWithOpts(ctx context.Context, log *logger.Logger, c client.Client, return nil } + if prevVersionedHome == "" { + prevVersionedHome = filepath.Join("data", fmt.Sprintf("%s-%s", agentName, prevHash)) + } + // cleanup everything except version we're rolling back into - return Cleanup(log, topDirPath, prevVersionedHome, prevHash, settings.RemoveMarker, true) + return Cleanup(log, topDirPath, settings.RemoveMarker, true, prevVersionedHome) } // Cleanup removes all artifacts and files related to a specified version. -func Cleanup(log *logger.Logger, topDirPath, currentVersionedHome, currentHash string, removeMarker, keepLogs bool) error { - return cleanup(log, topDirPath, currentVersionedHome, currentHash, removeMarker, keepLogs, afterRestartDelay) +func Cleanup(log *logger.Logger, topDirPath string, removeMarker, keepLogs bool, versionedHomesToKeep ...string) error { + return cleanup(log, topDirPath, removeMarker, keepLogs, afterRestartDelay, versionedHomesToKeep...) } -func cleanup(log *logger.Logger, topDirPath, currentVersionedHome, currentHash string, removeMarker, keepLogs bool, delay time.Duration) error { - log.Infow("Cleaning up upgrade", "hash", currentHash, "remove_marker", removeMarker) +func cleanup(log *logger.Logger, topDirPath string, removeMarker, keepLogs bool, delay time.Duration, versionedHomesToKeep ...string) error { + log.Infow("Cleaning up upgrade", "remove_marker", removeMarker) <-time.After(delay) // data directory path @@ -172,19 +177,19 @@ func cleanup(log *logger.Logger, topDirPath, currentVersionedHome, currentHash s _ = os.Remove(prevSymlink) dirPrefix := fmt.Sprintf("%s-", agentName) - var currentDir string - if currentVersionedHome != "" { - currentDir, err = filepath.Rel("data", currentVersionedHome) + + relativeHomePaths := make([]string, len(versionedHomesToKeep)) + for i, h := range versionedHomesToKeep { + relHomePath, err := filepath.Rel("data", h) if err != nil { - return fmt.Errorf("extracting elastic-agent path relative to data directory from %s: %w", currentVersionedHome, err) + return fmt.Errorf("extracting elastic-agent path relative to data directory from %s: %w", h, err) } - } else { - currentDir = fmt.Sprintf("%s-%s", agentName, currentHash) + relativeHomePaths[i] = relHomePath } var errs []error for _, dir := range subdirs { - if dir == currentDir { + if slices.Contains(relativeHomePaths, dir) { continue } diff --git a/internal/pkg/agent/application/upgrade/rollback_test.go b/internal/pkg/agent/application/upgrade/rollback_test.go index eab955d663a..df87bc5b454 100644 --- a/internal/pkg/agent/application/upgrade/rollback_test.go +++ b/internal/pkg/agent/application/upgrade/rollback_test.go @@ -219,7 +219,11 @@ func TestCleanup(t *testing.T) { require.NoError(t, err, "error loading update marker") require.NotNil(t, marker, "loaded marker must not be nil") t.Logf("Loaded update marker %+v", marker) - tt.wantErr(t, cleanup(testLogger, testTop, marker.VersionedHome, marker.Hash, tt.args.removeMarker, tt.args.keepLogs, 0), fmt.Sprintf("Cleanup(%v, %v, %v, %v)", marker.VersionedHome, marker.Hash, tt.args.removeMarker, tt.args.keepLogs)) + versionedHome := marker.VersionedHome + if versionedHome == "" { + versionedHome = filepath.Join("data", fmt.Sprintf("elastic-agent-%s", marker.Hash[:6])) + } + tt.wantErr(t, cleanup(testLogger, testTop, tt.args.removeMarker, tt.args.keepLogs, 0, versionedHome), fmt.Sprintf("Cleanup(%v, %v, %v, %v)", marker.Hash, tt.args.removeMarker, tt.args.keepLogs, versionedHome)) tt.checkAfterCleanup(t, testTop) }) } diff --git a/internal/pkg/agent/cmd/mocks.go b/internal/pkg/agent/cmd/mocks.go index 360210cbc90..8191f6c6293 100644 --- a/internal/pkg/agent/cmd/mocks.go +++ b/internal/pkg/agent/cmd/mocks.go @@ -144,16 +144,24 @@ func (_m *mockInstallationModifier) EXPECT() *mockInstallationModifier_Expecter } // Cleanup provides a mock function for the type mockInstallationModifier -func (_mock *mockInstallationModifier) Cleanup(log *logger.Logger, topDirPath string, currentVersionedHome string, currentHash string, removeMarker bool, keepLogs bool) error { - ret := _mock.Called(log, topDirPath, currentVersionedHome, currentHash, removeMarker, keepLogs) +func (_mock *mockInstallationModifier) Cleanup(log *logger.Logger, topDirPath string, removeMarker bool, keepLogs bool, versionedHomesToKeep ...string) error { + // string + _va := make([]interface{}, len(versionedHomesToKeep)) + for _i := range versionedHomesToKeep { + _va[_i] = versionedHomesToKeep[_i] + } + var _ca []interface{} + _ca = append(_ca, log, topDirPath, removeMarker, keepLogs) + _ca = append(_ca, _va...) + ret := _mock.Called(_ca...) if len(ret) == 0 { panic("no return value specified for Cleanup") } var r0 error - if returnFunc, ok := ret.Get(0).(func(*logger.Logger, string, string, string, bool, bool) error); ok { - r0 = returnFunc(log, topDirPath, currentVersionedHome, currentHash, removeMarker, keepLogs) + if returnFunc, ok := ret.Get(0).(func(*logger.Logger, string, bool, bool, ...string) error); ok { + r0 = returnFunc(log, topDirPath, removeMarker, keepLogs, versionedHomesToKeep...) } else { r0 = ret.Error(0) } @@ -168,15 +176,15 @@ type mockInstallationModifier_Cleanup_Call struct { // Cleanup is a helper method to define mock.On call // - log *logger.Logger // - topDirPath string -// - currentVersionedHome string -// - currentHash string // - removeMarker bool // - keepLogs bool -func (_e *mockInstallationModifier_Expecter) Cleanup(log interface{}, topDirPath interface{}, currentVersionedHome interface{}, currentHash interface{}, removeMarker interface{}, keepLogs interface{}) *mockInstallationModifier_Cleanup_Call { - return &mockInstallationModifier_Cleanup_Call{Call: _e.mock.On("Cleanup", log, topDirPath, currentVersionedHome, currentHash, removeMarker, keepLogs)} +// - versionedHomesToKeep ...string +func (_e *mockInstallationModifier_Expecter) Cleanup(log interface{}, topDirPath interface{}, removeMarker interface{}, keepLogs interface{}, versionedHomesToKeep ...interface{}) *mockInstallationModifier_Cleanup_Call { + return &mockInstallationModifier_Cleanup_Call{Call: _e.mock.On("Cleanup", + append([]interface{}{log, topDirPath, removeMarker, keepLogs}, versionedHomesToKeep...)...)} } -func (_c *mockInstallationModifier_Cleanup_Call) Run(run func(log *logger.Logger, topDirPath string, currentVersionedHome string, currentHash string, removeMarker bool, keepLogs bool)) *mockInstallationModifier_Cleanup_Call { +func (_c *mockInstallationModifier_Cleanup_Call) Run(run func(log *logger.Logger, topDirPath string, removeMarker bool, keepLogs bool, versionedHomesToKeep ...string)) *mockInstallationModifier_Cleanup_Call { _c.Call.Run(func(args mock.Arguments) { var arg0 *logger.Logger if args[0] != nil { @@ -186,29 +194,28 @@ func (_c *mockInstallationModifier_Cleanup_Call) Run(run func(log *logger.Logger if args[1] != nil { arg1 = args[1].(string) } - var arg2 string + var arg2 bool if args[2] != nil { - arg2 = args[2].(string) + arg2 = args[2].(bool) } - var arg3 string + var arg3 bool if args[3] != nil { - arg3 = args[3].(string) + arg3 = args[3].(bool) } - var arg4 bool - if args[4] != nil { - arg4 = args[4].(bool) - } - var arg5 bool - if args[5] != nil { - arg5 = args[5].(bool) + var arg4 []string + variadicArgs := make([]string, len(args)-4) + for i, a := range args[4:] { + if a != nil { + variadicArgs[i] = a.(string) + } } + arg4 = variadicArgs run( arg0, arg1, arg2, arg3, - arg4, - arg5, + arg4..., ) }) return _c @@ -219,7 +226,7 @@ func (_c *mockInstallationModifier_Cleanup_Call) Return(err error) *mockInstalla return _c } -func (_c *mockInstallationModifier_Cleanup_Call) RunAndReturn(run func(log *logger.Logger, topDirPath string, currentVersionedHome string, currentHash string, removeMarker bool, keepLogs bool) error) *mockInstallationModifier_Cleanup_Call { +func (_c *mockInstallationModifier_Cleanup_Call) RunAndReturn(run func(log *logger.Logger, topDirPath string, removeMarker bool, keepLogs bool, versionedHomesToKeep ...string) error) *mockInstallationModifier_Cleanup_Call { _c.Call.Return(run) return _c } diff --git a/internal/pkg/agent/cmd/watch.go b/internal/pkg/agent/cmd/watch.go index 11ac556ea0a..04f6e15533a 100644 --- a/internal/pkg/agent/cmd/watch.go +++ b/internal/pkg/agent/cmd/watch.go @@ -8,6 +8,7 @@ import ( "context" "fmt" "os" + "path/filepath" "runtime" "time" @@ -28,7 +29,6 @@ import ( "github.com/elastic/elastic-agent/internal/pkg/agent/errors" "github.com/elastic/elastic-agent/internal/pkg/cli" "github.com/elastic/elastic-agent/internal/pkg/config" - "github.com/elastic/elastic-agent/internal/pkg/release" "github.com/elastic/elastic-agent/pkg/core/logger" "github.com/elastic/elastic-agent/version" ) @@ -149,7 +149,7 @@ func WithRemoveMarker(removeMarker bool) upgrade.RollbackOption { } type installationModifier interface { - Cleanup(log *logger.Logger, topDirPath, currentVersionedHome, currentHash string, removeMarker, keepLogs bool) error + Cleanup(log *logger.Logger, topDirPath string, removeMarker, keepLogs bool, versionedHomesToKeep ...string) error Rollback(ctx context.Context, log *logger.Logger, c client.Client, topDirPath, prevVersionedHome, prevHash string, opts ...upgrade.RollbackOption) error } @@ -198,7 +198,12 @@ func watchCmd(log *logp.Logger, topDir string, cfg *configuration.UpgradeWatcher // if we're not within grace and marker is still there it might mean // that cleanup was not performed ok, cleanup everything except current version // hash is the same as hash of agent which initiated watcher. - if err := installModifier.Cleanup(log, paths.Top(), paths.VersionedHome(topDir), release.ShortCommit(), true, false); err != nil { + versionedHomesToKeep := make([]string, 0, len(marker.RollbacksAvailable)+1) + // current version needs to be kept + versionedHomesToKeep = append(versionedHomesToKeep, paths.VersionedHome(topDir)) + versionedHomesToKeep = appendAvailableRollbacks(log, marker, versionedHomesToKeep) + log.Infof("About to clean up upgrade. Keeping versioned homes: %v", versionedHomesToKeep) + if err := installModifier.Cleanup(log, paths.Top(), true, false, paths.VersionedHome(topDir)); err != nil { log.Error("clean up of prior watcher run failed", err) } // exit nicely @@ -253,13 +258,31 @@ func watchCmd(log *logp.Logger, topDir string, cfg *configuration.UpgradeWatcher // Why is this being skipped on Windows? The comment above is not clear. // issue: https://github.com/elastic/elastic-agent/issues/3027 removeMarker := !isWindows() - err = installModifier.Cleanup(log, topDir, marker.VersionedHome, marker.Hash, removeMarker, false) + newVersionedHome := marker.VersionedHome + if newVersionedHome == "" { + // the upgrade marker may have been created by an older version of agent where the versionedHome is always `data/elastic-agent-` + newVersionedHome = filepath.Join("data", fmt.Sprintf("elastic-agent-%s", marker.Hash[:6])) + } + versionedHomesToKeep := make([]string, 0, len(marker.RollbacksAvailable)+1) + versionedHomesToKeep = append(versionedHomesToKeep, newVersionedHome) + versionedHomesToKeep = appendAvailableRollbacks(log, marker, versionedHomesToKeep) + + err = installModifier.Cleanup(log, topDir, removeMarker, false, versionedHomesToKeep...) if err != nil { log.Error("cleanup after successful watch failed", err) } return err } +func appendAvailableRollbacks(log *logp.Logger, marker *upgrade.UpdateMarker, versionedHomesToKeep []string) []string { + // add any available rollbacks + for versionedHome, ra := range marker.RollbacksAvailable { + log.Debugf("Adding available rollback %s:%+v to the directories to keep during cleanup", versionedHome, ra) + versionedHomesToKeep = append(versionedHomesToKeep, versionedHome) + } + return versionedHomesToKeep +} + func rollback(log *logp.Logger, topDir string, client client.Client, installModifier installationModifier, versionedHome string) error { // TODO: there should be some sanity check in rollback functions like the installation we are going back to should exist and work log.Infof("rolling back to %s", versionedHome) diff --git a/internal/pkg/agent/cmd/watch_impl.go b/internal/pkg/agent/cmd/watch_impl.go index ebb9c04fcfb..3dfee99ed17 100644 --- a/internal/pkg/agent/cmd/watch_impl.go +++ b/internal/pkg/agent/cmd/watch_impl.go @@ -25,8 +25,8 @@ func (a upgradeAgentWatcher) Watch(ctx context.Context, tilGrace, errorCheckInte type upgradeInstallationModifier struct{} -func (a upgradeInstallationModifier) Cleanup(log *logger.Logger, topDirPath, currentVersionedHome, currentHash string, removeMarker, keepLogs bool) error { - return upgrade.Cleanup(log, topDirPath, currentVersionedHome, currentHash, removeMarker, keepLogs) +func (a upgradeInstallationModifier) Cleanup(log *logger.Logger, topDirPath string, removeMarker, keepLogs bool, versionedHomesToKeep ...string) error { + return upgrade.Cleanup(log, topDirPath, removeMarker, keepLogs, versionedHomesToKeep...) } func (a upgradeInstallationModifier) Rollback(ctx context.Context, log *logger.Logger, c client.Client, topDirPath, prevVersionedHome, prevHash string, opts ...upgrade.RollbackOption) error { diff --git a/internal/pkg/agent/cmd/watch_test.go b/internal/pkg/agent/cmd/watch_test.go index b04ce04b00d..0ebce30753f 100644 --- a/internal/pkg/agent/cmd/watch_test.go +++ b/internal/pkg/agent/cmd/watch_test.go @@ -26,7 +26,6 @@ import ( "github.com/elastic/elastic-agent/internal/pkg/agent/configuration" "github.com/elastic/elastic-agent/internal/pkg/agent/errors" "github.com/elastic/elastic-agent/internal/pkg/fleetapi" - "github.com/elastic/elastic-agent/internal/pkg/release" "github.com/elastic/elastic-agent/pkg/control/v2/client" "github.com/elastic/elastic-agent/pkg/core/logger" "github.com/elastic/elastic-agent/pkg/core/logger/loggertest" @@ -144,7 +143,7 @@ func Test_watchCmd(t *testing.T) { expectedRemoveMarkerFlag := runtime.GOOS != "windows" installModifier.EXPECT(). - Cleanup(mock.Anything, topDir, "elastic-agent-4.5.6-newver", "newver", expectedRemoveMarkerFlag, false). + Cleanup(mock.Anything, topDir, expectedRemoveMarkerFlag, false, "elastic-agent-4.5.6-newver"). Return(nil) }, args: args{ @@ -272,7 +271,7 @@ func Test_watchCmd(t *testing.T) { // topdir, prevVersionedHome and prevHash are not taken from the upgrade marker, otherwise they would be // installModifier.EXPECT(). - Cleanup(mock.Anything, paths.Top(), paths.VersionedHome(topDir), release.ShortCommit(), true, false). + Cleanup(mock.Anything, paths.Top(), true, false, paths.VersionedHome(topDir)). Return(nil) }, args: args{ @@ -308,7 +307,7 @@ func Test_watchCmd(t *testing.T) { // topdir, prevVersionedHome and prevHash are not taken from the upgrade marker, otherwise they would be // installModifier.EXPECT(). - Cleanup(mock.Anything, paths.Top(), paths.VersionedHome(topDir), release.ShortCommit(), true, false). + Cleanup(mock.Anything, paths.Top(), true, false, paths.VersionedHome(topDir)). Return(nil) }, args: args{ @@ -352,7 +351,7 @@ func Test_watchCmd(t *testing.T) { require.NoError(t, err) installModifier.EXPECT(). - Cleanup(mock.Anything, paths.Top(), paths.VersionedHome(tmpDir), release.ShortCommit(), true, false). + Cleanup(mock.Anything, paths.Top(), true, false, paths.VersionedHome(tmpDir)). Return(nil) }, args: args{ From 28e9abdd2bdbd89f9d0898626e0bf917e0738c51 Mon Sep 17 00:00:00 2001 From: Paolo Chila Date: Fri, 26 Sep 2025 16:14:39 +0200 Subject: [PATCH 02/14] refactor manual rollback function and tests on a separate file --- .../application/upgrade/manual_rollback.go | 159 ++++++++++ .../upgrade/manual_rollback_test.go | 295 ++++++++++++++++++ .../pkg/agent/application/upgrade/upgrade.go | 139 --------- .../agent/application/upgrade/upgrade_test.go | 272 ---------------- 4 files changed, 454 insertions(+), 411 deletions(-) create mode 100644 internal/pkg/agent/application/upgrade/manual_rollback.go create mode 100644 internal/pkg/agent/application/upgrade/manual_rollback_test.go diff --git a/internal/pkg/agent/application/upgrade/manual_rollback.go b/internal/pkg/agent/application/upgrade/manual_rollback.go new file mode 100644 index 00000000000..48ae8e1dc11 --- /dev/null +++ b/internal/pkg/agent/application/upgrade/manual_rollback.go @@ -0,0 +1,159 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License 2.0; +// you may not use this file except in compliance with the Elastic License 2.0. + +package upgrade + +import ( + "context" + "errors" + "fmt" + "os" + "time" + + "github.com/elastic/elastic-agent/internal/pkg/agent/application/filelock" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/reexec" + "github.com/elastic/elastic-agent/internal/pkg/fleetapi" + "github.com/elastic/elastic-agent/pkg/core/logger" + "github.com/elastic/elastic-agent/pkg/version" +) + +func (u *Upgrader) rollbackToPreviousVersion(ctx context.Context, topDir string, now time.Time, version string, action *fleetapi.ActionUpgrade) (reexec.ShutdownCallbackFn, error) { + if version == "" { + return nil, ErrEmptyRollbackVersion + } + + // check that the upgrade marker exists and is accessible + updateMarkerPath := markerFilePath(paths.DataFrom(topDir)) + _, err := os.Stat(updateMarkerPath) + if err != nil { + return nil, fmt.Errorf("stat() on upgrade marker %q failed: %w", updateMarkerPath, err) + } + + // read the upgrade marker + updateMarker, err := LoadMarker(paths.DataFrom(topDir)) + if err != nil { + return nil, fmt.Errorf("loading marker: %w", err) + } + + if updateMarker == nil { + return nil, ErrNilUpdateMarker + } + + // extract the agent installs involved in the upgrade and select the most appropriate watcher executable + previous, current, err := extractAgentInstallsFromMarker(updateMarker) + if err != nil { + return nil, fmt.Errorf("extracting current and previous install details: %w", err) + } + watcherExecutable := u.watcherHelper.SelectWatcherExecutable(topDir, previous, current) + + err = withTakeOverWatcher(ctx, u.log, topDir, u.watcherHelper, func() error { + // read the upgrade marker + updateMarker, err = LoadMarker(paths.DataFrom(topDir)) + if err != nil { + return fmt.Errorf("loading marker: %w", err) + } + + if updateMarker == nil { + return ErrNilUpdateMarker + } + + if len(updateMarker.RollbacksAvailable) == 0 { + return ErrNoRollbacksAvailable + } + var selectedRollback *TTLMarker + for _, rollback := range updateMarker.RollbacksAvailable { + if rollback.Version == version && now.Before(rollback.ValidUntil) { + selectedRollback = &rollback + break + } + } + if selectedRollback == nil { + return fmt.Errorf("version %q not listed among the available rollbacks: %w", version, ErrNoRollbacksAvailable) + } + + // rollback + _, err = u.watcherHelper.InvokeWatcher(u.log, watcherExecutable, "watch", "--rollback", updateMarker.PrevVersionedHome) + if err != nil { + return fmt.Errorf("starting rollback command: %w", err) + } + u.log.Debug("rollback command started successfully, PID") + return nil + }) + + if err != nil { + // Invoke watcher again (now that we released the watcher applocks) + _, invokeWatcherErr := u.watcherHelper.InvokeWatcher(u.log, watcherExecutable) + if invokeWatcherErr != nil { + return nil, errors.Join(err, fmt.Errorf("invoking watcher: %w", invokeWatcherErr)) + } + return nil, err + } + + return nil, nil + +} + +func withTakeOverWatcher(ctx context.Context, log *logger.Logger, topDir string, watcherHelper WatcherHelper, f func() error) error { + watcherApplock, err := watcherHelper.TakeOverWatcher(ctx, log, topDir) + if err != nil { + return fmt.Errorf("taking over watcher processes: %w", err) + } + defer func(watcherApplock *filelock.AppLocker) { + releaseWatcherAppLockerErr := watcherApplock.Unlock() + if releaseWatcherAppLockerErr != nil { + log.Warnw("error releasing watcher applock", "error", releaseWatcherAppLockerErr) + } + }(watcherApplock) + + return f() +} + +func extractAgentInstallsFromMarker(updateMarker *UpdateMarker) (previous agentInstall, current agentInstall, err error) { + previousParsedVersion, err := version.ParseVersion(updateMarker.PrevVersion) + if err != nil { + return previous, current, fmt.Errorf("parsing previous version %q: %w", updateMarker.PrevVersion, err) + } + previous = agentInstall{ + parsedVersion: previousParsedVersion, + version: updateMarker.PrevVersion, + hash: updateMarker.PrevHash, + versionedHome: updateMarker.PrevVersionedHome, + } + + currentParsedVersion, err := version.ParseVersion(updateMarker.Version) + if err != nil { + return previous, current, fmt.Errorf("parsing current version %q: %w", updateMarker.Version, err) + } + current = agentInstall{ + parsedVersion: currentParsedVersion, + version: updateMarker.Version, + hash: updateMarker.Hash, + versionedHome: updateMarker.VersionedHome, + } + + return previous, current, nil +} + +func getAvailableRollbacks(rollbackWindow time.Duration, now time.Time, currentVersion string, parsedCurrentVersion *version.ParsedSemVer, currentVersionedHome string) map[string]TTLMarker { + if rollbackWindow == 0 { + // if there's no rollback window it means that no rollback should survive the watcher cleanup at the end of the grace period. + return nil + } + + if parsedCurrentVersion == nil || parsedCurrentVersion.Less(*Version_9_3_0_SNAPSHOT) { + // the version we are upgrading to does not support manual rollbacks + return nil + } + + // when multiple rollbacks will be supported, read the existing descriptor + // at this stage we can get by with a single rollback + res := make(map[string]TTLMarker, 1) + res[currentVersionedHome] = TTLMarker{ + Version: currentVersion, + ValidUntil: now.Add(rollbackWindow), + } + + return res +} diff --git a/internal/pkg/agent/application/upgrade/manual_rollback_test.go b/internal/pkg/agent/application/upgrade/manual_rollback_test.go new file mode 100644 index 00000000000..6ac0f870c3a --- /dev/null +++ b/internal/pkg/agent/application/upgrade/manual_rollback_test.go @@ -0,0 +1,295 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License 2.0; +// you may not use this file except in compliance with the Elastic License 2.0. + +package upgrade + +import ( + "io/fs" + "os" + "os/exec" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" + + "github.com/elastic/elastic-agent/internal/pkg/agent/application/filelock" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/info" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/artifact" + "github.com/elastic/elastic-agent/internal/pkg/agent/configuration" + "github.com/elastic/elastic-agent/pkg/core/logger/loggertest" + agtversion "github.com/elastic/elastic-agent/pkg/version" +) + +func TestManualRollback(t *testing.T) { + const updatemarkerwatching456NoRollbackAvailable = ` + version: 4.5.6 + hash: newver + versioned_home: data/elastic-agent-4.5.6-newver + updated_on: 2025-07-11T10:11:12.131415Z + prev_version: 1.2.3 + prev_hash: oldver + prev_versioned_home: data/elastic-agent-1.2.3-oldver + acked: false + action: null + details: + target_version: 4.5.6 + state: UPG_WATCHING + metadata: + retry_until: null + ` + const updatemarkerwatching456 = ` + version: 4.5.6 + hash: newver + versioned_home: data/elastic-agent-4.5.6-newver + updated_on: 2025-07-11T10:11:12.131415Z + prev_version: 1.2.3 + prev_hash: oldver + prev_versioned_home: data/elastic-agent-1.2.3-oldver + acked: false + action: null + details: + target_version: 4.5.6 + state: UPG_WATCHING + metadata: + retry_until: null + desired_outcome: UPGRADE + rollbacks_available: + "data/elastic-agent-1.2.3-oldver": + version: 1.2.3 + valid_until: 2025-07-18T10:11:12.131415Z + ` + + parsed123Version, err := agtversion.ParseVersion("1.2.3") + require.NoError(t, err) + parsed456Version, err := agtversion.ParseVersion("4.5.6") + require.NoError(t, err) + + agentInstall123 := agentInstall{ + parsedVersion: parsed123Version, + version: "1.2.3", + hash: "oldver", + versionedHome: "data/elastic-agent-1.2.3-oldver", + } + + agentInstall456 := agentInstall{ + parsedVersion: parsed456Version, + version: "4.5.6", + hash: "newver", + versionedHome: "data/elastic-agent-4.5.6-newver", + } + + // this is the updated_on timestamp in the example + nowBeforeTTL, err := time.Parse(time.RFC3339, `2025-07-11T10:11:12Z`) + require.NoError(t, err, "error parsing nowBeforeTTL") + + // the update marker yaml assume 7d TLL for rollbacks, let's make an extra day pass + nowAfterTTL := nowBeforeTTL.Add(8 * 24 * time.Hour) + + type setupF func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) + type postRollbackAssertionsF func(t *testing.T, topDir string) + type testcase struct { + name string + setup setupF + artifactSettings *artifact.Config + upgradeSettings *configuration.UpgradeConfig + now time.Time + version string + wantErr assert.ErrorAssertionFunc + additionalAsserts postRollbackAssertionsF + } + + testcases := []testcase{ + { + name: "no rollback version - rollback fails", + setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) { + //do not setup anything here, let the rollback fail + }, + artifactSettings: artifact.DefaultConfig(), + upgradeSettings: configuration.DefaultUpgradeConfig(), + version: "", + wantErr: func(t assert.TestingT, err error, i ...interface{}) bool { + return assert.ErrorIs(t, err, ErrEmptyRollbackVersion) + }, + additionalAsserts: nil, + }, + { + name: "no update marker - rollback fails", + setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) { + //do not setup anything here, let the rollback fail + }, + artifactSettings: artifact.DefaultConfig(), + upgradeSettings: configuration.DefaultUpgradeConfig(), + version: "1.2.3", + wantErr: func(t assert.TestingT, err error, i ...interface{}) bool { + return assert.ErrorIs(t, err, fs.ErrNotExist) + }, + additionalAsserts: nil, + }, + { + name: "update marker is malformed - rollback fails", + setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) { + err := os.WriteFile(markerFilePath(paths.DataFrom(topDir)), []byte("this is not a proper YAML file"), 0600) + require.NoError(t, err, "error setting up update marker") + locker := filelock.NewAppLocker(topDir, "watcher.lock") + err = locker.TryLock() + require.NoError(t, err, "error locking initial watcher AppLocker") + // there's no takeover watcher so no expectation on that or InvokeWatcher + t.Cleanup(func() { + unlockErr := locker.Unlock() + assert.NoError(t, unlockErr, "error unlocking initial watcher AppLocker") + }) + }, + artifactSettings: artifact.DefaultConfig(), + upgradeSettings: configuration.DefaultUpgradeConfig(), + version: "1.2.3", + wantErr: assert.Error, + additionalAsserts: nil, + }, + { + name: "update marker ok but rollback available is empty - error", + setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) { + err := os.WriteFile(markerFilePath(paths.DataFrom(topDir)), []byte(updatemarkerwatching456NoRollbackAvailable), 0600) + require.NoError(t, err, "error setting up update marker") + locker := filelock.NewAppLocker(topDir, "watcher.lock") + err = locker.TryLock() + require.NoError(t, err, "error locking initial watcher AppLocker") + watcherHelper.EXPECT().TakeOverWatcher(t.Context(), mock.Anything, topDir).Return(locker, nil) + newerWatcherExecutable := filepath.Join(topDir, "data", "elastic-agent-4.5.6-newver", "elastic-agent") + watcherHelper.EXPECT().SelectWatcherExecutable(topDir, agentInstall123, agentInstall456).Return(newerWatcherExecutable) + watcherHelper.EXPECT().InvokeWatcher(mock.Anything, newerWatcherExecutable).Return(&exec.Cmd{Path: newerWatcherExecutable, Args: []string{"watch", "for realsies"}}, nil) + }, + artifactSettings: artifact.DefaultConfig(), + upgradeSettings: configuration.DefaultUpgradeConfig(), + version: "2.3.4-unknown", + wantErr: func(t assert.TestingT, err error, i ...interface{}) bool { + return assert.ErrorIs(t, err, ErrNoRollbacksAvailable) + }, + additionalAsserts: func(t *testing.T, topDir string) { + // marker should be untouched + filePath := markerFilePath(paths.DataFrom(topDir)) + require.FileExists(t, filePath) + markerFileBytes, readMarkerErr := os.ReadFile(filePath) + require.NoError(t, readMarkerErr) + + assert.YAMLEq(t, updatemarkerwatching456NoRollbackAvailable, string(markerFileBytes), "update marker should be untouched") + }, + }, + { + name: "update marker ok but version is not available for rollback - error", + setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) { + err := os.WriteFile(markerFilePath(paths.DataFrom(topDir)), []byte(updatemarkerwatching456), 0600) + require.NoError(t, err, "error setting up update marker") + locker := filelock.NewAppLocker(topDir, "watcher.lock") + err = locker.TryLock() + require.NoError(t, err, "error locking initial watcher AppLocker") + watcherHelper.EXPECT().TakeOverWatcher(t.Context(), mock.Anything, topDir).Return(locker, nil) + newerWatcherExecutable := filepath.Join(topDir, "data", "elastic-agent-4.5.6-newver", "elastic-agent") + watcherHelper.EXPECT().SelectWatcherExecutable(topDir, agentInstall123, agentInstall456).Return(newerWatcherExecutable) + watcherHelper.EXPECT().InvokeWatcher(mock.Anything, newerWatcherExecutable).Return(&exec.Cmd{Path: newerWatcherExecutable, Args: []string{"watch", "for realsies"}}, nil) + }, + artifactSettings: artifact.DefaultConfig(), + upgradeSettings: configuration.DefaultUpgradeConfig(), + version: "2.3.4-unknown", + wantErr: func(t assert.TestingT, err error, i ...interface{}) bool { + return assert.ErrorIs(t, err, ErrNoRollbacksAvailable) + }, + additionalAsserts: func(t *testing.T, topDir string) { + // marker should be untouched + filePath := markerFilePath(paths.DataFrom(topDir)) + require.FileExists(t, filePath) + markerFileBytes, readMarkerErr := os.ReadFile(filePath) + require.NoError(t, readMarkerErr) + + assert.YAMLEq(t, updatemarkerwatching456, string(markerFileBytes), "update marker should be untouched") + }, + }, + { + name: "update marker ok but rollback is expired - error", + setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) { + err := os.WriteFile(markerFilePath(paths.DataFrom(topDir)), []byte(updatemarkerwatching456), 0600) + require.NoError(t, err, "error setting up update marker") + locker := filelock.NewAppLocker(topDir, "watcher.lock") + err = locker.TryLock() + require.NoError(t, err, "error locking initial watcher AppLocker") + watcherHelper.EXPECT().TakeOverWatcher(t.Context(), mock.Anything, topDir).Return(locker, nil) + newerWatcherExecutable := filepath.Join(topDir, "data", "elastic-agent-4.5.6-newver", "elastic-agent") + watcherHelper.EXPECT().SelectWatcherExecutable(topDir, agentInstall123, agentInstall456).Return(newerWatcherExecutable) + watcherHelper.EXPECT().InvokeWatcher(mock.Anything, newerWatcherExecutable).Return(&exec.Cmd{Path: newerWatcherExecutable, Args: []string{"watch", "for realsies"}}, nil) + }, + artifactSettings: artifact.DefaultConfig(), + upgradeSettings: configuration.DefaultUpgradeConfig(), + now: nowAfterTTL, + version: "1.2.3", + wantErr: func(t assert.TestingT, err error, i ...interface{}) bool { + return assert.ErrorIs(t, err, ErrNoRollbacksAvailable) + }, + additionalAsserts: func(t *testing.T, topDir string) { + // marker should be untouched + filePath := markerFilePath(paths.DataFrom(topDir)) + require.FileExists(t, filePath) + markerFileBytes, readMarkerErr := os.ReadFile(filePath) + require.NoError(t, readMarkerErr) + + assert.YAMLEq(t, updatemarkerwatching456, string(markerFileBytes), "update marker should be untouched") + }, + }, + { + name: "update marker ok - takeover watcher, persist rollback and restart most recent watcher", + setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) { + err := os.WriteFile(markerFilePath(paths.DataFrom(topDir)), []byte(updatemarkerwatching456), 0600) + require.NoError(t, err, "error setting up update marker") + locker := filelock.NewAppLocker(topDir, "watcher.lock") + err = locker.TryLock() + require.NoError(t, err, "error locking initial watcher AppLocker") + watcherHelper.EXPECT().TakeOverWatcher(t.Context(), mock.Anything, topDir).Return(locker, nil) + newerWatcherExecutable := filepath.Join(topDir, "data", "elastic-agent-4.5.6-newver", "elastic-agent") + watcherHelper.EXPECT().SelectWatcherExecutable(topDir, agentInstall123, agentInstall456).Return(newerWatcherExecutable) + watcherHelper.EXPECT(). + InvokeWatcher(mock.Anything, newerWatcherExecutable, watcherSubcommand, "--rollback", "data/elastic-agent-1.2.3-oldver"). + Return(&exec.Cmd{Path: newerWatcherExecutable, Args: []string{"watch", "for rollbacksies"}}, nil) + }, + artifactSettings: artifact.DefaultConfig(), + upgradeSettings: configuration.DefaultUpgradeConfig(), + now: nowBeforeTTL, + version: "1.2.3", + wantErr: assert.NoError, + additionalAsserts: func(t *testing.T, topDir string) { + marker, loadMarkerErr := LoadMarker(paths.DataFrom(topDir)) + require.NoError(t, loadMarkerErr, "error loading marker") + require.NotNil(t, marker, "marker is nil") + + require.NotNil(t, marker.Details) + assert.NotEmpty(t, marker.RollbacksAvailable) + }, + }, + } + + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + log, _ := loggertest.New(t.Name()) + mockAgentInfo := info.NewMockAgent(t) + mockWatcherHelper := NewMockWatcherHelper(t) + mockRollbacksSource := newMockAvailableRollbacksSource(t) + topDir := t.TempDir() + err := os.MkdirAll(paths.DataFrom(topDir), 0777) + require.NoError(t, err, "error creating data directory in topDir %q", topDir) + + if tc.setup != nil { + tc.setup(t, topDir, mockAgentInfo, mockWatcherHelper) + } + + upgrader, err := NewUpgrader(log, tc.artifactSettings, tc.upgradeSettings, mockAgentInfo, mockWatcherHelper, mockRollbacksSource) + require.NoError(t, err, "error instantiating upgrader") + _, err = upgrader.rollbackToPreviousVersion(t.Context(), topDir, tc.now, tc.version, nil) + tc.wantErr(t, err, "unexpected error returned by rollbackToPreviousVersion()") + if tc.additionalAsserts != nil { + tc.additionalAsserts(t, topDir) + } + }) + } +} diff --git a/internal/pkg/agent/application/upgrade/upgrade.go b/internal/pkg/agent/application/upgrade/upgrade.go index 359fdf25fa5..eb00f1a1638 100644 --- a/internal/pkg/agent/application/upgrade/upgrade.go +++ b/internal/pkg/agent/application/upgrade/upgrade.go @@ -515,145 +515,6 @@ func (u *Upgrader) Upgrade(ctx context.Context, version string, rollback bool, s return cb, nil } -func getAvailableRollbacks(rollbackWindow time.Duration, now time.Time, currentVersion string, parsedCurrentVersion *agtversion.ParsedSemVer, currentVersionedHome string) map[string]TTLMarker { - if rollbackWindow == 0 { - // if there's no rollback window it means that no rollback should survive the watcher cleanup at the end of the grace period. - return nil - } - - if parsedCurrentVersion == nil || parsedCurrentVersion.Less(*Version_9_3_0_SNAPSHOT) { - // the version we are upgrading to does not support manual rollbacks - return nil - } - - // when multiple rollbacks will be supported, read the existing descriptor - // at this stage we can get by with a single rollback - res := make(map[string]TTLMarker, 1) - res[currentVersionedHome] = TTLMarker{ - Version: currentVersion, - ValidUntil: now.Add(rollbackWindow), - } - - return res -} - -func (u *Upgrader) rollbackToPreviousVersion(ctx context.Context, topDir string, now time.Time, version string, action *fleetapi.ActionUpgrade) (reexec.ShutdownCallbackFn, error) { - if version == "" { - return nil, ErrEmptyRollbackVersion - } - - // check that the upgrade marker exists and is accessible - updateMarkerPath := markerFilePath(paths.DataFrom(topDir)) - _, err := os.Stat(updateMarkerPath) - if err != nil { - return nil, fmt.Errorf("stat() on upgrade marker %q failed: %w", updateMarkerPath, err) - } - - // read the upgrade marker - updateMarker, err := LoadMarker(paths.DataFrom(topDir)) - if err != nil { - return nil, fmt.Errorf("loading marker: %w", err) - } - - if updateMarker == nil { - return nil, ErrNilUpdateMarker - } - - // extract the agent installs involved in the upgrade and select the most appropriate watcher executable - previous, current, err := extractAgentInstallsFromMarker(updateMarker) - if err != nil { - return nil, fmt.Errorf("extracting current and previous install details: %w", err) - } - watcherExecutable := u.watcherHelper.SelectWatcherExecutable(topDir, previous, current) - - err = withTakeOverWatcher(ctx, u.log, topDir, u.watcherHelper, func() error { - // read the upgrade marker - updateMarker, err = LoadMarker(paths.DataFrom(topDir)) - if err != nil { - return fmt.Errorf("loading marker: %w", err) - } - - if updateMarker == nil { - return ErrNilUpdateMarker - } - - if len(updateMarker.RollbacksAvailable) == 0 { - return ErrNoRollbacksAvailable - } - var selectedRollback *TTLMarker - for _, rollback := range updateMarker.RollbacksAvailable { - if rollback.Version == version && now.Before(rollback.ValidUntil) { - selectedRollback = &rollback - break - } - } - if selectedRollback == nil { - return fmt.Errorf("version %q not listed among the available rollbacks: %w", version, ErrNoRollbacksAvailable) - } - - // rollback - _, err = u.watcherHelper.InvokeWatcher(u.log, watcherExecutable, "watch", "--rollback", updateMarker.PrevVersionedHome) - if err != nil { - return fmt.Errorf("starting rollback command: %w", err) - } - u.log.Debug("rollback command started successfully, PID") - return nil - }) - - if err != nil { - // Invoke watcher again (now that we released the watcher applocks) - _, invokeWatcherErr := u.watcherHelper.InvokeWatcher(u.log, watcherExecutable) - if invokeWatcherErr != nil { - return nil, goerrors.Join(err, fmt.Errorf("invoking watcher: %w", invokeWatcherErr)) - } - return nil, err - } - - return nil, nil - -} - -func withTakeOverWatcher(ctx context.Context, log *logger.Logger, topDir string, watcherHelper WatcherHelper, f func() error) error { - watcherApplock, err := watcherHelper.TakeOverWatcher(ctx, log, topDir) - if err != nil { - return fmt.Errorf("taking over watcher processes: %w", err) - } - defer func(watcherApplock *filelock.AppLocker) { - releaseWatcherAppLockerErr := watcherApplock.Unlock() - if releaseWatcherAppLockerErr != nil { - log.Warnw("error releasing watcher applock", "error", releaseWatcherAppLockerErr) - } - }(watcherApplock) - - return f() -} - -func extractAgentInstallsFromMarker(updateMarker *UpdateMarker) (previous agentInstall, current agentInstall, err error) { - previousParsedVersion, err := agtversion.ParseVersion(updateMarker.PrevVersion) - if err != nil { - return previous, current, fmt.Errorf("parsing previous version %q: %w", updateMarker.PrevVersion, err) - } - previous = agentInstall{ - parsedVersion: previousParsedVersion, - version: updateMarker.PrevVersion, - hash: updateMarker.PrevHash, - versionedHome: updateMarker.PrevVersionedHome, - } - - currentParsedVersion, err := agtversion.ParseVersion(updateMarker.Version) - if err != nil { - return previous, current, fmt.Errorf("parsing current version %q: %w", updateMarker.Version, err) - } - current = agentInstall{ - parsedVersion: currentParsedVersion, - version: updateMarker.Version, - hash: updateMarker.Hash, - versionedHome: updateMarker.VersionedHome, - } - - return previous, current, nil -} - // Ack acks last upgrade action func (u *Upgrader) Ack(ctx context.Context, acker acker.Acker) error { // get upgrade action diff --git a/internal/pkg/agent/application/upgrade/upgrade_test.go b/internal/pkg/agent/application/upgrade/upgrade_test.go index a019fa0e2b5..cb2dc29326e 100644 --- a/internal/pkg/agent/application/upgrade/upgrade_test.go +++ b/internal/pkg/agent/application/upgrade/upgrade_test.go @@ -13,7 +13,6 @@ import ( "net/http" "net/url" "os" - "os/exec" "path/filepath" "runtime" "testing" @@ -27,13 +26,11 @@ import ( "github.com/elastic/elastic-agent-libs/transport/httpcommon" "github.com/elastic/elastic-agent-libs/transport/tlscommon" - "github.com/elastic/elastic-agent/internal/pkg/agent/application/filelock" "github.com/elastic/elastic-agent/internal/pkg/agent/application/info" "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/artifact" upgradeErrors "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/artifact/download/errors" "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details" - "github.com/elastic/elastic-agent/internal/pkg/agent/configuration" "github.com/elastic/elastic-agent/internal/pkg/agent/errors" "github.com/elastic/elastic-agent/internal/pkg/config" "github.com/elastic/elastic-agent/internal/pkg/fleetapi" @@ -1042,275 +1039,6 @@ func TestIsSameReleaseVersion(t *testing.T) { } } -func TestManualRollback(t *testing.T) { - const updatemarkerwatching456NoRollbackAvailable = ` - version: 4.5.6 - hash: newver - versioned_home: data/elastic-agent-4.5.6-newver - updated_on: 2025-07-11T10:11:12.131415Z - prev_version: 1.2.3 - prev_hash: oldver - prev_versioned_home: data/elastic-agent-1.2.3-oldver - acked: false - action: null - details: - target_version: 4.5.6 - state: UPG_WATCHING - metadata: - retry_until: null - ` - const updatemarkerwatching456 = ` - version: 4.5.6 - hash: newver - versioned_home: data/elastic-agent-4.5.6-newver - updated_on: 2025-07-11T10:11:12.131415Z - prev_version: 1.2.3 - prev_hash: oldver - prev_versioned_home: data/elastic-agent-1.2.3-oldver - acked: false - action: null - details: - target_version: 4.5.6 - state: UPG_WATCHING - metadata: - retry_until: null - desired_outcome: UPGRADE - rollbacks_available: - "data/elastic-agent-1.2.3-oldver": - version: 1.2.3 - valid_until: 2025-07-18T10:11:12.131415Z - ` - - parsed123Version, err := agtversion.ParseVersion("1.2.3") - require.NoError(t, err) - parsed456Version, err := agtversion.ParseVersion("4.5.6") - require.NoError(t, err) - - agentInstall123 := agentInstall{ - parsedVersion: parsed123Version, - version: "1.2.3", - hash: "oldver", - versionedHome: "data/elastic-agent-1.2.3-oldver", - } - - agentInstall456 := agentInstall{ - parsedVersion: parsed456Version, - version: "4.5.6", - hash: "newver", - versionedHome: "data/elastic-agent-4.5.6-newver", - } - - // this is the updated_on timestamp in the example - nowBeforeTTL, err := time.Parse(time.RFC3339, `2025-07-11T10:11:12Z`) - require.NoError(t, err, "error parsing nowBeforeTTL") - - // the update marker yaml assume 7d TLL for rollbacks, let's make an extra day pass - nowAfterTTL := nowBeforeTTL.Add(8 * 24 * time.Hour) - - type setupF func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) - type postRollbackAssertionsF func(t *testing.T, topDir string) - type testcase struct { - name string - setup setupF - artifactSettings *artifact.Config - upgradeSettings *configuration.UpgradeConfig - now time.Time - version string - wantErr assert.ErrorAssertionFunc - additionalAsserts postRollbackAssertionsF - } - - testcases := []testcase{ - { - name: "no rollback version - rollback fails", - setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) { - //do not setup anything here, let the rollback fail - }, - artifactSettings: artifact.DefaultConfig(), - upgradeSettings: configuration.DefaultUpgradeConfig(), - version: "", - wantErr: func(t assert.TestingT, err error, i ...interface{}) bool { - return assert.ErrorIs(t, err, ErrEmptyRollbackVersion) - }, - additionalAsserts: nil, - }, - { - name: "no update marker - rollback fails", - setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) { - //do not setup anything here, let the rollback fail - }, - artifactSettings: artifact.DefaultConfig(), - upgradeSettings: configuration.DefaultUpgradeConfig(), - version: "1.2.3", - wantErr: func(t assert.TestingT, err error, i ...interface{}) bool { - return assert.ErrorIs(t, err, fs.ErrNotExist) - }, - additionalAsserts: nil, - }, - { - name: "update marker is malformed - rollback fails", - setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) { - err := os.WriteFile(markerFilePath(paths.DataFrom(topDir)), []byte("this is not a proper YAML file"), 0600) - require.NoError(t, err, "error setting up update marker") - locker := filelock.NewAppLocker(topDir, "watcher.lock") - err = locker.TryLock() - require.NoError(t, err, "error locking initial watcher AppLocker") - // there's no takeover watcher so no expectation on that or InvokeWatcher - t.Cleanup(func() { - unlockErr := locker.Unlock() - assert.NoError(t, unlockErr, "error unlocking initial watcher AppLocker") - }) - }, - artifactSettings: artifact.DefaultConfig(), - upgradeSettings: configuration.DefaultUpgradeConfig(), - version: "1.2.3", - wantErr: assert.Error, - additionalAsserts: nil, - }, - { - name: "update marker ok but rollback available is empty - error", - setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) { - err := os.WriteFile(markerFilePath(paths.DataFrom(topDir)), []byte(updatemarkerwatching456NoRollbackAvailable), 0600) - require.NoError(t, err, "error setting up update marker") - locker := filelock.NewAppLocker(topDir, "watcher.lock") - err = locker.TryLock() - require.NoError(t, err, "error locking initial watcher AppLocker") - watcherHelper.EXPECT().TakeOverWatcher(t.Context(), mock.Anything, topDir).Return(locker, nil) - newerWatcherExecutable := filepath.Join(topDir, "data", "elastic-agent-4.5.6-newver", "elastic-agent") - watcherHelper.EXPECT().SelectWatcherExecutable(topDir, agentInstall123, agentInstall456).Return(newerWatcherExecutable) - watcherHelper.EXPECT().InvokeWatcher(mock.Anything, newerWatcherExecutable).Return(&exec.Cmd{Path: newerWatcherExecutable, Args: []string{"watch", "for realsies"}}, nil) - }, - artifactSettings: artifact.DefaultConfig(), - upgradeSettings: configuration.DefaultUpgradeConfig(), - version: "2.3.4-unknown", - wantErr: func(t assert.TestingT, err error, i ...interface{}) bool { - return assert.ErrorIs(t, err, ErrNoRollbacksAvailable) - }, - additionalAsserts: func(t *testing.T, topDir string) { - // marker should be untouched - filePath := markerFilePath(paths.DataFrom(topDir)) - require.FileExists(t, filePath) - markerFileBytes, readMarkerErr := os.ReadFile(filePath) - require.NoError(t, readMarkerErr) - - assert.YAMLEq(t, updatemarkerwatching456NoRollbackAvailable, string(markerFileBytes), "update marker should be untouched") - }, - }, - { - name: "update marker ok but version is not available for rollback - error", - setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) { - err := os.WriteFile(markerFilePath(paths.DataFrom(topDir)), []byte(updatemarkerwatching456), 0600) - require.NoError(t, err, "error setting up update marker") - locker := filelock.NewAppLocker(topDir, "watcher.lock") - err = locker.TryLock() - require.NoError(t, err, "error locking initial watcher AppLocker") - watcherHelper.EXPECT().TakeOverWatcher(t.Context(), mock.Anything, topDir).Return(locker, nil) - newerWatcherExecutable := filepath.Join(topDir, "data", "elastic-agent-4.5.6-newver", "elastic-agent") - watcherHelper.EXPECT().SelectWatcherExecutable(topDir, agentInstall123, agentInstall456).Return(newerWatcherExecutable) - watcherHelper.EXPECT().InvokeWatcher(mock.Anything, newerWatcherExecutable).Return(&exec.Cmd{Path: newerWatcherExecutable, Args: []string{"watch", "for realsies"}}, nil) - }, - artifactSettings: artifact.DefaultConfig(), - upgradeSettings: configuration.DefaultUpgradeConfig(), - version: "2.3.4-unknown", - wantErr: func(t assert.TestingT, err error, i ...interface{}) bool { - return assert.ErrorIs(t, err, ErrNoRollbacksAvailable) - }, - additionalAsserts: func(t *testing.T, topDir string) { - // marker should be untouched - filePath := markerFilePath(paths.DataFrom(topDir)) - require.FileExists(t, filePath) - markerFileBytes, readMarkerErr := os.ReadFile(filePath) - require.NoError(t, readMarkerErr) - - assert.YAMLEq(t, updatemarkerwatching456, string(markerFileBytes), "update marker should be untouched") - }, - }, - { - name: "update marker ok but rollback is expired - error", - setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) { - err := os.WriteFile(markerFilePath(paths.DataFrom(topDir)), []byte(updatemarkerwatching456), 0600) - require.NoError(t, err, "error setting up update marker") - locker := filelock.NewAppLocker(topDir, "watcher.lock") - err = locker.TryLock() - require.NoError(t, err, "error locking initial watcher AppLocker") - watcherHelper.EXPECT().TakeOverWatcher(t.Context(), mock.Anything, topDir).Return(locker, nil) - newerWatcherExecutable := filepath.Join(topDir, "data", "elastic-agent-4.5.6-newver", "elastic-agent") - watcherHelper.EXPECT().SelectWatcherExecutable(topDir, agentInstall123, agentInstall456).Return(newerWatcherExecutable) - watcherHelper.EXPECT().InvokeWatcher(mock.Anything, newerWatcherExecutable).Return(&exec.Cmd{Path: newerWatcherExecutable, Args: []string{"watch", "for realsies"}}, nil) - }, - artifactSettings: artifact.DefaultConfig(), - upgradeSettings: configuration.DefaultUpgradeConfig(), - now: nowAfterTTL, - version: "1.2.3", - wantErr: func(t assert.TestingT, err error, i ...interface{}) bool { - return assert.ErrorIs(t, err, ErrNoRollbacksAvailable) - }, - additionalAsserts: func(t *testing.T, topDir string) { - // marker should be untouched - filePath := markerFilePath(paths.DataFrom(topDir)) - require.FileExists(t, filePath) - markerFileBytes, readMarkerErr := os.ReadFile(filePath) - require.NoError(t, readMarkerErr) - - assert.YAMLEq(t, updatemarkerwatching456, string(markerFileBytes), "update marker should be untouched") - }, - }, - { - name: "update marker ok - takeover watcher, persist rollback and restart most recent watcher", - setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) { - err := os.WriteFile(markerFilePath(paths.DataFrom(topDir)), []byte(updatemarkerwatching456), 0600) - require.NoError(t, err, "error setting up update marker") - locker := filelock.NewAppLocker(topDir, "watcher.lock") - err = locker.TryLock() - require.NoError(t, err, "error locking initial watcher AppLocker") - watcherHelper.EXPECT().TakeOverWatcher(t.Context(), mock.Anything, topDir).Return(locker, nil) - newerWatcherExecutable := filepath.Join(topDir, "data", "elastic-agent-4.5.6-newver", "elastic-agent") - watcherHelper.EXPECT().SelectWatcherExecutable(topDir, agentInstall123, agentInstall456).Return(newerWatcherExecutable) - watcherHelper.EXPECT(). - InvokeWatcher(mock.Anything, newerWatcherExecutable, watcherSubcommand, "--rollback", "data/elastic-agent-1.2.3-oldver"). - Return(&exec.Cmd{Path: newerWatcherExecutable, Args: []string{"watch", "for rollbacksies"}}, nil) - }, - artifactSettings: artifact.DefaultConfig(), - upgradeSettings: configuration.DefaultUpgradeConfig(), - now: nowBeforeTTL, - version: "1.2.3", - wantErr: assert.NoError, - additionalAsserts: func(t *testing.T, topDir string) { - marker, loadMarkerErr := LoadMarker(paths.DataFrom(topDir)) - require.NoError(t, loadMarkerErr, "error loading marker") - require.NotNil(t, marker, "marker is nil") - - require.NotNil(t, marker.Details) - assert.NotEmpty(t, marker.RollbacksAvailable) - }, - }, - } - - for _, tc := range testcases { - t.Run(tc.name, func(t *testing.T) { - log, _ := loggertest.New(t.Name()) - mockAgentInfo := info.NewMockAgent(t) - mockWatcherHelper := NewMockWatcherHelper(t) - mockRollbacksSource := newMockAvailableRollbacksSource(t) - topDir := t.TempDir() - err := os.MkdirAll(paths.DataFrom(topDir), 0777) - require.NoError(t, err, "error creating data directory in topDir %q", topDir) - - if tc.setup != nil { - tc.setup(t, topDir, mockAgentInfo, mockWatcherHelper) - } - - upgrader, err := NewUpgrader(log, tc.artifactSettings, tc.upgradeSettings, mockAgentInfo, mockWatcherHelper, mockRollbacksSource) - require.NoError(t, err, "error instantiating upgrader") - _, err = upgrader.rollbackToPreviousVersion(t.Context(), topDir, tc.now, tc.version, nil) - tc.wantErr(t, err, "unexpected error returned by rollbackToPreviousVersion()") - if tc.additionalAsserts != nil { - tc.additionalAsserts(t, topDir) - } - }) - } -} - type mockArtifactDownloader struct { returnError error returnArchivePath string From fb31e7dab93d85fb381d401dd9e905519218d340 Mon Sep 17 00:00:00 2001 From: Paolo Chila Date: Fri, 26 Sep 2025 16:51:53 +0200 Subject: [PATCH 03/14] Split manual rollback between watching and non-watching cases --- .../application/upgrade/manual_rollback.go | 69 +++++++++++++------ .../upgrade/manual_rollback_test.go | 2 +- 2 files changed, 48 insertions(+), 23 deletions(-) diff --git a/internal/pkg/agent/application/upgrade/manual_rollback.go b/internal/pkg/agent/application/upgrade/manual_rollback.go index 48ae8e1dc11..4c08e06582c 100644 --- a/internal/pkg/agent/application/upgrade/manual_rollback.go +++ b/internal/pkg/agent/application/upgrade/manual_rollback.go @@ -27,28 +27,61 @@ func (u *Upgrader) rollbackToPreviousVersion(ctx context.Context, topDir string, // check that the upgrade marker exists and is accessible updateMarkerPath := markerFilePath(paths.DataFrom(topDir)) _, err := os.Stat(updateMarkerPath) - if err != nil { + if err != nil && !errors.Is(err, os.ErrNotExist) { return nil, fmt.Errorf("stat() on upgrade marker %q failed: %w", updateMarkerPath, err) } + var watcherExecutable string + var versionedHomeToRollbackTo string + + if errors.Is(err, os.ErrNotExist) { + // there is no upgrade marker, we need to extract available rollbacks from agent installs + watcherExecutable, versionedHomeToRollbackTo, err = rollbackUsingAgentInstalls() + } else { + watcherExecutable, versionedHomeToRollbackTo, err = rollbackUsingUpgradeMarker(ctx, u.log, u.watcherHelper, topDir, now, version) + } + + if err != nil { + return nil, err + } + + // rollback + watcherCmd, err := u.watcherHelper.InvokeWatcher(u.log, watcherExecutable, "watch", "--rollback", versionedHomeToRollbackTo) + if err != nil { + return nil, fmt.Errorf("starting rollback command: %w", err) + } + + u.log.Infof("rollback command started successfully, PID: %d", watcherCmd.Process.Pid) + return nil, nil +} + +func rollbackUsingAgentInstalls() (string, string, error) { + //FIXME implement + //panic("Not implemented") + return "", "", errors.Join(errors.New("not implemented"), os.ErrNotExist) +} + +func rollbackUsingUpgradeMarker(ctx context.Context, log *logger.Logger, watcherHelper WatcherHelper, topDir string, now time.Time, version string) (string, string, error) { // read the upgrade marker updateMarker, err := LoadMarker(paths.DataFrom(topDir)) if err != nil { - return nil, fmt.Errorf("loading marker: %w", err) + return "", "", fmt.Errorf("loading marker: %w", err) } if updateMarker == nil { - return nil, ErrNilUpdateMarker + return "", "", ErrNilUpdateMarker } // extract the agent installs involved in the upgrade and select the most appropriate watcher executable previous, current, err := extractAgentInstallsFromMarker(updateMarker) if err != nil { - return nil, fmt.Errorf("extracting current and previous install details: %w", err) + return "", "", fmt.Errorf("extracting current and previous install details: %w", err) } - watcherExecutable := u.watcherHelper.SelectWatcherExecutable(topDir, previous, current) + watcherExecutable := watcherHelper.SelectWatcherExecutable(topDir, previous, current) + + var selectedRollbackVersionedHome string - err = withTakeOverWatcher(ctx, u.log, topDir, u.watcherHelper, func() error { + err = withTakeOverWatcher(ctx, log, topDir, watcherHelper, func() error { // read the upgrade marker updateMarker, err = LoadMarker(paths.DataFrom(topDir)) if err != nil { @@ -62,37 +95,29 @@ func (u *Upgrader) rollbackToPreviousVersion(ctx context.Context, topDir string, if len(updateMarker.RollbacksAvailable) == 0 { return ErrNoRollbacksAvailable } - var selectedRollback *TTLMarker - for _, rollback := range updateMarker.RollbacksAvailable { + + for versionedHome, rollback := range updateMarker.RollbacksAvailable { if rollback.Version == version && now.Before(rollback.ValidUntil) { - selectedRollback = &rollback + selectedRollbackVersionedHome = versionedHome break } } - if selectedRollback == nil { + if selectedRollbackVersionedHome == "" { return fmt.Errorf("version %q not listed among the available rollbacks: %w", version, ErrNoRollbacksAvailable) } - - // rollback - _, err = u.watcherHelper.InvokeWatcher(u.log, watcherExecutable, "watch", "--rollback", updateMarker.PrevVersionedHome) - if err != nil { - return fmt.Errorf("starting rollback command: %w", err) - } - u.log.Debug("rollback command started successfully, PID") return nil }) if err != nil { // Invoke watcher again (now that we released the watcher applocks) - _, invokeWatcherErr := u.watcherHelper.InvokeWatcher(u.log, watcherExecutable) + _, invokeWatcherErr := watcherHelper.InvokeWatcher(log, watcherExecutable) if invokeWatcherErr != nil { - return nil, errors.Join(err, fmt.Errorf("invoking watcher: %w", invokeWatcherErr)) + return "", "", errors.Join(err, fmt.Errorf("invoking watcher: %w", invokeWatcherErr)) } - return nil, err + return "", "", err } - return nil, nil - + return watcherExecutable, selectedRollbackVersionedHome, nil } func withTakeOverWatcher(ctx context.Context, log *logger.Logger, topDir string, watcherHelper WatcherHelper, f func() error) error { diff --git a/internal/pkg/agent/application/upgrade/manual_rollback_test.go b/internal/pkg/agent/application/upgrade/manual_rollback_test.go index 6ac0f870c3a..191c0e3be62 100644 --- a/internal/pkg/agent/application/upgrade/manual_rollback_test.go +++ b/internal/pkg/agent/application/upgrade/manual_rollback_test.go @@ -251,7 +251,7 @@ func TestManualRollback(t *testing.T) { watcherHelper.EXPECT().SelectWatcherExecutable(topDir, agentInstall123, agentInstall456).Return(newerWatcherExecutable) watcherHelper.EXPECT(). InvokeWatcher(mock.Anything, newerWatcherExecutable, watcherSubcommand, "--rollback", "data/elastic-agent-1.2.3-oldver"). - Return(&exec.Cmd{Path: newerWatcherExecutable, Args: []string{"watch", "for rollbacksies"}}, nil) + Return(&exec.Cmd{Path: newerWatcherExecutable, Args: []string{"watch", "for rollbacksies"}, Process: &os.Process{Pid: 123}}, nil) }, artifactSettings: artifact.DefaultConfig(), upgradeSettings: configuration.DefaultUpgradeConfig(), From d995f1136bbda62d6f492e0d709e6c41b49c85df Mon Sep 17 00:00:00 2001 From: Paolo Chila Date: Mon, 29 Sep 2025 17:11:54 +0200 Subject: [PATCH 04/14] Implement manual rollback from list of agent installs --- .../application/upgrade/manual_rollback.go | 84 ++++- .../upgrade/manual_rollback_test.go | 353 +++++++++++++++++- 2 files changed, 418 insertions(+), 19 deletions(-) diff --git a/internal/pkg/agent/application/upgrade/manual_rollback.go b/internal/pkg/agent/application/upgrade/manual_rollback.go index 4c08e06582c..03f66bdf98b 100644 --- a/internal/pkg/agent/application/upgrade/manual_rollback.go +++ b/internal/pkg/agent/application/upgrade/manual_rollback.go @@ -9,14 +9,18 @@ import ( "errors" "fmt" "os" + "path/filepath" "time" "github.com/elastic/elastic-agent/internal/pkg/agent/application/filelock" "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" "github.com/elastic/elastic-agent/internal/pkg/agent/application/reexec" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details" "github.com/elastic/elastic-agent/internal/pkg/fleetapi" + "github.com/elastic/elastic-agent/internal/pkg/release" "github.com/elastic/elastic-agent/pkg/core/logger" "github.com/elastic/elastic-agent/pkg/version" + agtversion "github.com/elastic/elastic-agent/version" ) func (u *Upgrader) rollbackToPreviousVersion(ctx context.Context, topDir string, now time.Time, version string, action *fleetapi.ActionUpgrade) (reexec.ShutdownCallbackFn, error) { @@ -33,11 +37,13 @@ func (u *Upgrader) rollbackToPreviousVersion(ctx context.Context, topDir string, var watcherExecutable string var versionedHomeToRollbackTo string + var updateMarkerExistsBeforeRollback bool if errors.Is(err, os.ErrNotExist) { // there is no upgrade marker, we need to extract available rollbacks from agent installs - watcherExecutable, versionedHomeToRollbackTo, err = rollbackUsingAgentInstalls() + watcherExecutable, versionedHomeToRollbackTo, err = rollbackUsingAgentInstalls(u.log, u.watcherHelper, u.availableRollbacksSource, topDir, now, version, u.markUpgrade) } else { + updateMarkerExistsBeforeRollback = true watcherExecutable, versionedHomeToRollbackTo, err = rollbackUsingUpgradeMarker(ctx, u.log, u.watcherHelper, topDir, now, version) } @@ -46,8 +52,22 @@ func (u *Upgrader) rollbackToPreviousVersion(ctx context.Context, topDir string, } // rollback - watcherCmd, err := u.watcherHelper.InvokeWatcher(u.log, watcherExecutable, "watch", "--rollback", versionedHomeToRollbackTo) + watcherCmd, err := u.watcherHelper.InvokeWatcher(u.log, watcherExecutable, "--rollback", versionedHomeToRollbackTo) if err != nil { + if !updateMarkerExistsBeforeRollback { + // best effort: cleanup the fake update marker + cleanupErr := os.Remove(updateMarkerPath) + if cleanupErr != nil { + u.log.Errorf("Error cleaning up fake upgrade marker: %v", cleanupErr) + } + } else { + // attempt to resume the "normal" watcher to continue watching + _, restoreWatcherErr := u.watcherHelper.InvokeWatcher(u.log, watcherExecutable) + if restoreWatcherErr != nil { + u.log.Errorf("Error resuming watch after rollback error : %v", restoreWatcherErr) + } + } + return nil, fmt.Errorf("starting rollback command: %w", err) } @@ -55,10 +75,62 @@ func (u *Upgrader) rollbackToPreviousVersion(ctx context.Context, topDir string, return nil, nil } -func rollbackUsingAgentInstalls() (string, string, error) { - //FIXME implement - //panic("Not implemented") - return "", "", errors.Join(errors.New("not implemented"), os.ErrNotExist) +func rollbackUsingAgentInstalls(log *logger.Logger, watcherHelper WatcherHelper, source availableRollbacksSource, topDir string, now time.Time, rollbackVersion string, markUpgrade markUpgradeFunc) (string, string, error) { + // read the available installs + availableRollbacks, err := source.Get() + if err != nil { + return "", "", fmt.Errorf("retrieving available rollbacks: %w", err) + } + // check for the version we want to rollback to + var targetInstall string + var targetTTLMarker TTLMarker + for versionedHome, ttlMarker := range availableRollbacks { + if ttlMarker.Version == rollbackVersion && now.Before(ttlMarker.ValidUntil) { + // found a valid target + targetInstall = versionedHome + targetTTLMarker = ttlMarker + break + } + } + + if targetInstall == "" { + return "", "", fmt.Errorf("version %q not listed among the available rollbacks: %w", rollbackVersion, ErrNoRollbacksAvailable) + } + + prevAgentParsedVersion, err := version.ParseVersion(targetTTLMarker.Version) + if err != nil { + return "", "", fmt.Errorf("parsing version of target install %+v: %w", targetInstall, err) + } + + // write out a fake upgrade marker to make the upgrade details state happy + currentHome := paths.VersionedHome(topDir) + relCurVersionedHome, err := filepath.Rel(topDir, currentHome) + if err != nil { + return "", "", fmt.Errorf("getting current install home path %q relative to top %q: %w", currentHome, topDir, err) + } + curAgentInstall := agentInstall{ + parsedVersion: agtversion.GetParsedAgentPackageVersion(), + version: release.VersionWithSnapshot(), + hash: release.Commit(), + versionedHome: relCurVersionedHome, + } + + prevAgentInstall := agentInstall{ + parsedVersion: prevAgentParsedVersion, + version: targetTTLMarker.Version, + hash: "", + versionedHome: targetInstall, + } + + upgradeDetails := details.NewDetails(release.VersionWithSnapshot(), details.StateRequested, "" /*action.ID*/) + err = markUpgrade(log, paths.DataFrom(topDir), now, curAgentInstall, prevAgentInstall, nil /*action*/, upgradeDetails, nil) + if err != nil { + return "", "", fmt.Errorf("creating upgrade marker: %w", err) + } + + // return watcher executable and versionedHome to rollback to + watcherExecutable := watcherHelper.SelectWatcherExecutable(topDir, prevAgentInstall, curAgentInstall) + return watcherExecutable, targetInstall, nil } func rollbackUsingUpgradeMarker(ctx context.Context, log *logger.Logger, watcherHelper WatcherHelper, topDir string, now time.Time, version string) (string, string, error) { diff --git a/internal/pkg/agent/application/upgrade/manual_rollback_test.go b/internal/pkg/agent/application/upgrade/manual_rollback_test.go index 191c0e3be62..de6b9ab8c1b 100644 --- a/internal/pkg/agent/application/upgrade/manual_rollback_test.go +++ b/internal/pkg/agent/application/upgrade/manual_rollback_test.go @@ -5,6 +5,8 @@ package upgrade import ( + "errors" + "fmt" "io/fs" "os" "os/exec" @@ -15,14 +17,18 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" + "gopkg.in/yaml.v3" "github.com/elastic/elastic-agent/internal/pkg/agent/application/filelock" "github.com/elastic/elastic-agent/internal/pkg/agent/application/info" "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/artifact" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details" "github.com/elastic/elastic-agent/internal/pkg/agent/configuration" + "github.com/elastic/elastic-agent/internal/pkg/release" "github.com/elastic/elastic-agent/pkg/core/logger/loggertest" - agtversion "github.com/elastic/elastic-agent/pkg/version" + "github.com/elastic/elastic-agent/pkg/version" + agtversion "github.com/elastic/elastic-agent/version" ) func TestManualRollback(t *testing.T) { @@ -64,9 +70,9 @@ func TestManualRollback(t *testing.T) { valid_until: 2025-07-18T10:11:12.131415Z ` - parsed123Version, err := agtversion.ParseVersion("1.2.3") + parsed123Version, err := version.ParseVersion("1.2.3") require.NoError(t, err) - parsed456Version, err := agtversion.ParseVersion("4.5.6") + parsed456Version, err := version.ParseVersion("4.5.6") require.NoError(t, err) agentInstall123 := agentInstall{ @@ -83,6 +89,15 @@ func TestManualRollback(t *testing.T) { versionedHome: "data/elastic-agent-4.5.6-newver", } + agentInstallCurrent := agentInstall{ + parsedVersion: agtversion.GetParsedAgentPackageVersion(), + version: release.VersionWithSnapshot(), + hash: release.Commit(), + // Versioned home should contain the version but since the path does not really exist we fallback to the legacy format with just the hash + // versionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s-%s", release.VersionWithSnapshot(), release.ShortCommit())), + versionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s", release.ShortCommit())), + } + // this is the updated_on timestamp in the example nowBeforeTTL, err := time.Parse(time.RFC3339, `2025-07-11T10:11:12Z`) require.NoError(t, err, "error parsing nowBeforeTTL") @@ -90,7 +105,12 @@ func TestManualRollback(t *testing.T) { // the update marker yaml assume 7d TLL for rollbacks, let's make an extra day pass nowAfterTTL := nowBeforeTTL.Add(8 * 24 * time.Hour) - type setupF func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) + // save the current timestamp, useful for TTL-based testing + aMomentInTime := time.Now() + //aMomentTomorrow := aMomentInTime.Add(24 * time.Hour) + //aMomentAgo := aMomentInTime.Add(-1 * time.Second) + + type setupF func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) type postRollbackAssertionsF func(t *testing.T, topDir string) type testcase struct { name string @@ -106,7 +126,7 @@ func TestManualRollback(t *testing.T) { testcases := []testcase{ { name: "no rollback version - rollback fails", - setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) { + setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) { //do not setup anything here, let the rollback fail }, artifactSettings: artifact.DefaultConfig(), @@ -119,7 +139,7 @@ func TestManualRollback(t *testing.T) { }, { name: "no update marker - rollback fails", - setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) { + setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) { //do not setup anything here, let the rollback fail }, artifactSettings: artifact.DefaultConfig(), @@ -132,7 +152,7 @@ func TestManualRollback(t *testing.T) { }, { name: "update marker is malformed - rollback fails", - setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) { + setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) { err := os.WriteFile(markerFilePath(paths.DataFrom(topDir)), []byte("this is not a proper YAML file"), 0600) require.NoError(t, err, "error setting up update marker") locker := filelock.NewAppLocker(topDir, "watcher.lock") @@ -152,7 +172,7 @@ func TestManualRollback(t *testing.T) { }, { name: "update marker ok but rollback available is empty - error", - setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) { + setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) { err := os.WriteFile(markerFilePath(paths.DataFrom(topDir)), []byte(updatemarkerwatching456NoRollbackAvailable), 0600) require.NoError(t, err, "error setting up update marker") locker := filelock.NewAppLocker(topDir, "watcher.lock") @@ -181,7 +201,7 @@ func TestManualRollback(t *testing.T) { }, { name: "update marker ok but version is not available for rollback - error", - setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) { + setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) { err := os.WriteFile(markerFilePath(paths.DataFrom(topDir)), []byte(updatemarkerwatching456), 0600) require.NoError(t, err, "error setting up update marker") locker := filelock.NewAppLocker(topDir, "watcher.lock") @@ -210,7 +230,7 @@ func TestManualRollback(t *testing.T) { }, { name: "update marker ok but rollback is expired - error", - setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) { + setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) { err := os.WriteFile(markerFilePath(paths.DataFrom(topDir)), []byte(updatemarkerwatching456), 0600) require.NoError(t, err, "error setting up update marker") locker := filelock.NewAppLocker(topDir, "watcher.lock") @@ -238,9 +258,40 @@ func TestManualRollback(t *testing.T) { assert.YAMLEq(t, updatemarkerwatching456, string(markerFileBytes), "update marker should be untouched") }, }, + { + name: "update marker ok, rollback valid, invoking watcher fails - error", + setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) { + err := os.WriteFile(markerFilePath(paths.DataFrom(topDir)), []byte(updatemarkerwatching456), 0600) + require.NoError(t, err, "error setting up update marker") + locker := filelock.NewAppLocker(topDir, "watcher.lock") + err = locker.TryLock() + require.NoError(t, err, "error locking initial watcher AppLocker") + watcherHelper.EXPECT().TakeOverWatcher(t.Context(), mock.Anything, topDir).Return(locker, nil) + newerWatcherExecutable := filepath.Join(topDir, "data", "elastic-agent-4.5.6-newver", "elastic-agent") + watcherHelper.EXPECT().SelectWatcherExecutable(topDir, agentInstall123, agentInstall456).Return(newerWatcherExecutable) + // invoking watcher rollback fails + watcherHelper.EXPECT().InvokeWatcher(mock.Anything, newerWatcherExecutable, "--rollback", agentInstall123.versionedHome).Return(nil, errors.New("error invoking watcher")) + // Expect watch to be resumed + watcherHelper.EXPECT().InvokeWatcher(mock.Anything, newerWatcherExecutable).Return(&exec.Cmd{Path: newerWatcherExecutable, Args: []string{"watch", "for realsies"}}, nil) + }, + artifactSettings: artifact.DefaultConfig(), + upgradeSettings: configuration.DefaultUpgradeConfig(), + now: nowBeforeTTL, + version: "1.2.3", + wantErr: assert.Error, + additionalAsserts: func(t *testing.T, topDir string) { + // marker should be untouched + filePath := markerFilePath(paths.DataFrom(topDir)) + require.FileExists(t, filePath) + markerFileBytes, readMarkerErr := os.ReadFile(filePath) + require.NoError(t, readMarkerErr) + + assert.YAMLEq(t, updatemarkerwatching456, string(markerFileBytes), "update marker should be untouched") + }, + }, { name: "update marker ok - takeover watcher, persist rollback and restart most recent watcher", - setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper) { + setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) { err := os.WriteFile(markerFilePath(paths.DataFrom(topDir)), []byte(updatemarkerwatching456), 0600) require.NoError(t, err, "error setting up update marker") locker := filelock.NewAppLocker(topDir, "watcher.lock") @@ -250,7 +301,7 @@ func TestManualRollback(t *testing.T) { newerWatcherExecutable := filepath.Join(topDir, "data", "elastic-agent-4.5.6-newver", "elastic-agent") watcherHelper.EXPECT().SelectWatcherExecutable(topDir, agentInstall123, agentInstall456).Return(newerWatcherExecutable) watcherHelper.EXPECT(). - InvokeWatcher(mock.Anything, newerWatcherExecutable, watcherSubcommand, "--rollback", "data/elastic-agent-1.2.3-oldver"). + InvokeWatcher(mock.Anything, newerWatcherExecutable, "--rollback", "data/elastic-agent-1.2.3-oldver"). Return(&exec.Cmd{Path: newerWatcherExecutable, Args: []string{"watch", "for rollbacksies"}, Process: &os.Process{Pid: 123}}, nil) }, artifactSettings: artifact.DefaultConfig(), @@ -267,6 +318,282 @@ func TestManualRollback(t *testing.T) { assert.NotEmpty(t, marker.RollbacksAvailable) }, }, + { + name: "no update marker, available install for rollback with valid TTL - rollback", + setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) { + //FIXME translate the installSource calls to rollbacksSource + //installSource.EXPECT().GetInstallDesc().Return( + // &v1.InstallDescriptor{ + // AgentInstalls: []v1.AgentInstallDesc{ + // { + // Version: release.VersionWithSnapshot(), + // Hash: release.Commit(), + // // Versioned home should contain the version but since the path does not really exist we fallback to the legacy format with just the hash + // //VersionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s-%s", release.VersionWithSnapshot(), release.ShortCommit())), + // VersionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s", release.ShortCommit())), + // Flavor: "basic", + // Active: true, + // }, + // { + // // old install is still valid for the next 24 hours + // OptionalTTLItem: v1.OptionalTTLItem{TTL: &aMomentTomorrow}, + // Version: "1.2.3", + // Hash: "oldver", + // VersionedHome: "data/elastic-agent-1.2.3-oldver", + // Flavor: "basic", + // Active: false, + // }, + // }, + // }, + // nil, + //) + newerWatcherExecutable := filepath.Join(topDir, "data", fmt.Sprintf("elastic-agent-%s-%s", release.VersionWithSnapshot(), release.ShortCommit()), "elastic-agent") + watcherHelper.EXPECT().SelectWatcherExecutable(topDir, agentInstall123, agentInstallCurrent).Return(newerWatcherExecutable) + watcherHelper.EXPECT().InvokeWatcher(mock.Anything, newerWatcherExecutable, "--rollback", "data/elastic-agent-1.2.3-oldver"). + Return(&exec.Cmd{Path: newerWatcherExecutable, Args: []string{"watch", "for rollbacksies"}, Process: &os.Process{Pid: 123}}, nil) + }, + artifactSettings: artifact.DefaultConfig(), + upgradeSettings: &configuration.UpgradeConfig{ + Rollback: &configuration.UpgradeRollbackConfig{ + Window: 24 * time.Hour, + }, + }, + now: aMomentInTime, + version: "1.2.3", + wantErr: assert.NoError, + additionalAsserts: func(t *testing.T, topDir string) { + actualMarkerFilePath := filepath.Join(topDir, "data", markerFilename) + require.FileExists(t, actualMarkerFilePath, "marker file must have been created") + actualMarkerFileBytes, errReadMarkerFile := os.ReadFile(actualMarkerFilePath) + require.NoError(t, errReadMarkerFile, "marker file should be readable") + + expectedUpdateMarker := &UpdateMarker{ + Version: release.VersionWithSnapshot(), + Hash: release.Commit(), + VersionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s", release.ShortCommit())), + UpdatedOn: aMomentInTime, + PrevVersion: "1.2.3", + PrevHash: "oldver", + PrevVersionedHome: "data/elastic-agent-1.2.3-oldver", + Details: &details.Details{ + TargetVersion: release.VersionWithSnapshot(), + State: details.StateRequested, + }, + RollbacksAvailable: nil, + } + + expectedMarkerBytes, err := yaml.Marshal(newMarkerSerializer(expectedUpdateMarker)) + require.NoError(t, err, "error marshalling expected update marker") + require.YAMLEq(t, string(expectedMarkerBytes), string(actualMarkerFileBytes)) + }, + }, + { + name: "no update marker, available install for rollback with expired TTL - error", + setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) { + //FIXME translate the installSource calls to rollbacksSource + //installSource.EXPECT().GetInstallDesc().Return( + // &v1.InstallDescriptor{ + // AgentInstalls: []v1.AgentInstallDesc{ + // { + // Version: release.VersionWithSnapshot(), + // Hash: release.Commit(), + // // Versioned home should contain the version but since the path does not really exist we fallback to the legacy format with just the hash + // // VersionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s-%s", release.VersionWithSnapshot(), release.ShortCommit())), + // VersionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s", release.ShortCommit())), + // Flavor: "basic", + // Active: true, + // }, + // { + // // old install expired a second ago + // OptionalTTLItem: v1.OptionalTTLItem{TTL: &aMomentAgo}, + // Version: "1.2.3", + // Hash: "oldver", + // VersionedHome: "data/elastic-agent-1.2.3-oldver", + // Flavor: "basic", + // Active: false, + // }, + // }, + // }, + // nil, + //) + }, + artifactSettings: artifact.DefaultConfig(), + upgradeSettings: &configuration.UpgradeConfig{ + Rollback: &configuration.UpgradeRollbackConfig{ + Window: 24 * time.Hour, + }, + }, + now: aMomentInTime, + version: "1.2.3", + wantErr: func(t assert.TestingT, err error, i ...interface{}) bool { + return assert.ErrorIs(t, err, ErrNoRollbacksAvailable, i...) + }, + additionalAsserts: func(t *testing.T, topDir string) { + actualMarkerFilePath := filepath.Join(topDir, "data", markerFilename) + require.NoFileExists(t, actualMarkerFilePath, "marker file must not be created") + + }, + }, + { + name: "no update marker, no available install for the version - error", + setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) { + //FIXME translate the installSource calls to rollbacksSource + //installSource.EXPECT().GetInstallDesc().Return( + // &v1.InstallDescriptor{ + // AgentInstalls: []v1.AgentInstallDesc{ + // { + // Version: release.VersionWithSnapshot(), + // Hash: release.Commit(), + // // Versioned home should contain the version but since the path does not really exist we fallback to the legacy format with just the hash + // // VersionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s-%s", release.VersionWithSnapshot(), release.ShortCommit())), + // VersionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s", release.ShortCommit())), + // Flavor: "basic", + // Active: true, + // }, + // { + // // old install - still valid + // OptionalTTLItem: v1.OptionalTTLItem{TTL: &aMomentTomorrow}, + // Version: "1.2.3", + // Hash: "oldver", + // VersionedHome: "data/elastic-agent-1.2.3-oldver", + // Flavor: "basic", + // Active: false, + // }, + // }, + // }, + // nil, + //) + }, + artifactSettings: artifact.DefaultConfig(), + upgradeSettings: &configuration.UpgradeConfig{ + Rollback: &configuration.UpgradeRollbackConfig{ + Window: 24 * time.Hour, + }, + }, + now: aMomentInTime, + version: "6.6.6", + wantErr: func(t assert.TestingT, err error, i ...interface{}) bool { + return assert.ErrorIs(t, err, ErrNoRollbacksAvailable, i...) + }, + additionalAsserts: func(t *testing.T, topDir string) { + actualMarkerFilePath := filepath.Join(topDir, "data", markerFilename) + require.NoFileExists(t, actualMarkerFilePath, "marker file must not be created") + + }, + }, + { + name: "no update marker, available install for rollback without TTL - error", + setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) { + //FIXME translate the installSource calls to rollbacksSource + //installSource.EXPECT().GetInstallDesc().Return( + // &v1.InstallDescriptor{ + // AgentInstalls: []v1.AgentInstallDesc{ + // { + // Version: release.VersionWithSnapshot(), + // Hash: release.Commit(), + // // Versioned home should contain the version but since the path does not really exist we fallback to the legacy format with just the hash + // // VersionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s-%s", release.VersionWithSnapshot(), release.ShortCommit())), + // VersionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s", release.ShortCommit())), + // Flavor: "basic", + // Active: true, + // }, + // { + // // old install without TTL + // Version: "1.2.3", + // Hash: "oldver", + // VersionedHome: "data/elastic-agent-1.2.3-oldver", + // Flavor: "basic", + // Active: false, + // }, + // }, + // }, + // nil, + //) + }, + artifactSettings: artifact.DefaultConfig(), + upgradeSettings: &configuration.UpgradeConfig{ + Rollback: &configuration.UpgradeRollbackConfig{ + Window: 24 * time.Hour, + }, + }, + now: aMomentInTime, + version: "1.2.3", + wantErr: func(t assert.TestingT, err error, i ...interface{}) bool { + return assert.ErrorIs(t, err, ErrNoRollbacksAvailable, i...) + }, + additionalAsserts: func(t *testing.T, topDir string) { + actualMarkerFilePath := filepath.Join(topDir, "data", markerFilename) + require.NoFileExists(t, actualMarkerFilePath, "marker file must not be created") + + }, + }, + { + name: "no update marker, error retrieving agent installs", + setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) { + //FIXME translate the installSource calls to rollbacksSource + //installSource.EXPECT().GetInstallDesc().Return( + // nil, + // errors.New("error retrieving agent installs"), + //) + }, + artifactSettings: artifact.DefaultConfig(), + upgradeSettings: &configuration.UpgradeConfig{ + Rollback: &configuration.UpgradeRollbackConfig{ + Window: 24 * time.Hour, + }, + }, + now: aMomentInTime, + version: "1.2.3", + wantErr: assert.Error, + }, + { + name: "no update marker, invoking watcher fails - error", + setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) { + //FIXME translate the installSource calls to rollbacksSource + //installSource.EXPECT().GetInstallDesc().Return( + // &v1.InstallDescriptor{ + // AgentInstalls: []v1.AgentInstallDesc{ + // { + // Version: release.VersionWithSnapshot(), + // Hash: release.Commit(), + // // Versioned home should contain the version but since the path does not really exist we fallback to the legacy format with just the hash + // // VersionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s-%s", release.VersionWithSnapshot(), release.ShortCommit())), + // VersionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s", release.ShortCommit())), + // Flavor: "basic", + // Active: true, + // }, + // { + // // old install is still valid for the next 24 hours + // OptionalTTLItem: v1.OptionalTTLItem{TTL: &aMomentTomorrow}, + // Version: "1.2.3", + // Hash: "oldver", + // VersionedHome: "data/elastic-agent-1.2.3-oldver", + // Flavor: "basic", + // Active: false, + // }, + // }, + // }, + // nil, + //) + newerWatcherExecutable := filepath.Join(topDir, "data", fmt.Sprintf("elastic-agent-%s-%s", release.VersionWithSnapshot(), release.ShortCommit()), "elastic-agent") + watcherHelper.EXPECT().SelectWatcherExecutable(topDir, agentInstall123, agentInstallCurrent).Return(newerWatcherExecutable) + watcherHelper.EXPECT().InvokeWatcher(mock.Anything, newerWatcherExecutable, "--rollback", "data/elastic-agent-1.2.3-oldver"). + Return(nil, errors.New("error invoking watcher")) + }, + artifactSettings: artifact.DefaultConfig(), + upgradeSettings: &configuration.UpgradeConfig{ + Rollback: &configuration.UpgradeRollbackConfig{ + Window: 24 * time.Hour, + }, + }, + now: aMomentInTime, + version: "1.2.3", + wantErr: assert.Error, + additionalAsserts: func(t *testing.T, topDir string) { + actualMarkerFilePath := filepath.Join(topDir, "data", markerFilename) + require.NoFileExists(t, actualMarkerFilePath, "marker file must have been created") + }, + }, } for _, tc := range testcases { @@ -280,7 +607,7 @@ func TestManualRollback(t *testing.T) { require.NoError(t, err, "error creating data directory in topDir %q", topDir) if tc.setup != nil { - tc.setup(t, topDir, mockAgentInfo, mockWatcherHelper) + tc.setup(t, topDir, mockAgentInfo, mockWatcherHelper, mockRollbacksSource) } upgrader, err := NewUpgrader(log, tc.artifactSettings, tc.upgradeSettings, mockAgentInfo, mockWatcherHelper, mockRollbacksSource) From c796ed6958c49197d77cbf420369f0cfcc5934f6 Mon Sep 17 00:00:00 2001 From: Paolo Chila Date: Tue, 30 Sep 2025 14:45:20 +0200 Subject: [PATCH 05/14] fix lint errors --- .../application/upgrade/manual_rollback.go | 8 +- .../upgrade/manual_rollback_test.go | 199 ++++-------------- .../agent/application/upgrade/step_mark.go | 2 +- .../upgrade/ttl_marker_source_test.go | 7 +- .../pkg/agent/application/upgrade/upgrade.go | 2 +- 5 files changed, 52 insertions(+), 166 deletions(-) diff --git a/internal/pkg/agent/application/upgrade/manual_rollback.go b/internal/pkg/agent/application/upgrade/manual_rollback.go index 03f66bdf98b..9f715e6e6ac 100644 --- a/internal/pkg/agent/application/upgrade/manual_rollback.go +++ b/internal/pkg/agent/application/upgrade/manual_rollback.go @@ -23,6 +23,8 @@ import ( agtversion "github.com/elastic/elastic-agent/version" ) +const disableRollbackWindow = time.Duration(0) + func (u *Upgrader) rollbackToPreviousVersion(ctx context.Context, topDir string, now time.Time, version string, action *fleetapi.ActionUpgrade) (reexec.ShutdownCallbackFn, error) { if version == "" { return nil, ErrEmptyRollbackVersion @@ -118,7 +120,7 @@ func rollbackUsingAgentInstalls(log *logger.Logger, watcherHelper WatcherHelper, prevAgentInstall := agentInstall{ parsedVersion: prevAgentParsedVersion, version: targetTTLMarker.Version, - hash: "", + hash: targetTTLMarker.Hash, versionedHome: targetInstall, } @@ -233,8 +235,8 @@ func extractAgentInstallsFromMarker(updateMarker *UpdateMarker) (previous agentI return previous, current, nil } -func getAvailableRollbacks(rollbackWindow time.Duration, now time.Time, currentVersion string, parsedCurrentVersion *version.ParsedSemVer, currentVersionedHome string) map[string]TTLMarker { - if rollbackWindow == 0 { +func getAvailableRollbacks(rollbackWindow time.Duration, now time.Time, currentVersion string, parsedCurrentVersion *version.ParsedSemVer, currentVersionedHome string, currentHash string) map[string]TTLMarker { + if rollbackWindow == disableRollbackWindow { // if there's no rollback window it means that no rollback should survive the watcher cleanup at the end of the grace period. return nil } diff --git a/internal/pkg/agent/application/upgrade/manual_rollback_test.go b/internal/pkg/agent/application/upgrade/manual_rollback_test.go index de6b9ab8c1b..ee2e840a7f6 100644 --- a/internal/pkg/agent/application/upgrade/manual_rollback_test.go +++ b/internal/pkg/agent/application/upgrade/manual_rollback_test.go @@ -7,7 +7,6 @@ package upgrade import ( "errors" "fmt" - "io/fs" "os" "os/exec" "path/filepath" @@ -107,8 +106,8 @@ func TestManualRollback(t *testing.T) { // save the current timestamp, useful for TTL-based testing aMomentInTime := time.Now() - //aMomentTomorrow := aMomentInTime.Add(24 * time.Hour) - //aMomentAgo := aMomentInTime.Add(-1 * time.Second) + aMomentTomorrow := aMomentInTime.Add(24 * time.Hour) + aMomentAgo := aMomentInTime.Add(-1 * time.Second) type setupF func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) type postRollbackAssertionsF func(t *testing.T, topDir string) @@ -141,12 +140,13 @@ func TestManualRollback(t *testing.T) { name: "no update marker - rollback fails", setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) { //do not setup anything here, let the rollback fail + rollbacksSource.EXPECT().Get().Return(nil, nil) }, artifactSettings: artifact.DefaultConfig(), upgradeSettings: configuration.DefaultUpgradeConfig(), version: "1.2.3", wantErr: func(t assert.TestingT, err error, i ...interface{}) bool { - return assert.ErrorIs(t, err, fs.ErrNotExist) + return assert.ErrorIs(t, err, ErrNoRollbacksAvailable) }, additionalAsserts: nil, }, @@ -321,32 +321,13 @@ func TestManualRollback(t *testing.T) { { name: "no update marker, available install for rollback with valid TTL - rollback", setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) { - //FIXME translate the installSource calls to rollbacksSource - //installSource.EXPECT().GetInstallDesc().Return( - // &v1.InstallDescriptor{ - // AgentInstalls: []v1.AgentInstallDesc{ - // { - // Version: release.VersionWithSnapshot(), - // Hash: release.Commit(), - // // Versioned home should contain the version but since the path does not really exist we fallback to the legacy format with just the hash - // //VersionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s-%s", release.VersionWithSnapshot(), release.ShortCommit())), - // VersionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s", release.ShortCommit())), - // Flavor: "basic", - // Active: true, - // }, - // { - // // old install is still valid for the next 24 hours - // OptionalTTLItem: v1.OptionalTTLItem{TTL: &aMomentTomorrow}, - // Version: "1.2.3", - // Hash: "oldver", - // VersionedHome: "data/elastic-agent-1.2.3-oldver", - // Flavor: "basic", - // Active: false, - // }, - // }, - // }, - // nil, - //) + rollbacksSource.EXPECT().Get().Return(map[string]TTLMarker{ + "data/elastic-agent-1.2.3-oldver": { + Version: "1.2.3", + Hash: "oldver", + ValidUntil: aMomentTomorrow, + }, + }, nil) newerWatcherExecutable := filepath.Join(topDir, "data", fmt.Sprintf("elastic-agent-%s-%s", release.VersionWithSnapshot(), release.ShortCommit()), "elastic-agent") watcherHelper.EXPECT().SelectWatcherExecutable(topDir, agentInstall123, agentInstallCurrent).Return(newerWatcherExecutable) watcherHelper.EXPECT().InvokeWatcher(mock.Anything, newerWatcherExecutable, "--rollback", "data/elastic-agent-1.2.3-oldver"). @@ -390,32 +371,15 @@ func TestManualRollback(t *testing.T) { { name: "no update marker, available install for rollback with expired TTL - error", setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) { - //FIXME translate the installSource calls to rollbacksSource - //installSource.EXPECT().GetInstallDesc().Return( - // &v1.InstallDescriptor{ - // AgentInstalls: []v1.AgentInstallDesc{ - // { - // Version: release.VersionWithSnapshot(), - // Hash: release.Commit(), - // // Versioned home should contain the version but since the path does not really exist we fallback to the legacy format with just the hash - // // VersionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s-%s", release.VersionWithSnapshot(), release.ShortCommit())), - // VersionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s", release.ShortCommit())), - // Flavor: "basic", - // Active: true, - // }, - // { - // // old install expired a second ago - // OptionalTTLItem: v1.OptionalTTLItem{TTL: &aMomentAgo}, - // Version: "1.2.3", - // Hash: "oldver", - // VersionedHome: "data/elastic-agent-1.2.3-oldver", - // Flavor: "basic", - // Active: false, - // }, - // }, - // }, - // nil, - //) + rollbacksSource.EXPECT().Get().Return( + map[string]TTLMarker{ + "data/elastic-agent-1.2.3-oldver": { + Version: "1.2.3", + Hash: "oldver", + ValidUntil: aMomentAgo, + }, + }, + nil) }, artifactSettings: artifact.DefaultConfig(), upgradeSettings: &configuration.UpgradeConfig{ @@ -437,32 +401,15 @@ func TestManualRollback(t *testing.T) { { name: "no update marker, no available install for the version - error", setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) { - //FIXME translate the installSource calls to rollbacksSource - //installSource.EXPECT().GetInstallDesc().Return( - // &v1.InstallDescriptor{ - // AgentInstalls: []v1.AgentInstallDesc{ - // { - // Version: release.VersionWithSnapshot(), - // Hash: release.Commit(), - // // Versioned home should contain the version but since the path does not really exist we fallback to the legacy format with just the hash - // // VersionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s-%s", release.VersionWithSnapshot(), release.ShortCommit())), - // VersionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s", release.ShortCommit())), - // Flavor: "basic", - // Active: true, - // }, - // { - // // old install - still valid - // OptionalTTLItem: v1.OptionalTTLItem{TTL: &aMomentTomorrow}, - // Version: "1.2.3", - // Hash: "oldver", - // VersionedHome: "data/elastic-agent-1.2.3-oldver", - // Flavor: "basic", - // Active: false, - // }, - // }, - // }, - // nil, - //) + rollbacksSource.EXPECT().Get().Return( + map[string]TTLMarker{ + "data/elastic-agent-1.2.3-oldver": { + Version: "1.2.3", + Hash: "oldver", + ValidUntil: aMomentTomorrow, + }, + }, + nil) }, artifactSettings: artifact.DefaultConfig(), upgradeSettings: &configuration.UpgradeConfig{ @@ -481,60 +428,10 @@ func TestManualRollback(t *testing.T) { }, }, - { - name: "no update marker, available install for rollback without TTL - error", - setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) { - //FIXME translate the installSource calls to rollbacksSource - //installSource.EXPECT().GetInstallDesc().Return( - // &v1.InstallDescriptor{ - // AgentInstalls: []v1.AgentInstallDesc{ - // { - // Version: release.VersionWithSnapshot(), - // Hash: release.Commit(), - // // Versioned home should contain the version but since the path does not really exist we fallback to the legacy format with just the hash - // // VersionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s-%s", release.VersionWithSnapshot(), release.ShortCommit())), - // VersionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s", release.ShortCommit())), - // Flavor: "basic", - // Active: true, - // }, - // { - // // old install without TTL - // Version: "1.2.3", - // Hash: "oldver", - // VersionedHome: "data/elastic-agent-1.2.3-oldver", - // Flavor: "basic", - // Active: false, - // }, - // }, - // }, - // nil, - //) - }, - artifactSettings: artifact.DefaultConfig(), - upgradeSettings: &configuration.UpgradeConfig{ - Rollback: &configuration.UpgradeRollbackConfig{ - Window: 24 * time.Hour, - }, - }, - now: aMomentInTime, - version: "1.2.3", - wantErr: func(t assert.TestingT, err error, i ...interface{}) bool { - return assert.ErrorIs(t, err, ErrNoRollbacksAvailable, i...) - }, - additionalAsserts: func(t *testing.T, topDir string) { - actualMarkerFilePath := filepath.Join(topDir, "data", markerFilename) - require.NoFileExists(t, actualMarkerFilePath, "marker file must not be created") - - }, - }, { name: "no update marker, error retrieving agent installs", setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) { - //FIXME translate the installSource calls to rollbacksSource - //installSource.EXPECT().GetInstallDesc().Return( - // nil, - // errors.New("error retrieving agent installs"), - //) + rollbacksSource.EXPECT().Get().Return(nil, errors.New("error retrieving agent rollbacks")) }, artifactSettings: artifact.DefaultConfig(), upgradeSettings: &configuration.UpgradeConfig{ @@ -549,32 +446,16 @@ func TestManualRollback(t *testing.T) { { name: "no update marker, invoking watcher fails - error", setup: func(t *testing.T, topDir string, agent *info.MockAgent, watcherHelper *MockWatcherHelper, rollbacksSource *mockAvailableRollbacksSource) { - //FIXME translate the installSource calls to rollbacksSource - //installSource.EXPECT().GetInstallDesc().Return( - // &v1.InstallDescriptor{ - // AgentInstalls: []v1.AgentInstallDesc{ - // { - // Version: release.VersionWithSnapshot(), - // Hash: release.Commit(), - // // Versioned home should contain the version but since the path does not really exist we fallback to the legacy format with just the hash - // // VersionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s-%s", release.VersionWithSnapshot(), release.ShortCommit())), - // VersionedHome: filepath.Join("data", fmt.Sprintf("elastic-agent-%s", release.ShortCommit())), - // Flavor: "basic", - // Active: true, - // }, - // { - // // old install is still valid for the next 24 hours - // OptionalTTLItem: v1.OptionalTTLItem{TTL: &aMomentTomorrow}, - // Version: "1.2.3", - // Hash: "oldver", - // VersionedHome: "data/elastic-agent-1.2.3-oldver", - // Flavor: "basic", - // Active: false, - // }, - // }, - // }, - // nil, - //) + rollbacksSource.EXPECT().Get().Return( + map[string]TTLMarker{ + "data/elastic-agent-1.2.3-oldver": { + Version: "1.2.3", + Hash: "oldver", + ValidUntil: aMomentTomorrow, + }, + }, + nil, + ) newerWatcherExecutable := filepath.Join(topDir, "data", fmt.Sprintf("elastic-agent-%s-%s", release.VersionWithSnapshot(), release.ShortCommit()), "elastic-agent") watcherHelper.EXPECT().SelectWatcherExecutable(topDir, agentInstall123, agentInstallCurrent).Return(newerWatcherExecutable) watcherHelper.EXPECT().InvokeWatcher(mock.Anything, newerWatcherExecutable, "--rollback", "data/elastic-agent-1.2.3-oldver"). diff --git a/internal/pkg/agent/application/upgrade/step_mark.go b/internal/pkg/agent/application/upgrade/step_mark.go index 7f93ff43a27..b5a9e962c30 100644 --- a/internal/pkg/agent/application/upgrade/step_mark.go +++ b/internal/pkg/agent/application/upgrade/step_mark.go @@ -21,11 +21,11 @@ import ( ) const markerFilename = ".update-marker" -const disableRollbackWindow = time.Duration(0) // TTLMarker marks an elastic-agent install available for rollback type TTLMarker struct { Version string `json:"version" yaml:"version"` + Hash string `json:"hash" yaml:"hash"` ValidUntil time.Time `json:"valid_until" yaml:"valid_until"` } diff --git a/internal/pkg/agent/application/upgrade/ttl_marker_source_test.go b/internal/pkg/agent/application/upgrade/ttl_marker_source_test.go index 3ac95e1769d..dd79961102c 100644 --- a/internal/pkg/agent/application/upgrade/ttl_marker_source_test.go +++ b/internal/pkg/agent/application/upgrade/ttl_marker_source_test.go @@ -140,6 +140,7 @@ func TestTTLMarkerRegistry_Get(t *testing.T) { func TestTTLMarkerRegistry_Set(t *testing.T) { const TTLMarkerYAMLTemplate = ` version: {{ .Version }} + hash: {{ .Hash }} valid_until: {{ .ValidUntil }}` expectedMarkerContentTemplate, err := template.New("expected marker").Parse(TTLMarkerYAMLTemplate) @@ -156,6 +157,7 @@ func TestTTLMarkerRegistry_Set(t *testing.T) { versions := []string{"1.2.3", "4.5.6"} versionedHomes := []string{"elastic-agent-1.2.3-past", "elastic-agent-4.5.6-present"} + hashes := []string{"past", "present"} ttls := []string{tomorrowString, ""} type args struct { @@ -180,6 +182,7 @@ func TestTTLMarkerRegistry_Set(t *testing.T) { map[string]TTLMarker{ filepath.Join("data", versionedHomes[0]): { Version: versions[0], + Hash: hashes[0], ValidUntil: tomorrow, }, }, @@ -192,7 +195,7 @@ func TestTTLMarkerRegistry_Set(t *testing.T) { if assert.FileExists(t, expectedTTLMarkerFilePath, "new TTL marker should have been created") { b := new(strings.Builder) - err = expectedMarkerContentTemplate.Execute(b, map[string]string{"Version": versions[0], "ValidUntil": ttls[0]}) + err = expectedMarkerContentTemplate.Execute(b, map[string]string{"Version": versions[0], "ValidUntil": ttls[0], "Hash": hashes[0]}) require.NoError(t, err) actualMarkerContent, err := os.ReadFile(expectedTTLMarkerFilePath) require.NoError(t, err) @@ -207,7 +210,7 @@ func TestTTLMarkerRegistry_Set(t *testing.T) { err = os.MkdirAll(filepath.Join(tmpDir, "data", versionedHome), 0755) require.NoError(t, err, "error setting up fake agent install directory") b := new(strings.Builder) - err = expectedMarkerContentTemplate.Execute(b, map[string]string{"Version": versions[i], "ValidUntil": ttls[i]}) + err = expectedMarkerContentTemplate.Execute(b, map[string]string{"Version": versions[i], "ValidUntil": ttls[i], "Hash": hashes[i]}) require.NoError(t, err, "error setting up ttl marker") err = os.WriteFile(filepath.Join(tmpDir, "data", versionedHomes[i], ttlMarkerName), []byte(b.String()), 0644) } diff --git a/internal/pkg/agent/application/upgrade/upgrade.go b/internal/pkg/agent/application/upgrade/upgrade.go index eb00f1a1638..98a246d748c 100644 --- a/internal/pkg/agent/application/upgrade/upgrade.go +++ b/internal/pkg/agent/application/upgrade/upgrade.go @@ -468,7 +468,7 @@ func (u *Upgrader) Upgrade(ctx context.Context, version string, rollback bool, s versionedHome: currentVersionedHome, } - availableRollbacks := getAvailableRollbacks(rollbackWindow, time.Now(), release.VersionWithSnapshot(), previousParsedVersion, currentVersionedHome) + availableRollbacks := getAvailableRollbacks(rollbackWindow, time.Now(), release.VersionWithSnapshot(), previousParsedVersion, currentVersionedHome, release.Commit()) if err = u.availableRollbacksSource.Set(availableRollbacks); err != nil { u.log.Errorw("Rolling back: setting ttl markers failed", "error.message", err) From 754258cadd4dbf6de0a336156c7d5311afca03be Mon Sep 17 00:00:00 2001 From: Paolo Chila Date: Wed, 1 Oct 2025 11:40:49 +0200 Subject: [PATCH 06/14] Normalize install descriptors at startup --- .mockery.yaml | 3 + internal/pkg/agent/application/application.go | 94 +++++++++ .../pkg/agent/application/application_test.go | 191 ++++++++++++++++++ internal/pkg/agent/application/mocks.go | 148 ++++++++++++++ .../pkg/agent/application/upgrade/rollback.go | 12 +- .../application/upgrade/rollback_test.go | 18 +- .../agent/application/upgrade/step_mark.go | 4 +- .../agent/application/upgrade/step_relink.go | 4 +- .../agent/application/upgrade/step_unpack.go | 10 +- .../application/upgrade/step_unpack_test.go | 12 +- .../pkg/agent/application/upgrade/upgrade.go | 16 +- .../agent/application/upgrade/upgrade_test.go | 4 +- .../pkg/agent/application/upgrade/watcher.go | 4 +- .../agent/application/upgrade/watcher_test.go | 2 +- 14 files changed, 479 insertions(+), 43 deletions(-) create mode 100644 internal/pkg/agent/application/mocks.go diff --git a/.mockery.yaml b/.mockery.yaml index 8d8a896feb3..0b2e6c8c63b 100644 --- a/.mockery.yaml +++ b/.mockery.yaml @@ -5,6 +5,9 @@ filename: mocks.go template-data: unroll-variadic: true packages: + github.com/elastic/elastic-agent/internal/pkg/agent/application: + interfaces: + rollbacksSource: {} github.com/elastic/elastic-agent/internal/pkg/agent/application/actions/handlers: interfaces: Uploader: {} diff --git a/internal/pkg/agent/application/application.go b/internal/pkg/agent/application/application.go index 825b078cceb..2d69196084e 100644 --- a/internal/pkg/agent/application/application.go +++ b/internal/pkg/agent/application/application.go @@ -7,11 +7,14 @@ package application import ( "context" "fmt" + "os" + "path/filepath" "time" "go.elastic.co/apm/v2" componentmonitoring "github.com/elastic/elastic-agent/internal/pkg/agent/application/monitoring/component" + "github.com/elastic/elastic-agent/internal/pkg/agent/install" "github.com/elastic/go-ucfg" @@ -48,6 +51,11 @@ import ( "github.com/elastic/elastic-agent/version" ) +type rollbacksSource interface { + Set(map[string]upgrade.TTLMarker) error + Get() (map[string]upgrade.TTLMarker, error) +} + // CfgOverrider allows for application driven overrides of configuration read from disk. type CfgOverrider func(cfg *configuration.Configuration) @@ -131,6 +139,10 @@ func New( isMonitoringSupported := !disableMonitoring && cfg.Settings.V1MonitoringEnabled availableRollbacksSource := upgrade.NewTTLMarkerRegistry(log, paths.Top()) + if platform.OS != component.Container { + // If we are not running in a container, check and normalize the install descriptor before we start the agent + normalizeInstallDescriptorAtStartup(log, paths.Top(), time.Now(), initialUpdateMarker, availableRollbacksSource) + } upgrader, err := upgrade.NewUpgrader(log, cfg.Settings.DownloadConfig, cfg.Settings.Upgrade, agentInfo, new(upgrade.AgentWatcherHelper), availableRollbacksSource) if err != nil { return nil, nil, nil, fmt.Errorf("failed to create upgrader: %w", err) @@ -296,6 +308,88 @@ func New( return coord, configMgr, varsManager, nil } +// normalizeInstallDescriptorAtStartup will check the install descriptor checking: +// - if we just rolled back: the update marker is checked and in case of rollback we clean up the entry about the failed upgraded install +// - check all the entries: +// - verify that the home directory for that install still exists (remove what does not exist anymore) +// - TODO check TTLs of installs to schedule delayed cleanup while the agent is running +// +// This function will NOT error out, it will log any errors it encounters as warnings but any error must be treated as non-fatal +func normalizeInstallDescriptorAtStartup(log *logger.Logger, topDir string, now time.Time, initialUpdateMarker *upgrade.UpdateMarker, rollbackSource rollbacksSource) { + // Check if we rolled back and update the install descriptor + if initialUpdateMarker != nil && initialUpdateMarker.Details != nil && initialUpdateMarker.Details.State == details.StateRollback { + // Reset the TTL for the current version if we are coming off a rollback + rollbacks, err := rollbackSource.Get() + if err != nil { + log.Warnf("Error getting available rollbacks from rollbackSource during startup check: %s", err) + return + } + + // remove the current versioned home TTL marker + delete(rollbacks, initialUpdateMarker.PrevVersionedHome) + err = rollbackSource.Set(rollbacks) + if err != nil { + log.Warnf("Error removing install descriptor from installDescriptorSource during startup check: %s", err) + return + } + } + + // check if we need to cleanup old agent installs + rollbacks, err := rollbackSource.Get() + if err != nil { + log.Warnf("Error getting available rollbacks during startup check: %s", err) + return + } + + var versionedHomesToCleanup []string + for versionedHome, ttlMarker := range rollbacks { + + versionedHomeAbsPath := filepath.Join(topDir, versionedHome) + + if versionedHomeAbsPath == paths.HomeFrom(topDir) { + // skip the current install + log.Warnf("Found a TTL marker for the currently running agent at %s. Skipping cleanup...", versionedHome) + continue + } + + _, err = os.Stat(versionedHomeAbsPath) + if errors.Is(err, os.ErrNotExist) { + log.Warnf("Versioned home %s corresponding to agent install descriptor %+v is not found on disk", versionedHomeAbsPath, ttlMarker) + versionedHomesToCleanup = append(versionedHomesToCleanup, versionedHome) + continue + } + + if err != nil { + log.Warnf("error checking versioned home %s for agent install: %s", versionedHomeAbsPath, err.Error()) + continue + } + + if now.After(ttlMarker.ValidUntil) { + // the install directory exists but it's expired. Remove the files. + log.Infof("agent install descriptor %+v is expired, removing directory %q", ttlMarker, versionedHomeAbsPath) + if cleanupErr := install.RemoveBut(versionedHomeAbsPath, true); cleanupErr != nil { + log.Warnf("Error removing directory %q: %s", versionedHomeAbsPath, cleanupErr) + } else { + log.Infof("Directory %q was removed", versionedHomeAbsPath) + versionedHomesToCleanup = append(versionedHomesToCleanup, versionedHome) + } + } + } + + if len(versionedHomesToCleanup) > 0 { + log.Infof("removing install descriptor(s) for %v", versionedHomesToCleanup) + for _, versionedHomeToCleanup := range versionedHomesToCleanup { + delete(rollbacks, versionedHomeToCleanup) + } + err = rollbackSource.Set(rollbacks) + if err != nil { + log.Warnf("Error removing install descriptor(s): %s", err) + } + } + + return +} + func mergeFleetConfig(ctx context.Context, rawConfig *config.Config) (storage.Store, *configuration.Configuration, error) { path := paths.AgentConfigFile() store, err := storage.NewEncryptedDiskStore(ctx, path) diff --git a/internal/pkg/agent/application/application_test.go b/internal/pkg/agent/application/application_test.go index 3c4ea820ea0..3c3c2a63810 100644 --- a/internal/pkg/agent/application/application_test.go +++ b/internal/pkg/agent/application/application_test.go @@ -7,6 +7,9 @@ package application import ( "context" "fmt" + "os" + "path/filepath" + "runtime" "testing" "time" @@ -15,6 +18,9 @@ import ( "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/elastic-agent/internal/pkg/agent/application/info" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details" "github.com/elastic/elastic-agent/internal/pkg/config" "github.com/elastic/elastic-agent/internal/pkg/testutils" "github.com/elastic/elastic-agent/pkg/core/logger/loggertest" @@ -302,3 +308,188 @@ func TestInjectOutputOverrides(t *testing.T) { }) } } + +func Test_normalizeInstallDescriptorAtStartup(t *testing.T) { + + now := time.Now() + tomorrow := now.Add(24 * time.Hour) + yesterday := now.Add(-24 * time.Hour) + + tests := []struct { + name string + setup func(t *testing.T, topDir string) (*upgrade.UpdateMarker, rollbacksSource) + postNormalizeAssertions func(t *testing.T, topDir string, initialUpdateMarker *upgrade.UpdateMarker) + }{ + { + name: "happy path: single install, no rollbacks, no modifications needed", + setup: func(t *testing.T, topDir string) (*upgrade.UpdateMarker, rollbacksSource) { + mockRollbackSource := newMockRollbacksSource(t) + mockRollbackSource.EXPECT().Get().Return(nil, nil) + return nil, mockRollbackSource + }, + + postNormalizeAssertions: nil, + }, + { + name: "Agent was manually rolled back: rolled back install is removed from the list", + setup: func(t *testing.T, topDir string) (*upgrade.UpdateMarker, rollbacksSource) { + newAgentInstallPath := createFakeAgentInstall(t, topDir, "4.5.6", "newversionhash", true) + oldAgentInstallPath := createFakeAgentInstall(t, topDir, "1.2.3", "oldversionhash", true) + + mockRollbackSource := newMockRollbacksSource(t) + mockRollbackSource.EXPECT().Get().Return(map[string]upgrade.TTLMarker{ + oldAgentInstallPath: { + Version: "1.2.3", + Hash: "oldversionhash", + ValidUntil: tomorrow, + }, + }, nil) + + updateMarker := &upgrade.UpdateMarker{ + Version: "4.5.6", + Hash: "newversionhash", + VersionedHome: newAgentInstallPath, + UpdatedOn: now, + PrevVersion: "1.2.3", + PrevHash: "oldversionhash", + PrevVersionedHome: oldAgentInstallPath, + Acked: false, + Action: nil, + Details: &details.Details{ + TargetVersion: "4.5.6", + State: details.StateRollback, + ActionID: "", + Metadata: details.Metadata{ + Reason: details.ReasonManualRollbackPattern, + }, + }, + } + + // expect code to clear the rollback + mockRollbackSource.EXPECT().Set(map[string]upgrade.TTLMarker{}).Return(nil) + return updateMarker, mockRollbackSource + }, + postNormalizeAssertions: nil, + }, + { + name: "Entries not having a matching install directory will be removed from the list", + setup: func(t *testing.T, topDir string) (*upgrade.UpdateMarker, rollbacksSource) { + _ = createFakeAgentInstall(t, topDir, "4.5.6", "newversionhash", true) + oldAgentInstallPath := createFakeAgentInstall(t, topDir, "1.2.3", "oldversionhash", true) + + mockRollbackSource := newMockRollbacksSource(t) + nonExistingVersionedHome := filepath.Join("data", "thisdirectorydoesnotexist") + mockRollbackSource.EXPECT().Get().Return(map[string]upgrade.TTLMarker{ + oldAgentInstallPath: { + Version: "1.2.3", + Hash: "oldversionhash", + ValidUntil: tomorrow, + }, + nonExistingVersionedHome: { + Version: "0.0.0", + Hash: "nonExistingHash", + ValidUntil: tomorrow, + }, + }, nil) + + mockRollbackSource.EXPECT().Set(map[string]upgrade.TTLMarker{ + oldAgentInstallPath: { + Version: "1.2.3", + Hash: "oldversionhash", + ValidUntil: tomorrow, + }, + }).Return(nil) + return nil, mockRollbackSource + }, + postNormalizeAssertions: nil, + }, + { + name: "Expired installs still existing on disk will be removed from the install list and removed from disk", + setup: func(t *testing.T, topDir string) (*upgrade.UpdateMarker, rollbacksSource) { + _ = createFakeAgentInstall(t, topDir, "4.5.6", "newversionhash", true) + oldAgentInstallPath := createFakeAgentInstall(t, topDir, "1.2.3", "oldversionhash", true) + + // assert that the versionedHome of the old install is the same we check in postNormalizeAssertions + assert.Equal(t, oldAgentInstallPath, filepath.Join("data", "elastic-agent-1.2.3-oldver"), + "Unexpected old install versioned home. Post normalize assertions may not be working") + + mockRollbackSource := newMockRollbacksSource(t) + mockRollbackSource.EXPECT().Get().Return( + map[string]upgrade.TTLMarker{ + oldAgentInstallPath: { + Version: "1.2.3", + Hash: "oldver", + ValidUntil: yesterday, + }, + }, + nil, + ) + // expect removal of the existing ttlmarker + mockRollbackSource.EXPECT().Set(map[string]upgrade.TTLMarker{}).Return(nil) + return nil, mockRollbackSource + }, + postNormalizeAssertions: func(t *testing.T, topDir string, _ *upgrade.UpdateMarker) { + assert.NoDirExists(t, filepath.Join(topDir, "data", "elastic-agent-1.2.3-oldver")) + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + logger, _ := loggertest.New(t.Name()) + tmpDir := t.TempDir() + updateMarker, installSource := tt.setup(t, tmpDir) + normalizeInstallDescriptorAtStartup(logger, tmpDir, now, updateMarker, installSource) + if tt.postNormalizeAssertions != nil { + tt.postNormalizeAssertions(t, tmpDir, updateMarker) + } + }) + } +} + +// createFakeAgentInstall (copied from the upgrade package tests) will create a mock agent install within topDir, possibly +// using the version in the directory name, depending on useVersionInPath it MUST return the path to the created versionedHome +// relative to topDir, to mirror what step_unpack returns +func createFakeAgentInstall(t *testing.T, topDir, version, hash string, useVersionInPath bool) string { + + // create versioned home + versionedHome := fmt.Sprintf("elastic-agent-%s", hash[:upgrade.HashLen]) + if useVersionInPath { + // use the version passed as parameter + versionedHome = fmt.Sprintf("elastic-agent-%s-%s", version, hash[:upgrade.HashLen]) + } + relVersionedHomePath := filepath.Join("data", versionedHome) + absVersionedHomePath := filepath.Join(topDir, relVersionedHomePath) + + // recalculate the binary path and launch a mkDirAll to account for MacOS weirdness + // (the extra nesting of elastic agent binary within versionedHome) + absVersioneHomeBinaryPath := paths.BinaryPath(absVersionedHomePath, "") + err := os.MkdirAll(absVersioneHomeBinaryPath, 0o750) + require.NoError(t, err, "error creating fake install versioned home directory (including binary path) %q", absVersioneHomeBinaryPath) + + // place a few directories in the fake install + absComponentsDirPath := filepath.Join(absVersionedHomePath, "components") + err = os.MkdirAll(absComponentsDirPath, 0o750) + require.NoError(t, err, "error creating fake install components directory %q", absVersionedHomePath) + + absLogsDirPath := filepath.Join(absVersionedHomePath, "logs") + err = os.MkdirAll(absLogsDirPath, 0o750) + require.NoError(t, err, "error creating fake install logs directory %q", absLogsDirPath) + + absRunDirPath := filepath.Join(absVersionedHomePath, "run") + err = os.MkdirAll(absRunDirPath, 0o750) + require.NoError(t, err, "error creating fake install run directory %q", absRunDirPath) + + // put some placeholder for files + agentExecutableName := upgrade.AgentName + if runtime.GOOS == "windows" { + agentExecutableName += ".exe" + } + err = os.WriteFile(paths.BinaryPath(absVersionedHomePath, agentExecutableName), []byte(fmt.Sprintf("Placeholder for agent %s", version)), 0o750) + require.NoErrorf(t, err, "error writing elastic agent binary placeholder %q", agentExecutableName) + fakeLogPath := filepath.Join(absLogsDirPath, "fakelog.ndjson") + err = os.WriteFile(fakeLogPath, []byte(fmt.Sprintf("Sample logs for agent %s", version)), 0o750) + require.NoErrorf(t, err, "error writing fake log placeholder %q", fakeLogPath) + + // return the path relative to top exactly like the step_unpack does + return relVersionedHomePath +} diff --git a/internal/pkg/agent/application/mocks.go b/internal/pkg/agent/application/mocks.go new file mode 100644 index 00000000000..e2dd99b9e9c --- /dev/null +++ b/internal/pkg/agent/application/mocks.go @@ -0,0 +1,148 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License 2.0; +// you may not use this file except in compliance with the Elastic License 2.0. + +// Code generated by mockery; DO NOT EDIT. +// github.com/vektra/mockery +// template: testify + +package application + +import ( + mock "github.com/stretchr/testify/mock" + + "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade" +) + +// newMockRollbacksSource creates a new instance of mockRollbacksSource. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +// The first argument is typically a *testing.T value. +func newMockRollbacksSource(t interface { + mock.TestingT + Cleanup(func()) +}) *mockRollbacksSource { + mock := &mockRollbacksSource{} + mock.Mock.Test(t) + + t.Cleanup(func() { mock.AssertExpectations(t) }) + + return mock +} + +// mockRollbacksSource is an autogenerated mock type for the rollbacksSource type +type mockRollbacksSource struct { + mock.Mock +} + +type mockRollbacksSource_Expecter struct { + mock *mock.Mock +} + +func (_m *mockRollbacksSource) EXPECT() *mockRollbacksSource_Expecter { + return &mockRollbacksSource_Expecter{mock: &_m.Mock} +} + +// Get provides a mock function for the type mockRollbacksSource +func (_mock *mockRollbacksSource) Get() (map[string]upgrade.TTLMarker, error) { + ret := _mock.Called() + + if len(ret) == 0 { + panic("no return value specified for Get") + } + + var r0 map[string]upgrade.TTLMarker + var r1 error + if returnFunc, ok := ret.Get(0).(func() (map[string]upgrade.TTLMarker, error)); ok { + return returnFunc() + } + if returnFunc, ok := ret.Get(0).(func() map[string]upgrade.TTLMarker); ok { + r0 = returnFunc() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(map[string]upgrade.TTLMarker) + } + } + if returnFunc, ok := ret.Get(1).(func() error); ok { + r1 = returnFunc() + } else { + r1 = ret.Error(1) + } + return r0, r1 +} + +// mockRollbacksSource_Get_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'Get' +type mockRollbacksSource_Get_Call struct { + *mock.Call +} + +// Get is a helper method to define mock.On call +func (_e *mockRollbacksSource_Expecter) Get() *mockRollbacksSource_Get_Call { + return &mockRollbacksSource_Get_Call{Call: _e.mock.On("Get")} +} + +func (_c *mockRollbacksSource_Get_Call) Run(run func()) *mockRollbacksSource_Get_Call { + _c.Call.Run(func(args mock.Arguments) { + run() + }) + return _c +} + +func (_c *mockRollbacksSource_Get_Call) Return(stringToTTLMarker map[string]upgrade.TTLMarker, err error) *mockRollbacksSource_Get_Call { + _c.Call.Return(stringToTTLMarker, err) + return _c +} + +func (_c *mockRollbacksSource_Get_Call) RunAndReturn(run func() (map[string]upgrade.TTLMarker, error)) *mockRollbacksSource_Get_Call { + _c.Call.Return(run) + return _c +} + +// Set provides a mock function for the type mockRollbacksSource +func (_mock *mockRollbacksSource) Set(stringToTTLMarker map[string]upgrade.TTLMarker) error { + ret := _mock.Called(stringToTTLMarker) + + if len(ret) == 0 { + panic("no return value specified for Set") + } + + var r0 error + if returnFunc, ok := ret.Get(0).(func(map[string]upgrade.TTLMarker) error); ok { + r0 = returnFunc(stringToTTLMarker) + } else { + r0 = ret.Error(0) + } + return r0 +} + +// mockRollbacksSource_Set_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'Set' +type mockRollbacksSource_Set_Call struct { + *mock.Call +} + +// Set is a helper method to define mock.On call +// - stringToTTLMarker map[string]upgrade.TTLMarker +func (_e *mockRollbacksSource_Expecter) Set(stringToTTLMarker interface{}) *mockRollbacksSource_Set_Call { + return &mockRollbacksSource_Set_Call{Call: _e.mock.On("Set", stringToTTLMarker)} +} + +func (_c *mockRollbacksSource_Set_Call) Run(run func(stringToTTLMarker map[string]upgrade.TTLMarker)) *mockRollbacksSource_Set_Call { + _c.Call.Run(func(args mock.Arguments) { + var arg0 map[string]upgrade.TTLMarker + if args[0] != nil { + arg0 = args[0].(map[string]upgrade.TTLMarker) + } + run( + arg0, + ) + }) + return _c +} + +func (_c *mockRollbacksSource_Set_Call) Return(err error) *mockRollbacksSource_Set_Call { + _c.Call.Return(err) + return _c +} + +func (_c *mockRollbacksSource_Set_Call) RunAndReturn(run func(stringToTTLMarker map[string]upgrade.TTLMarker) error) *mockRollbacksSource_Set_Call { + _c.Call.Return(run) + return _c +} diff --git a/internal/pkg/agent/application/upgrade/rollback.go b/internal/pkg/agent/application/upgrade/rollback.go index a9542ed109e..6aa8bfda5e9 100644 --- a/internal/pkg/agent/application/upgrade/rollback.go +++ b/internal/pkg/agent/application/upgrade/rollback.go @@ -78,16 +78,16 @@ func RollbackWithOpts(ctx context.Context, log *logger.Logger, c client.Client, settings := NewRollbackSettings(opts...) - symlinkPath := filepath.Join(topDirPath, agentName) + symlinkPath := filepath.Join(topDirPath, AgentName) var symlinkTarget string if prevVersionedHome != "" { - symlinkTarget = paths.BinaryPath(filepath.Join(topDirPath, prevVersionedHome), agentName) + symlinkTarget = paths.BinaryPath(filepath.Join(topDirPath, prevVersionedHome), AgentName) } else { // fallback for upgrades that didn't use the manifest and path remapping - hashedDir := fmt.Sprintf("%s-%s", agentName, prevHash) + hashedDir := fmt.Sprintf("%s-%s", AgentName, prevHash) // paths.BinaryPath properly derives the binary directory depending on the platform. The path to the binary for macOS is inside of the app bundle. - symlinkTarget = paths.BinaryPath(filepath.Join(paths.DataFrom(topDirPath), hashedDir), agentName) + symlinkTarget = paths.BinaryPath(filepath.Join(paths.DataFrom(topDirPath), hashedDir), AgentName) } // change symlink if err := changeSymlink(log, topDirPath, symlinkPath, symlinkTarget); err != nil { @@ -128,7 +128,7 @@ func RollbackWithOpts(ctx context.Context, log *logger.Logger, c client.Client, } if prevVersionedHome == "" { - prevVersionedHome = filepath.Join("data", fmt.Sprintf("%s-%s", agentName, prevHash)) + prevVersionedHome = filepath.Join("data", fmt.Sprintf("%s-%s", AgentName, prevHash)) } // cleanup everything except version we're rolling back into @@ -176,7 +176,7 @@ func cleanup(log *logger.Logger, topDirPath string, removeMarker, keepLogs bool, log.Infow("Removing previous symlink path", "file.path", prevSymlinkPath(topDirPath)) _ = os.Remove(prevSymlink) - dirPrefix := fmt.Sprintf("%s-", agentName) + dirPrefix := fmt.Sprintf("%s-", AgentName) relativeHomePaths := make([]string, len(versionedHomesToKeep)) for i, h := range versionedHomesToKeep { diff --git a/internal/pkg/agent/application/upgrade/rollback_test.go b/internal/pkg/agent/application/upgrade/rollback_test.go index df87bc5b454..4a977905354 100644 --- a/internal/pkg/agent/application/upgrade/rollback_test.go +++ b/internal/pkg/agent/application/upgrade/rollback_test.go @@ -339,7 +339,7 @@ func TestRollback(t *testing.T) { func TestRollbackWithOpts(t *testing.T) { type hookFuncWithLogs func(t *testing.T, logs *observer.ObservedLogs, topDir string) - agentExecutableName := agentName + agentExecutableName := AgentName if runtime.GOOS == "windows" { agentExecutableName += ".exe" } @@ -595,7 +595,7 @@ func checkFilesAfterCleanup(t *testing.T, topDir, newAgentHome string, oldAgentH // check the new agent home assert.DirExists(t, filepath.Join(topDir, newAgentHome), "new agent directory should exist after cleanup") - agentExecutable := agentName + agentExecutable := AgentName if runtime.GOOS == "windows" { agentExecutable += ".exe" } @@ -624,11 +624,11 @@ func checkFilesAfterRollback(t *testing.T, topDir, oldAgentHome, newAgentHome st // some things should have been removed from the new agent directory assert.NoDirExists(t, filepath.Join(topDir, newAgentHome, "components"), "new agent components directory should have been cleaned up in the rollback") assert.NoDirExists(t, filepath.Join(topDir, newAgentHome, "run"), "new agent run directory should have been cleaned up in the rollback") - assert.NoFileExists(t, filepath.Join(topDir, newAgentHome, agentName), "new agent binary should have been cleaned up in the rollback") + assert.NoFileExists(t, filepath.Join(topDir, newAgentHome, AgentName), "new agent binary should have been cleaned up in the rollback") // check the old agent home assert.DirExists(t, filepath.Join(topDir, oldAgentHome), "old agent directory should exist after rollback") - agentExecutable := agentName + agentExecutable := AgentName if runtime.GOOS == "windows" { agentExecutable += ".exe" } @@ -693,10 +693,10 @@ func setupAgents(t *testing.T, log *logger.Logger, topDir string, installations func createFakeAgentInstall(t *testing.T, topDir, version, hash string, useVersionInPath bool) string { // create versioned home - versionedHome := fmt.Sprintf("elastic-agent-%s", hash[:hashLen]) + versionedHome := fmt.Sprintf("elastic-agent-%s", hash[:HashLen]) if useVersionInPath { // use the version passed as parameter - versionedHome = fmt.Sprintf("elastic-agent-%s-%s", version, hash[:hashLen]) + versionedHome = fmt.Sprintf("elastic-agent-%s-%s", version, hash[:HashLen]) } relVersionedHomePath := filepath.Join("data", versionedHome) absVersionedHomePath := filepath.Join(topDir, relVersionedHomePath) @@ -721,7 +721,7 @@ func createFakeAgentInstall(t *testing.T, topDir, version, hash string, useVersi require.NoError(t, err, "error creating fake install run directory %q", absRunDirPath) // put some placeholder for files - agentExecutableName := agentName + agentExecutableName := AgentName if runtime.GOOS == "windows" { agentExecutableName += ".exe" } @@ -736,8 +736,8 @@ func createFakeAgentInstall(t *testing.T, topDir, version, hash string, useVersi } func createLink(t *testing.T, topDir string, currentAgentVersionedHome string) { - linkTarget := paths.BinaryPath(currentAgentVersionedHome, agentName) - linkName := agentName + linkTarget := paths.BinaryPath(currentAgentVersionedHome, AgentName) + linkName := AgentName if runtime.GOOS == "windows" { linkTarget += ".exe" linkName += ".exe" diff --git a/internal/pkg/agent/application/upgrade/step_mark.go b/internal/pkg/agent/application/upgrade/step_mark.go index b5a9e962c30..11e7b208430 100644 --- a/internal/pkg/agent/application/upgrade/step_mark.go +++ b/internal/pkg/agent/application/upgrade/step_mark.go @@ -143,8 +143,8 @@ type updateActiveCommitFunc func(log *logger.Logger, topDirPath, hash string, wr func markUpgradeProvider(updateActiveCommit updateActiveCommitFunc, writeFile writeFileFunc) markUpgradeFunc { return func(log *logger.Logger, dataDirPath string, updatedOn time.Time, agent, previousAgent agentInstall, action *fleetapi.ActionUpgrade, upgradeDetails *details.Details, availableRollbacks map[string]TTLMarker) error { - if len(previousAgent.hash) > hashLen { - previousAgent.hash = previousAgent.hash[:hashLen] + if len(previousAgent.hash) > HashLen { + previousAgent.hash = previousAgent.hash[:HashLen] } marker := &UpdateMarker{ diff --git a/internal/pkg/agent/application/upgrade/step_relink.go b/internal/pkg/agent/application/upgrade/step_relink.go index d6fc9a6b9c1..82b1a597889 100644 --- a/internal/pkg/agent/application/upgrade/step_relink.go +++ b/internal/pkg/agent/application/upgrade/step_relink.go @@ -44,11 +44,11 @@ func changeSymlink(log *logger.Logger, topDirPath, symlinkPath, newTarget string } func prevSymlinkPath(topDirPath string) string { - agentPrevName := agentName + ".prev" + agentPrevName := AgentName + ".prev" // handle windows suffixes if runtime.GOOS == windowsOSName { - agentPrevName = agentName + ".exe.prev" + agentPrevName = AgentName + ".exe.prev" } return filepath.Join(topDirPath, agentPrevName) diff --git a/internal/pkg/agent/application/upgrade/step_unpack.go b/internal/pkg/agent/application/upgrade/step_unpack.go index ae0f964ee65..e39ea77ebe5 100644 --- a/internal/pkg/agent/application/upgrade/step_unpack.go +++ b/internal/pkg/agent/application/upgrade/step_unpack.go @@ -123,7 +123,7 @@ func unzip(log *logger.Logger, archivePath, dataDir string, flavor string, copy return UnpackResult{}, fmt.Errorf("retrieving package metadata from %q: %w", archivePath, err) } - hash = metadata.hash[:hashLen] + hash = metadata.hash[:HashLen] var registry map[string][]string if metadata.manifest != nil { pm.mappings = metadata.manifest.Package.PathMappings @@ -368,7 +368,7 @@ func untar(log *logger.Logger, archivePath, dataDir string, flavor string, copy return UnpackResult{}, fmt.Errorf("retrieving package metadata from %q: %w", archivePath, err) } - hash = metadata.hash[:hashLen] + hash = metadata.hash[:HashLen] var registry map[string][]string if metadata.manifest != nil { @@ -641,8 +641,8 @@ func readCommitHash(reader io.Reader) (string, error) { return "", fmt.Errorf("reading agent commit hash file: %w", err) } hash := strings.TrimSpace(string(commitBytes)) - if len(hash) < hashLen { - return "", fmt.Errorf("hash %q is shorter than minimum length %d", string(commitBytes), hashLen) + if len(hash) < HashLen { + return "", fmt.Errorf("hash %q is shorter than minimum length %d", string(commitBytes), HashLen) } return hash, nil } @@ -767,5 +767,5 @@ func getFilesContentFromTar(archivePath string, files ...string) (map[string]io. // createVersionedHomeFromHash returns a versioned home path relative to topPath in the legacy format `elastic-agent-` // formatted using OS-dependent path separators func createVersionedHomeFromHash(hash string) string { - return filepath.Join("data", fmt.Sprintf("elastic-agent-%s", hash[:hashLen])) + return filepath.Join("data", fmt.Sprintf("elastic-agent-%s", hash[:HashLen])) } diff --git a/internal/pkg/agent/application/upgrade/step_unpack_test.go b/internal/pkg/agent/application/upgrade/step_unpack_test.go index 49a0bc1accb..6b2df7d5f4a 100644 --- a/internal/pkg/agent/application/upgrade/step_unpack_test.go +++ b/internal/pkg/agent/application/upgrade/step_unpack_test.go @@ -121,7 +121,7 @@ var archiveFilesWithMoreComponents = []files{ {fType: REGULAR, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/" + agentCommitFile, content: "abcdefghijklmnopqrstuvwxyz", mode: fs.ModePerm & 0o640}, {fType: DIRECTORY, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/data", mode: fs.ModeDir | (fs.ModePerm & 0o750)}, {fType: DIRECTORY, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/data/elastic-agent-abcdef", mode: fs.ModeDir | (fs.ModePerm & 0o750)}, - {fType: REGULAR, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/data/elastic-agent-abcdef/" + agentName, content: agentBinaryPlaceholderContent, mode: fs.ModePerm & 0o750}, + {fType: REGULAR, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/data/elastic-agent-abcdef/" + AgentName, content: agentBinaryPlaceholderContent, mode: fs.ModePerm & 0o750}, {fType: REGULAR, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/data/elastic-agent-abcdef/package.version", content: "1.2.3", mode: fs.ModePerm & 0o640}, {fType: DIRECTORY, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/data/elastic-agent-abcdef/components", mode: fs.ModeDir | (fs.ModePerm & 0o750)}, {fType: REGULAR, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/data/elastic-agent-abcdef/components/comp1", binary: true, content: "Placeholder for component", mode: fs.ModePerm & 0o750}, @@ -142,7 +142,7 @@ var archiveFilesWithManifestNoSymlink = []files{ {fType: REGULAR, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/" + agentCommitFile, content: "abcdefghijklmnopqrstuvwxyz", mode: fs.ModePerm & 0o640}, {fType: DIRECTORY, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/data", mode: fs.ModeDir | (fs.ModePerm & 0o750)}, {fType: DIRECTORY, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/data/elastic-agent-abcdef", mode: fs.ModeDir | (fs.ModePerm & 0o750)}, - {fType: REGULAR, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/data/elastic-agent-abcdef/" + agentName, content: agentBinaryPlaceholderContent, mode: fs.ModePerm & 0o750}, + {fType: REGULAR, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/data/elastic-agent-abcdef/" + AgentName, content: agentBinaryPlaceholderContent, mode: fs.ModePerm & 0o750}, {fType: REGULAR, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/data/elastic-agent-abcdef/package.version", content: "1.2.3", mode: fs.ModePerm & 0o640}, {fType: DIRECTORY, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/data/elastic-agent-abcdef/components", mode: fs.ModeDir | (fs.ModePerm & 0o750)}, {fType: REGULAR, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/data/elastic-agent-abcdef/components/comp1", content: "Placeholder for component", mode: fs.ModePerm & 0o750}, @@ -153,7 +153,7 @@ var outOfOrderArchiveFilesNoManifestNoSymlink = []files{ {fType: DIRECTORY, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64", mode: fs.ModeDir | (fs.ModePerm & 0o750)}, {fType: REGULAR, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/" + agentCommitFile, content: "abcdefghijklmnopqrstuvwxyz", mode: fs.ModePerm & 0o640}, {fType: REGULAR, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/data/elastic-agent-abcdef/package.version", content: "1.2.3", mode: fs.ModePerm & 0o640}, - {fType: REGULAR, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/data/elastic-agent-abcdef/" + agentName, content: agentBinaryPlaceholderContent, mode: fs.ModePerm & 0o750}, + {fType: REGULAR, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/data/elastic-agent-abcdef/" + AgentName, content: agentBinaryPlaceholderContent, mode: fs.ModePerm & 0o750}, {fType: DIRECTORY, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/data", mode: fs.ModeDir | (fs.ModePerm & 0o750)}, {fType: DIRECTORY, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/data/elastic-agent-abcdef", mode: fs.ModeDir | (fs.ModePerm & 0o700)}, {fType: DIRECTORY, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/data/elastic-agent-abcdef/components", mode: fs.ModeDir | (fs.ModePerm & 0o750)}, @@ -161,7 +161,7 @@ var outOfOrderArchiveFilesNoManifestNoSymlink = []files{ {fType: REGULAR, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/data/elastic-agent-abcdef/components/comp1.spec.yml", content: foo_component_spec, mode: fs.ModePerm & 0o640}, } -var agentArchiveSymLink = files{fType: SYMLINK, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/" + agentName, content: "data/elastic-agent-abcdef/" + agentName, mode: fs.ModeSymlink | (fs.ModePerm & 0o750)} +var agentArchiveSymLink = files{fType: SYMLINK, path: "elastic-agent-1.2.3-SNAPSHOT-someos-x86_64/" + AgentName, content: "data/elastic-agent-abcdef/" + AgentName, mode: fs.ModeSymlink | (fs.ModePerm & 0o750)} type fileType uint @@ -603,7 +603,7 @@ func checkExtractedFilesOutOfOrder(t *testing.T, versionedHome string) { } actualPermissions := fs.ModePerm & stat.Mode() assert.Equalf(t, expectedPermissions, actualPermissions, "Wrong permissions set on versioned home %q: expected %O, got %O", versionedHome, expectedPermissions, actualPermissions) - agentExecutable := filepath.Join(versionedHome, agentName) + agentExecutable := filepath.Join(versionedHome, AgentName) if assert.FileExistsf(t, agentExecutable, "agent executable %q is not found in versioned home directory %q", agentExecutable, versionedHome) { fileBytes, err := os.ReadFile(agentExecutable) if assert.NoErrorf(t, err, "error reading elastic-agent executable %q", agentExecutable) { @@ -615,7 +615,7 @@ func checkExtractedFilesOutOfOrder(t *testing.T, versionedHome string) { func checkExtractedFilesWithManifest(t *testing.T, testDataDir string) { versionedHome := filepath.Join(testDataDir, "elastic-agent-1.2.3-SNAPSHOT-abcdef") require.DirExists(t, versionedHome, "mapped versioned home directory does not exists") - mappedAgentExecutable := filepath.Join(versionedHome, agentName) + mappedAgentExecutable := filepath.Join(versionedHome, AgentName) if assert.FileExistsf(t, mappedAgentExecutable, "agent executable %q is not found in mapped versioned home directory %q", mappedAgentExecutable, versionedHome) { fileBytes, err := os.ReadFile(mappedAgentExecutable) if assert.NoErrorf(t, err, "error reading elastic-agent executable %q", mappedAgentExecutable) { diff --git a/internal/pkg/agent/application/upgrade/upgrade.go b/internal/pkg/agent/application/upgrade/upgrade.go index 98a246d748c..d2157eb8301 100644 --- a/internal/pkg/agent/application/upgrade/upgrade.go +++ b/internal/pkg/agent/application/upgrade/upgrade.go @@ -43,8 +43,8 @@ import ( ) const ( - agentName = "elastic-agent" - hashLen = 6 + AgentName = "elastic-agent" + HashLen = 6 agentCommitFile = ".elastic-agent.active.commit" runDirMod = 0770 snapshotSuffix = "-SNAPSHOT" @@ -54,8 +54,8 @@ const ( var agentArtifact = artifact.Artifact{ Name: "Elastic Agent", - Cmd: agentName, - Artifact: "beats/" + agentName, + Cmd: AgentName, + Artifact: "beats/" + AgentName, } var ( @@ -428,10 +428,10 @@ func (u *Upgrader) Upgrade(ctx context.Context, version string, rollback bool, s // create symlink to the /elastic-agent hashedDir := unpackRes.VersionedHome - symlinkPath := filepath.Join(paths.Top(), agentName) + symlinkPath := filepath.Join(paths.Top(), AgentName) // paths.BinaryPath properly derives the binary directory depending on the platform. The path to the binary for macOS is inside of the app bundle. - newPath := paths.BinaryPath(filepath.Join(paths.Top(), hashedDir), agentName) + newPath := paths.BinaryPath(filepath.Join(paths.Top(), hashedDir), AgentName) currentVersionedHome, err := filepath.Rel(paths.Top(), paths.Home()) if err != nil { @@ -593,8 +593,8 @@ func isSameVersion(log *logger.Logger, current agentVersion, newVersion agentVer } func rollbackInstall(ctx context.Context, log *logger.Logger, topDirPath, versionedHome, oldVersionedHome string, rollbackSource availableRollbacksSource) error { - oldAgentPath := paths.BinaryPath(filepath.Join(topDirPath, oldVersionedHome), agentName) - err := changeSymlink(log, topDirPath, filepath.Join(topDirPath, agentName), oldAgentPath) + oldAgentPath := paths.BinaryPath(filepath.Join(topDirPath, oldVersionedHome), AgentName) + err := changeSymlink(log, topDirPath, filepath.Join(topDirPath, AgentName), oldAgentPath) if err != nil && !errors.Is(err, fs.ErrNotExist) { return fmt.Errorf("rolling back install: restoring symlink to %q failed: %w", oldAgentPath, err) } diff --git a/internal/pkg/agent/application/upgrade/upgrade_test.go b/internal/pkg/agent/application/upgrade/upgrade_test.go index cb2dc29326e..dc91be46bc2 100644 --- a/internal/pkg/agent/application/upgrade/upgrade_test.go +++ b/internal/pkg/agent/application/upgrade/upgrade_test.go @@ -136,8 +136,8 @@ func TestShutdownCallback(t *testing.T) { testcases := []testcase{ { name: "legacy run directories", - agentHomeDirectory: fmt.Sprintf("%s-%s", agentName, release.ShortCommit()), - newAgentHomeDirectory: fmt.Sprintf("%s-%s", agentName, "abc123"), + agentHomeDirectory: fmt.Sprintf("%s-%s", AgentName, release.ShortCommit()), + newAgentHomeDirectory: fmt.Sprintf("%s-%s", AgentName, "abc123"), agentVersion: "7.14.0", newAgentVersion: "7.15.0", oldRunFile: filepath.Join("run", "default", "process-7.14.0", "file.test"), diff --git a/internal/pkg/agent/application/upgrade/watcher.go b/internal/pkg/agent/application/upgrade/watcher.go index e4163483b56..dac69262d62 100644 --- a/internal/pkg/agent/application/upgrade/watcher.go +++ b/internal/pkg/agent/application/upgrade/watcher.go @@ -358,10 +358,10 @@ func selectWatcherExecutable(topDir string, previous agentInstall, current agent // check if the upgraded version is less than the previous (currently installed) version if current.parsedVersion.Less(*previous.parsedVersion) { // use the current agent executable for watch, if downgrading the old agent doesn't understand the current agent's path structure. - return paths.BinaryPath(filepath.Join(topDir, previous.versionedHome), agentName) + return paths.BinaryPath(filepath.Join(topDir, previous.versionedHome), AgentName) } else { // use the new agent executable as it should be able to parse the new update marker - return paths.BinaryPath(filepath.Join(topDir, current.versionedHome), agentName) + return paths.BinaryPath(filepath.Join(topDir, current.versionedHome), AgentName) } } diff --git a/internal/pkg/agent/application/upgrade/watcher_test.go b/internal/pkg/agent/application/upgrade/watcher_test.go index 12a7c744c61..0b03957bf28 100644 --- a/internal/pkg/agent/application/upgrade/watcher_test.go +++ b/internal/pkg/agent/application/upgrade/watcher_test.go @@ -706,7 +706,7 @@ func Test_selectWatcherExecutable(t *testing.T) { fakeTopDir := filepath.Join(t.TempDir(), "Elastic", "Agent") for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - assert.Equalf(t, paths.BinaryPath(filepath.Join(fakeTopDir, tt.want), agentName), selectWatcherExecutable(fakeTopDir, tt.args.previous, tt.args.current), "selectWatcherExecutable(%v, %v)", tt.args.previous, tt.args.current) + assert.Equalf(t, paths.BinaryPath(filepath.Join(fakeTopDir, tt.want), AgentName), selectWatcherExecutable(fakeTopDir, tt.args.previous, tt.args.current), "selectWatcherExecutable(%v, %v)", tt.args.previous, tt.args.current) }) } } From c53451f99b8bcbf51d1b9e2051c2e999f1612409 Mon Sep 17 00:00:00 2001 From: Paolo Chila Date: Thu, 2 Oct 2025 18:21:09 +0200 Subject: [PATCH 07/14] Add integration test for manual rollback after grace period --- .../integration/ess/upgrade_rollback_test.go | 120 ++++++++++++++---- 1 file changed, 94 insertions(+), 26 deletions(-) diff --git a/testing/integration/ess/upgrade_rollback_test.go b/testing/integration/ess/upgrade_rollback_test.go index 71e94ff2cc2..9d75123cb69 100644 --- a/testing/integration/ess/upgrade_rollback_test.go +++ b/testing/integration/ess/upgrade_rollback_test.go @@ -44,7 +44,7 @@ agent.upgrade.watcher: const fastWatcherCfgWithRollbackWindow = ` agent.upgrade: watcher: - grace_period: 2m + grace_period: 1m error_check.interval: 5s rollback: window: 10m @@ -339,6 +339,8 @@ func TestFleetManagedUpgradeRollbackOnRestarts(t *testing.T) { } } +type rollbackTriggerFunc func(ctx context.Context, t *testing.T, client client.Client, startFixture, endFixture *atesting.Fixture) + // TestStandaloneUpgradeManualRollback tests the scenario where, after upgrading to a new version // of Agent, a manual rollback is triggered. It checks that the Agent is rolled back to the previous version. func TestStandaloneUpgradeManualRollback(t *testing.T) { @@ -349,12 +351,16 @@ func TestStandaloneUpgradeManualRollback(t *testing.T) { }) type fixturesSetupFunc func(t *testing.T) (from *atesting.Fixture, to *atesting.Fixture) + testcases := []struct { - name string - fixturesSetup fixturesSetupFunc + name string + fixturesSetup fixturesSetupFunc + agentConfig string + rollbackTrigger rollbackTriggerFunc }{ { - name: "upgrade to a repackaged agent built from the same commit", + name: "upgrade to a repackaged agent built from the same commit, rollback during grace period", + agentConfig: fastWatcherCfgWithRollbackWindow, fixturesSetup: func(t *testing.T) (from *atesting.Fixture, to *atesting.Fixture) { // Upgrade from the current build to the same build as Independent Agent Release. @@ -390,13 +396,84 @@ func TestStandaloneUpgradeManualRollback(t *testing.T) { return fromFixture, toFixture }, + rollbackTrigger: func(ctx context.Context, t *testing.T, client client.Client, startFixture, endFixture *atesting.Fixture) { + t.Logf("sending version=%s rollback=%v upgrade to agent", startFixture.Version(), true) + retVal, err := client.Upgrade(ctx, startFixture.Version(), true, "", false, false) + require.NoError(t, err, "error triggering manual rollback to version %s", startFixture.Version()) + t.Logf("received output %s from upgrade command", retVal) + }, }, - } + { + name: "upgrade to a repackaged agent built from the same commit, rollback after grace period", + agentConfig: fastWatcherCfgWithRollbackWindow, + fixturesSetup: func(t *testing.T) (from *atesting.Fixture, to *atesting.Fixture) { + // Upgrade from the current build to the same build as Independent Agent Release. - // set up start ficture with a rollback window of 1h - rollbackWindowConfig := ` -agent.upgrade.rollback.window: 1h -` + // Start from the build under test. + fromFixture, err := define.NewFixtureFromLocalBuild(t, define.Version()) + require.NoError(t, err) + + // Create a new package with a different version (IAR-style) + // modify the version with the "+buildYYYYMMDDHHMMSS" + currentVersion, err := version.ParseVersion(define.Version()) + require.NoErrorf(t, err, "define.Version() %q is not parsable.", define.Version()) + + newVersionBuildMetadata := "build" + time.Now().Format("20060102150405") + parsedNewVersion := version.NewParsedSemVer(currentVersion.Major(), currentVersion.Minor(), currentVersion.Patch(), "", newVersionBuildMetadata) + + err = fromFixture.EnsurePrepared(t.Context()) + require.NoErrorf(t, err, "fixture should be prepared") + + // retrieve the compressed package file location + srcPackage, err := fromFixture.SrcPackage(t.Context()) + require.NoErrorf(t, err, "error retrieving start fixture source package") + + versionForFixture, repackagedArchiveFile, err := repackageArchive(t, srcPackage, newVersionBuildMetadata, currentVersion, parsedNewVersion) + require.NoError(t, err, "error repackaging the archive built from the same commit") + + repackagedLocalFetcher := atesting.LocalFetcher(filepath.Dir(repackagedArchiveFile)) + toFixture, err := atesting.NewFixture( + t, + versionForFixture.String(), + atesting.WithFetcher(repackagedLocalFetcher), + ) + require.NoError(t, err) + + return fromFixture, toFixture + }, + rollbackTrigger: func(ctx context.Context, t *testing.T, client client.Client, startFixture, endFixture *atesting.Fixture) { + // trim -SNAPSHOT at the end of the fixture version as that is reported as a separate flag + expectedVersion := endFixture.Version() + expectedSnapshot := false + if strings.HasSuffix(expectedVersion, "-SNAPSHOT") { + expectedVersion = strings.TrimSuffix(endFixture.Version(), "-SNAPSHOT") + expectedSnapshot = true + } + + // It will take at least 2 minutes before the agent exits the grace period (see fastWatcherCfgWithRollbackWindow) + // let's shoot for up to 4 minutes to exit grace period, checking every 10 seconds + require.EventuallyWithT(t, func(collect *assert.CollectT) { + state, err := client.State(ctx) + require.NoError(collect, err) + t.Logf("checking agent state: %+v", state) + require.NotNil(collect, state) + assert.Nil(collect, state.UpgradeDetails) + assert.Equal(t, cproto.State_HEALTHY, state.State) + assert.Equal(collect, expectedVersion, state.Info.Version) + assert.Equal(collect, expectedSnapshot, state.Info.Snapshot) + if runtime.GOOS != "windows" { + // on windows the update marker is not removed when cleaning up + assert.NoFileExists(collect, filepath.Join(startFixture.WorkDir(), "data", ".update-marker")) + } + }, 4*time.Minute, 10*time.Second) + t.Log("elastic agent is out of grace period.") + t.Logf("sending version=%s rollback=%v upgrade to agent", startFixture.Version(), true) + retVal, err := client.Upgrade(ctx, startFixture.Version(), true, "", false, false) + require.NoError(t, err, "error triggering manual rollback to version %s", startFixture.Version()) + t.Logf("received output %s from upgrade command", retVal) + }, + }, + } for _, tc := range testcases { t.Run(tc.name, func(t *testing.T) { @@ -404,9 +481,11 @@ agent.upgrade.rollback.window: 1h defer cancel() from, to := tc.fixturesSetup(t) - err := from.Configure(ctx, []byte(rollbackWindowConfig)) - require.NoError(t, err, "error setting up rollback window") - standaloneManualRollbackTest(ctx, t, from, to) + err := from.Configure(ctx, []byte(tc.agentConfig)) + require.NoError(t, err, "error configuring starting fixture") + standaloneRollbackTest( + ctx, t, from, to, fastWatcherCfgWithRollbackWindow, fmt.Sprintf(details.ReasonManualRollbackPattern, from.Version()), + tc.rollbackTrigger) }) } @@ -491,23 +570,12 @@ func managedRollbackRestartTest(ctx context.Context, t *testing.T, info *define. func standaloneRollbackRestartTest(ctx context.Context, t *testing.T, startFixture *atesting.Fixture, endFixture *atesting.Fixture) { standaloneRollbackTest(ctx, t, startFixture, endFixture, reallyFastWatcherCfg, details.ReasonWatchFailed, - func(t *testing.T, _ client.Client) { + func(ctx context.Context, t *testing.T, _ client.Client, _ *atesting.Fixture, _ *atesting.Fixture) { restartAgentNTimes(t, 3, 10*time.Second) }) } -func standaloneManualRollbackTest(ctx context.Context, t *testing.T, startFixture *atesting.Fixture, endFixture *atesting.Fixture) { - standaloneRollbackTest(ctx, t, startFixture, endFixture, fastWatcherCfgWithRollbackWindow, fmt.Sprintf(details.ReasonManualRollbackPattern, startFixture.Version()), - func(t *testing.T, client client.Client) { - t.Logf("sending version=%s rollback=%v upgrade to agent", startFixture.Version(), true) - retVal, err := client.Upgrade(ctx, startFixture.Version(), true, "", false, false) - require.NoError(t, err, "error triggering manual rollback to version %s", startFixture.Version()) - t.Logf("received output %s from upgrade command", retVal) - }, - ) -} - -func standaloneRollbackTest(ctx context.Context, t *testing.T, startFixture *atesting.Fixture, endFixture *atesting.Fixture, customConfig string, rollbackReason string, rollbackTrigger func(t *testing.T, client client.Client)) { +func standaloneRollbackTest(ctx context.Context, t *testing.T, startFixture *atesting.Fixture, endFixture *atesting.Fixture, customConfig string, rollbackReason string, rollbackTrigger rollbackTriggerFunc) { startVersionInfo, err := startFixture.ExecVersion(ctx) require.NoError(t, err, "failed to get start agent build version info") @@ -540,7 +608,7 @@ func standaloneRollbackTest(ctx context.Context, t *testing.T, startFixture *ate defer elasticAgentClient.Disconnect() // A few seconds after the upgrade, trigger a rollback using the passed trigger - rollbackTrigger(t, elasticAgentClient) + rollbackTrigger(ctx, t, elasticAgentClient, startFixture, endFixture) // wait for the agent to be healthy and back at the start version err = upgradetest.WaitHealthyAndVersion(ctx, startFixture, startVersionInfo.Binary, 2*time.Minute, 10*time.Second, t) From db96ca9393a89bf7a968bc4b24e0fd5e67ee3303 Mon Sep 17 00:00:00 2001 From: Paolo Chila Date: Fri, 3 Oct 2025 15:28:46 +0200 Subject: [PATCH 08/14] fix linter errors --- internal/pkg/agent/application/application.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/internal/pkg/agent/application/application.go b/internal/pkg/agent/application/application.go index 2d69196084e..931bbd8b5e9 100644 --- a/internal/pkg/agent/application/application.go +++ b/internal/pkg/agent/application/application.go @@ -386,8 +386,6 @@ func normalizeInstallDescriptorAtStartup(log *logger.Logger, topDir string, now log.Warnf("Error removing install descriptor(s): %s", err) } } - - return } func mergeFleetConfig(ctx context.Context, rawConfig *config.Config) (storage.Store, *configuration.Configuration, error) { From 1cd2407e1bdf7868c57c0af5408596111c4fe9a8 Mon Sep 17 00:00:00 2001 From: Paolo Chila Date: Fri, 24 Oct 2025 10:23:40 +0200 Subject: [PATCH 09/14] Set commit hash in TTLMarker when preparing available rollbacks --- internal/pkg/agent/application/upgrade/manual_rollback.go | 1 + 1 file changed, 1 insertion(+) diff --git a/internal/pkg/agent/application/upgrade/manual_rollback.go b/internal/pkg/agent/application/upgrade/manual_rollback.go index 9f715e6e6ac..646392958b9 100644 --- a/internal/pkg/agent/application/upgrade/manual_rollback.go +++ b/internal/pkg/agent/application/upgrade/manual_rollback.go @@ -251,6 +251,7 @@ func getAvailableRollbacks(rollbackWindow time.Duration, now time.Time, currentV res := make(map[string]TTLMarker, 1) res[currentVersionedHome] = TTLMarker{ Version: currentVersion, + Hash: currentHash, ValidUntil: now.Add(rollbackWindow), } From 4ccc86728006b46b8801271481a018c1a0c10ba2 Mon Sep 17 00:00:00 2001 From: Paolo Chila Date: Tue, 28 Oct 2025 11:08:38 +0100 Subject: [PATCH 10/14] Pass versionedHomesToKeep to installModifier.Cleanup() --- internal/pkg/agent/cmd/watch.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/pkg/agent/cmd/watch.go b/internal/pkg/agent/cmd/watch.go index 04f6e15533a..2308a7630d5 100644 --- a/internal/pkg/agent/cmd/watch.go +++ b/internal/pkg/agent/cmd/watch.go @@ -203,7 +203,7 @@ func watchCmd(log *logp.Logger, topDir string, cfg *configuration.UpgradeWatcher versionedHomesToKeep = append(versionedHomesToKeep, paths.VersionedHome(topDir)) versionedHomesToKeep = appendAvailableRollbacks(log, marker, versionedHomesToKeep) log.Infof("About to clean up upgrade. Keeping versioned homes: %v", versionedHomesToKeep) - if err := installModifier.Cleanup(log, paths.Top(), true, false, paths.VersionedHome(topDir)); err != nil { + if err := installModifier.Cleanup(log, paths.Top(), true, false, versionedHomesToKeep...); err != nil { log.Error("clean up of prior watcher run failed", err) } // exit nicely From 25743948ec8253f9f14b24991f1bbbac0fb2a92d Mon Sep 17 00:00:00 2001 From: Paolo Chila Date: Tue, 28 Oct 2025 11:35:27 +0100 Subject: [PATCH 11/14] change check for running TTL marker normalization at startup --- internal/pkg/agent/application/application.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/pkg/agent/application/application.go b/internal/pkg/agent/application/application.go index 931bbd8b5e9..4db9fd1aaea 100644 --- a/internal/pkg/agent/application/application.go +++ b/internal/pkg/agent/application/application.go @@ -139,7 +139,7 @@ func New( isMonitoringSupported := !disableMonitoring && cfg.Settings.V1MonitoringEnabled availableRollbacksSource := upgrade.NewTTLMarkerRegistry(log, paths.Top()) - if platform.OS != component.Container { + if upgrade.IsUpgradeable() { // If we are not running in a container, check and normalize the install descriptor before we start the agent normalizeInstallDescriptorAtStartup(log, paths.Top(), time.Now(), initialUpdateMarker, availableRollbacksSource) } From dcf0ede92e3ba49bd079e551f66a24f91201fc79 Mon Sep 17 00:00:00 2001 From: Paolo Chila Date: Tue, 28 Oct 2025 11:41:17 +0100 Subject: [PATCH 12/14] remove references to install registry --- internal/pkg/agent/application/application.go | 20 +++++++++---------- .../pkg/agent/application/application_test.go | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/internal/pkg/agent/application/application.go b/internal/pkg/agent/application/application.go index 4db9fd1aaea..948db85f88f 100644 --- a/internal/pkg/agent/application/application.go +++ b/internal/pkg/agent/application/application.go @@ -141,7 +141,7 @@ func New( availableRollbacksSource := upgrade.NewTTLMarkerRegistry(log, paths.Top()) if upgrade.IsUpgradeable() { // If we are not running in a container, check and normalize the install descriptor before we start the agent - normalizeInstallDescriptorAtStartup(log, paths.Top(), time.Now(), initialUpdateMarker, availableRollbacksSource) + normalizeAgentInstalls(log, paths.Top(), time.Now(), initialUpdateMarker, availableRollbacksSource) } upgrader, err := upgrade.NewUpgrader(log, cfg.Settings.DownloadConfig, cfg.Settings.Upgrade, agentInfo, new(upgrade.AgentWatcherHelper), availableRollbacksSource) if err != nil { @@ -308,15 +308,15 @@ func New( return coord, configMgr, varsManager, nil } -// normalizeInstallDescriptorAtStartup will check the install descriptor checking: -// - if we just rolled back: the update marker is checked and in case of rollback we clean up the entry about the failed upgraded install +// normalizeAgentInstalls will attempt to normalize the agent installs and related TTL markers: +// - if we just rolled back: the update marker is checked and in case of rollback we clean up the TTL marker of the rolled back version // - check all the entries: -// - verify that the home directory for that install still exists (remove what does not exist anymore) -// - TODO check TTLs of installs to schedule delayed cleanup while the agent is running +// - verify that the home directory for that install still exists (remove TTL markers for what does not exist anymore) +// - check if the agent install: if it is no longer valid collect the versioned home and the TTL marker for deletion // // This function will NOT error out, it will log any errors it encounters as warnings but any error must be treated as non-fatal -func normalizeInstallDescriptorAtStartup(log *logger.Logger, topDir string, now time.Time, initialUpdateMarker *upgrade.UpdateMarker, rollbackSource rollbacksSource) { - // Check if we rolled back and update the install descriptor +func normalizeAgentInstalls(log *logger.Logger, topDir string, now time.Time, initialUpdateMarker *upgrade.UpdateMarker, rollbackSource rollbacksSource) { + // Check if we rolled back and update the TTL markers if initialUpdateMarker != nil && initialUpdateMarker.Details != nil && initialUpdateMarker.Details.State == details.StateRollback { // Reset the TTL for the current version if we are coming off a rollback rollbacks, err := rollbackSource.Get() @@ -329,7 +329,7 @@ func normalizeInstallDescriptorAtStartup(log *logger.Logger, topDir string, now delete(rollbacks, initialUpdateMarker.PrevVersionedHome) err = rollbackSource.Set(rollbacks) if err != nil { - log.Warnf("Error removing install descriptor from installDescriptorSource during startup check: %s", err) + log.Warnf("Error setting available rollbacks during normalization: %s", err) return } } @@ -354,7 +354,7 @@ func normalizeInstallDescriptorAtStartup(log *logger.Logger, topDir string, now _, err = os.Stat(versionedHomeAbsPath) if errors.Is(err, os.ErrNotExist) { - log.Warnf("Versioned home %s corresponding to agent install descriptor %+v is not found on disk", versionedHomeAbsPath, ttlMarker) + log.Warnf("Versioned home %s corresponding to agent TTL marker %+v is not found on disk", versionedHomeAbsPath, ttlMarker) versionedHomesToCleanup = append(versionedHomesToCleanup, versionedHome) continue } @@ -366,7 +366,7 @@ func normalizeInstallDescriptorAtStartup(log *logger.Logger, topDir string, now if now.After(ttlMarker.ValidUntil) { // the install directory exists but it's expired. Remove the files. - log.Infof("agent install descriptor %+v is expired, removing directory %q", ttlMarker, versionedHomeAbsPath) + log.Infof("agent TTL marker %+v marks %q as expired, removing directory", ttlMarker, versionedHomeAbsPath) if cleanupErr := install.RemoveBut(versionedHomeAbsPath, true); cleanupErr != nil { log.Warnf("Error removing directory %q: %s", versionedHomeAbsPath, cleanupErr) } else { diff --git a/internal/pkg/agent/application/application_test.go b/internal/pkg/agent/application/application_test.go index 3c3c2a63810..0c7fb737f7f 100644 --- a/internal/pkg/agent/application/application_test.go +++ b/internal/pkg/agent/application/application_test.go @@ -438,7 +438,7 @@ func Test_normalizeInstallDescriptorAtStartup(t *testing.T) { logger, _ := loggertest.New(t.Name()) tmpDir := t.TempDir() updateMarker, installSource := tt.setup(t, tmpDir) - normalizeInstallDescriptorAtStartup(logger, tmpDir, now, updateMarker, installSource) + normalizeAgentInstalls(logger, tmpDir, now, updateMarker, installSource) if tt.postNormalizeAssertions != nil { tt.postNormalizeAssertions(t, tmpDir, updateMarker) } From 52ab10395383f9a813aa0780a49060d52d08a46b Mon Sep 17 00:00:00 2001 From: Paolo Chila Date: Thu, 30 Oct 2025 11:26:59 +0100 Subject: [PATCH 13/14] implement code review feedback --- internal/pkg/agent/application/upgrade/manual_rollback.go | 5 ++++- internal/pkg/agent/application/upgrade/rollback.go | 4 ---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/internal/pkg/agent/application/upgrade/manual_rollback.go b/internal/pkg/agent/application/upgrade/manual_rollback.go index 646392958b9..664ba586da1 100644 --- a/internal/pkg/agent/application/upgrade/manual_rollback.go +++ b/internal/pkg/agent/application/upgrade/manual_rollback.go @@ -42,9 +42,12 @@ func (u *Upgrader) rollbackToPreviousVersion(ctx context.Context, topDir string, var updateMarkerExistsBeforeRollback bool if errors.Is(err, os.ErrNotExist) { - // there is no upgrade marker, we need to extract available rollbacks from agent installs + // there is no upgrade marker (the rollback was requested after the watcher grace period had elapsed), we need + // to extract available rollbacks from agent installs watcherExecutable, versionedHomeToRollbackTo, err = rollbackUsingAgentInstalls(u.log, u.watcherHelper, u.availableRollbacksSource, topDir, now, version, u.markUpgrade) } else { + // If upgrade marker is available, we need to gracefully stop any watcher process, read the available rollbacks from + // the upgrade marker and then proceed with rollback updateMarkerExistsBeforeRollback = true watcherExecutable, versionedHomeToRollbackTo, err = rollbackUsingUpgradeMarker(ctx, u.log, u.watcherHelper, topDir, now, version) } diff --git a/internal/pkg/agent/application/upgrade/rollback.go b/internal/pkg/agent/application/upgrade/rollback.go index 6aa8bfda5e9..419b681862c 100644 --- a/internal/pkg/agent/application/upgrade/rollback.go +++ b/internal/pkg/agent/application/upgrade/rollback.go @@ -127,10 +127,6 @@ func RollbackWithOpts(ctx context.Context, log *logger.Logger, c client.Client, return nil } - if prevVersionedHome == "" { - prevVersionedHome = filepath.Join("data", fmt.Sprintf("%s-%s", AgentName, prevHash)) - } - // cleanup everything except version we're rolling back into return Cleanup(log, topDirPath, settings.RemoveMarker, true, prevVersionedHome) } From f1568f6a3f26e3e8270f06869f9e2fdbb53053ee Mon Sep 17 00:00:00 2001 From: Paolo Chila Date: Thu, 30 Oct 2025 13:56:57 +0100 Subject: [PATCH 14/14] fixup! implement code review feedback --- internal/pkg/agent/application/upgrade/rollback.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/internal/pkg/agent/application/upgrade/rollback.go b/internal/pkg/agent/application/upgrade/rollback.go index 419b681862c..96f97fed1cc 100644 --- a/internal/pkg/agent/application/upgrade/rollback.go +++ b/internal/pkg/agent/application/upgrade/rollback.go @@ -80,15 +80,15 @@ func RollbackWithOpts(ctx context.Context, log *logger.Logger, c client.Client, symlinkPath := filepath.Join(topDirPath, AgentName) - var symlinkTarget string - if prevVersionedHome != "" { - symlinkTarget = paths.BinaryPath(filepath.Join(topDirPath, prevVersionedHome), AgentName) - } else { + if prevVersionedHome == "" { // fallback for upgrades that didn't use the manifest and path remapping hashedDir := fmt.Sprintf("%s-%s", AgentName, prevHash) - // paths.BinaryPath properly derives the binary directory depending on the platform. The path to the binary for macOS is inside of the app bundle. - symlinkTarget = paths.BinaryPath(filepath.Join(paths.DataFrom(topDirPath), hashedDir), AgentName) + prevVersionedHome = filepath.Join("data", hashedDir) } + + // paths.BinaryPath properly derives the binary directory depending on the platform. The path to the binary for macOS is inside of the app bundle. + symlinkTarget := paths.BinaryPath(filepath.Join(topDirPath, prevVersionedHome), AgentName) + // change symlink if err := changeSymlink(log, topDirPath, symlinkPath, symlinkTarget); err != nil { return err