Skip to content

Commit a2ee90d

Browse files
pchilahayotbisonai
authored andcommitted
Manual rollback after grace period (elastic#9643)
* Allow for multiple directories to be specified during cleanup * refactor manual rollback function and tests on a separate file * Split manual rollback between watching and non-watching cases * Implement manual rollback from list of agent installs * fix lint errors * Normalize install descriptors at startup * Add integration test for manual rollback after grace period * fix linter errors * Set commit hash in TTLMarker when preparing available rollbacks * Pass versionedHomesToKeep to installModifier.Cleanup() * change check for running TTL marker normalization at startup * remove references to install registry * implement code review feedback * fixup! implement code review feedback
1 parent 2ac9d96 commit a2ee90d

22 files changed

+1427
-534
lines changed

.mockery.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ filename: mocks.go
55
template-data:
66
unroll-variadic: true
77
packages:
8+
github.com/elastic/elastic-agent/internal/pkg/agent/application:
9+
interfaces:
10+
rollbacksSource: {}
811
github.com/elastic/elastic-agent/internal/pkg/agent/application/actions/handlers:
912
interfaces:
1013
Uploader: {}

internal/pkg/agent/application/application.go

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,14 @@ package application
77
import (
88
"context"
99
"fmt"
10+
"os"
11+
"path/filepath"
1012
"time"
1113

1214
"go.elastic.co/apm/v2"
1315

1416
componentmonitoring "github.com/elastic/elastic-agent/internal/pkg/agent/application/monitoring/component"
17+
"github.com/elastic/elastic-agent/internal/pkg/agent/install"
1518

1619
"github.com/elastic/go-ucfg"
1720

@@ -47,6 +50,11 @@ import (
4750
"github.com/elastic/elastic-agent/version"
4851
)
4952

53+
type rollbacksSource interface {
54+
Set(map[string]upgrade.TTLMarker) error
55+
Get() (map[string]upgrade.TTLMarker, error)
56+
}
57+
5058
// CfgOverrider allows for application driven overrides of configuration read from disk.
5159
type CfgOverrider func(cfg *configuration.Configuration)
5260

@@ -127,6 +135,10 @@ func New(
127135
isMonitoringSupported := !disableMonitoring && cfg.Settings.V1MonitoringEnabled
128136

129137
availableRollbacksSource := upgrade.NewTTLMarkerRegistry(log, paths.Top())
138+
if upgrade.IsUpgradeable() {
139+
// If we are not running in a container, check and normalize the install descriptor before we start the agent
140+
normalizeAgentInstalls(log, paths.Top(), time.Now(), initialUpdateMarker, availableRollbacksSource)
141+
}
130142
upgrader, err := upgrade.NewUpgrader(log, cfg.Settings.DownloadConfig, cfg.Settings.Upgrade, agentInfo, new(upgrade.AgentWatcherHelper), availableRollbacksSource)
131143
if err != nil {
132144
return nil, nil, nil, fmt.Errorf("failed to create upgrader: %w", err)
@@ -291,6 +303,86 @@ func New(
291303
return coord, configMgr, varsManager, nil
292304
}
293305

306+
// normalizeAgentInstalls will attempt to normalize the agent installs and related TTL markers:
307+
// - if we just rolled back: the update marker is checked and in case of rollback we clean up the TTL marker of the rolled back version
308+
// - check all the entries:
309+
// - verify that the home directory for that install still exists (remove TTL markers for what does not exist anymore)
310+
// - check if the agent install: if it is no longer valid collect the versioned home and the TTL marker for deletion
311+
//
312+
// This function will NOT error out, it will log any errors it encounters as warnings but any error must be treated as non-fatal
313+
func normalizeAgentInstalls(log *logger.Logger, topDir string, now time.Time, initialUpdateMarker *upgrade.UpdateMarker, rollbackSource rollbacksSource) {
314+
// Check if we rolled back and update the TTL markers
315+
if initialUpdateMarker != nil && initialUpdateMarker.Details != nil && initialUpdateMarker.Details.State == details.StateRollback {
316+
// Reset the TTL for the current version if we are coming off a rollback
317+
rollbacks, err := rollbackSource.Get()
318+
if err != nil {
319+
log.Warnf("Error getting available rollbacks from rollbackSource during startup check: %s", err)
320+
return
321+
}
322+
323+
// remove the current versioned home TTL marker
324+
delete(rollbacks, initialUpdateMarker.PrevVersionedHome)
325+
err = rollbackSource.Set(rollbacks)
326+
if err != nil {
327+
log.Warnf("Error setting available rollbacks during normalization: %s", err)
328+
return
329+
}
330+
}
331+
332+
// check if we need to cleanup old agent installs
333+
rollbacks, err := rollbackSource.Get()
334+
if err != nil {
335+
log.Warnf("Error getting available rollbacks during startup check: %s", err)
336+
return
337+
}
338+
339+
var versionedHomesToCleanup []string
340+
for versionedHome, ttlMarker := range rollbacks {
341+
342+
versionedHomeAbsPath := filepath.Join(topDir, versionedHome)
343+
344+
if versionedHomeAbsPath == paths.HomeFrom(topDir) {
345+
// skip the current install
346+
log.Warnf("Found a TTL marker for the currently running agent at %s. Skipping cleanup...", versionedHome)
347+
continue
348+
}
349+
350+
_, err = os.Stat(versionedHomeAbsPath)
351+
if errors.Is(err, os.ErrNotExist) {
352+
log.Warnf("Versioned home %s corresponding to agent TTL marker %+v is not found on disk", versionedHomeAbsPath, ttlMarker)
353+
versionedHomesToCleanup = append(versionedHomesToCleanup, versionedHome)
354+
continue
355+
}
356+
357+
if err != nil {
358+
log.Warnf("error checking versioned home %s for agent install: %s", versionedHomeAbsPath, err.Error())
359+
continue
360+
}
361+
362+
if now.After(ttlMarker.ValidUntil) {
363+
// the install directory exists but it's expired. Remove the files.
364+
log.Infof("agent TTL marker %+v marks %q as expired, removing directory", ttlMarker, versionedHomeAbsPath)
365+
if cleanupErr := install.RemoveBut(versionedHomeAbsPath, true); cleanupErr != nil {
366+
log.Warnf("Error removing directory %q: %s", versionedHomeAbsPath, cleanupErr)
367+
} else {
368+
log.Infof("Directory %q was removed", versionedHomeAbsPath)
369+
versionedHomesToCleanup = append(versionedHomesToCleanup, versionedHome)
370+
}
371+
}
372+
}
373+
374+
if len(versionedHomesToCleanup) > 0 {
375+
log.Infof("removing install descriptor(s) for %v", versionedHomesToCleanup)
376+
for _, versionedHomeToCleanup := range versionedHomesToCleanup {
377+
delete(rollbacks, versionedHomeToCleanup)
378+
}
379+
err = rollbackSource.Set(rollbacks)
380+
if err != nil {
381+
log.Warnf("Error removing install descriptor(s): %s", err)
382+
}
383+
}
384+
}
385+
294386
func mergeFleetConfig(ctx context.Context, rawConfig *config.Config) (storage.Store, *configuration.Configuration, error) {
295387
path := paths.AgentConfigFile()
296388
store, err := storage.NewEncryptedDiskStore(ctx, path)

internal/pkg/agent/application/application_test.go

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ package application
77
import (
88
"context"
99
"fmt"
10+
"os"
11+
"path/filepath"
12+
"runtime"
1013
"testing"
1114
"time"
1215

@@ -15,6 +18,9 @@ import (
1518

1619
"github.com/elastic/elastic-agent-libs/logp"
1720
"github.com/elastic/elastic-agent/internal/pkg/agent/application/info"
21+
"github.com/elastic/elastic-agent/internal/pkg/agent/application/paths"
22+
"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade"
23+
"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details"
1824
"github.com/elastic/elastic-agent/internal/pkg/config"
1925
"github.com/elastic/elastic-agent/internal/pkg/testutils"
2026
"github.com/elastic/elastic-agent/pkg/core/logger/loggertest"
@@ -302,3 +308,188 @@ func TestInjectOutputOverrides(t *testing.T) {
302308
})
303309
}
304310
}
311+
312+
func Test_normalizeInstallDescriptorAtStartup(t *testing.T) {
313+
314+
now := time.Now()
315+
tomorrow := now.Add(24 * time.Hour)
316+
yesterday := now.Add(-24 * time.Hour)
317+
318+
tests := []struct {
319+
name string
320+
setup func(t *testing.T, topDir string) (*upgrade.UpdateMarker, rollbacksSource)
321+
postNormalizeAssertions func(t *testing.T, topDir string, initialUpdateMarker *upgrade.UpdateMarker)
322+
}{
323+
{
324+
name: "happy path: single install, no rollbacks, no modifications needed",
325+
setup: func(t *testing.T, topDir string) (*upgrade.UpdateMarker, rollbacksSource) {
326+
mockRollbackSource := newMockRollbacksSource(t)
327+
mockRollbackSource.EXPECT().Get().Return(nil, nil)
328+
return nil, mockRollbackSource
329+
},
330+
331+
postNormalizeAssertions: nil,
332+
},
333+
{
334+
name: "Agent was manually rolled back: rolled back install is removed from the list",
335+
setup: func(t *testing.T, topDir string) (*upgrade.UpdateMarker, rollbacksSource) {
336+
newAgentInstallPath := createFakeAgentInstall(t, topDir, "4.5.6", "newversionhash", true)
337+
oldAgentInstallPath := createFakeAgentInstall(t, topDir, "1.2.3", "oldversionhash", true)
338+
339+
mockRollbackSource := newMockRollbacksSource(t)
340+
mockRollbackSource.EXPECT().Get().Return(map[string]upgrade.TTLMarker{
341+
oldAgentInstallPath: {
342+
Version: "1.2.3",
343+
Hash: "oldversionhash",
344+
ValidUntil: tomorrow,
345+
},
346+
}, nil)
347+
348+
updateMarker := &upgrade.UpdateMarker{
349+
Version: "4.5.6",
350+
Hash: "newversionhash",
351+
VersionedHome: newAgentInstallPath,
352+
UpdatedOn: now,
353+
PrevVersion: "1.2.3",
354+
PrevHash: "oldversionhash",
355+
PrevVersionedHome: oldAgentInstallPath,
356+
Acked: false,
357+
Action: nil,
358+
Details: &details.Details{
359+
TargetVersion: "4.5.6",
360+
State: details.StateRollback,
361+
ActionID: "",
362+
Metadata: details.Metadata{
363+
Reason: details.ReasonManualRollbackPattern,
364+
},
365+
},
366+
}
367+
368+
// expect code to clear the rollback
369+
mockRollbackSource.EXPECT().Set(map[string]upgrade.TTLMarker{}).Return(nil)
370+
return updateMarker, mockRollbackSource
371+
},
372+
postNormalizeAssertions: nil,
373+
},
374+
{
375+
name: "Entries not having a matching install directory will be removed from the list",
376+
setup: func(t *testing.T, topDir string) (*upgrade.UpdateMarker, rollbacksSource) {
377+
_ = createFakeAgentInstall(t, topDir, "4.5.6", "newversionhash", true)
378+
oldAgentInstallPath := createFakeAgentInstall(t, topDir, "1.2.3", "oldversionhash", true)
379+
380+
mockRollbackSource := newMockRollbacksSource(t)
381+
nonExistingVersionedHome := filepath.Join("data", "thisdirectorydoesnotexist")
382+
mockRollbackSource.EXPECT().Get().Return(map[string]upgrade.TTLMarker{
383+
oldAgentInstallPath: {
384+
Version: "1.2.3",
385+
Hash: "oldversionhash",
386+
ValidUntil: tomorrow,
387+
},
388+
nonExistingVersionedHome: {
389+
Version: "0.0.0",
390+
Hash: "nonExistingHash",
391+
ValidUntil: tomorrow,
392+
},
393+
}, nil)
394+
395+
mockRollbackSource.EXPECT().Set(map[string]upgrade.TTLMarker{
396+
oldAgentInstallPath: {
397+
Version: "1.2.3",
398+
Hash: "oldversionhash",
399+
ValidUntil: tomorrow,
400+
},
401+
}).Return(nil)
402+
return nil, mockRollbackSource
403+
},
404+
postNormalizeAssertions: nil,
405+
},
406+
{
407+
name: "Expired installs still existing on disk will be removed from the install list and removed from disk",
408+
setup: func(t *testing.T, topDir string) (*upgrade.UpdateMarker, rollbacksSource) {
409+
_ = createFakeAgentInstall(t, topDir, "4.5.6", "newversionhash", true)
410+
oldAgentInstallPath := createFakeAgentInstall(t, topDir, "1.2.3", "oldversionhash", true)
411+
412+
// assert that the versionedHome of the old install is the same we check in postNormalizeAssertions
413+
assert.Equal(t, oldAgentInstallPath, filepath.Join("data", "elastic-agent-1.2.3-oldver"),
414+
"Unexpected old install versioned home. Post normalize assertions may not be working")
415+
416+
mockRollbackSource := newMockRollbacksSource(t)
417+
mockRollbackSource.EXPECT().Get().Return(
418+
map[string]upgrade.TTLMarker{
419+
oldAgentInstallPath: {
420+
Version: "1.2.3",
421+
Hash: "oldver",
422+
ValidUntil: yesterday,
423+
},
424+
},
425+
nil,
426+
)
427+
// expect removal of the existing ttlmarker
428+
mockRollbackSource.EXPECT().Set(map[string]upgrade.TTLMarker{}).Return(nil)
429+
return nil, mockRollbackSource
430+
},
431+
postNormalizeAssertions: func(t *testing.T, topDir string, _ *upgrade.UpdateMarker) {
432+
assert.NoDirExists(t, filepath.Join(topDir, "data", "elastic-agent-1.2.3-oldver"))
433+
},
434+
},
435+
}
436+
for _, tt := range tests {
437+
t.Run(tt.name, func(t *testing.T) {
438+
logger, _ := loggertest.New(t.Name())
439+
tmpDir := t.TempDir()
440+
updateMarker, installSource := tt.setup(t, tmpDir)
441+
normalizeAgentInstalls(logger, tmpDir, now, updateMarker, installSource)
442+
if tt.postNormalizeAssertions != nil {
443+
tt.postNormalizeAssertions(t, tmpDir, updateMarker)
444+
}
445+
})
446+
}
447+
}
448+
449+
// createFakeAgentInstall (copied from the upgrade package tests) will create a mock agent install within topDir, possibly
450+
// using the version in the directory name, depending on useVersionInPath it MUST return the path to the created versionedHome
451+
// relative to topDir, to mirror what step_unpack returns
452+
func createFakeAgentInstall(t *testing.T, topDir, version, hash string, useVersionInPath bool) string {
453+
454+
// create versioned home
455+
versionedHome := fmt.Sprintf("elastic-agent-%s", hash[:upgrade.HashLen])
456+
if useVersionInPath {
457+
// use the version passed as parameter
458+
versionedHome = fmt.Sprintf("elastic-agent-%s-%s", version, hash[:upgrade.HashLen])
459+
}
460+
relVersionedHomePath := filepath.Join("data", versionedHome)
461+
absVersionedHomePath := filepath.Join(topDir, relVersionedHomePath)
462+
463+
// recalculate the binary path and launch a mkDirAll to account for MacOS weirdness
464+
// (the extra nesting of elastic agent binary within versionedHome)
465+
absVersioneHomeBinaryPath := paths.BinaryPath(absVersionedHomePath, "")
466+
err := os.MkdirAll(absVersioneHomeBinaryPath, 0o750)
467+
require.NoError(t, err, "error creating fake install versioned home directory (including binary path) %q", absVersioneHomeBinaryPath)
468+
469+
// place a few directories in the fake install
470+
absComponentsDirPath := filepath.Join(absVersionedHomePath, "components")
471+
err = os.MkdirAll(absComponentsDirPath, 0o750)
472+
require.NoError(t, err, "error creating fake install components directory %q", absVersionedHomePath)
473+
474+
absLogsDirPath := filepath.Join(absVersionedHomePath, "logs")
475+
err = os.MkdirAll(absLogsDirPath, 0o750)
476+
require.NoError(t, err, "error creating fake install logs directory %q", absLogsDirPath)
477+
478+
absRunDirPath := filepath.Join(absVersionedHomePath, "run")
479+
err = os.MkdirAll(absRunDirPath, 0o750)
480+
require.NoError(t, err, "error creating fake install run directory %q", absRunDirPath)
481+
482+
// put some placeholder for files
483+
agentExecutableName := upgrade.AgentName
484+
if runtime.GOOS == "windows" {
485+
agentExecutableName += ".exe"
486+
}
487+
err = os.WriteFile(paths.BinaryPath(absVersionedHomePath, agentExecutableName), []byte(fmt.Sprintf("Placeholder for agent %s", version)), 0o750)
488+
require.NoErrorf(t, err, "error writing elastic agent binary placeholder %q", agentExecutableName)
489+
fakeLogPath := filepath.Join(absLogsDirPath, "fakelog.ndjson")
490+
err = os.WriteFile(fakeLogPath, []byte(fmt.Sprintf("Sample logs for agent %s", version)), 0o750)
491+
require.NoErrorf(t, err, "error writing fake log placeholder %q", fakeLogPath)
492+
493+
// return the path relative to top exactly like the step_unpack does
494+
return relVersionedHomePath
495+
}

0 commit comments

Comments
 (0)