Skip to content

Commit ebf3189

Browse files
authored
Retry starting/installing service upon failure (#9313)
* Retry starting/installing service upon failure * Add service restart delay * Adding CHANGELOG entry * [WIP] Debugging * Add mutex to prevent data race * [WIP] Debugging * Remove max service restart attempts * Attempt to (re)start service when timer fires as well * Move serviceRestartDelay to struct
1 parent 7964dec commit ebf3189

File tree

5 files changed

+144
-6
lines changed

5 files changed

+144
-6
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Kind can be one of:
2+
# - breaking-change: a change to previously-documented behavior
3+
# - deprecation: functionality that is being removed in a later release
4+
# - bug-fix: fixes a problem in a previous version
5+
# - enhancement: extends functionality but does not break or fix existing behavior
6+
# - feature: new functionality
7+
# - known-issue: problems that we are aware of in a given version
8+
# - security: impacts on the security of a product or a user’s deployment.
9+
# - upgrade: important information for someone upgrading from a prior version
10+
# - other: does not fit into any of the other categories
11+
kind: bug-fix
12+
13+
# Change summary; a 80ish characters long description of the change.
14+
summary: Retry service start command upon failure with 30-second delay.
15+
16+
# Long description; in case the summary is not enough to describe the change
17+
# this field accommodate a description without length limits.
18+
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
19+
#description:
20+
21+
# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
22+
component: elastic-agent
23+
24+
# PR URL; optional; the PR number that added the changeset.
25+
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
26+
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
27+
# Please provide it if you are adding a fragment for a different PR.
28+
pr: https://github.com/elastic/elastic-agent/pull/9313
29+
30+
# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
31+
# If not present is automatically filled by the tooling with the issue linked to the PR number.
32+
#issue: https://github.com/owner/repo/1234

pkg/component/runtime/conn_info_server.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ func newConnInfoServer(log *logger.Logger, comm Communicator, address string) (*
4343
if err != nil {
4444
return nil, fmt.Errorf("failed to start connection credentials listener: %w", err)
4545
}
46+
log.Debugf("started connection info server listener on %s", address)
4647

4748
s := &connInfoServer{log: log, listener: listener, stopTimeout: defaultStopTimeout}
4849

pkg/component/runtime/conn_info_server_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ func (c *mockCommunicator) CheckinObserved() <-chan *proto.CheckinObserved {
7373
const testPort = 6788
7474

7575
// Test Elastic Agent Connection Info sock
76-
const testSock = ".teaci.sock"
76+
const testSock = ".cteaci.sock"
7777

7878
func getAddress(dir string, isLocal bool) string {
7979
if isLocal {

pkg/component/runtime/service.go

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,16 @@ type serviceRuntime struct {
5858
executeServiceCommandImpl executeServiceCommandFunc
5959

6060
isLocal bool // true if rpc is domain socket, or named pipe
61+
62+
// The most recent mode received on actionCh. The mode will be either
63+
// actionStart (indicating the process should be running, and should be
64+
// created if it is not), or actionStop or actionTeardown (indicating that
65+
// it should terminate).
66+
actionState actionModeSigned
67+
68+
// serviceRestartDelay is the time duration the service runtime will wait before
69+
// attempting to restart a service that failed to start.
70+
serviceRestartDelay time.Duration
6171
}
6272

6373
// newServiceRuntime creates a new command runtime for the provided component.
@@ -81,6 +91,7 @@ func newServiceRuntime(comp component.Component, logger *logger.Logger, isLocal
8191
state: state,
8292
executeServiceCommandImpl: executeServiceCommand,
8393
isLocal: isLocal,
94+
serviceRestartDelay: 30 * time.Second,
8495
}
8596

8697
// Set initial state as STOPPED
@@ -208,6 +219,7 @@ func (s *serviceRuntime) Run(ctx context.Context, comm Communicator) (err error)
208219
return ctx.Err()
209220
case as := <-s.actionCh:
210221
s.log.Debugf("got action %v for %s service", as.actionMode, s.name())
222+
s.actionState = as
211223
switch as.actionMode {
212224
case actionStart:
213225
// Initial state on start
@@ -253,8 +265,14 @@ func (s *serviceRuntime) Run(ctx context.Context, comm Communicator) (err error)
253265
}
254266
}
255267

256-
cisStop()
257-
break
268+
// Error is due to a fatal exit code. Schedule a restart of the service after a delay by resending
269+
// the start action on the action channel
270+
delay := s.serviceRestartDelay
271+
s.log.Errorf("failed to start %s service, err: %v, restarting after waiting for %v", s.name(), err, delay)
272+
time.AfterFunc(delay, func() {
273+
s.actionCh <- as
274+
})
275+
continue
258276
}
259277

260278
// Start check-in timer
@@ -290,8 +308,19 @@ func (s *serviceRuntime) Run(ctx context.Context, comm Communicator) (err error)
290308
s.processCheckin(checkin, comm, &lastCheckin)
291309
}
292310
case <-checkinTimer.C:
293-
s.checkStatus(s.checkinPeriod(), &lastCheckin, &missedCheckins)
294-
checkinTimer.Reset(s.checkinPeriod())
311+
if s.actionState.actionMode == actionStart {
312+
if !s.isRunning() {
313+
// not running, but should be running
314+
if err := s.start(ctx); err != nil {
315+
s.forceCompState(client.UnitStateFailed, fmt.Sprintf("service %s failed to start: %v", s.name(), err))
316+
continue
317+
}
318+
} else {
319+
// running and should be running
320+
s.checkStatus(s.checkinPeriod(), &lastCheckin, &missedCheckins)
321+
checkinTimer.Reset(s.checkinPeriod())
322+
}
323+
}
295324
case <-teardownCheckinTimer.C:
296325
s.log.Debugf("got tearing down timeout for %s service", s.name())
297326
// Teardown timed out

pkg/component/runtime/service_test.go

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ func TestCISKeepsRunningOnNonFatalExitCodeFromStart(t *testing.T) {
280280
log, logObs := loggertest.New("test")
281281
const nonFatalExitCode = 99
282282
const cisPort = 9999
283-
const cisSocket = ".teaci.sock"
283+
const cisSocket = ".steaci.sock"
284284

285285
// Make an Endpoint component for testing
286286
endpoint := makeEndpointComponent(t, map[string]interface{}{})
@@ -371,6 +371,82 @@ func TestCISKeepsRunningOnNonFatalExitCodeFromStart(t *testing.T) {
371371
}, 2*time.Second, 200*time.Millisecond)
372372
}
373373

374+
// TestServiceStartRetry tests that the service runtime will
375+
// retry the service start command if it fails
376+
func TestServiceStartRetry(t *testing.T) {
377+
log, logObs := loggertest.New("test")
378+
const cisPort = 9999
379+
const cisSocket = ".rteaci.sock"
380+
381+
// Make an Endpoint component for testing
382+
endpoint := makeEndpointComponent(t, map[string]interface{}{})
383+
endpoint.InputSpec.Spec.Service = &component.ServiceSpec{
384+
CPort: cisPort,
385+
CSocket: cisSocket,
386+
Log: nil,
387+
Operations: component.ServiceOperationsSpec{
388+
Check: &component.ServiceOperationsCommandSpec{},
389+
Install: &component.ServiceOperationsCommandSpec{},
390+
},
391+
Timeouts: component.ServiceTimeoutSpec{},
392+
}
393+
394+
// Create binary mocking Endpoint such that executing it will return
395+
// the non-fatal exit code from the spec above.
396+
endpoint.InputSpec.BinaryPath = mockEndpointBinary(t, 99)
397+
endpoint.InputSpec.BinaryName = "endpoint"
398+
399+
t.Logf("mock binary path: %s\n", endpoint.InputSpec.BinaryPath)
400+
401+
// Create new service runtime with component
402+
service, err := newServiceRuntime(endpoint, log, true)
403+
require.NoError(t, err)
404+
405+
// Shorten service restart delay for testing
406+
service.serviceRestartDelay = 2 * time.Second
407+
408+
ctx, cancel := context.WithCancel(context.Background())
409+
defer cancel()
410+
comm := newMockCommunicator("")
411+
412+
// Observe component state
413+
go func() {
414+
for {
415+
select {
416+
case <-ctx.Done():
417+
return
418+
case <-service.ch:
419+
}
420+
}
421+
}()
422+
423+
// Run the service runtime
424+
go func() {
425+
err := service.Run(ctx, comm)
426+
require.EqualError(t, err, context.Canceled.Error())
427+
}()
428+
429+
// Start the service
430+
service.actionCh <- actionModeSigned{
431+
actionMode: actionStart,
432+
}
433+
434+
expectedRestartLogMsg := fmt.Sprintf(
435+
"failed to start endpoint service, err: %s, restarting after waiting for %v",
436+
"failed install endpoint service: exit status 99", service.serviceRestartDelay,
437+
)
438+
require.Eventually(t, func() bool {
439+
logs := logObs.TakeAll()
440+
for _, l := range logs {
441+
t.Logf("[%s] %s", l.Level, l.Message)
442+
if l.Level == zapcore.ErrorLevel && l.Message == expectedRestartLogMsg {
443+
return true
444+
}
445+
}
446+
return false
447+
}, service.serviceRestartDelay+1*time.Second, 500*time.Millisecond)
448+
}
449+
374450
func mockEndpointBinary(t *testing.T, exitCode int) string {
375451
// Build a mock Endpoint binary that can return a specific exit code.
376452
outPath := filepath.Join(t.TempDir(), "mock_endpoint")

0 commit comments

Comments
 (0)