@@ -6,6 +6,7 @@ package cmd
66
77import (
88 "context"
9+ goerrors "errors"
910 "fmt"
1011 "net/url"
1112 "os"
@@ -88,7 +89,6 @@ func newRunCommandWithArgs(_ []string, streams *cli.IOStreams) *cobra.Command {
8889 testingMode , _ := cmd .Flags ().GetBool ("testing-mode" )
8990 if err := run (nil , testingMode , fleetInitTimeout ); err != nil && ! errors .Is (err , context .Canceled ) {
9091 fmt .Fprintf (streams .Err , "Error: %v\n %s\n " , err , troubleshootMessage ())
91- logExternal (fmt .Sprintf ("%s run failed: %s" , paths .BinaryName , err ))
9292 return err
9393 }
9494 return nil
@@ -141,56 +141,82 @@ func run(override application.CfgOverrider, testingMode bool, fleetInitTimeout t
141141 defer cancel ()
142142 go service .ProcessWindowsControlEvents (stopBeat )
143143
144- upgradeDetailsFromMarker , err := handleUpgrade ()
145- if err != nil {
146- return fmt .Errorf ("error checking for and handling upgrade: %w" , err )
147- }
148-
149- locker := filelock .NewAppLocker (paths .Data (), paths .AgentLockFileName )
150- if err := locker .TryLock (); err != nil {
151- return err
152- }
153- defer func () {
154- _ = locker .Unlock ()
155- }()
156-
157- return runElasticAgent (ctx , cancel , override , stop , testingMode , fleetInitTimeout , upgradeDetailsFromMarker , modifiers ... )
144+ return runElasticAgentCritical (ctx , cancel , override , stop , testingMode , fleetInitTimeout , modifiers ... )
158145}
159146
160147func logReturn (l * logger.Logger , err error ) error {
161148 if err != nil && ! errors .Is (err , context .Canceled ) {
162149 l .Errorf ("%s" , err )
150+ logExternal (fmt .Sprintf ("%s run failed: %s" , paths .BinaryName , err ))
163151 }
164152 return err
165153}
166154
167- func runElasticAgent (
155+ // runElasticAgentCritical provides a critical path to running runElasticAgent, it exhausts all efforts to log any
156+ // errors to ensure that any issues are captured in the logs.
157+ func runElasticAgentCritical (
168158 ctx context.Context ,
169159 cancel context.CancelFunc ,
170160 override application.CfgOverrider ,
171161 stop chan bool ,
172162 testingMode bool ,
173163 fleetInitTimeout time.Duration ,
174- upgradeDetailsFromMarker * details.Details ,
175164 modifiers ... component.PlatformModifier ,
176165) error {
177- err := coordinator .RestoreConfig ()
166+ var errs []error
167+
168+ // early handleUpgrade, but don't error yet
169+ upgradeDetailsFromMarker , err := handleUpgrade ()
178170 if err != nil {
179- return err
171+ errs = append ( errs , fmt . Errorf ( "failed to handle upgrade: %w" , err ))
180172 }
181173
174+ // single run, but don't error yet
175+ locker := filelock .NewAppLocker (paths .Data (), paths .AgentLockFileName )
176+ lockErr := locker .TryLock ()
177+ if lockErr != nil {
178+ errs = append (errs , fmt .Errorf ("failed to get app lock: %w" , err ))
179+ }
180+ defer func () {
181+ _ = locker .Unlock ()
182+ }()
183+
184+ // try restore (if app locker didn't fail), but don't error yet
185+ if lockErr == nil {
186+ err = coordinator .RestoreConfig ()
187+ if err != nil {
188+ errs = append (errs , fmt .Errorf ("failed to restore configuration: %w" , err ))
189+ }
190+ }
191+
192+ // try load config, but don't error yet
182193 cfg , err := loadConfig (ctx , override )
183194 if err != nil {
184- return err
195+ // failed to load configuration, just load the default to create the logger
196+ errs = append (errs , fmt .Errorf ("failed to load configuration: %w" , err ))
197+ cfg = configuration .DefaultConfiguration ()
185198 }
186199
187- logLvl := logger .DefaultLogLevel
188- if cfg .Settings .LoggingConfig != nil {
189- logLvl = cfg .Settings .LoggingConfig .Level
190- }
191200 baseLogger , err := logger .NewFromConfig ("" , cfg .Settings .LoggingConfig , cfg .Settings .EventLoggingConfig , true )
192201 if err != nil {
193- return err
202+ errs = append (errs , fmt .Errorf ("failed to create logger: %w" , err ))
203+
204+ // failed to create the baseLogger, this comes from the configuration being possibly invalid
205+ // switch to a default config and try again
206+ cfg = configuration .DefaultConfiguration ()
207+ baseLogger , err = logger .NewFromConfig ("" , cfg .Settings .LoggingConfig , cfg .Settings .EventLoggingConfig , true )
208+ if err != nil {
209+ errs = append (errs , fmt .Errorf ("failed to create logger with default configuration: %w" , err ))
210+
211+ // this really should not happen, but this whole critical function is very defensive
212+ baseLogger , err = logger .New ("" , true )
213+ if err != nil {
214+ errs = append (errs , fmt .Errorf ("failed to create logger with no configuration: %w" , err ))
215+
216+ // again? no way, but you never know
217+ baseLogger = logger .NewWithoutConfig ("" )
218+ }
219+ }
194220 }
195221
196222 // Make sure to flush any buffered logs before we're done.
@@ -200,10 +226,39 @@ func runElasticAgent(
200226 "source" : agentName ,
201227 })
202228
229+ // at this point the logger is working, so any errors that we hit can now be logged and returned
230+ if len (errs ) > 0 {
231+ return logReturn (l , goerrors .Join (errs ... ))
232+ }
233+
234+ // actually run the agent now
235+ err = runElasticAgent (ctx , cancel , baseLogger , l , cfg , override , stop , testingMode , fleetInitTimeout , upgradeDetailsFromMarker , modifiers ... )
236+ return logReturn (l , err )
237+ }
238+
239+ // runElasticAgent runs the actual Elastic Agent.
240+ func runElasticAgent (
241+ ctx context.Context ,
242+ cancel context.CancelFunc ,
243+ baseLogger * logger.Logger ,
244+ l * logger.Logger ,
245+ cfg * configuration.Configuration ,
246+ override application.CfgOverrider ,
247+ stop chan bool ,
248+ testingMode bool ,
249+ fleetInitTimeout time.Duration ,
250+ upgradeDetailsFromMarker * details.Details ,
251+ modifiers ... component.PlatformModifier ,
252+ ) error {
253+ logLvl := logger .DefaultLogLevel
254+ if cfg .Settings .LoggingConfig != nil {
255+ logLvl = cfg .Settings .LoggingConfig .Level
256+ }
257+
203258 // try early to check if running as root
204259 isRoot , err := utils .HasRoot ()
205260 if err != nil {
206- return logReturn ( l , fmt .Errorf ("failed to check for root/Administrator privileges: %w" , err ) )
261+ return fmt .Errorf ("failed to check for root/Administrator privileges: %w" , err )
207262 }
208263
209264 l .Infow ("Elastic Agent started" ,
@@ -213,7 +268,7 @@ func runElasticAgent(
213268
214269 cfg , err = tryDelayEnroll (ctx , l , cfg , override )
215270 if err != nil {
216- return logReturn ( l , errors .New (err , "failed to perform delayed enrollment" ) )
271+ return errors .New (err , "failed to perform delayed enrollment" )
217272 }
218273
219274 // agent ID needs to stay empty in bootstrap mode
@@ -225,31 +280,31 @@ func runElasticAgent(
225280 // that writes the agentID into fleet.enc (encrypted fleet.yml) before even loading the configuration.
226281 err = secret .CreateAgentSecret (ctx , vault .WithUnprivileged (! isRoot ))
227282 if err != nil {
228- return logReturn ( l , fmt .Errorf ("failed to read/write secrets: %w" , err ) )
283+ return fmt .Errorf ("failed to read/write secrets: %w" , err )
229284 }
230285
231286 // Migrate .yml files if the corresponding .enc does not exist
232287
233288 // the encrypted config does not exist but the unencrypted file does
234289 err = migration .MigrateToEncryptedConfig (ctx , l , paths .AgentConfigYmlFile (), paths .AgentConfigFile ())
235290 if err != nil {
236- return logReturn ( l , errors .New (err , "error migrating fleet config" ) )
291+ return errors .New (err , "error migrating fleet config" )
237292 }
238293
239294 // the encrypted state does not exist but the unencrypted file does
240295 err = migration .MigrateToEncryptedConfig (ctx , l ,
241296 paths .AgentStateStoreYmlFile (),
242297 paths .AgentStateStoreFile ())
243298 if err != nil {
244- return logReturn ( l , errors .New (err , "error migrating agent state" ) )
299+ return errors .New (err , "error migrating agent state" )
245300 }
246301
247302 agentInfo , err := info .NewAgentInfoWithLog (ctx , defaultLogLevel (cfg , logLvl .String ()), createAgentID )
248303 if err != nil {
249- return logReturn ( l , errors .New (err ,
304+ return errors .New (err ,
250305 "could not load agent info" ,
251306 errors .TypeFilesystem ,
252- errors .M (errors .MetaKeyPath , paths .AgentConfigFile ())))
307+ errors .M (errors .MetaKeyPath , paths .AgentConfigFile ()))
253308 }
254309
255310 // Ensure that the log level now matches what is configured in the agentInfo.
@@ -275,14 +330,14 @@ func runElasticAgent(
275330
276331 execPath , err := reexecPath ()
277332 if err != nil {
278- return logReturn ( l , fmt .Errorf ("failed to get reexec path: %w" , err ) )
333+ return fmt .Errorf ("failed to get reexec path: %w" , err )
279334 }
280335 rexLogger := l .Named ("reexec" )
281336 rex := reexec .NewManager (rexLogger , execPath )
282337
283338 tracer , err := initTracer (agentName , release .Version (), cfg .Settings .MonitoringConfig )
284339 if err != nil {
285- return logReturn ( l , fmt .Errorf ("could not initiate APM tracer: %w" , err ) )
340+ return fmt .Errorf ("could not initiate APM tracer: %w" , err )
286341 }
287342 if tracer != nil {
288343 l .Info ("APM instrumentation enabled" )
@@ -298,12 +353,12 @@ func runElasticAgent(
298353 coord , configMgr , _ , err := application .New (ctx , l , baseLogger , logLvl , agentInfo , rex , tracer , testingMode ,
299354 fleetInitTimeout , isBootstrap , override , upgradeDetailsFromMarker , modifiers ... )
300355 if err != nil {
301- return logReturn ( l , err )
356+ return err
302357 }
303358
304359 monitoringServer , err := setupMetrics (l , cfg .Settings .DownloadConfig .OS (), cfg .Settings .MonitoringConfig , tracer , coord )
305360 if err != nil {
306- return logReturn ( l , err )
361+ return err
307362 }
308363 coord .RegisterMonitoringServer (monitoringServer )
309364 defer func () {
@@ -327,7 +382,7 @@ func runElasticAgent(
327382
328383 // start the control listener
329384 if err := control .Start (); err != nil {
330- return logReturn ( l , err )
385+ return err
331386 }
332387 defer control .Stop ()
333388
@@ -410,7 +465,7 @@ LOOP:
410465 if isRex {
411466 rex .ShutdownComplete ()
412467 }
413- return logReturn ( l , err )
468+ return err
414469}
415470
416471func loadConfig (ctx context.Context , override application.CfgOverrider ) (* configuration.Configuration , error ) {
0 commit comments