@@ -2,14 +2,19 @@ package main
22
33import (
44 "context"
5+ "encoding/json"
56 "fmt"
7+ "os"
8+ "path/filepath"
9+ "time"
610
711 "github.com/sirupsen/logrus"
812 "github.com/spf13/cobra"
913
1014 "go.goms.io/aks/AKSFlexNode/pkg/bootstrapper"
1115 "go.goms.io/aks/AKSFlexNode/pkg/config"
1216 "go.goms.io/aks/AKSFlexNode/pkg/logger"
17+ "go.goms.io/aks/AKSFlexNode/pkg/status"
1318)
1419
1520// Version information variables (set at build time)
@@ -19,14 +24,14 @@ var (
1924 BuildTime = "unknown"
2025)
2126
22- // NewBootstrapCommand creates a new bootstrap command
23- func NewBootstrapCommand () * cobra.Command {
27+ // NewAgentCommand creates a new agent command
28+ func NewAgentCommand () * cobra.Command {
2429 cmd := & cobra.Command {
25- Use : "bootstrap " ,
26- Short : "Bootstrap AKS node with Arc connection" ,
27- Long : "Initialize and configure this machine as an AKS node connected through Azure Arc " ,
30+ Use : "agent " ,
31+ Short : "Start AKS node agent with Arc connection" ,
32+ Long : "Initialize and run the AKS node agent daemon with automatic status tracking and self-recovery " ,
2833 RunE : func (cmd * cobra.Command , args []string ) error {
29- return runBootstrap (cmd .Context ())
34+ return runAgent (cmd .Context ())
3035 },
3136 }
3237
@@ -61,8 +66,8 @@ func NewVersionCommand() *cobra.Command {
6166 return cmd
6267}
6368
64- // runBootstrap executes the bootstrap process
65- func runBootstrap (ctx context.Context ) error {
69+ // runAgent executes the bootstrap process and then runs as daemon
70+ func runAgent (ctx context.Context ) error {
6671 logger := logger .GetLoggerFromContext (ctx )
6772
6873 cfg , err := config .LoadConfig (configPath )
@@ -76,8 +81,14 @@ func runBootstrap(ctx context.Context) error {
7681 return err
7782 }
7883
79- // Handle and log the result
80- return handleExecutionResult (result , "bootstrap" , logger )
84+ // Handle and log the bootstrap result
85+ if err := handleExecutionResult (result , "bootstrap" , logger ); err != nil {
86+ return err
87+ }
88+
89+ // After successful bootstrap, transition to daemon mode
90+ logger .Info ("Bootstrap completed successfully, transitioning to daemon mode..." )
91+ return runDaemonLoop (ctx , cfg )
8192}
8293
8394// runUnbootstrap executes the unbootstrap process
@@ -107,6 +118,142 @@ func runVersion() {
107118 fmt .Printf ("Build Time: %s\n " , BuildTime )
108119}
109120
121+ // runDaemonLoop runs the periodic status collection and bootstrap monitoring daemon
122+ func runDaemonLoop (ctx context.Context , cfg * config.Config ) error {
123+ logger := logger .GetLoggerFromContext (ctx )
124+ // Create status file directory - using runtime directory for service or temp for development
125+ statusFilePath := status .GetStatusFilePath ()
126+ statusDir := filepath .Dir (statusFilePath )
127+ if err := os .MkdirAll (statusDir , 0750 ); err != nil {
128+ return fmt .Errorf ("failed to create status directory %s: %w" , statusDir , err )
129+ }
130+
131+ // Clean up any stale status file on daemon startup
132+ if _ , err := os .Stat (statusFilePath ); err == nil {
133+ logger .Info ("Removing stale status file from previous daemon session..." )
134+ if err := os .Remove (statusFilePath ); err != nil {
135+ logger .Warnf ("Failed to remove stale status file: %v" , err )
136+ } else {
137+ logger .Info ("Stale status file removed successfully" )
138+ }
139+ }
140+
141+ logger .Info ("Starting periodic status collection daemon (status: 1 minutes, bootstrap check: 2 minute)" )
142+
143+ // Create tickers for different intervals
144+ statusTicker := time .NewTicker (1 * time .Minute )
145+ bootstrapTicker := time .NewTicker (2 * time .Minute )
146+ defer statusTicker .Stop ()
147+ defer bootstrapTicker .Stop ()
148+
149+ // Collect status immediately on start
150+ if err := collectAndWriteStatus (ctx , cfg , statusFilePath ); err != nil {
151+ logger .Errorf ("Failed to collect initial status: %v" , err )
152+ }
153+
154+ // Run the periodic collection and monitoring loop
155+ for {
156+ select {
157+ case <- ctx .Done ():
158+ logger .Info ("Daemon shutting down due to context cancellation" )
159+ return ctx .Err ()
160+ case <- statusTicker .C :
161+ logger .Infof ("Starting periodic status collection at %s..." , time .Now ().Format ("2006-01-02 15:04:05" ))
162+ if err := collectAndWriteStatus (ctx , cfg , statusFilePath ); err != nil {
163+ logger .Errorf ("Failed to collect status at %s: %v" , time .Now ().Format ("2006-01-02 15:04:05" ), err )
164+ // Continue running even if status collection fails
165+ } else {
166+ logger .Infof ("Status collection completed successfully at %s" , time .Now ().Format ("2006-01-02 15:04:05" ))
167+ }
168+ case <- bootstrapTicker .C :
169+ logger .Infof ("Starting bootstrap health check at %s..." , time .Now ().Format ("2006-01-02 15:04:05" ))
170+ if err := checkAndBootstrap (ctx , cfg ); err != nil {
171+ logger .Errorf ("Auto-bootstrap check failed at %s: %v" , time .Now ().Format ("2006-01-02 15:04:05" ), err )
172+ // Continue running even if bootstrap check fails
173+ } else {
174+ logger .Infof ("Bootstrap health check completed at %s" , time .Now ().Format ("2006-01-02 15:04:05" ))
175+ }
176+ }
177+ }
178+ }
179+
180+ // checkAndBootstrap checks if the node needs re-bootstrapping and performs it if necessary
181+ func checkAndBootstrap (ctx context.Context , cfg * config.Config ) error {
182+ logger := logger .GetLoggerFromContext (ctx )
183+ // Create status collector to check bootstrap requirements
184+ collector := status .NewCollector (cfg , logger , Version )
185+
186+ // Check if bootstrap is needed
187+ needsBootstrap := collector .NeedsBootstrap (ctx )
188+ if ! needsBootstrap {
189+ return nil // All good, no action needed
190+ }
191+
192+ logger .Info ("Node requires re-bootstrapping, initiating auto-bootstrap..." )
193+
194+ // Perform bootstrap
195+ bootstrapExecutor := bootstrapper .New (cfg , logger )
196+ result , err := bootstrapExecutor .Bootstrap (ctx )
197+ if err != nil {
198+ // Bootstrap failed - remove status file so next check will detect the problem
199+ removeStatusFile (ctx )
200+ return fmt .Errorf ("auto-bootstrap failed: %s" , err )
201+ }
202+
203+ // Handle and log the bootstrap result
204+ if err := handleExecutionResult (result , "auto-bootstrap" , logger ); err != nil {
205+ // Bootstrap execution failed - remove status file so next check will detect the problem
206+ removeStatusFile (ctx )
207+ return fmt .Errorf ("auto-bootstrap execution failed: %s" , err )
208+ }
209+
210+ logger .Info ("Auto-bootstrap completed successfully" )
211+ return nil
212+ }
213+
214+ func removeStatusFile (ctx context.Context ) {
215+ logger := logger .GetLoggerFromContext (ctx )
216+ statusFilePath := status .GetStatusFilePath ()
217+ if removeErr := os .Remove (statusFilePath ); removeErr != nil {
218+ logger .Debugf ("Failed to remove status file: %s" , removeErr )
219+ } else {
220+ logger .Debug ("Removed status file successfully" )
221+ }
222+ }
223+
224+ // collectAndWriteStatus collects current node status and writes it to the status file
225+ func collectAndWriteStatus (ctx context.Context , cfg * config.Config , statusFilePath string ) error {
226+ logger := logger .GetLoggerFromContext (ctx )
227+
228+ // Create status collector
229+ collector := status .NewCollector (cfg , logger , Version )
230+
231+ // Collect comprehensive status
232+ nodeStatus , err := collector .CollectStatus (ctx )
233+ if err != nil {
234+ return fmt .Errorf ("failed to collect node status: %w" , err )
235+ }
236+
237+ // Write status to JSON file
238+ statusData , err := json .MarshalIndent (nodeStatus , "" , " " )
239+ if err != nil {
240+ return fmt .Errorf ("failed to marshal status to JSON: %w" , err )
241+ }
242+
243+ // Write to temporary file first, then rename (atomic operation)
244+ tempFile := statusFilePath + ".tmp"
245+ if err := os .WriteFile (tempFile , statusData , 0600 ); err != nil {
246+ return fmt .Errorf ("failed to write status to temp file: %w" , err )
247+ }
248+
249+ if err := os .Rename (tempFile , statusFilePath ); err != nil {
250+ return fmt .Errorf ("failed to rename temp status file: %w" , err )
251+ }
252+
253+ logger .Debugf ("Status written to %s" , statusFilePath )
254+ return nil
255+ }
256+
110257// handleExecutionResult processes and logs execution results
111258func handleExecutionResult (result * bootstrapper.ExecutionResult , operation string , logger * logrus.Logger ) error {
112259 if result == nil {
0 commit comments