Skip to content

Commit d9a96c4

Browse files
wenxuan0923claude
andauthored
Change bootstrap to daemon mode (#26)
Co-authored-by: Claude Sonnet 4.5 <noreply@anthropic.com>
1 parent 54b669b commit d9a96c4

File tree

12 files changed

+568
-83
lines changed

12 files changed

+568
-83
lines changed

.github/workflows/pr-checks.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ on:
1414
permissions:
1515
contents: read
1616
pull-requests: read
17+
security-events: write
1718

1819
jobs:
1920
build:

README.md

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -145,32 +145,27 @@ EOF
145145

146146
| Command | Description | Usage |
147147
|---------|-------------|-------|
148-
| `bootstrap` | Transform VM into AKS node | `sudo aks-flex-node bootstrap` |
149-
| `unbootstrap` | Clean removal of all components | `sudo aks-flex-node unbootstrap` |
150-
| `version` | Show version information | `sudo aks-flex-node version` |
148+
| `agent` | Start agent daemon (bootstrap + monitoring) | `aks-flex-node agent --config /etc/aks-flex-node/config.json` |
149+
| `unbootstrap` | Clean removal of all components | `aks-flex-node unbootstrap --config /etc/aks-flex-node/config.json` |
150+
| `version` | Show version information | `aks-flex-node version` |
151151

152-
#### Bootstrap
152+
#### Agent Command (Bootstrap + Daemon)
153153
```bash
154154
# Option 1: Direct command execution
155-
aks-flex-node bootstrap
155+
aks-flex-node agent --config /etc/aks-flex-node/config.json
156156
cat /var/log/aks-flex-node/aks-flex-node.log
157157

158158
# Option 2: Using systemd service
159-
sudo systemctl enable aks-flex-node@bootstrap.service; sudo systemctl start aks-flex-node@bootstrap
160-
journalctl -u aks-flex-node@bootstrap --since "1 minutes ago" -f
159+
sudo systemctl enable aks-flex-node-agent.service; sudo systemctl start aks-flex-node-agent
160+
journalctl -u aks-flex-node-agent --since "1 minutes ago" -f
161161

162162
```
163163

164164
#### Unbootstrap
165165
```bash
166-
# Option 1: Direct command execution
167-
aks-flex-node unbootstrap
166+
# Direct command execution
167+
aks-flex-node unbootstrap --config /etc/aks-flex-node/config.json
168168
cat /var/log/aks-flex-node/aks-flex-node.log
169-
170-
# Option 2: Using systemd service
171-
sudo systemctl enable aks-flex-node@unbootstrap.service; sudo systemctl start aks-flex-node@unbootstrap
172-
journalctl -u aks-flex-node@unbootstrap --since "1 minutes ago" -f
173-
174169
```
175170

176171
## Authentication Flow:
@@ -206,14 +201,14 @@ The service principal must have the same permissions listed in the Prerequisites
206201
### Complete Removal
207202
```bash
208203
# First run unbootstrap to cleanly disconnect from Arc and AKS cluster
209-
aks-flex-node unbootstrap
204+
aks-flex-node unbootstrap --config /etc/aks-flex-node/config.json
210205

211206
# Then run automated uninstall to remove all components
212207
curl -fsSL https://raw.githubusercontent.com/Azure/AKSFlexNode/main/scripts/uninstall.sh | sudo bash
213208
```
214209

215210
The uninstall script will:
216-
- Stop and disable aks-flex-node systemd services (bootstrap/unbootstrap)
211+
- Stop and disable aks-flex-node agent service
217212
- Remove the service user and permissions
218213
- Clean up all directories and configuration files
219214
- Remove the binary and systemd service files
Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,26 @@
11
[Unit]
2-
Description=AKS Flex Node Agent - %i
2+
Description=AKS Flex Node Agent
33
After=network-online.target
44
Wants=network-online.target
5+
# Restart on failure to enable auto-recovery
6+
StartLimitIntervalSec=300
7+
StartLimitBurst=5
58

69
[Service]
710
Type=simple
811
RemainAfterExit=no
9-
ExecStart=/usr/local/bin/aks-flex-node %i --config /etc/aks-flex-node/config.json
10-
TimeoutStartSec=30
11-
TimeoutStopSec=3600
12+
ExecStart=/usr/local/bin/aks-flex-node agent --config /etc/aks-flex-node/config.json
13+
TimeoutStartSec=300
14+
TimeoutStopSec=60
15+
# Restart configuration for daemon resilience
16+
Restart=on-failure
17+
RestartSec=30
1218
User=aks-flex-node
1319
Group=aks-flex-node
1420
SupplementaryGroups=himds ubuntu
1521
Environment=AZURE_CONFIG_DIR=/home/ubuntu/.azure
22+
RuntimeDirectory=aks-flex-node
23+
RuntimeDirectoryMode=0755
1624
StandardOutput=journal
1725
StandardError=journal
1826

@@ -38,7 +46,7 @@ RestrictSUIDSGID=false
3846
RemoveIPC=false
3947

4048
# Allow access to specific paths that need modification (- prefix makes paths optional)
41-
ReadWritePaths=-/etc/kubernetes -/var/lib/kubelet -/var/lib/containerd -/etc/containerd -/opt/cni -/etc/cni -/etc/systemd/system -/etc/sysctl.d -/etc/modules-load.d -/var/log/aks-flex-node -/tmp
49+
ReadWritePaths=-/etc/kubernetes -/var/lib/kubelet -/var/lib/containerd -/etc/containerd -/opt/cni -/etc/cni -/etc/systemd/system -/etc/sysctl.d -/etc/modules-load.d -/var/log/aks-flex-node -/tmp -/etc/aks-flex-node -/run/aks-flex-node
4250

4351
[Install]
4452
WantedBy=multi-user.target

aks-flex-node-sudoers

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,30 @@ aks-flex-node ALL=(root) NOPASSWD:SETENV: /bin/systemctl restart kubelet
2525
aks-flex-node ALL=(root) NOPASSWD:SETENV: /bin/systemctl restart containerd
2626
aks-flex-node ALL=(root) NOPASSWD:SETENV: /bin/systemctl status kubelet
2727
aks-flex-node ALL=(root) NOPASSWD:SETENV: /bin/systemctl status containerd
28+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /bin/systemctl check kubelet
29+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /bin/systemctl check containerd
2830
aks-flex-node ALL=(root) NOPASSWD:SETENV: /bin/systemctl is-active *
2931
aks-flex-node ALL=(root) NOPASSWD:SETENV: /bin/systemctl is-enabled *
3032
aks-flex-node ALL=(root) NOPASSWD:SETENV: /bin/systemctl list-unit-files *
33+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/systemctl daemon-reload
34+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/systemctl enable kubelet
35+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/systemctl enable containerd
36+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/systemctl enable --now kubelet
37+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/systemctl enable --now containerd
38+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/systemctl disable kubelet
39+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/systemctl disable containerd
40+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/systemctl start kubelet
41+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/systemctl start containerd
42+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/systemctl stop kubelet
43+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/systemctl stop containerd
44+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/systemctl restart kubelet
45+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/systemctl restart containerd
46+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/systemctl status kubelet
47+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/systemctl status containerd
48+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/systemctl check kubelet
49+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/systemctl check containerd
50+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/systemctl is-active *
51+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/systemctl is-enabled *
3152
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/systemctl list-unit-files *
3253

3354
# Package management (for installing Kubernetes components)
@@ -47,16 +68,24 @@ aks-flex-node ALL=(root) NOPASSWD:SETENV: /bin/ln *, /usr/bin/ln *
4768
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/install *, /bin/install *
4869
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/curl *, /usr/bin/wget *
4970
aks-flex-node ALL=(root) NOPASSWD:SETENV: /bin/tar *, /usr/bin/unzip *
71+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /bin/ls *, /usr/bin/ls *
72+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/test *, /bin/test *
5073

5174
# System configuration for Kubernetes
5275
aks-flex-node ALL=(root) NOPASSWD:SETENV: /sbin/sysctl --system
5376
aks-flex-node ALL=(root) NOPASSWD:SETENV: /sbin/modprobe overlay
5477
aks-flex-node ALL=(root) NOPASSWD:SETENV: /sbin/modprobe br_netfilter
5578

56-
# Configuration file management
79+
# Configuration file management and reading
5780
aks-flex-node ALL=(root) NOPASSWD:SETENV: /bin/tee /etc/sysctl.d/k8s.conf
5881
aks-flex-node ALL=(root) NOPASSWD:SETENV: /bin/tee /etc/modules-load.d/k8s.conf
5982
aks-flex-node ALL=(root) NOPASSWD:SETENV: /bin/tee /etc/containerd/config.toml
83+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/cat /etc/default/kubelet
84+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/cat /etc/systemd/system/kubelet.service.d/*
85+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/cat /etc/kubernetes/*
86+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /bin/cat /etc/default/kubelet
87+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /bin/cat /etc/systemd/system/kubelet.service.d/*
88+
aks-flex-node ALL=(root) NOPASSWD:SETENV: /bin/cat /etc/kubernetes/*
6089

6190
# Network operations for troubleshooting
6291
aks-flex-node ALL=(root) NOPASSWD:SETENV: /sbin/ip route

commands.go

Lines changed: 157 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,19 @@ package main
22

33
import (
44
"context"
5+
"encoding/json"
56
"fmt"
7+
"os"
8+
"path/filepath"
9+
"time"
610

711
"github.com/sirupsen/logrus"
812
"github.com/spf13/cobra"
913

1014
"go.goms.io/aks/AKSFlexNode/pkg/bootstrapper"
1115
"go.goms.io/aks/AKSFlexNode/pkg/config"
1216
"go.goms.io/aks/AKSFlexNode/pkg/logger"
17+
"go.goms.io/aks/AKSFlexNode/pkg/status"
1318
)
1419

1520
// Version information variables (set at build time)
@@ -19,14 +24,14 @@ var (
1924
BuildTime = "unknown"
2025
)
2126

22-
// NewBootstrapCommand creates a new bootstrap command
23-
func NewBootstrapCommand() *cobra.Command {
27+
// NewAgentCommand creates a new agent command
28+
func NewAgentCommand() *cobra.Command {
2429
cmd := &cobra.Command{
25-
Use: "bootstrap",
26-
Short: "Bootstrap AKS node with Arc connection",
27-
Long: "Initialize and configure this machine as an AKS node connected through Azure Arc",
30+
Use: "agent",
31+
Short: "Start AKS node agent with Arc connection",
32+
Long: "Initialize and run the AKS node agent daemon with automatic status tracking and self-recovery",
2833
RunE: func(cmd *cobra.Command, args []string) error {
29-
return runBootstrap(cmd.Context())
34+
return runAgent(cmd.Context())
3035
},
3136
}
3237

@@ -61,8 +66,8 @@ func NewVersionCommand() *cobra.Command {
6166
return cmd
6267
}
6368

64-
// runBootstrap executes the bootstrap process
65-
func runBootstrap(ctx context.Context) error {
69+
// runAgent executes the bootstrap process and then runs as daemon
70+
func runAgent(ctx context.Context) error {
6671
logger := logger.GetLoggerFromContext(ctx)
6772

6873
cfg, err := config.LoadConfig(configPath)
@@ -76,8 +81,14 @@ func runBootstrap(ctx context.Context) error {
7681
return err
7782
}
7883

79-
// Handle and log the result
80-
return handleExecutionResult(result, "bootstrap", logger)
84+
// Handle and log the bootstrap result
85+
if err := handleExecutionResult(result, "bootstrap", logger); err != nil {
86+
return err
87+
}
88+
89+
// After successful bootstrap, transition to daemon mode
90+
logger.Info("Bootstrap completed successfully, transitioning to daemon mode...")
91+
return runDaemonLoop(ctx, cfg)
8192
}
8293

8394
// runUnbootstrap executes the unbootstrap process
@@ -107,6 +118,142 @@ func runVersion() {
107118
fmt.Printf("Build Time: %s\n", BuildTime)
108119
}
109120

121+
// runDaemonLoop runs the periodic status collection and bootstrap monitoring daemon
122+
func runDaemonLoop(ctx context.Context, cfg *config.Config) error {
123+
logger := logger.GetLoggerFromContext(ctx)
124+
// Create status file directory - using runtime directory for service or temp for development
125+
statusFilePath := status.GetStatusFilePath()
126+
statusDir := filepath.Dir(statusFilePath)
127+
if err := os.MkdirAll(statusDir, 0750); err != nil {
128+
return fmt.Errorf("failed to create status directory %s: %w", statusDir, err)
129+
}
130+
131+
// Clean up any stale status file on daemon startup
132+
if _, err := os.Stat(statusFilePath); err == nil {
133+
logger.Info("Removing stale status file from previous daemon session...")
134+
if err := os.Remove(statusFilePath); err != nil {
135+
logger.Warnf("Failed to remove stale status file: %v", err)
136+
} else {
137+
logger.Info("Stale status file removed successfully")
138+
}
139+
}
140+
141+
logger.Info("Starting periodic status collection daemon (status: 1 minutes, bootstrap check: 2 minute)")
142+
143+
// Create tickers for different intervals
144+
statusTicker := time.NewTicker(1 * time.Minute)
145+
bootstrapTicker := time.NewTicker(2 * time.Minute)
146+
defer statusTicker.Stop()
147+
defer bootstrapTicker.Stop()
148+
149+
// Collect status immediately on start
150+
if err := collectAndWriteStatus(ctx, cfg, statusFilePath); err != nil {
151+
logger.Errorf("Failed to collect initial status: %v", err)
152+
}
153+
154+
// Run the periodic collection and monitoring loop
155+
for {
156+
select {
157+
case <-ctx.Done():
158+
logger.Info("Daemon shutting down due to context cancellation")
159+
return ctx.Err()
160+
case <-statusTicker.C:
161+
logger.Infof("Starting periodic status collection at %s...", time.Now().Format("2006-01-02 15:04:05"))
162+
if err := collectAndWriteStatus(ctx, cfg, statusFilePath); err != nil {
163+
logger.Errorf("Failed to collect status at %s: %v", time.Now().Format("2006-01-02 15:04:05"), err)
164+
// Continue running even if status collection fails
165+
} else {
166+
logger.Infof("Status collection completed successfully at %s", time.Now().Format("2006-01-02 15:04:05"))
167+
}
168+
case <-bootstrapTicker.C:
169+
logger.Infof("Starting bootstrap health check at %s...", time.Now().Format("2006-01-02 15:04:05"))
170+
if err := checkAndBootstrap(ctx, cfg); err != nil {
171+
logger.Errorf("Auto-bootstrap check failed at %s: %v", time.Now().Format("2006-01-02 15:04:05"), err)
172+
// Continue running even if bootstrap check fails
173+
} else {
174+
logger.Infof("Bootstrap health check completed at %s", time.Now().Format("2006-01-02 15:04:05"))
175+
}
176+
}
177+
}
178+
}
179+
180+
// checkAndBootstrap checks if the node needs re-bootstrapping and performs it if necessary
181+
func checkAndBootstrap(ctx context.Context, cfg *config.Config) error {
182+
logger := logger.GetLoggerFromContext(ctx)
183+
// Create status collector to check bootstrap requirements
184+
collector := status.NewCollector(cfg, logger, Version)
185+
186+
// Check if bootstrap is needed
187+
needsBootstrap := collector.NeedsBootstrap(ctx)
188+
if !needsBootstrap {
189+
return nil // All good, no action needed
190+
}
191+
192+
logger.Info("Node requires re-bootstrapping, initiating auto-bootstrap...")
193+
194+
// Perform bootstrap
195+
bootstrapExecutor := bootstrapper.New(cfg, logger)
196+
result, err := bootstrapExecutor.Bootstrap(ctx)
197+
if err != nil {
198+
// Bootstrap failed - remove status file so next check will detect the problem
199+
removeStatusFile(ctx)
200+
return fmt.Errorf("auto-bootstrap failed: %s", err)
201+
}
202+
203+
// Handle and log the bootstrap result
204+
if err := handleExecutionResult(result, "auto-bootstrap", logger); err != nil {
205+
// Bootstrap execution failed - remove status file so next check will detect the problem
206+
removeStatusFile(ctx)
207+
return fmt.Errorf("auto-bootstrap execution failed: %s", err)
208+
}
209+
210+
logger.Info("Auto-bootstrap completed successfully")
211+
return nil
212+
}
213+
214+
func removeStatusFile(ctx context.Context) {
215+
logger := logger.GetLoggerFromContext(ctx)
216+
statusFilePath := status.GetStatusFilePath()
217+
if removeErr := os.Remove(statusFilePath); removeErr != nil {
218+
logger.Debugf("Failed to remove status file: %s", removeErr)
219+
} else {
220+
logger.Debug("Removed status file successfully")
221+
}
222+
}
223+
224+
// collectAndWriteStatus collects current node status and writes it to the status file
225+
func collectAndWriteStatus(ctx context.Context, cfg *config.Config, statusFilePath string) error {
226+
logger := logger.GetLoggerFromContext(ctx)
227+
228+
// Create status collector
229+
collector := status.NewCollector(cfg, logger, Version)
230+
231+
// Collect comprehensive status
232+
nodeStatus, err := collector.CollectStatus(ctx)
233+
if err != nil {
234+
return fmt.Errorf("failed to collect node status: %w", err)
235+
}
236+
237+
// Write status to JSON file
238+
statusData, err := json.MarshalIndent(nodeStatus, "", " ")
239+
if err != nil {
240+
return fmt.Errorf("failed to marshal status to JSON: %w", err)
241+
}
242+
243+
// Write to temporary file first, then rename (atomic operation)
244+
tempFile := statusFilePath + ".tmp"
245+
if err := os.WriteFile(tempFile, statusData, 0600); err != nil {
246+
return fmt.Errorf("failed to write status to temp file: %w", err)
247+
}
248+
249+
if err := os.Rename(tempFile, statusFilePath); err != nil {
250+
return fmt.Errorf("failed to rename temp status file: %w", err)
251+
}
252+
253+
logger.Debugf("Status written to %s", statusFilePath)
254+
return nil
255+
}
256+
110257
// handleExecutionResult processes and logs execution results
111258
func handleExecutionResult(result *bootstrapper.ExecutionResult, operation string, logger *logrus.Logger) error {
112259
if result == nil {

0 commit comments

Comments
 (0)