Skip to content

Commit 15a5b9d

Browse files
authored
Merge pull request #226 from rancher-sandbox/seamless-upgrade
feat: make agent protection persistent during rolling update
2 parents 0ca6780 + 2561bfd commit 15a5b9d

9 files changed

Lines changed: 318 additions & 21 deletions

File tree

charts/runtime-enforcer/templates/agent/daemonset.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@ spec:
99
matchLabels:
1010
app.kubernetes.io/component: agent
1111
{{- include "runtime-enforcer.selectorLabels" . | nindent 6 }}
12+
updateStrategy:
13+
type: RollingUpdate
14+
rollingUpdate:
15+
maxUnavailable: 0
16+
maxSurge: 1
1217
template:
1318
metadata:
1419
labels:
@@ -59,6 +64,12 @@ spec:
5964
| default .Chart.AppVersion }}
6065
imagePullPolicy: {{ .Values.agent.agent.image.pullPolicy }}
6166
name: agent
67+
startupProbe:
68+
httpGet:
69+
path: /readyz
70+
port: 8081
71+
initialDelaySeconds: 5
72+
periodSeconds: 3
6273
resources: {{- toYaml .Values.agent.agent.resources | nindent 10 }}
6374
securityContext: {{- toYaml .Values.agent.agent.containerSecurityContext | nindent 10 }}
6475
volumeMounts:

cmd/agent/main.go

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@ package main
22

33
import (
44
"context"
5+
"errors"
56
"flag"
67
"fmt"
8+
"net/http"
79
"os"
810

911
"github.com/cilium/ebpf"
@@ -32,16 +34,20 @@ type Config struct {
3234
enableLearning bool
3335
nriSocketPath string
3436
nriPluginIdx string
37+
probeAddr string
3538
}
3639

3740
// +kubebuilder:rbac:groups=security.rancher.io,resources=workloadpolicies,verbs=get;list;watch
3841

39-
func newControllerManager() (manager.Manager, error) {
42+
func newControllerManager(config Config) (manager.Manager, error) {
4043
scheme := runtime.NewScheme()
4144
utilruntime.Must(v1alpha1.AddToScheme(scheme))
4245
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
4346
utilruntime.Must(securityv1alpha1.AddToScheme(scheme))
44-
controllerOptions := ctrl.Options{Scheme: scheme}
47+
controllerOptions := ctrl.Options{
48+
Scheme: scheme,
49+
HealthProbeBindAddress: config.probeAddr,
50+
}
4551
mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), controllerOptions)
4652
if err != nil {
4753
return nil, fmt.Errorf("unable to start manager: %w", err)
@@ -55,7 +61,7 @@ func startAgent(ctx context.Context, logger *slog.Logger, config Config) error {
5561
//////////////////////
5662
// Create controller manager
5763
//////////////////////
58-
ctrlMgr, err := newControllerManager()
64+
ctrlMgr, err := newControllerManager(config)
5965
if err != nil {
6066
return fmt.Errorf("cannot create manager: %w", err)
6167
}
@@ -118,6 +124,12 @@ func startAgent(ctx context.Context, logger *slog.Logger, config Config) error {
118124
return fmt.Errorf("failed to add NRI handler to controller manager: %w", err)
119125
}
120126

127+
// controller-runtime doesn't support a separate startup probe, so we use the readiness probe instead.
128+
// See https://github.com/kubernetes-sigs/controller-runtime/issues/2644 for more details.
129+
if err = ctrlMgr.AddReadyzCheck("resolver readyz", resolver.Ping); err != nil {
130+
return fmt.Errorf("failed to add resolver's readiness probe: %w", err)
131+
}
132+
121133
//////////////////////
122134
// Create the scraper
123135
//////////////////////
@@ -139,7 +151,23 @@ func startAgent(ctx context.Context, logger *slog.Logger, config Config) error {
139151
if err != nil {
140152
return fmt.Errorf("cannot get workload policy informer: %w", err)
141153
}
142-
_, _ = workloadPolicyInformer.AddEventHandler(resolver.PolicyEventHandlers())
154+
handlerRegistration, err := workloadPolicyInformer.AddEventHandler(resolver.PolicyEventHandlers())
155+
if err != nil {
156+
return fmt.Errorf("failed to add event handler for workload policy: %w", err)
157+
}
158+
159+
if err = ctrlMgr.AddReadyzCheck("policy readyz", func(_ *http.Request) error {
160+
// Instead of informer.HasSynced(), which checks if the internal storage is synced,
161+
// we use ResourceEventHandlerRegistration.HasSynced() to ensure that
162+
// the event handlers have been synced.
163+
if !handlerRegistration.HasSynced() {
164+
logger.Warn("workload policy informer has not yet synced")
165+
return errors.New("workload policy informer has not yet synced")
166+
}
167+
return nil
168+
}); err != nil {
169+
return fmt.Errorf("failed to add NRI handler's readiness probe: %w", err)
170+
}
143171

144172
logger.InfoContext(ctx, "starting manager")
145173
if err = ctrlMgr.Start(ctx); err != nil {
@@ -167,6 +195,7 @@ func main() {
167195
flag.BoolVar(&config.enableLearning, "enable-learning", false, "Enable learning mode")
168196
flag.StringVar(&config.nriSocketPath, "nri-socket-path", "/var/run/nri/nri.sock", "NRI socket path")
169197
flag.StringVar(&config.nriPluginIdx, "nri-plugin-index", "00", "NRI plugin index")
198+
flag.StringVar(&config.probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
170199

171200
flag.Parse()
172201

internal/nri/handler.go

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,24 @@ type Handler struct {
2424
resolver *resolver.Resolver
2525
}
2626

27+
func newNRIPlugin(logger *slog.Logger, resolver *resolver.Resolver, opts ...stub.Option) (*plugin, error) {
28+
var err error
29+
p := &plugin{
30+
logger: logger.With("component", "nri-plugin"),
31+
resolver: resolver,
32+
}
33+
34+
p.stub, err = stub.New(p, opts...)
35+
if err != nil {
36+
return nil, fmt.Errorf("failed to create NRI plugin stub: %w", err)
37+
}
38+
return p, nil
39+
}
40+
41+
func (p *plugin) Run(ctx context.Context) error {
42+
return p.stub.Run(ctx)
43+
}
44+
2745
func NewNRIHandler(socketPath, pluginIndex string, logger *slog.Logger, r *resolver.Resolver) (*Handler, error) {
2846
h := &Handler{
2947
socketPath: socketPath,
@@ -71,24 +89,17 @@ func (h *Handler) checkNRISupport() error {
7189
}
7290

7391
func (h *Handler) startNRIPlugin(ctx context.Context) error {
74-
var err error
75-
76-
p := &plugin{
77-
logger: h.logger.With("component", "nri-plugin"),
78-
resolver: h.resolver,
79-
}
80-
81-
opts := []stub.Option{
92+
p, err := newNRIPlugin(
93+
h.logger,
94+
h.resolver,
8295
stub.WithPluginIdx(h.pluginIndex),
8396
stub.WithSocketPath(h.socketPath),
84-
}
85-
86-
p.stub, err = stub.New(p, opts...)
97+
)
8798
if err != nil {
88-
return fmt.Errorf("failed to create NRI plugin stub: %w", err)
99+
return fmt.Errorf("failed to create NRI plugin: %w", err)
89100
}
90101

91-
err = p.stub.Run(ctx)
102+
err = p.Run(ctx)
92103
if err != nil {
93104
return fmt.Errorf("NRI plugin exited with error: %w", err)
94105
}

internal/nri/plugin.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ func (p *plugin) Synchronize(
9393
"error", err)
9494
}
9595
}
96+
// Mark resolver as synchronized, so old agent can be safely removed.
97+
p.resolver.NRISynchronized()
9698
return nil, nil
9799
}
98100

internal/resolver/nri_api.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
package resolver
22

33
import (
4+
"errors"
45
"fmt"
6+
"net/http"
57

68
"github.com/rancher-sandbox/runtime-enforcer/internal/bpf"
79
)
@@ -123,3 +125,15 @@ func (r *Resolver) RemovePodContainerFromNri(podID PodID, containerID ContainerI
123125

124126
return r.cgroupToPolicyMapUpdateFunc(PolicyIDNone, []CgroupID{container.cgID}, bpf.RemoveCgroups)
125127
}
128+
129+
func (r *Resolver) NRISynchronized() {
130+
r.nriSynchronized.Store(true)
131+
}
132+
133+
func (r *Resolver) Ping(_ *http.Request) error {
134+
if !r.nriSynchronized.Load() {
135+
r.logger.Warn("NRI handler has not yet synchronized")
136+
return errors.New("NRI handler has not yet synchronized")
137+
}
138+
return nil
139+
}

internal/resolver/resolver.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package resolver
33
import (
44
"log/slog"
55
"sync"
6+
"sync/atomic"
67

78
"github.com/rancher-sandbox/runtime-enforcer/internal/bpf"
89
"github.com/rancher-sandbox/runtime-enforcer/internal/types/policymode"
@@ -15,8 +16,9 @@ type ContainerName = string
1516

1617
type Resolver struct {
1718
// let's see if we can split this unique lock in multiple locks later
18-
mu sync.Mutex
19-
logger *slog.Logger
19+
mu sync.Mutex
20+
logger *slog.Logger
21+
nriSynchronized atomic.Bool
2022
// todo!: we should add a cache with deleted pods/containers so that we can resolve also recently deleted ones
2123
podCache map[PodID]*podState
2224
cgroupIDToPodID map[CgroupID]PodID

test/e2e/e2e_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,3 +107,9 @@ func TestValidatingAdmissionPolicyPodPolicyLabel(t *testing.T) {
107107

108108
testEnv.Test(t, getValidatingAdmissionPolicyPodPolicyLabelTest())
109109
}
110+
111+
func TestRollingUpdate(t *testing.T) {
112+
t.Log("test rolling update")
113+
114+
testEnv.Test(t, getRollingUpdateTest())
115+
}

test/e2e/enforcement_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ func getEnforcementOnExistingPodsTest() types.Feature {
105105
Spec: v1alpha1.WorkloadPolicySpec{
106106
Mode: "protect",
107107
RulesByContainer: map[string]*v1alpha1.WorkloadPolicyRules{
108-
"ubuntu": &v1alpha1.WorkloadPolicyRules{
108+
"ubuntu": {
109109
Executables: tc.AllowedExecutables,
110110
},
111111
},
@@ -210,7 +210,7 @@ func getEnforcementOnNewPodsTest() types.Feature {
210210
Spec: v1alpha1.WorkloadPolicySpec{
211211
Mode: "protect",
212212
RulesByContainer: map[string]*v1alpha1.WorkloadPolicyRules{
213-
"ubuntu": &v1alpha1.WorkloadPolicyRules{
213+
"ubuntu": {
214214
Executables: tc.AllowedExecutables,
215215
},
216216
},

0 commit comments

Comments
 (0)