Skip to content

Commit 45cfd00

Browse files
committed
feat: prevent containers from running in NRI
As part of error handling, when we failed to apply protection on a container, we will fail the container creation flow by default. Users can use NRI_FAILOPEN envvar to change this behavior. Signed-off-by: Sam Wang (holyspectral) <sam.wang@suse.com>
1 parent ab69ca9 commit 45cfd00

File tree

5 files changed

+77
-25
lines changed

5 files changed

+77
-25
lines changed

cspell.json

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,10 @@
8484
"printk",
8585
"ringbuf",
8686
"cgtracker",
87-
"runtimeenforcer",
88-
"agentv1"
87+
"runtimeenforcer",
88+
"agentv1",
89+
"failopen",
90+
"ttrpc"
8991
],
9092
"ignoreWords": [
9193
"clientgoscheme",
@@ -118,7 +120,9 @@
118120
"workloadpolicyhandler",
119121
"rawtime",
120122
"exepath",
121-
"pexepath"
123+
"pexepath",
124+
"workloadkind",
125+
"policymode"
122126
],
123127
"import": []
124-
}
128+
}

internal/nri/handler.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"fmt"
77
"log/slog"
88
"net"
9+
"os"
910
"time"
1011

1112
retry "github.com/avast/retry-go/v4"
@@ -33,6 +34,7 @@ func newNRIPlugin(
3334
p := &plugin{
3435
logger: logger.With("component", "nri-plugin"),
3536
resolver: resolver,
37+
failOpen: os.Getenv("NRI_FAILOPEN") == "true",
3638
}
3739

3840
p.stub, err = stub.New(p, opts...)

internal/nri/plugin.go

Lines changed: 46 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ type plugin struct {
1818
logger *slog.Logger
1919
resolver *resolver.Resolver
2020
lastErr error
21+
failOpen bool
2122
}
2223

2324
func (p *plugin) getWorkloadInfoAndLog(ctx context.Context, pod *api.PodSandbox) (string, workloadkind.Kind) {
@@ -133,8 +134,13 @@ func (p *plugin) Synchronize(
133134
"containers", containers,
134135
)
135136
if err := p.resolver.AddPodContainerFromNri(podData); err != nil {
137+
// This could be recoverable. Returning an error so we can retry.
136138
p.logger.ErrorContext(ctx, "failed to add pod container from NRI",
137-
"error", err)
139+
"pod", pod.GetName(),
140+
"namespace", pod.GetNamespace(),
141+
"error", err,
142+
)
143+
return nil, fmt.Errorf("failed to add pod container from NRI: %w", err)
138144
}
139145
}
140146
// Mark resolver as synchronized, so old agent can be safely removed.
@@ -152,13 +158,46 @@ func (p *plugin) StartContainer(
152158
"containerName", container.GetName(),
153159
)
154160

161+
handleError := func(reason string, err error) error {
162+
if p.failOpen {
163+
p.logger.WarnContext(
164+
ctx,
165+
"container is starting WITHOUT enforcement due to NRI_FAILOPEN",
166+
"reason", reason,
167+
"error", err,
168+
"containerID", container.GetId(),
169+
"containerName", container.GetName(),
170+
"podName", pod.GetName(),
171+
"podID", pod.GetUid(),
172+
)
173+
return nil
174+
}
175+
p.logger.ErrorContext(
176+
ctx,
177+
"The container is prevented from starting. To change this behavior, set environment variable NRI_FAILOPEN to true",
178+
"reason",
179+
reason,
180+
"containerName",
181+
container.GetName(),
182+
"podName",
183+
pod.GetName(),
184+
"error",
185+
err,
186+
)
187+
return fmt.Errorf(
188+
"%s: %w. The container '%s/%s' is prevented from starting. To change this behavior, set environment variable NRI_FAILOPEN to true",
189+
reason,
190+
err,
191+
pod.GetName(),
192+
container.GetName(),
193+
)
194+
}
195+
155196
cgroupID, err := cgroupFromContainer(container)
156197
if err != nil {
157-
// this should never happen but if we are not able to obtain the cgroup ID, it's useless to add the container
158-
// to the cache, nobody will ever query this entry into the cache.
159-
p.logger.ErrorContext(ctx, "failed to get cgroup ID from container",
160-
"error", err)
161-
return nil
198+
// this should never happen because we've succeeded before in Synchronize() call.
199+
// When this happens, it indicates a serious inconsistency in the system.
200+
return handleError("failed to get cgroup ID from container", err)
162201
}
163202

164203
workloadName, workloadKind := p.getWorkloadInfoAndLog(ctx, pod)
@@ -174,8 +213,7 @@ func (p *plugin) StartContainer(
174213
}
175214

176215
if err = p.resolver.AddPodContainerFromNri(podData); err != nil {
177-
p.logger.ErrorContext(ctx, "failed to add pod container from NRI",
178-
"error", err)
216+
return handleError("failed to add pod container from NRI", err)
179217
}
180218
return nil
181219
}

internal/resolver/nri_api.go

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ func (r *Resolver) AddPodContainerFromNri(pod PodInput) error {
3535
// If everything is identical, as expected, we can just continue
3636
continue
3737
}
38+
// If it's different, this is unexpected and we return an error to avoid potential issues
39+
// with wrong cgroupID -> pod association in the cache.
3840
return fmt.Errorf("containerID %s for pod %s already exists. old (name: %s,cID: %d) new (name: %s,cID: %d)",
3941
containerID,
4042
pod.Meta.Name,
@@ -51,21 +53,20 @@ func (r *Resolver) AddPodContainerFromNri(pod PodInput) error {
5153

5254
// update the cgtracker map
5355
if err := r.cgTrackerUpdateFunc(container.CgroupID, ""); err != nil {
54-
r.logger.Error("failed to update cgroup tracker map",
55-
"pod", podID,
56-
"containerID", containerID,
57-
"error", err)
58-
continue
56+
return fmt.Errorf(
57+
"failed to update cgroup tracker map for pod %s, container %s: %w",
58+
pod.Meta.Name,
59+
container.Name,
60+
err,
61+
)
5962
}
6063
}
6164

6265
// we update back the cache
6366
r.podCache[podID] = state
6467

6568
if err := r.applyPolicyToPodIfPresent(state); err != nil {
66-
r.logger.Error("failed to apply policy to pod",
67-
"error", err,
68-
)
69+
return fmt.Errorf("failed to apply policy to pod: %w", err)
6970
}
7071
return nil
7172
}

internal/resolver/policy.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -132,12 +132,19 @@ func (r *Resolver) applyPolicyToPodIfPresent(state *podEntry) error {
132132
key := fmt.Sprintf("%s/%s", state.podNamespace(), policyName)
133133
info := r.wpState[key]
134134
if info == nil {
135-
return fmt.Errorf(
136-
"pod has policy label but policy does not exist. pod-name: %s, pod-namespace: %s, policy-name: %s",
137-
state.podName(),
138-
state.podNamespace(),
139-
policyName,
135+
// This can happen when the pod runs before the policy is created/reconciled when using GitOps to deploy.
136+
// After the policy is reconciled, the policy will be applied, so we can safely ignore it for now.
137+
//
138+
// Another case is that the policy is just not created at all, which is likely an user error.
139+
// We log a warning for both cases and return without applying any policy.
140+
// This is to avoid the risk of blocking the pod creation unexpectedly.
141+
r.logger.Warn(
142+
"pod has policy label but policy does not exist. .",
143+
"pod-name", state.podName(),
144+
"pod-namespace", state.podNamespace(),
145+
"policy-name", policyName,
140146
)
147+
return nil
141148
}
142149

143150
return r.applyPolicyToPod(state, info.polByContainer)

0 commit comments

Comments
 (0)