|
| 1 | +# Dirty Frag Kubernetes mitigation |
| 2 | +# |
| 3 | +# Disclosure: https://github.com/V4bel/dirtyfrag |
| 4 | +# |
| 5 | +# This manifest applies the Dirty Frag mitigation recommended in the disclosure |
| 6 | +# README to every Linux node in a Kubernetes cluster: |
| 7 | +# |
| 8 | +# printf 'install esp4 /bin/false\ninstall esp6 /bin/false\ninstall rxrpc /bin/false\n' \ |
| 9 | +# > /etc/modprobe.d/dirtyfrag.conf |
| 10 | +# rmmod esp4 esp6 rxrpc 2>/dev/null |
| 11 | +# echo 3 > /proc/sys/vm/drop_caches |
| 12 | +# |
| 13 | +# It runs as a DaemonSet so that: |
| 14 | +# - The mitigation is applied on every existing node, and |
| 15 | +# - It is automatically re-applied to any new node that joins the cluster |
| 16 | +# (autoscaling, node-image upgrade, scale-set rolling update, etc.) before |
| 17 | +# workloads schedule onto it. |
| 18 | +# |
| 19 | +# How it works: |
| 20 | +# - An init container enters the host's PID, mount, network, IPC and UTS |
| 21 | +# namespaces with `nsenter -t 1 -m -u -i -n -p` and: |
| 22 | +# 1. Writes /etc/modprobe.d/disable-dirtyfrag.conf so esp4, esp6 and |
| 23 | +# rxrpc cannot be loaded on demand. |
| 24 | +# 2. For each module currently loaded with refcnt=0, runs `modprobe -r` |
| 25 | +# to unload it from the live kernel. |
| 26 | +# 3. Runs `sync; echo 3 > /proc/sys/vm/drop_caches` to clear any |
| 27 | +# contaminated cached pages (gated on DROP_CACHES, default true). |
| 28 | +# 4. If any module remains loaded with refcnt > 0, emits a single |
| 29 | +# aggregated Warning Kubernetes Event (reason=DirtyFragModulesInUse) |
| 30 | +# on the Node listing the in-use modules so operators can drain and |
| 31 | +# reboot/replace the node. This DaemonSet does NOT auto-cordon. |
| 32 | +# - A long-running `pause` container keeps the pod in Running state so the |
| 33 | +# init container is only re-executed on pod recreation (i.e. on each new |
| 34 | +# node). |
| 35 | +# |
| 36 | +# Compatibility note: |
| 37 | +# esp4 and esp6 provide IPsec ESP transforms; rxrpc provides the RxRPC |
| 38 | +# socket family used by AFS. If any of your workloads (or the host network) |
| 39 | +# require these modules, do NOT apply this manifest as-is — either remove |
| 40 | +# the affected module(s) from the MODULES env var below, or label-exclude |
| 41 | +# the affected node pool. On a typical workload-only Kubernetes cluster |
| 42 | +# none of these modules are in use. |
| 43 | +# |
| 44 | +# Reverting once upstream kernel patches roll out: |
| 45 | +# 1. Run a cleanup pass first to remove the modprobe drop-in from live |
| 46 | +# nodes (the init container's CLEANUP_MODE branch removes the file |
| 47 | +# and reloads modprobe state): |
| 48 | +# |
| 49 | +# kubectl -n kube-system set env ds/dirtyfrag-mitigation CLEANUP_MODE=true |
| 50 | +# kubectl -n kube-system rollout restart ds/dirtyfrag-mitigation |
| 51 | +# kubectl -n kube-system rollout status ds/dirtyfrag-mitigation |
| 52 | +# |
| 53 | +# 2. Then delete the resources: |
| 54 | +# |
| 55 | +# kubectl delete -f dirtyfrag-mitigation.yaml |
| 56 | +# |
| 57 | +# If you skip step 1, the modprobe drop-in remains on existing nodes until |
| 58 | +# each is recycled (node-image upgrade, scale-down, or manual drain+delete). |
| 59 | +# |
| 60 | +# Tested with Kubernetes 1.27+ on AKS, EKS, and GKE (Linux nodes only). |
| 61 | +--- |
| 62 | +apiVersion: v1 |
| 63 | +kind: ServiceAccount |
| 64 | +metadata: |
| 65 | + name: dirtyfrag-mitigation |
| 66 | + namespace: kube-system |
| 67 | + labels: |
| 68 | + app.kubernetes.io/name: dirtyfrag-mitigation |
| 69 | + app.kubernetes.io/component: cve-mitigation |
| 70 | +--- |
| 71 | +apiVersion: rbac.authorization.k8s.io/v1 |
| 72 | +kind: ClusterRole |
| 73 | +metadata: |
| 74 | + name: dirtyfrag-mitigation |
| 75 | + labels: |
| 76 | + app.kubernetes.io/name: dirtyfrag-mitigation |
| 77 | + app.kubernetes.io/component: cve-mitigation |
| 78 | +rules: |
| 79 | + # Read node metadata so we can address Events to the running node. |
| 80 | + - apiGroups: [""] |
| 81 | + resources: ["nodes"] |
| 82 | + verbs: ["get"] |
| 83 | + # Emit Warning Events when any module is in use (refcount > 0). |
| 84 | + - apiGroups: [""] |
| 85 | + resources: ["events"] |
| 86 | + verbs: ["create", "patch"] |
| 87 | + - apiGroups: ["events.k8s.io"] |
| 88 | + resources: ["events"] |
| 89 | + verbs: ["create", "patch"] |
| 90 | +--- |
| 91 | +apiVersion: rbac.authorization.k8s.io/v1 |
| 92 | +kind: ClusterRoleBinding |
| 93 | +metadata: |
| 94 | + name: dirtyfrag-mitigation |
| 95 | + labels: |
| 96 | + app.kubernetes.io/name: dirtyfrag-mitigation |
| 97 | + app.kubernetes.io/component: cve-mitigation |
| 98 | +roleRef: |
| 99 | + apiGroup: rbac.authorization.k8s.io |
| 100 | + kind: ClusterRole |
| 101 | + name: dirtyfrag-mitigation |
| 102 | +subjects: |
| 103 | + - kind: ServiceAccount |
| 104 | + name: dirtyfrag-mitigation |
| 105 | + namespace: kube-system |
| 106 | +--- |
| 107 | +apiVersion: apps/v1 |
| 108 | +kind: DaemonSet |
| 109 | +metadata: |
| 110 | + name: dirtyfrag-mitigation |
| 111 | + namespace: kube-system |
| 112 | + labels: |
| 113 | + app.kubernetes.io/name: dirtyfrag-mitigation |
| 114 | + app.kubernetes.io/component: cve-mitigation |
| 115 | +spec: |
| 116 | + selector: |
| 117 | + matchLabels: |
| 118 | + app.kubernetes.io/name: dirtyfrag-mitigation |
| 119 | + updateStrategy: |
| 120 | + type: RollingUpdate |
| 121 | + rollingUpdate: |
| 122 | + maxUnavailable: 100% # init container is fast; roll the whole fleet at once |
| 123 | + template: |
| 124 | + metadata: |
| 125 | + labels: |
| 126 | + app.kubernetes.io/name: dirtyfrag-mitigation |
| 127 | + app.kubernetes.io/component: cve-mitigation |
| 128 | + spec: |
| 129 | + hostPID: true |
| 130 | + priorityClassName: system-node-critical |
| 131 | + serviceAccountName: dirtyfrag-mitigation |
| 132 | + automountServiceAccountToken: true |
| 133 | + # Run on every Linux node, including system/critical pools. |
| 134 | + nodeSelector: |
| 135 | + kubernetes.io/os: linux |
| 136 | + tolerations: |
| 137 | + - operator: Exists |
| 138 | + terminationGracePeriodSeconds: 5 |
| 139 | + initContainers: |
| 140 | + - name: apply-mitigation |
| 141 | + image: busybox:1.36.1 |
| 142 | + imagePullPolicy: IfNotPresent |
| 143 | + securityContext: |
| 144 | + privileged: true |
| 145 | + runAsUser: 0 |
| 146 | + env: |
| 147 | + - name: NODE_NAME |
| 148 | + valueFrom: |
| 149 | + fieldRef: |
| 150 | + fieldPath: spec.nodeName |
| 151 | + # Node Events follow the kubelet convention of being created in |
| 152 | + # the `default` namespace; cluster-scoped objects like Nodes |
| 153 | + # cannot have a namespaced involvedObject reference. |
| 154 | + - name: EVENT_NAMESPACE |
| 155 | + value: "default" |
| 156 | + # Set CLEANUP_MODE=true (e.g. via `kubectl set env`) to flip the |
| 157 | + # init container into removing the modprobe drop-in instead of |
| 158 | + # writing it. Use this for a full rollout pass before deleting |
| 159 | + # the DaemonSet, to clean up live nodes. |
| 160 | + - name: CLEANUP_MODE |
| 161 | + value: "false" |
| 162 | + # Set DROP_CACHES=false to skip `echo 3 > /proc/sys/vm/drop_caches` |
| 163 | + # (the page-cache flush after unloading modules). Default true, |
| 164 | + # matching the disclosure's recommended mitigation. |
| 165 | + - name: DROP_CACHES |
| 166 | + value: "true" |
| 167 | + # Space-separated list of modules to blacklist + unload. Edit this |
| 168 | + # if you need to keep one of these modules available (e.g. IPsec |
| 169 | + # via esp4/esp6, AFS via rxrpc). |
| 170 | + - name: MODULES |
| 171 | + value: "esp4 esp6 rxrpc" |
| 172 | + command: ["/bin/sh", "-c"] |
| 173 | + args: |
| 174 | + - | |
| 175 | + set -eu |
| 176 | +
|
| 177 | + MODPROBE_FILE=/etc/modprobe.d/disable-dirtyfrag.conf |
| 178 | +
|
| 179 | + if [ "${CLEANUP_MODE}" = "true" ]; then |
| 180 | + echo "[dirtyfrag] CLEANUP mode on node ${NODE_NAME}: removing mitigation" |
| 181 | + nsenter -t 1 -m -u -i -n -p -- sh -c "rm -f ${MODPROBE_FILE}; depmod -a 2>/dev/null || true; for m in ${MODULES}; do modprobe -r \$m 2>/dev/null || true; done; true" |
| 182 | + echo "[dirtyfrag] cleanup complete on ${NODE_NAME}" |
| 183 | + exit 0 |
| 184 | + fi |
| 185 | +
|
| 186 | + echo "[dirtyfrag] applying mitigation on node ${NODE_NAME} for modules: ${MODULES}" |
| 187 | +
|
| 188 | + # 1. Persist modprobe blacklist so the modules cannot be loaded on demand. |
| 189 | + # Rewrite the file from scratch (idempotent) to keep ordering stable |
| 190 | + # and match the disclosure's recommended single-file form. |
| 191 | + nsenter -t 1 -m -u -i -n -p -- sh -c " |
| 192 | + set -eu |
| 193 | + TMP=\$(mktemp ${MODPROBE_FILE}.XXXXXX) |
| 194 | + for m in ${MODULES}; do |
| 195 | + printf 'install %s /bin/false\n' \"\$m\" >> \"\$TMP\" |
| 196 | + done |
| 197 | + if [ -f ${MODPROBE_FILE} ] && cmp -s \"\$TMP\" ${MODPROBE_FILE}; then |
| 198 | + rm -f \"\$TMP\" |
| 199 | + echo '[dirtyfrag] ${MODPROBE_FILE} already up to date' |
| 200 | + else |
| 201 | + mv \"\$TMP\" ${MODPROBE_FILE} |
| 202 | + chmod 0644 ${MODPROBE_FILE} |
| 203 | + echo '[dirtyfrag] wrote ${MODPROBE_FILE}' |
| 204 | + fi |
| 205 | + depmod -a 2>/dev/null || true |
| 206 | + " |
| 207 | +
|
| 208 | + # 2. For each module: if currently loaded, try to unload. Track in-use |
| 209 | + # modules so we can emit a single aggregated Warning Event. |
| 210 | + IN_USE="" |
| 211 | + for m in ${MODULES}; do |
| 212 | + REFCNT_PATH=/sys/module/${m}/refcnt |
| 213 | + if nsenter -t 1 -m -u -i -n -p -- test -f "${REFCNT_PATH}"; then |
| 214 | + REFCNT=$(nsenter -t 1 -m -u -i -n -p -- cat "${REFCNT_PATH}") |
| 215 | + echo "[dirtyfrag] ${m} is loaded with refcnt=${REFCNT}" |
| 216 | +
|
| 217 | + if [ "${REFCNT}" = "0" ]; then |
| 218 | + if nsenter -t 1 -m -u -i -n -p -- modprobe -r ${m} 2>&1; then |
| 219 | + echo "[dirtyfrag] successfully unloaded ${m}" |
| 220 | + else |
| 221 | + echo "[dirtyfrag] WARNING: rmmod ${m} failed despite refcnt=0" |
| 222 | + IN_USE="${IN_USE}${IN_USE:+,}${m}(rmmod-failed)" |
| 223 | + fi |
| 224 | + else |
| 225 | + echo "[dirtyfrag] WARNING: ${m} in use (refcnt=${REFCNT}); node ${NODE_NAME} requires drain+reboot for full mitigation" |
| 226 | + IN_USE="${IN_USE}${IN_USE:+,}${m}(refcnt=${REFCNT})" |
| 227 | + fi |
| 228 | + else |
| 229 | + echo "[dirtyfrag] ${m} is not loaded; modprobe blacklist will prevent future loads" |
| 230 | + fi |
| 231 | + done |
| 232 | +
|
| 233 | + # 3. Drop page caches to clear any contaminated cached pages, per the |
| 234 | + # disclosure's mitigation guidance. Best-effort. |
| 235 | + if [ "${DROP_CACHES}" = "true" ]; then |
| 236 | + if nsenter -t 1 -m -u -i -n -p -- sh -c 'sync; echo 3 > /proc/sys/vm/drop_caches' 2>/dev/null; then |
| 237 | + echo "[dirtyfrag] dropped page caches" |
| 238 | + else |
| 239 | + echo "[dirtyfrag] WARNING: failed to drop page caches" |
| 240 | + fi |
| 241 | + fi |
| 242 | +
|
| 243 | + # 4. If any module was in-use, emit a single aggregated Warning Event |
| 244 | + # on the Node so operators get an actionable signal. |
| 245 | + # Best-effort: do not fail the init container if the API call fails. |
| 246 | + # BusyBox `wget --no-check-certificate` is used because BusyBox wget |
| 247 | + # does not support `--ca-certificate`; the bearer token still |
| 248 | + # authenticates us to the API server, and the endpoint is the |
| 249 | + # in-cluster `kubernetes.default.svc` ClusterIP, so skipping TLS |
| 250 | + # chain validation is an accepted trade-off for a best-effort emitter. |
| 251 | + if [ -n "${IN_USE}" ]; then |
| 252 | + TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token) |
| 253 | + APISERVER=https://kubernetes.default.svc |
| 254 | + NODE_UID=$(wget -qO- --no-check-certificate \ |
| 255 | + --header="Authorization: Bearer ${TOKEN}" \ |
| 256 | + "${APISERVER}/api/v1/nodes/${NODE_NAME}" 2>/dev/null | \ |
| 257 | + sed -n 's/.*"uid":[[:space:]]*"\([^"]*\)".*/\1/p' | head -1 || true) |
| 258 | + TS=$(date -u +%Y-%m-%dT%H:%M:%SZ) |
| 259 | + EVENT_NAME="dirtyfrag-mitigation.${NODE_NAME}.$(date +%s)" |
| 260 | + EVENT_BODY=$(cat <<EOF |
| 261 | + {"apiVersion":"v1","kind":"Event","metadata":{"name":"${EVENT_NAME}","namespace":"${EVENT_NAMESPACE}"},"involvedObject":{"apiVersion":"v1","kind":"Node","name":"${NODE_NAME}","uid":"${NODE_UID}"},"reason":"DirtyFragModulesInUse","message":"Dirty Frag: the following kernel modules are in use and could not be unloaded: ${IN_USE}. Drain and reboot/replace this node to fully mitigate.","type":"Warning","firstTimestamp":"${TS}","lastTimestamp":"${TS}","count":1,"source":{"component":"dirtyfrag-mitigation"}} |
| 262 | + EOF |
| 263 | + ) |
| 264 | + if wget -qO- --no-check-certificate \ |
| 265 | + --header="Authorization: Bearer ${TOKEN}" \ |
| 266 | + --header="Content-Type: application/json" \ |
| 267 | + --post-data="${EVENT_BODY}" \ |
| 268 | + "${APISERVER}/api/v1/namespaces/${EVENT_NAMESPACE}/events" >/dev/null 2>&1; then |
| 269 | + echo "[dirtyfrag] emitted Warning Event ${EVENT_NAME} (in-use: ${IN_USE})" |
| 270 | + else |
| 271 | + echo "[dirtyfrag] WARNING: failed to emit Kubernetes Event" |
| 272 | + fi |
| 273 | + fi |
| 274 | +
|
| 275 | + echo "[dirtyfrag] mitigation complete on ${NODE_NAME}" |
| 276 | + resources: |
| 277 | + requests: |
| 278 | + cpu: 10m |
| 279 | + memory: 16Mi |
| 280 | + limits: |
| 281 | + cpu: 100m |
| 282 | + memory: 64Mi |
| 283 | + containers: |
| 284 | + # Long-running placeholder so the pod stays Running and the init |
| 285 | + # container is re-executed only on pod recreate (i.e. on each new node). |
| 286 | + - name: pause |
| 287 | + image: registry.k8s.io/pause:3.10.1 |
| 288 | + imagePullPolicy: IfNotPresent |
| 289 | + resources: |
| 290 | + requests: |
| 291 | + cpu: 1m |
| 292 | + memory: 8Mi |
| 293 | + limits: |
| 294 | + cpu: 10m |
| 295 | + memory: 16Mi |
| 296 | + securityContext: |
| 297 | + allowPrivilegeEscalation: false |
| 298 | + readOnlyRootFilesystem: true |
| 299 | + capabilities: |
| 300 | + drop: ["ALL"] |
0 commit comments