Skip to content

Commit 70733ca

Browse files
authored
feat: add OOM stress test framework with Terraform and Kubernetes resources (#1528)
- Introduced `run.sh` script to manage OOM stress test lifecycle (apply, watch, destroy). - Created Terraform configuration for Kubernetes resources including namespaces, pods, and MondooAuditConfig. - Implemented validation script `validate.sh` to monitor scan job outcomes. - Developed integration tests in `oom_stress_test.go` to verify OOM conditions and successful scans. - Added necessary Terraform variables and outputs for configuration management. - Included example Terraform variable file for user setup. - Established `.gitignore` for Terraform state files and directories.
1 parent 2c528fc commit 70733ca

10 files changed

Lines changed: 1137 additions & 0 deletions

File tree

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
#!/usr/bin/env bash
2+
# Copyright Mondoo, Inc. 2026
3+
# SPDX-License-Identifier: BUSL-1.1
4+
5+
# OOM Stress Test Runner
6+
#
7+
# Sets up everything via Terraform, then watches for the scan pod to OOM.
8+
#
9+
# Prerequisites:
10+
# - Cluster running with mondoo-operator installed
11+
# - terraform, kubectl available
12+
# - MONDOO_API_TOKEN or ~/.config/mondoo/mondoo.yml configured
13+
#
14+
# Usage:
15+
# cd tests/integration/oom-stress
16+
# cp terraform/terraform.example.tfvars terraform/terraform.tfvars
17+
# # edit terraform.tfvars with your mondoo_org_id
18+
# ./run.sh [apply|watch|destroy|status]
19+
20+
set -euo pipefail
21+
22+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
23+
TF_DIR="${SCRIPT_DIR}/terraform"
24+
25+
TIMEOUT_SECONDS="${OOM_STRESS_TIMEOUT:-1500}" # 25 minutes
26+
POLL_INTERVAL=5
27+
28+
log() { echo "[$(date '+%H:%M:%S')] $*"; }
29+
30+
cmd_apply() {
31+
log "Running terraform apply..."
32+
cd "$TF_DIR"
33+
terraform init -upgrade
34+
terraform apply -auto-approve
35+
36+
OPERATOR_NS=$(terraform output -raw operator_namespace 2>/dev/null || echo "mondoo-operator")
37+
MEMORY_LIMIT=$(terraform output -raw scanner_memory_limit)
38+
IMAGE_COUNT=$(terraform output -raw stress_image_count)
39+
40+
log ""
41+
log "Infrastructure ready:"
42+
log " Space: $(terraform output -raw mondoo_space_id)"
43+
log " Memory limit: ${MEMORY_LIMIT}"
44+
log " Images: ${IMAGE_COUNT}"
45+
log ""
46+
log "Waiting for target pods to be ready..."
47+
cd "$SCRIPT_DIR"
48+
49+
TARGET_NS=$(cd "$TF_DIR" && terraform output -raw target_namespace)
50+
kubectl wait --for=condition=Ready pods \
51+
-l app.kubernetes.io/part-of=oom-stress-test \
52+
-n "$TARGET_NS" \
53+
--timeout=300s 2>/dev/null || log "WARNING: some pods may not be ready yet"
54+
55+
log "Run './run.sh watch' to monitor the scan, or it will start automatically."
56+
}
57+
58+
cmd_watch() {
59+
cd "$TF_DIR"
60+
OPERATOR_NS="mondoo-operator"
61+
if [ -f terraform.tfstate ]; then
62+
OPERATOR_NS=$(terraform output -raw operator_namespace 2>/dev/null || echo "mondoo-operator")
63+
fi
64+
65+
SCAN_LABEL="app=mondoo-container-scan,mondoo_cr=oom-stress-test"
66+
START_TIME=$(date +%s)
67+
68+
log "Watching for scan pod (label: ${SCAN_LABEL})..."
69+
log "Timeout: ${TIMEOUT_SECONDS}s"
70+
log ""
71+
72+
while true; do
73+
ELAPSED=$(( $(date +%s) - START_TIME ))
74+
if [ "$ELAPSED" -gt "$TIMEOUT_SECONDS" ]; then
75+
log "TIMEOUT: No scan pod terminated within ${TIMEOUT_SECONDS}s"
76+
cmd_status
77+
exit 1
78+
fi
79+
80+
POD_COUNT=$(kubectl get pods -n "$OPERATOR_NS" -l "$SCAN_LABEL" --no-headers 2>/dev/null | wc -l | tr -d ' ')
81+
82+
if [ "$POD_COUNT" -eq 0 ]; then
83+
CJ_EXISTS=$(kubectl get cronjob -n "$OPERATOR_NS" -l "mondoo_cr=oom-stress-test" --no-headers 2>/dev/null | wc -l | tr -d ' ')
84+
if [ "$CJ_EXISTS" -eq 0 ]; then
85+
log " No CronJob yet... (${ELAPSED}s) — check operator logs if this persists"
86+
else
87+
log " CronJob exists, waiting for Job to spawn... (${ELAPSED}s)"
88+
fi
89+
sleep "$POLL_INTERVAL"
90+
continue
91+
fi
92+
93+
while IFS= read -r POD_NAME; do
94+
[ -z "$POD_NAME" ] && continue
95+
POD_PHASE=$(kubectl get pod -n "$OPERATOR_NS" "$POD_NAME" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown")
96+
97+
TERM_REASON=$(kubectl get pod -n "$OPERATOR_NS" "$POD_NAME" \
98+
-o jsonpath='{.status.containerStatuses[0].state.terminated.reason}' 2>/dev/null || true)
99+
EXIT_CODE=$(kubectl get pod -n "$OPERATOR_NS" "$POD_NAME" \
100+
-o jsonpath='{.status.containerStatuses[0].state.terminated.exitCode}' 2>/dev/null || true)
101+
LAST_REASON=$(kubectl get pod -n "$OPERATOR_NS" "$POD_NAME" \
102+
-o jsonpath='{.status.containerStatuses[0].lastState.terminated.reason}' 2>/dev/null || true)
103+
104+
OOM_KILLED="false"
105+
if [ "$TERM_REASON" = "OOMKilled" ] || [ "$LAST_REASON" = "OOMKilled" ] || [ "$EXIT_CODE" = "137" ]; then
106+
OOM_KILLED="true"
107+
fi
108+
109+
if [ "$OOM_KILLED" = "true" ]; then
110+
COMPLETED=$(kubectl logs -n "$OPERATOR_NS" "$POD_NAME" --tail=10000 2>/dev/null | grep -c "successfully uploaded" || echo 0)
111+
log ""
112+
log "=== OOM DETECTED ==="
113+
log "Pod: $POD_NAME"
114+
log "Phase: $POD_PHASE"
115+
log "Elapsed: ${ELAPSED}s"
116+
log "Images: $COMPLETED completed before OOM"
117+
log ""
118+
log "PASSED: scanner OOM-killed as expected."
119+
exit 0
120+
fi
121+
122+
if [ "$POD_PHASE" = "Succeeded" ]; then
123+
log ""
124+
log "=== SCAN COMPLETED WITHOUT OOM ==="
125+
log "Pod $POD_NAME completed successfully."
126+
log "Lower scanner_memory_limit or add more images."
127+
log ""
128+
log "FAILED: no OOM observed."
129+
exit 1
130+
fi
131+
132+
if [ "$POD_PHASE" = "Failed" ] && [ "$OOM_KILLED" = "false" ]; then
133+
log ""
134+
log "=== SCAN FAILED (not OOM) ==="
135+
kubectl describe pod -n "$OPERATOR_NS" "$POD_NAME" 2>/dev/null | tail -15 || true
136+
log ""
137+
log "INCONCLUSIVE: pod failed for non-OOM reason."
138+
exit 2
139+
fi
140+
done < <(kubectl get pods -n "$OPERATOR_NS" -l "$SCAN_LABEL" --no-headers -o custom-columns='NAME:.metadata.name' 2>/dev/null)
141+
142+
log " Scan running... (${ELAPSED}s, phase: ${POD_PHASE:-unknown})"
143+
sleep "$POLL_INTERVAL"
144+
done
145+
}
146+
147+
cmd_status() {
148+
cd "$TF_DIR"
149+
OPERATOR_NS="mondoo-operator"
150+
151+
log "=== Cluster State ==="
152+
echo ""
153+
echo "--- CronJobs ---"
154+
kubectl get cronjobs -n "$OPERATOR_NS" -l "mondoo_cr=oom-stress-test" 2>/dev/null || echo "(none)"
155+
echo ""
156+
echo "--- Scan Pods ---"
157+
kubectl get pods -n "$OPERATOR_NS" -l "app=mondoo-container-scan,mondoo_cr=oom-stress-test" -o wide 2>/dev/null || echo "(none)"
158+
echo ""
159+
echo "--- Target Pods ---"
160+
TARGET_NS=$(terraform output -raw target_namespace 2>/dev/null || echo "oom-stress-targets")
161+
kubectl get pods -n "$TARGET_NS" -l "app.kubernetes.io/part-of=oom-stress-test" --no-headers 2>/dev/null | wc -l | xargs -I{} echo "{} target pods"
162+
echo ""
163+
echo "--- MondooAuditConfig ---"
164+
kubectl get mondooauditconfig -n "$OPERATOR_NS" oom-stress-test -o yaml 2>/dev/null | head -30 || echo "(not found)"
165+
echo ""
166+
echo "--- Operator Logs (last 10) ---"
167+
kubectl logs -n "$OPERATOR_NS" deploy/mondoo-operator-controller-manager --tail=10 2>/dev/null || echo "(no operator)"
168+
}
169+
170+
cmd_destroy() {
171+
log "Destroying terraform resources..."
172+
cd "$TF_DIR"
173+
terraform destroy -auto-approve
174+
log "Done."
175+
}
176+
177+
# --- Main ---
178+
179+
case "${1:-apply-and-watch}" in
180+
apply)
181+
cmd_apply
182+
;;
183+
watch)
184+
cmd_watch
185+
;;
186+
apply-and-watch)
187+
cmd_apply
188+
cmd_watch
189+
;;
190+
status)
191+
cmd_status
192+
;;
193+
destroy)
194+
cmd_destroy
195+
;;
196+
*)
197+
echo "Usage: $0 [apply|watch|apply-and-watch|status|destroy]"
198+
echo ""
199+
echo " apply-and-watch (default) Provision + watch for OOM"
200+
echo " apply Provision infrastructure only"
201+
echo " watch Watch for scan pod OOM (after apply)"
202+
echo " status Show current cluster state"
203+
echo " destroy Tear down all resources"
204+
exit 1
205+
;;
206+
esac
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
.terraform/
2+
.terraform.lock.hcl
3+
terraform.tfstate
4+
terraform.tfstate.backup
5+
terraform.tfvars
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
# Copyright Mondoo, Inc. 2026
2+
# SPDX-License-Identifier: BUSL-1.1
3+
4+
# Kubernetes resources for the OOM stress test:
5+
# - Target namespace with package-heavy pods for scanning
6+
# - Mondoo credentials secret
7+
# - MondooAuditConfig with reduced memory limits
8+
9+
# --- Target namespace + pods ---
10+
11+
resource "kubernetes_namespace_v1" "targets" {
12+
metadata {
13+
name = var.target_namespace
14+
}
15+
}
16+
17+
resource "kubernetes_pod_v1" "stress_target" {
18+
for_each = { for img in var.stress_images : img.name => img }
19+
20+
metadata {
21+
name = each.value.name
22+
namespace = kubernetes_namespace_v1.targets.metadata[0].name
23+
24+
labels = {
25+
"app.kubernetes.io/name" = each.value.name
26+
"app.kubernetes.io/part-of" = "oom-stress-test"
27+
}
28+
}
29+
30+
spec {
31+
container {
32+
name = "target"
33+
image = each.value.image
34+
command = ["sleep", "infinity"]
35+
}
36+
}
37+
}
38+
39+
# --- Mondoo credentials ---
40+
41+
resource "kubernetes_secret_v1" "mondoo_client" {
42+
metadata {
43+
name = "mondoo-client"
44+
namespace = var.operator_namespace
45+
}
46+
47+
data = {
48+
config = base64decode(mondoo_service_account.oom_stress.credential)
49+
}
50+
51+
lifecycle {
52+
# Don't destroy the secret if the operator is still running — it causes
53+
# reconcile errors. The operator namespace cleanup handles this.
54+
prevent_destroy = false
55+
}
56+
}
57+
58+
# --- Docker Hub pull secret (avoids rate limits) ---
59+
60+
resource "kubernetes_secret_v1" "docker_pull" {
61+
count = var.docker_hub_username != "" ? 1 : 0
62+
63+
metadata {
64+
name = "mondoo-private-registries-secrets"
65+
namespace = var.operator_namespace
66+
}
67+
68+
type = "kubernetes.io/dockerconfigjson"
69+
70+
data = {
71+
".dockerconfigjson" = jsonencode({
72+
auths = {
73+
"https://index.docker.io/v1/" = {
74+
username = var.docker_hub_username
75+
password = var.docker_hub_password
76+
auth = base64encode("${var.docker_hub_username}:${var.docker_hub_password}")
77+
}
78+
}
79+
})
80+
}
81+
}
82+
83+
# --- MondooAuditConfig ---
84+
85+
resource "kubernetes_manifest" "audit_config" {
86+
manifest = {
87+
apiVersion = "k8s.mondoo.com/v1alpha2"
88+
kind = "MondooAuditConfig"
89+
90+
metadata = {
91+
name = "oom-stress-test"
92+
namespace = var.operator_namespace
93+
}
94+
95+
spec = {
96+
mondooCredsSecretRef = {
97+
name = kubernetes_secret_v1.mondoo_client.metadata[0].name
98+
}
99+
100+
# Using service account directly, not a console integration
101+
consoleIntegration = {
102+
enable = false
103+
}
104+
105+
containers = {
106+
enable = true
107+
schedule = var.scan_schedule
108+
resources = {
109+
limits = {
110+
memory = var.scanner_memory_limit
111+
}
112+
requests = {
113+
memory = var.scanner_memory_request
114+
}
115+
}
116+
}
117+
118+
filtering = {
119+
namespaces = {
120+
include = [var.target_namespace]
121+
}
122+
}
123+
}
124+
}
125+
126+
depends_on = [
127+
kubernetes_secret_v1.mondoo_client,
128+
kubernetes_namespace_v1.targets,
129+
kubernetes_pod_v1.stress_target,
130+
]
131+
}

0 commit comments

Comments
 (0)