Skip to content

Commit 111223b

Browse files
committed
Add test for noisy neighbors
1 parent 864ea3e commit 111223b

1 file changed

Lines changed: 108 additions & 0 deletions

File tree

.github/workflows/e2e-ci.yml

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
name: E2E Fleet
33

44
on:
5+
workflow_dispatch:
56
pull_request:
67
push:
78
branches:
@@ -48,6 +49,113 @@ jobs:
4849
with:
4950
go-version-file: 'go.mod'
5051
check-latest: true
52+
-
53+
name: Noisy Neighbors
54+
run: |
55+
#!/usr/bin/env bash
56+
set -euo pipefail
57+
58+
DURATION="${DURATION:-10}" # seconds to run the probe
59+
WORKERS="${WORKERS:-0}" # default: use all CPUs; set to cap parallelism
60+
61+
echo "=== Noisy neighbor probe ==="
62+
date
63+
uname -a || true
64+
65+
CPUS="$(nproc || echo 1)"
66+
[[ "$WORKERS" -gt 0 ]] || WORKERS="$CPUS"
67+
echo "CPUs: $CPUS | Workers: $WORKERS | Duration: ${DURATION}s"
68+
69+
# Resolve cgroup v2 cpu.stat for this process
70+
CGROUP_SUBPATH="$(awk -F: '$1=="0"{print $3}' /proc/self/cgroup || true)"
71+
CG_BASE="/sys/fs/cgroup${CGROUP_SUBPATH}"
72+
CPU_STAT_PATH="${CG_BASE}/cpu.stat"
73+
if [[ ! -f "$CPU_STAT_PATH" ]]; then
74+
# Fallbacks for odd layouts
75+
[[ -f "/sys/fs/cgroup/cpu.stat" ]] && CPU_STAT_PATH="/sys/fs/cgroup/cpu.stat"
76+
fi
77+
78+
print_cpu_stat() {
79+
if [[ -f "$CPU_STAT_PATH" ]]; then
80+
echo "--- cgroup cpu.stat ($CPU_STAT_PATH) ---"
81+
cat "$CPU_STAT_PATH"
82+
else
83+
echo "--- cgroup cpu.stat not found ---"
84+
fi
85+
}
86+
87+
print_psi() {
88+
for r in cpu io memory; do
89+
local p="/proc/pressure/$r"
90+
if [[ -f "$p" ]]; then
91+
echo "--- PSI $r ---"
92+
cat "$p"
93+
fi
94+
done
95+
}
96+
97+
print_load() {
98+
echo "--- Load /proc/loadavg ---"
99+
cat /proc/loadavg
100+
}
101+
102+
echo
103+
echo "== Baseline =="
104+
print_load
105+
print_psi
106+
print_cpu_stat
107+
108+
echo
109+
echo "== Running CPU workload for ${DURATION}s with ${WORKERS} worker(s)... =="
110+
111+
# Worker: CPU-bound hash of endless stream; avoids disk and is available on runners
112+
run_worker() {
113+
timeout "${DURATION}s" bash -c 'yes | md5sum > /dev/null'
114+
}
115+
116+
PIDS=()
117+
for _ in $(seq 1 "$WORKERS"); do
118+
run_worker &
119+
PIDS+=("$!")
120+
done
121+
122+
# While workers run, sample PSI every second (lightweight)
123+
SAMPLES=()
124+
for ((i=0; i<"${DURATION}"; i++)); do
125+
if [[ -f /proc/pressure/cpu ]]; then
126+
line=$(sed -n '1p' /proc/pressure/cpu) # "some ..." line
127+
SAMPLES+=("$(date +%H:%M:%S) $line")
128+
fi
129+
sleep 1
130+
done
131+
132+
# Wait for workers
133+
for p in "${PIDS[@]}"; do wait "$p" 2>/dev/null || true; done
134+
135+
echo
136+
echo "== After workload =="
137+
print_load
138+
print_psi
139+
print_cpu_stat
140+
141+
if [[ ${#SAMPLES[@]} -gt 0 ]]; then
142+
echo
143+
echo "== PSI cpu 'some' samples during run (1/sec) =="
144+
printf '%s\n' "${SAMPLES[@]}"
145+
fi
146+
147+
echo
148+
echo "== Interpretation hints =="
149+
cat <<'HINTS'
150+
• If loadavg (first number) >> CPU count while running, the run queue is saturated. That can be your own parallelism, platform limits, or neighbors.
151+
• In PSI:
152+
- cpu: 'some' rising toward 100% means tasks frequently stalled on CPU; 'full' > 0% means the whole workload often couldn't run at all.
153+
- io/memory pressure > 0% implies stalls waiting on disk/cache/mem — noisy neighbors or shared resource contention.
154+
• In cpu.stat:
155+
- nr_throttled / throttled_usec increasing => cgroup CPU quota throttling (platform-imposed). That looks like "steal", but it's policy not hypervisor steal.
156+
• If PSI stays near zero, load ~= workers, and no throttling, the bottleneck is likely your code or chosen parallelism, not neighbors.
157+
HINTS
158+
51159
-
52160
name: Install Ginkgo CLI
53161
run: go install github.com/onsi/ginkgo/v2/ginkgo

0 commit comments

Comments
 (0)