|
2 | 2 | name: E2E Fleet |
3 | 3 |
|
4 | 4 | on: |
| 5 | + workflow_dispatch: |
5 | 6 | pull_request: |
6 | 7 | push: |
7 | 8 | branches: |
@@ -48,6 +49,113 @@ jobs: |
48 | 49 | with: |
49 | 50 | go-version-file: 'go.mod' |
50 | 51 | check-latest: true |
| 52 | + - |
| 53 | + name: Noisy Neighbors |
| 54 | + run: | |
| 55 | + #!/usr/bin/env bash |
| 56 | + set -euo pipefail |
| 57 | +
|
| 58 | + DURATION="${DURATION:-10}" # seconds to run the probe |
| 59 | + WORKERS="${WORKERS:-0}" # default: use all CPUs; set to cap parallelism |
| 60 | +
|
| 61 | + echo "=== Noisy neighbor probe ===" |
| 62 | + date |
| 63 | + uname -a || true |
| 64 | +
|
| 65 | + CPUS="$(nproc || echo 1)" |
| 66 | + [[ "$WORKERS" -gt 0 ]] || WORKERS="$CPUS" |
| 67 | + echo "CPUs: $CPUS | Workers: $WORKERS | Duration: ${DURATION}s" |
| 68 | +
|
| 69 | + # Resolve cgroup v2 cpu.stat for this process |
| 70 | + CGROUP_SUBPATH="$(awk -F: '$1=="0"{print $3}' /proc/self/cgroup || true)" |
| 71 | + CG_BASE="/sys/fs/cgroup${CGROUP_SUBPATH}" |
| 72 | + CPU_STAT_PATH="${CG_BASE}/cpu.stat" |
| 73 | + if [[ ! -f "$CPU_STAT_PATH" ]]; then |
| 74 | + # Fallbacks for odd layouts |
| 75 | + [[ -f "/sys/fs/cgroup/cpu.stat" ]] && CPU_STAT_PATH="/sys/fs/cgroup/cpu.stat" |
| 76 | + fi |
| 77 | +
|
| 78 | + print_cpu_stat() { |
| 79 | + if [[ -f "$CPU_STAT_PATH" ]]; then |
| 80 | + echo "--- cgroup cpu.stat ($CPU_STAT_PATH) ---" |
| 81 | + cat "$CPU_STAT_PATH" |
| 82 | + else |
| 83 | + echo "--- cgroup cpu.stat not found ---" |
| 84 | + fi |
| 85 | + } |
| 86 | +
|
| 87 | + print_psi() { |
| 88 | + for r in cpu io memory; do |
| 89 | + local p="/proc/pressure/$r" |
| 90 | + if [[ -f "$p" ]]; then |
| 91 | + echo "--- PSI $r ---" |
| 92 | + cat "$p" |
| 93 | + fi |
| 94 | + done |
| 95 | + } |
| 96 | +
|
| 97 | + print_load() { |
| 98 | + echo "--- Load /proc/loadavg ---" |
| 99 | + cat /proc/loadavg |
| 100 | + } |
| 101 | +
|
| 102 | + echo |
| 103 | + echo "== Baseline ==" |
| 104 | + print_load |
| 105 | + print_psi |
| 106 | + print_cpu_stat |
| 107 | +
|
| 108 | + echo |
| 109 | + echo "== Running CPU workload for ${DURATION}s with ${WORKERS} worker(s)... ==" |
| 110 | +
|
| 111 | + # Worker: CPU-bound hash of endless stream; avoids disk and is available on runners |
| 112 | + run_worker() { |
| 113 | + timeout "${DURATION}s" bash -c 'yes | md5sum > /dev/null' |
| 114 | + } |
| 115 | +
|
| 116 | + PIDS=() |
| 117 | + for _ in $(seq 1 "$WORKERS"); do |
| 118 | + run_worker & |
| 119 | + PIDS+=("$!") |
| 120 | + done |
| 121 | +
|
| 122 | + # While workers run, sample PSI every second (lightweight) |
| 123 | + SAMPLES=() |
| 124 | + for ((i=0; i<"${DURATION}"; i++)); do |
| 125 | + if [[ -f /proc/pressure/cpu ]]; then |
| 126 | + line=$(sed -n '1p' /proc/pressure/cpu) # "some ..." line |
| 127 | + SAMPLES+=("$(date +%H:%M:%S) $line") |
| 128 | + fi |
| 129 | + sleep 1 |
| 130 | + done |
| 131 | +
|
| 132 | + # Wait for workers |
| 133 | + for p in "${PIDS[@]}"; do wait "$p" 2>/dev/null || true; done |
| 134 | +
|
| 135 | + echo |
| 136 | + echo "== After workload ==" |
| 137 | + print_load |
| 138 | + print_psi |
| 139 | + print_cpu_stat |
| 140 | +
|
| 141 | + if [[ ${#SAMPLES[@]} -gt 0 ]]; then |
| 142 | + echo |
| 143 | + echo "== PSI cpu 'some' samples during run (1/sec) ==" |
| 144 | + printf '%s\n' "${SAMPLES[@]}" |
| 145 | + fi |
| 146 | +
|
| 147 | + echo |
| 148 | + echo "== Interpretation hints ==" |
| 149 | + cat <<'HINTS' |
| 150 | + • If loadavg (first number) >> CPU count while running, the run queue is saturated. That can be your own parallelism, platform limits, or neighbors. |
| 151 | + • In PSI: |
| 152 | + - cpu: 'some' rising toward 100% means tasks frequently stalled on CPU; 'full' > 0% means the whole workload often couldn't run at all. |
| 153 | + - io/memory pressure > 0% implies stalls waiting on disk/cache/mem — noisy neighbors or shared resource contention. |
| 154 | + • In cpu.stat: |
| 155 | + - nr_throttled / throttled_usec increasing => cgroup CPU quota throttling (platform-imposed). That looks like "steal", but it's policy not hypervisor steal. |
| 156 | + • If PSI stays near zero, load ~= workers, and no throttling, the bottleneck is likely your code or chosen parallelism, not neighbors. |
| 157 | + HINTS |
| 158 | +
|
51 | 159 | - |
52 | 160 | name: Install Ginkgo CLI |
53 | 161 | run: go install github.com/onsi/ginkgo/v2/ginkgo |
|
0 commit comments