ffengc
diff --git a/‎Predictor.hpp‎
Lines changed: 82 additions & 0 deletions b/‎Predictor.hpp‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎README-CN.md‎
Lines changed: 100 additions & 1 deletion b/‎README-CN.md‎
Lines changed: 100 additions & 1 deletion
@@ -34,6 +34,19 @@ class Predictor {
     double _drift;            // Steady-state tolerance band (allowed deviation)
     double _threshold;        // CUSUM alarm threshold
 
+    // --- Standardized CUSUM state (used only by AdaptivePredict) ---
+    // Tracks EWMSD: an EWMA of |rps - baseline|, used as a running-scale proxy
+    // for the typical deviation magnitude. Each observation is divided by
+    // max(sigma, min_sigma) to produce a dimensionless z-score, so the alarm
+    // threshold is invariant to the absolute RPS level and to clock-aliasing
+    // effects that make a single window measure lower than its phase average.
+    double _sigma_ewma;       // EWMSD estimate of typical |deviation|
+    double _beta;             // EWMSD smoothing factor
+    double _k_std;            // Page-Hinkley allowance in z-score units
+    double _h_std;            // standardized alarm threshold
+    double _min_sigma;        // lower clamp on sigma (avoid z explosion at startup)
+    double _cusum_std;        // standardized CUSUM accumulator
+
     double _avg_service_time; // T: average service time per request (seconds)
     int    _safety_margin;    // M_safety: extra workers for prediction error buffer
 
@@ -49,14 +62,27 @@ class Predictor {
               int    safety_margin   = 1)
         : _alpha(alpha), _ewma(0),
           _cusum(0), _drift(drift), _threshold(threshold),
+          _sigma_ewma(0), _beta(0.3), _k_std(0.5), _h_std(3.0), _min_sigma(1.0),
+          _cusum_std(0),
           _avg_service_time(avg_service_time),
           _safety_margin(safety_margin) {}
 
     double GetAvgServiceTime() const { return _avg_service_time; }
 
     // Feedback from DispatchPool threads: feed observed end-to-end latency (seconds)
     // so T tracks real workload characteristics rather than a static guess.
+    //
+    // Sanity clamp: worker.py sleeps 0.5s per request; cold-start fallback is 0.8s;
+    // legitimate end-to-end never exceeds ~2s even under queuing. Observations > 5s
+    // indicate pathological state (OS backpressure, client suspend, late-arriving
+    // socket completion) and must NOT be smoothed into T — once observed before:
+    // T drifted 0.5s -> 91s -> 322s -> Little's Law produced Target=443 -> OOM.
     void UpdateServiceTime(double observed_seconds) {
+        if (observed_seconds > 5.0 || observed_seconds < 0.0) {
+            logMessage(WARNING, "[Predictor] Discarded anomalous service time %.3fs (T kept at %.3fs)",
+                       observed_seconds, _avg_service_time);
+            return;
+        }
         _avg_service_time = 0.1 * observed_seconds + 0.9 * _avg_service_time;
     }
 
@@ -92,6 +118,62 @@ class Predictor {
         return target;
     }
 
+    // E7 — Standardized (adaptive) CUSUM:
+    //   sigma_t = beta * |rps - ewma| + (1 - beta) * sigma_{t-1}          (EWMSD)
+    //   z_t     = (rps - ewma) / max(sigma_t, min_sigma)
+    //   cusum   = max(0, cusum + z_t - k_std)
+    //   alarm if cusum > h_std
+    //
+    // Motivation: the fixed drift/threshold variant is brittle when the true
+    // Ramp boundary lands mid-window and the observed RPS is artificially
+    // low. Normalising by a running sigma removes the RPS-magnitude
+    // dependence, so C2/C4 fire at the same Ramp window as C1/C3.
+    int AdaptivePredict(int current_rps) {
+        // Step 1: EWMA baseline (same as UpdateAndPredict).
+        if (_ewma == 0.0) {
+            _ewma = current_rps;
+        } else {
+            _ewma = _alpha * current_rps + (1.0 - _alpha) * _ewma;
+        }
+
+        double deviation = current_rps - _ewma;
+
+        // Step 2: EWMSD — track typical |deviation| as a running scale estimate.
+        // We deliberately do NOT lazy-init _sigma_ewma to the first abs_dev:
+        // that makes the first large deviation self-normalise to z=1, which
+        // is insufficient to fire within the Ramp window (observed in the
+        // warmup-sweep: adaptive C1 missed SPIKE at W=35s,60s because sigma
+        // bootstrapped from the ramp itself). Always applying the EWMA
+        // update means after a flat warmup (sigma≈0), the first ramp
+        // window's abs_dev≈10 gives sigma = beta·abs_dev ≈ 3, so
+        // z = abs_dev/sigma = 1/beta ≈ 3.3 — strong enough to cross
+        // cusum_std = h=3.0 within two windows.
+        double abs_dev = std::fabs(deviation);
+        _sigma_ewma = _beta * abs_dev + (1.0 - _beta) * _sigma_ewma;
+
+        double sigma_safe = std::max(_sigma_ewma, _min_sigma);
+        double z = deviation / sigma_safe;
+
+        // Step 3: Standardized CUSUM — accumulate z-scores above k_std only.
+        _cusum_std = std::max(0.0, _cusum_std + z - _k_std);
+
+        double predicted_lambda = _ewma;
+
+        if (_cusum_std > _h_std) {
+            logMessage(WARNING, "[Predictor/adaptive] SPIKE DETECTED  CUSUM_std=%.2f  z=%.2f  sigma=%.2f  RPS=%d -> predicted_lambda=%.1f",
+                       _cusum_std, z, sigma_safe, current_rps, current_rps * 1.5);
+            predicted_lambda = current_rps * 1.5;
+            _cusum_std = 0; // reset after acting on the alarm
+        }
+
+        int target = (int)std::ceil(predicted_lambda * _avg_service_time) + _safety_margin;
+
+        logMessage(DEBUG, "[Predictor/adaptive] RPS=%d  EWMA=%.2f  sigma=%.2f  z=%.2f  CUSUM_std=%.2f  T=%.3fs  Target=%d",
+                   current_rps, _ewma, sigma_safe, z, _cusum_std, _avg_service_time, target);
+
+        return target;
+    }
+
     // E2 — Reactive baseline:
     // No EWMA smoothing, no CUSUM spike detection.
     // Target is computed directly from the current observed RPS via Little's Law.
 
@@ -29,6 +29,14 @@
     - [Check #2 Checklist](#check-2-checklist)
     - [如何运行](#如何运行)
     - [目前的发现与问题](#目前的发现与问题)
+  - [Before Pre — 演示前最终结果](#before-pre--演示前最终结果)
+    - [相比 Check #2 的新增工作](#相比-check-2-的新增工作)
+    - [主结果（5 模式 × 4 cycle，2026-04-20 单 trial）](#主结果5-模式--4-cycle2026-04-20-单-trial)
+    - [CoW Template — 9× 冷启动提速](#cow-template--9-冷启动提速)
+    - [CUSUM 实测 trace — 报警落在 ramp，不在 peak](#cusum-实测-trace--报警落在-ramp不在-peak)
+    - [Workload 设计 — Bursty-Ramp × 4 cycles](#workload-设计--bursty-ramp--4-cycles)
+    - [Warmup-Sweep 消融 — Fixed vs Adaptive 各有 failure mode](#warmup-sweep-消融--fixed-vs-adaptive-各有-failure-mode)
+    - [已知局限 / Final Report 待办](#已知局限--final-report-待办)
 
 
 ## Proposal
@@ -243,4 +251,95 @@ python3 load_tester.py
 **已知问题 / 局限：**
 - CUSUM 固定阈值导致每隔一个周期退化，需要自适应阈值改进
 - 目前在本机 loopback 测试，无真实网络延迟，CloudLab 结果可能不同
-- RSS 数值是各进程独立 RSS 之和，CoW 共享页被重复计算，真实物理内存（PSS）更低
+- RSS 数值是各进程独立 RSS 之和，CoW 共享页被重复计算，真实物理内存（PSS）更低
+
+## Before Pre — 演示前最终结果
+
+> 本节是 2026-04-30 课堂演示（CSCI 599）前整理的最终结果，对 Check #2 之后新增的 **CoW 量化、Adaptive CUSUM 基线、Warmup Sweep 消融** 做了系统化补充。所有图见 [`figures/pre/`](./figures/pre/) 目录及 [`MANIFEST.md`](./figures/pre/MANIFEST.md)。
+
+### 相比 Check #2 的新增工作
+
+| 新增 | 文件 / 命令 | 说明 |
+|---|---|---|
+| **CoW 冷启动量化** | `figures/plot_cow.py` → `slide05_cow.png` | 从 worker.py 模拟 cold start + server.log "Worker N ready (CoW fork)" 日志手测：`Naive (exec Python + import) ≈ 900 ms` vs `CoW (fork from warm parent) ≈ 100 ms`，**9× 提速 · 无 runtime 依赖** |
+| **CUSUM 实测 trace** | `figures/plot_rps_cusum.py` → `slide06_rps_cusum.png` | 用 sweep #3 真实 server.log（90 个 predictor tick、11 个 SPIKE DETECTED）重建 CUSUM 累积轨迹（drift=5, h=8），验证报警**全部落在 ramp 爬升段** |
+| **Adaptive CUSUM 基线** | `./server ewma_adaptive` | 用 EWMSD（running σ）做 z-score 归一化，作为 fixed-drift CUSUM 的对照 |
+| **Workload 设计可视化** | `figures/plot_workload.py` → `slide07_workload.png` | 把 `load_tester.py` 的 4-cycle Bursty-Ramp 参数画成时间轴，标出 Ramp = CUSUM 检测窗口 |
+| **5-mode 主结果** | `figures/plot_main_result.py` → `slide08_main_result.png` | 2026-04-20 重跑 5 个模式 × 4 cycle，cold counts 从各 `load_tester_output.txt` 的 SPIKE COMPARISON 表解析 |
+| **Warmup-Sweep 消融** | `figures/plot_sweep.py` → `slide10_sweep.png` | sweep #1（W=5, 120 s 端点）+ sweep #3（W=10, 20, 35, 60 s 内点），对比 fixed vs adaptive |
+
+### 主结果（5 模式 × 4 cycle，2026-04-20 单 trial）
+
+![](./figures/pre/slide08_main_result.png)
+
+| 模式 | C1 | C2 | C3 | C4 | 总计 | 说明 |
+|---|---:|---:|---:|---:|---:|---|
+| Static-15（过度配置） | 0 | 0 | 0 | 0 | **0** | 15 个 worker 全程钉死 —— 上界参照线 |
+| Adaptive CUSUM（EWMSD z-score） | 0 | 0 | 0 | 0 | **0** | W=35 落在 sweet spot |
+| **Fixed CUSUM（我们）** | 0 | 14 | **33** | 0 | **47** | C3 出现 clock-aliasing 事件 |
+| Reactive（按 backlog 扩缩） | 20 | 15 | 20 | 12 | **67** | 反应式基线 |
+| ARIMA（smoothed Target） | 20 | 18 | 31 | 16 | **85** | 历史时序预测 |
+
+> 总冷启动数 = 4 个 cycle 中 600 个 spike 请求里被判为 cold（RTT > 700 ms）的总数。
+
+**主要发现：**
+
+- **预测式（Fixed CUSUM）明显赢反应式**：比 Reactive 少 **30%** cold starts，比 ARIMA 少 **45%**
+- **47 里有 33 来自 C3 一次 clock-aliasing 事件**：2 秒测量窗口正好把 ramp 切散，CUSUM 累加恰好过不了阈值，第一次 SPIKE DETECTED 推迟 4 秒（`server.log` 在 t=1776671064 时 CUSUM=18.24，而非预期 ~8–10）。这是 fixed-drift CUSUM 的**已知 failure mode，不是 bug**。不计这次事件，total ≈ 14，几乎贴着 Static-15 的 floor —— 但我们没钉死 15 个 worker
+- **CoW Template** 把每个新 worker 的 spin-up 从 **900 ms** 降到 **100 ms**（9× 提速、无 runtime 依赖）
+- **CUSUM 在 ramp 阶段触发，不是 peak**：单次 200 s 的 4-cycle run 共触发 11 次 SPIKE DETECTED，**全部落在 ramp 爬升段**
+
+### CoW Template — 9× 冷启动提速
+
+![](./figures/pre/slide05_cow.png)
+
+启动一次 template Python 进程预先 import 好 Pillow + 建好 socket 骨架，后续每个 worker 通过 `fork()` 从 template 复制。Linux 的 copy-on-write 让 fork 几乎免费 —— Pillow 代码 / import 表是只读的，不会触发页复制。
+
+→ **No image. No snapshot. No registry. Just `fork()` from a warm parent.**
+
+### CUSUM 实测 trace — 报警落在 ramp，不在 peak
+
+![](./figures/pre/slide06_rps_cusum.png)
+
+上面 panel：蓝线是测得 RPS，橙虚线是 EWMA baseline（α=0.2）—— 故意滞后让 RPS 一脱离就显出 gap。
+
+下面 panel：绿色是 CUSUM 累加器，越过红色虚线 `h = 8` 时 ★ 报警 —— **11 次报警全部落在 ramp 爬升段，0 次落在 peak 之后**。这就是 "catch the ramp, not the peak" 的实证。
+
+### Workload 设计 — Bursty-Ramp × 4 cycles
+
+![](./figures/pre/slide07_workload.png)
+
+每个 cycle 模拟一次"列车到站"周期：`Warmup → Ramp(30 s) → Spike(30 RPS × 5 s) → Cooldown → Drain`。
+
+- **C1 warmup = 8 s**：测真正冷启动（无任何历史）
+- **C2–C4 warmup = 35 s**：长到让 scavenger 把 worker 全部缩回去，**但 EWMA baseline 还记得上次 spike** —— 测 predictor 跨 cycle 的记忆
+- **橙色 Ramp 即 CUSUM 的 30 s 检测窗口**：所有想在 peak 那一刻拿到的 worker，都必须在这 30 s 内 fork 好
+
+### Warmup-Sweep 消融 — Fixed vs Adaptive 各有 failure mode
+
+![](./figures/pre/slide10_sweep.png)
+
+| W (s) | Fixed CUSUM | Adaptive CUSUM |
+|---:|---:|---:|
+| 5 | 48 | **287** |
+| 10 | 45 | 135 |
+| 20 | 0 | 0 |
+| 35 | 0 | 0 |
+| 60 | 0 | 0 |
+| 120 | **32** | 0 |
+
+- **Adaptive 在短 W 翻车（τ_σ cliff）**：running σ 在背靠背 burst 之间降不下来，z-score 永远过不了阈值。τ_σ ≈ 6.6 s 是已实测的衰减常数
+- **Fixed 在长 W 翻车（aliasing miss）**：阈值 h=8 是为典型 ramp 调的，2 s 测量窗口在边界 case 下会切散 ramp
+- **Sweet spot: W = 20 ~ 60 s**：两种都工作。**主结果 W=35 正落在这里**，所以两种都接近 0
+- 网格总数：fixed=131 < adaptive=423 —— **raw number 反而 fixed 赢**
+
+> ⚠ **Framing**：**两个 failure modes，没有赢家**。Fixed 适合紧节奏，Adaptive 适合松节奏。Predictor 不是一个选项 —— **是一个 knob**。Adaptive 真正的贡献是 **scale invariance + aliasing robustness**，而不是更少的 cold starts。
+
+### 已知局限 / Final Report 待办
+
+- **n = 1 per sweep point**：受演示前时间预算限制，sweep 网格未做多 trial → CloudLab multi-trial（n ≥ 5）放在 final report
+- **当前 Python load_tester 上限 ~300 RPS**，不足以模拟真实 edge burst（目标 2 k+，需切到 `wrk` 或 Rust async）
+- **Regime-aware ensemble** 是 sweep 结果最直接的研究延伸：fixed + adaptive + meta-controller，自动按 workload 节奏选 —— 补上 W ≤ 10 s 的 gap，同时不丢 adaptive 的 scale invariance
+- 本机 loopback 测试，无真实网络延迟
+- RSS 是各进程独立 RSS 之和，CoW 共享页被重复计算，真实 PSS 更低
+- 演示交付物：[`docs/pre_how_4.md`](./docs/pre_how_4.md)（13 页 Slide 稿，中英双语）+ [`figures/pre/`](./figures/pre/) 5 张图