|
| 1 | +"""BTC Fib SELECTION-LEARNING — learning-curve diagnostic (facit data-sensitivity). |
| 2 | +
|
| 3 | +Is the current Stage-2 selection model **data-starved or saturated** w.r.t. facit size — i.e. would |
| 4 | +more human-labeled 4h legs plausibly raise pooled OOS Average Precision against the human facit? A |
| 5 | +**data-sensitivity diagnostic** toward the north star (does the engine select legs like the human), |
| 6 | +fixed blind in the learning-curve LOCK before any AP at any fraction existed: |
| 7 | + docs/research_wiki/reviews/btc-fib-selection-learning-learning-curve-lock-20260625.md |
| 8 | +
|
| 9 | +Method (LOCK L2/L3): reuse the Stage-2 cell **verbatim** (universe, ε, purged split, the 5 live |
| 10 | +features, the interpretable logreg, pooled AP); **fix the held-out test set** so AP is comparable |
| 11 | +across fractions; **vary only the training facit** — drop whole human legs and relabel train rows |
| 12 | +(a train candidate is positive iff it ε-matches a *retained* human leg). **Build-once-vary-labels:** |
| 13 | +the candidate universe and feature matrices are built once; per (fraction, repeat) only the train |
| 14 | +labels, the logreg fit, and the test AP recompute. |
| 15 | +
|
| 16 | +Verdict is ASYMMETRIC (LOCK L4): the Stage-2 lift is carried almost entirely by ONE feature |
| 17 | +(``cleanliness``) → ≈1 effective parameter → **saturation is the EXPECTED default**. So a flat curve |
| 18 | +means "this 1-feature model is capacity-bound → back to the feature/crux", NOT "don't grow facit". |
| 19 | +Only a curve still rising at full facit (``data_starved``) is a genuine green light. With 65 test |
| 20 | +positives ``inconclusive_underpowered`` is a live, likely outcome. |
| 21 | +
|
| 22 | +**Diagnostic only — no edge / behaviour / PnL / backtest / Genesis / auto-fib-as-truth / 1H / ETH |
| 23 | +/ label-mutation (LOCK L7).** Does NOT resolve the ``cleanliness`` crux. Frozen data, no refresh. |
| 24 | +
|
| 25 | +Run (own CLI): |
| 26 | + uv run python -m fibengine.research.selection_learning_curve --curve-preflight |
| 27 | + uv run python -m fibengine.research.selection_learning_curve --curve |
| 28 | +""" |
| 29 | + |
| 30 | +from __future__ import annotations |
| 31 | + |
| 32 | +import argparse |
| 33 | +import json |
| 34 | +import sys |
| 35 | +import time |
| 36 | +from dataclasses import replace |
| 37 | +from pathlib import Path |
| 38 | +from typing import Any |
| 39 | + |
| 40 | +import numpy as np |
| 41 | + |
| 42 | +from fibengine.core.config import load_settings |
| 43 | +from fibengine.data.loader import load_candles |
| 44 | +from fibengine.research.selection_learning import ( |
| 45 | + PRIMARY_K, |
| 46 | + RESULTS_DIR, |
| 47 | + Candidate, |
| 48 | + SelectionConfig, |
| 49 | + _progress, |
| 50 | + average_precision, |
| 51 | + build_candidates, |
| 52 | + fit_logreg, |
| 53 | + live_feature_names, |
| 54 | + load_human_legs, |
| 55 | + predict_proba, |
| 56 | + roc_auc, |
| 57 | + window_of, |
| 58 | +) |
| 59 | +from fibengine.research.selection_learning_gap import run_preflight |
| 60 | + |
| 61 | +# LOCK L3: finer grid near the top — the local slope at f=1.0 is what speaks to "would the NEXT |
| 62 | +# labels help". f=1.0 is the single full-facit point (deterministic; reproduces the Stage-2 AP). |
| 63 | +FRACTIONS: tuple[float, ...] = (0.25, 0.50, 0.75, 0.80, 0.90, 0.95, 1.00) |
| 64 | +REPEATS = 64 # independent subsamples per fraction (<1.0); build is once, relabel+refit is cheap |
| 65 | +CONTEXT_TIMEFRAMES = ("1M", "1w", "1d") # underpowered context only (LOCK L2), at primary k=3 |
| 66 | +_FTOL = 1e-9 |
| 67 | + |
| 68 | + |
| 69 | +def _seed_for(base_seed: int, frac_idx: int, repeat: int) -> int: |
| 70 | + """Deterministic per-(fraction, repeat) seed (LOCK L3): base + frac_idx*1000 + repeat.""" |
| 71 | + return base_seed + frac_idx * 1000 + repeat |
| 72 | + |
| 73 | + |
| 74 | +def _band_halfwidth(stats: dict[str, Any]) -> float: |
| 75 | + """Half the [p5, p95] AP band for a fraction — the which-legs-dropped train-subsample noise.""" |
| 76 | + return (stats["ap_p95"] - stats["ap_p5"]) / 2.0 |
| 77 | + |
| 78 | + |
| 79 | +def _frac_stats(per_fraction: list[dict[str, Any]], frac: float) -> dict[str, Any] | None: |
| 80 | + return next((s for s in per_fraction if abs(s["fraction"] - frac) < _FTOL), None) |
| 81 | + |
| 82 | + |
| 83 | +# --- per-cell driver (build once, vary only the training-facit fraction) ----------------------- |
| 84 | + |
| 85 | + |
| 86 | +def run_curve_cell(timeframe: str, k: int, cfg_in: SelectionConfig, settings: Any) -> dict: |
| 87 | + """One learning-curve cell. Build the candidate universe + feature matrices ONCE; the held-out |
| 88 | + test set is FIXED (full facit labels). For each fraction f and repeat r, retain ``round(f * |
| 89 | + n_train_legs)`` whole human legs (uniform, no replacement, seeded), relabel the train rows, |
| 90 | + refit the 5-feature logreg, recompute pooled test AP on the fixed test set (LOCK L2/L3).""" |
| 91 | + cfg = replace(cfg_in, k=k) |
| 92 | + t_cell = time.perf_counter() |
| 93 | + feat_names = live_feature_names(k) |
| 94 | + _progress(f"cell START tf={timeframe} k={k} features={feat_names}") |
| 95 | + data_cfg = settings.data.model_copy(update={"timeframe": timeframe}) |
| 96 | + df = load_candles(data_cfg, fetch_if_missing=False, strict=False) |
| 97 | + if df.empty: |
| 98 | + raise ValueError(f"empty candle frame for {timeframe} — fail-closed") |
| 99 | + human_legs = load_human_legs(timeframe) |
| 100 | + cands = build_candidates(df, human_legs, settings.pivots, settings.scoring, cfg) |
| 101 | + |
| 102 | + n = len(df) |
| 103 | + split_idx = int(n * cfg.train_frac) |
| 104 | + reach = cfg.k # live viewport — Stage-2 parity (no W embargo) |
| 105 | + train, test = [], [] |
| 106 | + for c in cands: |
| 107 | + win = window_of(c.anchor_b_pos, split_idx, n, reach) |
| 108 | + if win == "train": |
| 109 | + train.append(c) |
| 110 | + elif win == "test": |
| 111 | + test.append(c) |
| 112 | + |
| 113 | + def _x(rows: list[Candidate]) -> np.ndarray: |
| 114 | + if not rows: |
| 115 | + return np.zeros((0, len(feat_names))) |
| 116 | + return np.array([[c.features[f] for f in feat_names] for c in rows], dtype=float) |
| 117 | + |
| 118 | + # FIXED test set (full facit labels — never subsampled, LOCK L2) |
| 119 | + x_te = _x(test) |
| 120 | + y_te = np.array([c.label for c in test], dtype=float) |
| 121 | + x_tr = _x(train) # feature matrix fixed; only labels vary across subsamples |
| 122 | + tr_human_idx = np.array([c.human_idx for c in train], dtype=int) # -1 = negative row |
| 123 | + train_legs = sorted({int(h) for h in tr_human_idx if h >= 0}) # facit legs in the train period |
| 124 | + n_train_legs = len(train_legs) |
| 125 | + n_test_pos = int(y_te.sum()) |
| 126 | + powered = n_test_pos >= cfg.min_test_positives |
| 127 | + |
| 128 | + matched_humans = {c.human_idx for c in cands if c.human_idx >= 0} |
| 129 | + reachable_fraction = len(matched_humans) / len(human_legs) if human_legs else None |
| 130 | + |
| 131 | + per_fraction: list[dict[str, Any]] = [] |
| 132 | + can_run = bool(len(train) and len(test) and n_test_pos > 0 and n_train_legs > 0) |
| 133 | + if can_run: |
| 134 | + legs_arr = np.array(train_legs, dtype=int) |
| 135 | + for f_idx, frac in enumerate(FRACTIONS): |
| 136 | + n_retain = max(0, min(n_train_legs, round(frac * n_train_legs))) |
| 137 | + full = frac >= 1.0 - _FTOL |
| 138 | + reps = 1 if full else REPEATS # f=1.0 is the deterministic single full-facit point |
| 139 | + aps: list[float] = [] |
| 140 | + aucs: list[float] = [] |
| 141 | + n_pos_list: list[int] = [] |
| 142 | + for r in range(reps): |
| 143 | + if full: |
| 144 | + retained = set(train_legs) |
| 145 | + else: |
| 146 | + rng = np.random.default_rng(_seed_for(cfg.seed, f_idx, r)) |
| 147 | + picked = rng.choice(legs_arr, size=n_retain, replace=False) |
| 148 | + retained = {int(x) for x in picked} |
| 149 | + y_tr = np.array( |
| 150 | + [1.0 if (h >= 0 and int(h) in retained) else 0.0 for h in tr_human_idx], |
| 151 | + dtype=float, |
| 152 | + ) |
| 153 | + n_pos_list.append(int(y_tr.sum())) |
| 154 | + model = fit_logreg(x_tr, y_tr, cfg) |
| 155 | + p_te = predict_proba(model, x_te) |
| 156 | + ap = average_precision(y_te, p_te) |
| 157 | + auc = roc_auc(y_te, p_te) |
| 158 | + if ap is not None: |
| 159 | + aps.append(ap) |
| 160 | + if auc is not None: |
| 161 | + aucs.append(auc) |
| 162 | + ap_arr = np.array(aps, dtype=float) |
| 163 | + per_fraction.append( |
| 164 | + { |
| 165 | + "fraction": float(frac), |
| 166 | + "repeats": reps, |
| 167 | + "n_retain": int(n_retain), |
| 168 | + "n_train_pos_mean": float(np.mean(n_pos_list)) if n_pos_list else None, |
| 169 | + "ap_mean": float(ap_arr.mean()) if ap_arr.size else None, |
| 170 | + "ap_p5": float(np.percentile(ap_arr, 5)) if ap_arr.size else None, |
| 171 | + "ap_p95": float(np.percentile(ap_arr, 95)) if ap_arr.size else None, |
| 172 | + "ap_std": float(ap_arr.std()) if ap_arr.size else None, |
| 173 | + "auc_mean": float(np.mean(aucs)) if aucs else None, |
| 174 | + } |
| 175 | + ) |
| 176 | + _progress( |
| 177 | + f" f={frac:.2f} reps={reps} n_retain={n_retain} " |
| 178 | + f"ap_mean={per_fraction[-1]['ap_mean']}" |
| 179 | + ) |
| 180 | + |
| 181 | + full_stats = _frac_stats(per_fraction, 1.0) |
| 182 | + _progress( |
| 183 | + f"cell DONE tf={timeframe} k={k} in {time.perf_counter() - t_cell:.0f}s " |
| 184 | + f"(test_pos={n_test_pos} powered={powered} " |
| 185 | + f"ap_full={full_stats['ap_mean'] if full_stats else None})" |
| 186 | + ) |
| 187 | + return { |
| 188 | + "timeframe": timeframe, |
| 189 | + "k": k, |
| 190 | + "features": feat_names, |
| 191 | + "n_bars": n, |
| 192 | + "n_human_legs": len(human_legs), |
| 193 | + "n_candidates": len(cands), |
| 194 | + "n_train": len(train), |
| 195 | + "n_test": len(test), |
| 196 | + "n_test_positives": n_test_pos, |
| 197 | + "n_train_legs": n_train_legs, # facit legs available to subsample in the train period |
| 198 | + "reachable_fraction": reachable_fraction, |
| 199 | + "powered": powered, |
| 200 | + "repeats": REPEATS, |
| 201 | + "fractions": list(FRACTIONS), |
| 202 | + "per_fraction": per_fraction, |
| 203 | + "ap_full_facit": full_stats["ap_mean"] if full_stats else None, |
| 204 | + # LOCK L6 addable-supply context (reported, not a gate) |
| 205 | + "addable_supply": { |
| 206 | + "labeled_human_legs": len(human_legs), |
| 207 | + "candidate_universe": len(cands), |
| 208 | + "train_period_facit_legs": n_train_legs, |
| 209 | + "note": "true human-meaningful count needs a human pass; capped by available history", |
| 210 | + }, |
| 211 | + } |
| 212 | + |
| 213 | + |
| 214 | +def curve_verdict(cell: dict[str, Any]) -> str: |
| 215 | + """ASYMMETRIC blind verdict (LOCK L4), read from the 4h primary. Reads DIFFERENCES of fraction |
| 216 | + means against the f=0.95 train-subsample band (LOCK L5), not absolute AP levels. |
| 217 | +
|
| 218 | + - ``inconclusive_underpowered`` — not powered, or the f=0.95 band half-width is >= the full |
| 219 | + spread of fraction means (per-fraction noise dwarfs the curve; likely at 65 test positives). |
| 220 | + - ``data_starved`` — ``AP(1.0) − AP(0.95)`` exceeds the f=0.95 band half-width and the curve is |
| 221 | + increasing: genuinely informative green light (more facit helps even this 1-feature model). |
| 222 | + - ``saturated`` — last increment within the band (flat): ambiguous AND EXPECTED → routes to the |
| 223 | + feature / ``cleanliness`` crux, NOT away from labeling. |
| 224 | + """ |
| 225 | + if not cell.get("powered"): |
| 226 | + return "inconclusive_underpowered" |
| 227 | + per = cell.get("per_fraction") or [] |
| 228 | + s_top = _frac_stats(per, 1.0) |
| 229 | + s_95 = _frac_stats(per, 0.95) |
| 230 | + means = [s["ap_mean"] for s in per if s.get("ap_mean") is not None] |
| 231 | + if s_top is None or s_95 is None or s_top["ap_mean"] is None or len(means) < 2: |
| 232 | + return "inconclusive_underpowered" |
| 233 | + spread = max(means) - min(means) |
| 234 | + hw95 = _band_halfwidth(s_95) |
| 235 | + if hw95 >= spread: # noise dominates the whole curve range |
| 236 | + return "inconclusive_underpowered" |
| 237 | + last_increment = s_top["ap_mean"] - s_95["ap_mean"] |
| 238 | + if last_increment > hw95 and s_top["ap_mean"] > s_95["ap_mean"]: |
| 239 | + return "data_starved" |
| 240 | + return "saturated" |
| 241 | + |
| 242 | + |
| 243 | +# --- checkpointed study ----------------------------------------------------------------------- |
| 244 | + |
| 245 | + |
| 246 | +def _json_default(o: Any) -> Any: |
| 247 | + if isinstance(o, np.generic): |
| 248 | + return o.item() |
| 249 | + raise TypeError(f"not JSON-serializable: {type(o)}") |
| 250 | + |
| 251 | + |
| 252 | +def _run_or_load_cell( |
| 253 | + timeframe: str, k: int, cfg: SelectionConfig, settings: Any, ckpt_dir: Path |
| 254 | +) -> dict: |
| 255 | + """Run one cell or load a same-seed checkpoint (atomic write), mirroring the enrich/W-gap resume |
| 256 | + pattern so an interrupted long 4h build loses at most the in-flight cell.""" |
| 257 | + path = ckpt_dir / f"{timeframe}_k{k}.json" |
| 258 | + if path.exists(): |
| 259 | + saved = json.loads(path.read_text(encoding="utf-8")) |
| 260 | + if saved.get("seed") == cfg.seed: |
| 261 | + _progress(f"RESUME tf={timeframe} k={k}: loaded checkpoint {path.name}") |
| 262 | + return saved["cell"] |
| 263 | + _progress(f"stale ckpt {path.name}: seed {saved.get('seed')}!={cfg.seed}, recompute") |
| 264 | + result = run_curve_cell(timeframe, k, cfg, settings) |
| 265 | + ckpt_dir.mkdir(parents=True, exist_ok=True) |
| 266 | + tmp = path.with_name(path.name + ".tmp") |
| 267 | + tmp.write_text( |
| 268 | + json.dumps( |
| 269 | + {"seed": cfg.seed, "cell": result}, indent=2, sort_keys=True, default=_json_default |
| 270 | + ), |
| 271 | + encoding="utf-8", |
| 272 | + ) |
| 273 | + tmp.replace(path) |
| 274 | + _progress(f"checkpoint written {path.name}") |
| 275 | + return json.loads(path.read_text(encoding="utf-8"))["cell"] |
| 276 | + |
| 277 | + |
| 278 | +def run_curve_study( |
| 279 | + config_path: str | None, cfg: SelectionConfig, ckpt_dir: Path | None = None |
| 280 | +) -> dict: |
| 281 | + """Learning-curve study (LOCK L2): 4h primary at k=3 (the single powered, verdict-bearing cell); |
| 282 | + 1M/1w/1d at k=3 as underpowered context only, never refuted. Verdict from the 4h k=3 cell.""" |
| 283 | + settings = load_settings(config_path) if config_path else load_settings() |
| 284 | + if ckpt_dir is None: |
| 285 | + ckpt_dir = RESULTS_DIR / "curve" / "cells" |
| 286 | + primary = _run_or_load_cell("4h", PRIMARY_K, cfg, settings, ckpt_dir) |
| 287 | + context = [ |
| 288 | + _run_or_load_cell(tf, PRIMARY_K, cfg, settings, ckpt_dir) for tf in CONTEXT_TIMEFRAMES |
| 289 | + ] |
| 290 | + return { |
| 291 | + "generated_by": "fib_selection_learning_curve", |
| 292 | + "stage": "facit_data_sensitivity_learning_curve", |
| 293 | + "metric": "pooled_test_average_precision", |
| 294 | + "design": "fixed test set; vary only training-facit fraction (whole legs); build-once", |
| 295 | + "seed": cfg.seed, |
| 296 | + "primary_timeframe": "4h", |
| 297 | + "primary_k": PRIMARY_K, |
| 298 | + "learning_curve_verdict": curve_verdict(primary), |
| 299 | + "results_4h_primary": primary, |
| 300 | + "results_context_underpowered": context, |
| 301 | + } |
| 302 | + |
| 303 | + |
| 304 | +def print_curve(report: dict, path: Any) -> None: |
| 305 | + rows = [("4h", report["results_4h_primary"])] |
| 306 | + rows += [("ctx", c) for c in report["results_context_underpowered"]] |
| 307 | + for label, r in rows: |
| 308 | + print( |
| 309 | + f"[{label} tf={r['timeframe']} k={r['k']}] n_cands={r['n_candidates']} " |
| 310 | + f"n_train_legs={r['n_train_legs']} test_pos={r['n_test_positives']} " |
| 311 | + f"powered={r['powered']} ap_full={r['ap_full_facit']}" |
| 312 | + ) |
| 313 | + for s in r["per_fraction"]: |
| 314 | + print( |
| 315 | + f" f={s['fraction']:.2f} reps={s['repeats']} n_retain={s['n_retain']} " |
| 316 | + f"n_pos~{s['n_train_pos_mean']} ap_mean={s['ap_mean']} " |
| 317 | + f"[p5={s['ap_p5']}, p95={s['ap_p95']}] auc={s['auc_mean']}" |
| 318 | + ) |
| 319 | + print(f" addable_supply={r['addable_supply']}") |
| 320 | + print(f"learning_curve_verdict={report['learning_curve_verdict']} summary={path}") |
| 321 | + |
| 322 | + |
| 323 | +def _write_summary(report: dict, out_dir: Path) -> Path: |
| 324 | + out_dir.mkdir(parents=True, exist_ok=True) |
| 325 | + path = out_dir / "summary.json" |
| 326 | + path.write_text( |
| 327 | + json.dumps(report, indent=2, sort_keys=True, default=_json_default), encoding="utf-8" |
| 328 | + ) |
| 329 | + return path |
| 330 | + |
| 331 | + |
| 332 | +def main(argv: list[str] | None = None) -> int: |
| 333 | + for _s in (sys.stdout, sys.stderr): # UTF-8 console (long-run safety) |
| 334 | + _rc = getattr(_s, "reconfigure", None) |
| 335 | + if _rc is not None: |
| 336 | + _rc(encoding="utf-8") |
| 337 | + ap = argparse.ArgumentParser(description="BTC Fib selection-learning learning-curve diagnostic") |
| 338 | + ap.add_argument("--config", default="config/settings.expansion.yaml") |
| 339 | + ap.add_argument("--out", default=str(RESULTS_DIR / "curve")) |
| 340 | + ap.add_argument("--curve", action="store_true", help="run the facit learning-curve diagnostic") |
| 341 | + ap.add_argument( |
| 342 | + "--curve-preflight", |
| 343 | + action="store_true", |
| 344 | + help="frozen-data parity + facit fail-fast (reuses W-gap preflight; no run)", |
| 345 | + ) |
| 346 | + args = ap.parse_args(argv) |
| 347 | + if args.curve_preflight: |
| 348 | + return run_preflight(args.config) |
| 349 | + if args.curve: |
| 350 | + report = run_curve_study(args.config, SelectionConfig()) |
| 351 | + path = _write_summary(report, Path(args.out)) |
| 352 | + print_curve(report, path) |
| 353 | + return 0 |
| 354 | + ap.print_help() |
| 355 | + return 0 |
| 356 | + |
| 357 | + |
| 358 | +if __name__ == "__main__": # pragma: no cover |
| 359 | + raise SystemExit(main()) |
0 commit comments