Skip to content

Commit c4bd330

Browse files
JohnCCarterclaude
andcommitted
feat(research): learning-curve diagnostic harness (facit data-sensitivity, Commit 2)
Build per the learning-curve LOCK (d1d0845). New selection_learning_curve.py (own CLI --curve/--curve-preflight; no code in byte-capped selection_learning.py) reusing build_candidates/fit_logreg/AP/AUC + the W-gap frozen preflight. Build- once-vary-labels: candidate universe + feature matrices built once, fixed held-out test set, vary only the training-facit fraction (whole human legs dropped), R=64, finer grid near f=1.0. ASYMMETRIC blind verdict (data_starved / saturated-but-expected / inconclusive_underpowered) reading fraction-mean DIFFERENCES against the f=0.95 band. +12 tests (build-once, fixed test, whole-leg subsample, seed determinism, all verdict branches, checkpoint resume). Diagnostic only, no edge/behaviour/PnL/Genesis claim; does not resolve the cleanliness crux. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent d1d0845 commit c4bd330

2 files changed

Lines changed: 594 additions & 0 deletions

File tree

Lines changed: 359 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,359 @@
1+
"""BTC Fib SELECTION-LEARNING — learning-curve diagnostic (facit data-sensitivity).
2+
3+
Is the current Stage-2 selection model **data-starved or saturated** w.r.t. facit size — i.e. would
4+
more human-labeled 4h legs plausibly raise pooled OOS Average Precision against the human facit? A
5+
**data-sensitivity diagnostic** toward the north star (does the engine select legs like the human),
6+
fixed blind in the learning-curve LOCK before any AP at any fraction existed:
7+
docs/research_wiki/reviews/btc-fib-selection-learning-learning-curve-lock-20260625.md
8+
9+
Method (LOCK L2/L3): reuse the Stage-2 cell **verbatim** (universe, ε, purged split, the 5 live
10+
features, the interpretable logreg, pooled AP); **fix the held-out test set** so AP is comparable
11+
across fractions; **vary only the training facit** — drop whole human legs and relabel train rows
12+
(a train candidate is positive iff it ε-matches a *retained* human leg). **Build-once-vary-labels:**
13+
the candidate universe and feature matrices are built once; per (fraction, repeat) only the train
14+
labels, the logreg fit, and the test AP recompute.
15+
16+
Verdict is ASYMMETRIC (LOCK L4): the Stage-2 lift is carried almost entirely by ONE feature
17+
(``cleanliness``) → ≈1 effective parameter → **saturation is the EXPECTED default**. So a flat curve
18+
means "this 1-feature model is capacity-bound → back to the feature/crux", NOT "don't grow facit".
19+
Only a curve still rising at full facit (``data_starved``) is a genuine green light. With 65 test
20+
positives ``inconclusive_underpowered`` is a live, likely outcome.
21+
22+
**Diagnostic only — no edge / behaviour / PnL / backtest / Genesis / auto-fib-as-truth / 1H / ETH
23+
/ label-mutation (LOCK L7).** Does NOT resolve the ``cleanliness`` crux. Frozen data, no refresh.
24+
25+
Run (own CLI):
26+
uv run python -m fibengine.research.selection_learning_curve --curve-preflight
27+
uv run python -m fibengine.research.selection_learning_curve --curve
28+
"""
29+
30+
from __future__ import annotations
31+
32+
import argparse
33+
import json
34+
import sys
35+
import time
36+
from dataclasses import replace
37+
from pathlib import Path
38+
from typing import Any
39+
40+
import numpy as np
41+
42+
from fibengine.core.config import load_settings
43+
from fibengine.data.loader import load_candles
44+
from fibengine.research.selection_learning import (
45+
PRIMARY_K,
46+
RESULTS_DIR,
47+
Candidate,
48+
SelectionConfig,
49+
_progress,
50+
average_precision,
51+
build_candidates,
52+
fit_logreg,
53+
live_feature_names,
54+
load_human_legs,
55+
predict_proba,
56+
roc_auc,
57+
window_of,
58+
)
59+
from fibengine.research.selection_learning_gap import run_preflight
60+
61+
# LOCK L3: finer grid near the top — the local slope at f=1.0 is what speaks to "would the NEXT
62+
# labels help". f=1.0 is the single full-facit point (deterministic; reproduces the Stage-2 AP).
63+
FRACTIONS: tuple[float, ...] = (0.25, 0.50, 0.75, 0.80, 0.90, 0.95, 1.00)
64+
REPEATS = 64 # independent subsamples per fraction (<1.0); build is once, relabel+refit is cheap
65+
CONTEXT_TIMEFRAMES = ("1M", "1w", "1d") # underpowered context only (LOCK L2), at primary k=3
66+
_FTOL = 1e-9
67+
68+
69+
def _seed_for(base_seed: int, frac_idx: int, repeat: int) -> int:
70+
"""Deterministic per-(fraction, repeat) seed (LOCK L3): base + frac_idx*1000 + repeat."""
71+
return base_seed + frac_idx * 1000 + repeat
72+
73+
74+
def _band_halfwidth(stats: dict[str, Any]) -> float:
75+
"""Half the [p5, p95] AP band for a fraction — the which-legs-dropped train-subsample noise."""
76+
return (stats["ap_p95"] - stats["ap_p5"]) / 2.0
77+
78+
79+
def _frac_stats(per_fraction: list[dict[str, Any]], frac: float) -> dict[str, Any] | None:
80+
return next((s for s in per_fraction if abs(s["fraction"] - frac) < _FTOL), None)
81+
82+
83+
# --- per-cell driver (build once, vary only the training-facit fraction) -----------------------
84+
85+
86+
def run_curve_cell(timeframe: str, k: int, cfg_in: SelectionConfig, settings: Any) -> dict:
87+
"""One learning-curve cell. Build the candidate universe + feature matrices ONCE; the held-out
88+
test set is FIXED (full facit labels). For each fraction f and repeat r, retain ``round(f *
89+
n_train_legs)`` whole human legs (uniform, no replacement, seeded), relabel the train rows,
90+
refit the 5-feature logreg, recompute pooled test AP on the fixed test set (LOCK L2/L3)."""
91+
cfg = replace(cfg_in, k=k)
92+
t_cell = time.perf_counter()
93+
feat_names = live_feature_names(k)
94+
_progress(f"cell START tf={timeframe} k={k} features={feat_names}")
95+
data_cfg = settings.data.model_copy(update={"timeframe": timeframe})
96+
df = load_candles(data_cfg, fetch_if_missing=False, strict=False)
97+
if df.empty:
98+
raise ValueError(f"empty candle frame for {timeframe} — fail-closed")
99+
human_legs = load_human_legs(timeframe)
100+
cands = build_candidates(df, human_legs, settings.pivots, settings.scoring, cfg)
101+
102+
n = len(df)
103+
split_idx = int(n * cfg.train_frac)
104+
reach = cfg.k # live viewport — Stage-2 parity (no W embargo)
105+
train, test = [], []
106+
for c in cands:
107+
win = window_of(c.anchor_b_pos, split_idx, n, reach)
108+
if win == "train":
109+
train.append(c)
110+
elif win == "test":
111+
test.append(c)
112+
113+
def _x(rows: list[Candidate]) -> np.ndarray:
114+
if not rows:
115+
return np.zeros((0, len(feat_names)))
116+
return np.array([[c.features[f] for f in feat_names] for c in rows], dtype=float)
117+
118+
# FIXED test set (full facit labels — never subsampled, LOCK L2)
119+
x_te = _x(test)
120+
y_te = np.array([c.label for c in test], dtype=float)
121+
x_tr = _x(train) # feature matrix fixed; only labels vary across subsamples
122+
tr_human_idx = np.array([c.human_idx for c in train], dtype=int) # -1 = negative row
123+
train_legs = sorted({int(h) for h in tr_human_idx if h >= 0}) # facit legs in the train period
124+
n_train_legs = len(train_legs)
125+
n_test_pos = int(y_te.sum())
126+
powered = n_test_pos >= cfg.min_test_positives
127+
128+
matched_humans = {c.human_idx for c in cands if c.human_idx >= 0}
129+
reachable_fraction = len(matched_humans) / len(human_legs) if human_legs else None
130+
131+
per_fraction: list[dict[str, Any]] = []
132+
can_run = bool(len(train) and len(test) and n_test_pos > 0 and n_train_legs > 0)
133+
if can_run:
134+
legs_arr = np.array(train_legs, dtype=int)
135+
for f_idx, frac in enumerate(FRACTIONS):
136+
n_retain = max(0, min(n_train_legs, round(frac * n_train_legs)))
137+
full = frac >= 1.0 - _FTOL
138+
reps = 1 if full else REPEATS # f=1.0 is the deterministic single full-facit point
139+
aps: list[float] = []
140+
aucs: list[float] = []
141+
n_pos_list: list[int] = []
142+
for r in range(reps):
143+
if full:
144+
retained = set(train_legs)
145+
else:
146+
rng = np.random.default_rng(_seed_for(cfg.seed, f_idx, r))
147+
picked = rng.choice(legs_arr, size=n_retain, replace=False)
148+
retained = {int(x) for x in picked}
149+
y_tr = np.array(
150+
[1.0 if (h >= 0 and int(h) in retained) else 0.0 for h in tr_human_idx],
151+
dtype=float,
152+
)
153+
n_pos_list.append(int(y_tr.sum()))
154+
model = fit_logreg(x_tr, y_tr, cfg)
155+
p_te = predict_proba(model, x_te)
156+
ap = average_precision(y_te, p_te)
157+
auc = roc_auc(y_te, p_te)
158+
if ap is not None:
159+
aps.append(ap)
160+
if auc is not None:
161+
aucs.append(auc)
162+
ap_arr = np.array(aps, dtype=float)
163+
per_fraction.append(
164+
{
165+
"fraction": float(frac),
166+
"repeats": reps,
167+
"n_retain": int(n_retain),
168+
"n_train_pos_mean": float(np.mean(n_pos_list)) if n_pos_list else None,
169+
"ap_mean": float(ap_arr.mean()) if ap_arr.size else None,
170+
"ap_p5": float(np.percentile(ap_arr, 5)) if ap_arr.size else None,
171+
"ap_p95": float(np.percentile(ap_arr, 95)) if ap_arr.size else None,
172+
"ap_std": float(ap_arr.std()) if ap_arr.size else None,
173+
"auc_mean": float(np.mean(aucs)) if aucs else None,
174+
}
175+
)
176+
_progress(
177+
f" f={frac:.2f} reps={reps} n_retain={n_retain} "
178+
f"ap_mean={per_fraction[-1]['ap_mean']}"
179+
)
180+
181+
full_stats = _frac_stats(per_fraction, 1.0)
182+
_progress(
183+
f"cell DONE tf={timeframe} k={k} in {time.perf_counter() - t_cell:.0f}s "
184+
f"(test_pos={n_test_pos} powered={powered} "
185+
f"ap_full={full_stats['ap_mean'] if full_stats else None})"
186+
)
187+
return {
188+
"timeframe": timeframe,
189+
"k": k,
190+
"features": feat_names,
191+
"n_bars": n,
192+
"n_human_legs": len(human_legs),
193+
"n_candidates": len(cands),
194+
"n_train": len(train),
195+
"n_test": len(test),
196+
"n_test_positives": n_test_pos,
197+
"n_train_legs": n_train_legs, # facit legs available to subsample in the train period
198+
"reachable_fraction": reachable_fraction,
199+
"powered": powered,
200+
"repeats": REPEATS,
201+
"fractions": list(FRACTIONS),
202+
"per_fraction": per_fraction,
203+
"ap_full_facit": full_stats["ap_mean"] if full_stats else None,
204+
# LOCK L6 addable-supply context (reported, not a gate)
205+
"addable_supply": {
206+
"labeled_human_legs": len(human_legs),
207+
"candidate_universe": len(cands),
208+
"train_period_facit_legs": n_train_legs,
209+
"note": "true human-meaningful count needs a human pass; capped by available history",
210+
},
211+
}
212+
213+
214+
def curve_verdict(cell: dict[str, Any]) -> str:
215+
"""ASYMMETRIC blind verdict (LOCK L4), read from the 4h primary. Reads DIFFERENCES of fraction
216+
means against the f=0.95 train-subsample band (LOCK L5), not absolute AP levels.
217+
218+
- ``inconclusive_underpowered`` — not powered, or the f=0.95 band half-width is >= the full
219+
spread of fraction means (per-fraction noise dwarfs the curve; likely at 65 test positives).
220+
- ``data_starved`` — ``AP(1.0) − AP(0.95)`` exceeds the f=0.95 band half-width and the curve is
221+
increasing: genuinely informative green light (more facit helps even this 1-feature model).
222+
- ``saturated`` — last increment within the band (flat): ambiguous AND EXPECTED → routes to the
223+
feature / ``cleanliness`` crux, NOT away from labeling.
224+
"""
225+
if not cell.get("powered"):
226+
return "inconclusive_underpowered"
227+
per = cell.get("per_fraction") or []
228+
s_top = _frac_stats(per, 1.0)
229+
s_95 = _frac_stats(per, 0.95)
230+
means = [s["ap_mean"] for s in per if s.get("ap_mean") is not None]
231+
if s_top is None or s_95 is None or s_top["ap_mean"] is None or len(means) < 2:
232+
return "inconclusive_underpowered"
233+
spread = max(means) - min(means)
234+
hw95 = _band_halfwidth(s_95)
235+
if hw95 >= spread: # noise dominates the whole curve range
236+
return "inconclusive_underpowered"
237+
last_increment = s_top["ap_mean"] - s_95["ap_mean"]
238+
if last_increment > hw95 and s_top["ap_mean"] > s_95["ap_mean"]:
239+
return "data_starved"
240+
return "saturated"
241+
242+
243+
# --- checkpointed study -----------------------------------------------------------------------
244+
245+
246+
def _json_default(o: Any) -> Any:
247+
if isinstance(o, np.generic):
248+
return o.item()
249+
raise TypeError(f"not JSON-serializable: {type(o)}")
250+
251+
252+
def _run_or_load_cell(
253+
timeframe: str, k: int, cfg: SelectionConfig, settings: Any, ckpt_dir: Path
254+
) -> dict:
255+
"""Run one cell or load a same-seed checkpoint (atomic write), mirroring the enrich/W-gap resume
256+
pattern so an interrupted long 4h build loses at most the in-flight cell."""
257+
path = ckpt_dir / f"{timeframe}_k{k}.json"
258+
if path.exists():
259+
saved = json.loads(path.read_text(encoding="utf-8"))
260+
if saved.get("seed") == cfg.seed:
261+
_progress(f"RESUME tf={timeframe} k={k}: loaded checkpoint {path.name}")
262+
return saved["cell"]
263+
_progress(f"stale ckpt {path.name}: seed {saved.get('seed')}!={cfg.seed}, recompute")
264+
result = run_curve_cell(timeframe, k, cfg, settings)
265+
ckpt_dir.mkdir(parents=True, exist_ok=True)
266+
tmp = path.with_name(path.name + ".tmp")
267+
tmp.write_text(
268+
json.dumps(
269+
{"seed": cfg.seed, "cell": result}, indent=2, sort_keys=True, default=_json_default
270+
),
271+
encoding="utf-8",
272+
)
273+
tmp.replace(path)
274+
_progress(f"checkpoint written {path.name}")
275+
return json.loads(path.read_text(encoding="utf-8"))["cell"]
276+
277+
278+
def run_curve_study(
279+
config_path: str | None, cfg: SelectionConfig, ckpt_dir: Path | None = None
280+
) -> dict:
281+
"""Learning-curve study (LOCK L2): 4h primary at k=3 (the single powered, verdict-bearing cell);
282+
1M/1w/1d at k=3 as underpowered context only, never refuted. Verdict from the 4h k=3 cell."""
283+
settings = load_settings(config_path) if config_path else load_settings()
284+
if ckpt_dir is None:
285+
ckpt_dir = RESULTS_DIR / "curve" / "cells"
286+
primary = _run_or_load_cell("4h", PRIMARY_K, cfg, settings, ckpt_dir)
287+
context = [
288+
_run_or_load_cell(tf, PRIMARY_K, cfg, settings, ckpt_dir) for tf in CONTEXT_TIMEFRAMES
289+
]
290+
return {
291+
"generated_by": "fib_selection_learning_curve",
292+
"stage": "facit_data_sensitivity_learning_curve",
293+
"metric": "pooled_test_average_precision",
294+
"design": "fixed test set; vary only training-facit fraction (whole legs); build-once",
295+
"seed": cfg.seed,
296+
"primary_timeframe": "4h",
297+
"primary_k": PRIMARY_K,
298+
"learning_curve_verdict": curve_verdict(primary),
299+
"results_4h_primary": primary,
300+
"results_context_underpowered": context,
301+
}
302+
303+
304+
def print_curve(report: dict, path: Any) -> None:
305+
rows = [("4h", report["results_4h_primary"])]
306+
rows += [("ctx", c) for c in report["results_context_underpowered"]]
307+
for label, r in rows:
308+
print(
309+
f"[{label} tf={r['timeframe']} k={r['k']}] n_cands={r['n_candidates']} "
310+
f"n_train_legs={r['n_train_legs']} test_pos={r['n_test_positives']} "
311+
f"powered={r['powered']} ap_full={r['ap_full_facit']}"
312+
)
313+
for s in r["per_fraction"]:
314+
print(
315+
f" f={s['fraction']:.2f} reps={s['repeats']} n_retain={s['n_retain']} "
316+
f"n_pos~{s['n_train_pos_mean']} ap_mean={s['ap_mean']} "
317+
f"[p5={s['ap_p5']}, p95={s['ap_p95']}] auc={s['auc_mean']}"
318+
)
319+
print(f" addable_supply={r['addable_supply']}")
320+
print(f"learning_curve_verdict={report['learning_curve_verdict']} summary={path}")
321+
322+
323+
def _write_summary(report: dict, out_dir: Path) -> Path:
324+
out_dir.mkdir(parents=True, exist_ok=True)
325+
path = out_dir / "summary.json"
326+
path.write_text(
327+
json.dumps(report, indent=2, sort_keys=True, default=_json_default), encoding="utf-8"
328+
)
329+
return path
330+
331+
332+
def main(argv: list[str] | None = None) -> int:
333+
for _s in (sys.stdout, sys.stderr): # UTF-8 console (long-run safety)
334+
_rc = getattr(_s, "reconfigure", None)
335+
if _rc is not None:
336+
_rc(encoding="utf-8")
337+
ap = argparse.ArgumentParser(description="BTC Fib selection-learning learning-curve diagnostic")
338+
ap.add_argument("--config", default="config/settings.expansion.yaml")
339+
ap.add_argument("--out", default=str(RESULTS_DIR / "curve"))
340+
ap.add_argument("--curve", action="store_true", help="run the facit learning-curve diagnostic")
341+
ap.add_argument(
342+
"--curve-preflight",
343+
action="store_true",
344+
help="frozen-data parity + facit fail-fast (reuses W-gap preflight; no run)",
345+
)
346+
args = ap.parse_args(argv)
347+
if args.curve_preflight:
348+
return run_preflight(args.config)
349+
if args.curve:
350+
report = run_curve_study(args.config, SelectionConfig())
351+
path = _write_summary(report, Path(args.out))
352+
print_curve(report, path)
353+
return 0
354+
ap.print_help()
355+
return 0
356+
357+
358+
if __name__ == "__main__": # pragma: no cover
359+
raise SystemExit(main())

0 commit comments

Comments
 (0)