|
| 1 | +"""EXP-2860 — Cross-validate bootstrap Simpson against other audition signals. |
| 2 | +
|
| 3 | +Bootstrap P(simpson) from EXP-2859 sorts patients into 3 groups: |
| 4 | +high-conf Simpson (P>=0.9), boundary (0.1<P<0.9), confidently clean |
| 5 | +(P<=0.1). Test whether these groups differ on OTHER audition inputs: |
| 6 | + - phenotype (EXP-2845) |
| 7 | + - structural basal step (β_slow proxy: actual_basal_s1 - actual_basal_s0 |
| 8 | + from EXP-2843) |
| 9 | + - SMB share (EXP-2845) |
| 10 | + - mean basal level (EXP-2853) |
| 11 | +
|
| 12 | +If groups differ on these features, Simpson is correlated with |
| 13 | +existing signals (partial duplicate). If groups are indistinguishable, |
| 14 | +Simpson is a genuinely orthogonal signal worth carrying. |
| 15 | +
|
| 16 | +Charter B compliant. |
| 17 | +""" |
| 18 | +from __future__ import annotations |
| 19 | + |
| 20 | +import json |
| 21 | +from pathlib import Path |
| 22 | + |
| 23 | +import numpy as np |
| 24 | +import pandas as pd |
| 25 | + |
| 26 | +REPO = Path(__file__).resolve().parents[2] |
| 27 | +EXPDIR = REPO / "externals" / "experiments" |
| 28 | +FIGDIR = REPO / "docs" / "60-research" / "figures" |
| 29 | + |
| 30 | + |
| 31 | +def main() -> None: |
| 32 | + boot = pd.read_parquet(EXPDIR / "exp-2859_bootstrap_simpson.parquet") |
| 33 | + decomp = pd.read_parquet(EXPDIR / "exp-2853_simpson_decomposition.parquet") |
| 34 | + state = pd.read_parquet(EXPDIR / "exp-2843_state_basal_coupling.parquet") |
| 35 | + route = pd.read_parquet(EXPDIR / "exp-2845_per_patient_route.parquet") |
| 36 | + |
| 37 | + # State-coupling features |
| 38 | + state["d_basal_state"] = state["actual_basal_s1"] - state["actual_basal_s0"] |
| 39 | + state["d_glucose_state"] = state["glucose_s1"] - state["glucose_s0"] |
| 40 | + |
| 41 | + df = ( |
| 42 | + boot[["patient_id", "p_simpson", "beta_fast_mean", "beta_slow_mean", |
| 43 | + "point_simpson"]] |
| 44 | + .merge(decomp[["patient_id", "mean_basal_uph", "frac_variance_within_window"]], |
| 45 | + on="patient_id", how="left") |
| 46 | + .merge(state[["patient_id", "d_basal_state", "d_glucose_state", |
| 47 | + "n_s0_cells", "n_s1_cells"]], |
| 48 | + on="patient_id", how="left") |
| 49 | + .merge(route[["patient_id", "phenotype", "d_basal_uph", "d_smb_uph", |
| 50 | + "smb_share_s1"]], |
| 51 | + on="patient_id", how="left") |
| 52 | + ) |
| 53 | + |
| 54 | + # Categorize |
| 55 | + df["band"] = pd.cut( |
| 56 | + df["p_simpson"], |
| 57 | + bins=[-0.001, 0.1, 0.9, 1.001], |
| 58 | + labels=["clean", "boundary", "simpson"], |
| 59 | + ) |
| 60 | + |
| 61 | + summary = { |
| 62 | + "exp": "EXP-2860", |
| 63 | + "method": ( |
| 64 | + "Cross-tab bootstrap-Simpson bands (clean P<=0.1, boundary " |
| 65 | + "0.1<P<0.9, simpson P>=0.9) against other audition signals " |
| 66 | + "(phenotype, state-basal step, SMB share, mean basal)." |
| 67 | + ), |
| 68 | + "n_patients_total": int(len(df)), |
| 69 | + "by_band": {}, |
| 70 | + } |
| 71 | + for band, g in df.groupby("band"): |
| 72 | + summary["by_band"][str(band)] = { |
| 73 | + "n": int(len(g)), |
| 74 | + "phenotype_counts": ( |
| 75 | + g["phenotype"].value_counts(dropna=False).to_dict() |
| 76 | + if "phenotype" in g and g["phenotype"].notna().any() |
| 77 | + else {} |
| 78 | + ), |
| 79 | + "median_mean_basal_uph": ( |
| 80 | + float(g["mean_basal_uph"].median()) |
| 81 | + if g["mean_basal_uph"].notna().any() else None |
| 82 | + ), |
| 83 | + "median_d_basal_state_uph": ( |
| 84 | + float(g["d_basal_state"].median()) |
| 85 | + if g["d_basal_state"].notna().any() else None |
| 86 | + ), |
| 87 | + "median_d_glucose_state_mgdl": ( |
| 88 | + float(g["d_glucose_state"].median()) |
| 89 | + if g["d_glucose_state"].notna().any() else None |
| 90 | + ), |
| 91 | + "median_smb_share_s1": ( |
| 92 | + float(g["smb_share_s1"].median()) |
| 93 | + if "smb_share_s1" in g and g["smb_share_s1"].notna().any() else None |
| 94 | + ), |
| 95 | + "median_frac_variance_within_window": ( |
| 96 | + float(g["frac_variance_within_window"].median()) |
| 97 | + if g["frac_variance_within_window"].notna().any() else None |
| 98 | + ), |
| 99 | + } |
| 100 | + |
| 101 | + # Mann-Whitney clean vs boundary+simpson where N permits |
| 102 | + try: |
| 103 | + from scipy import stats as sst |
| 104 | + clean = df[df["band"] == "clean"] |
| 105 | + flagged = df[df["band"].isin(["boundary", "simpson"])] |
| 106 | + mw = {} |
| 107 | + for feat in ["mean_basal_uph", "d_basal_state", "d_glucose_state", |
| 108 | + "smb_share_s1", "frac_variance_within_window"]: |
| 109 | + a = clean[feat].dropna().to_numpy() if feat in clean else [] |
| 110 | + b = flagged[feat].dropna().to_numpy() if feat in flagged else [] |
| 111 | + if len(a) >= 4 and len(b) >= 4: |
| 112 | + u = sst.mannwhitneyu(a, b, alternative="two-sided") |
| 113 | + mw[feat] = { |
| 114 | + "p": float(u.pvalue), |
| 115 | + "n_clean": int(len(a)), |
| 116 | + "n_flagged": int(len(b)), |
| 117 | + } |
| 118 | + summary["mannwhitney_clean_vs_flagged"] = mw |
| 119 | + except Exception as e: # noqa: BLE001 |
| 120 | + summary["mw_error"] = str(e) |
| 121 | + |
| 122 | + df.to_parquet(EXPDIR / "exp-2860_simpson_xref.parquet", index=False) |
| 123 | + (EXPDIR / "exp-2860_summary.json").write_text(json.dumps(summary, indent=2, default=str)) |
| 124 | + |
| 125 | + # Visualization |
| 126 | + try: |
| 127 | + import matplotlib.pyplot as plt |
| 128 | + feats = [ |
| 129 | + ("mean_basal_uph", "mean basal (U/hr)"), |
| 130 | + ("d_basal_state", "Δ basal S1−S0 (U/hr)"), |
| 131 | + ("d_glucose_state", "Δ glucose S1−S0 (mg/dL)"), |
| 132 | + ("smb_share_s1", "SMB share in S1"), |
| 133 | + ] |
| 134 | + fig, axes = plt.subplots(1, 4, figsize=(18, 4.5)) |
| 135 | + bands = ["clean", "boundary", "simpson"] |
| 136 | + colors = {"clean": "#43AA8B", "boundary": "#F8961E", "simpson": "#F94144"} |
| 137 | + for ax, (feat, label) in zip(axes, feats): |
| 138 | + data = [] |
| 139 | + labels = [] |
| 140 | + for band in bands: |
| 141 | + vals = df.loc[df["band"] == band, feat].dropna() |
| 142 | + if len(vals) > 0: |
| 143 | + data.append(vals) |
| 144 | + labels.append(f"{band}\nn={len(vals)}") |
| 145 | + if data: |
| 146 | + bp = ax.boxplot(data, labels=labels, showfliers=False, patch_artist=True) |
| 147 | + for patch, band_label in zip(bp["boxes"], bands[: len(data)]): |
| 148 | + patch.set_facecolor(colors[band_label]) |
| 149 | + patch.set_alpha(0.7) |
| 150 | + ax.set_ylabel(label) |
| 151 | + p = summary.get("mannwhitney_clean_vs_flagged", {}).get(feat, {}).get("p") |
| 152 | + if p is not None: |
| 153 | + ax.set_title(f"{label}\nclean vs flagged p={p:.3f}", fontsize=10) |
| 154 | + else: |
| 155 | + ax.set_title(label, fontsize=10) |
| 156 | + fig.suptitle( |
| 157 | + "EXP-2860: bootstrap Simpson bands vs other audition signals", |
| 158 | + fontsize=12, |
| 159 | + ) |
| 160 | + plt.tight_layout() |
| 161 | + FIGDIR.mkdir(parents=True, exist_ok=True) |
| 162 | + fig.savefig(FIGDIR / "exp-2860_simpson_xref.png", dpi=120) |
| 163 | + plt.close(fig) |
| 164 | + except Exception as e: # noqa: BLE001 |
| 165 | + print("viz failed:", e) |
| 166 | + |
| 167 | + print(json.dumps(summary, indent=2, default=str)) |
| 168 | + |
| 169 | + |
| 170 | +if __name__ == "__main__": |
| 171 | + main() |
0 commit comments