|
| 1 | +"""EXP-2868: Do EGP / fasting-window findings change when we gate carb |
| 2 | +events to real carb events (>=5g) instead of any non-zero carb? |
| 3 | +
|
| 4 | +Prior EGP experiments (EXP-2739/2740/2757/2758) and the stratified |
| 5 | +basal experiment (EXP-2865) all declare "fasting equilibrium" using |
| 6 | +`time_since_carb_min >= 240`. But `time_since_carb_min` is reset by |
| 7 | +ANY carb event, including the 30% of <5g events that EXP-2866 |
| 8 | +identified as treat-of-low / detector noise. |
| 9 | +
|
| 10 | +For messy-log patients (EXP-2866: patient `b` 79% <5g, 38/day), |
| 11 | +noise-carb events never let `time_since_carb_min` grow to 240 min. |
| 12 | +Those patients may have been entirely excluded from EGP/basal |
| 13 | +findings — a coverage gap, not a value bias. |
| 14 | +
|
| 15 | +This experiment: |
| 16 | + 1. Recompute `time_since_real_carb_min` where resets happen only |
| 17 | + on events with carbs >= REAL_CARB_EVENT_THRESHOLD_G (5 g). |
| 18 | + 2. Apply the EXP-2865 fasting+equilibrium filter under both |
| 19 | + definitions. Compare row counts and patient coverage. |
| 20 | + 3. For each patient, compute median net glucose drift |
| 21 | + (mg/dL / 5min) in fasting equilibrium — the "ISF-independent |
| 22 | + EGP" proxy from EXP-2758. Compare naive vs real-carb gating. |
| 23 | + 4. Check whether `b` and `odc-39819048` gain fasting coverage. |
| 24 | +""" |
| 25 | +from __future__ import annotations |
| 26 | + |
| 27 | +import json |
| 28 | +import sys |
| 29 | +from pathlib import Path |
| 30 | + |
| 31 | +import numpy as np |
| 32 | +import pandas as pd |
| 33 | + |
| 34 | +REPO = Path(__file__).resolve().parents[2] |
| 35 | +sys.path.insert(0, str(REPO)) |
| 36 | + |
| 37 | +from tools.cgmencode.production.meal_filter import ( |
| 38 | + REAL_CARB_EVENT_THRESHOLD_G, |
| 39 | +) |
| 40 | + |
| 41 | +GRID = REPO / "externals" / "ns-parquet" / "training" / "grid.parquet" |
| 42 | +OUT = REPO / "externals" / "experiments" |
| 43 | +OUT.mkdir(parents=True, exist_ok=True) |
| 44 | + |
| 45 | + |
| 46 | +def _build_real_tsc(g: pd.DataFrame) -> pd.Series: |
| 47 | + """Per-patient: time_since_real_carb_min (resets only on carbs >= 5g).""" |
| 48 | + g = g.sort_values(["patient_id", "time"]).reset_index(drop=True) |
| 49 | + # event_time marks rows with a real-carb event |
| 50 | + real_event = g["carbs"] >= REAL_CARB_EVENT_THRESHOLD_G |
| 51 | + # last real carb time per row via forward-fill |
| 52 | + last_real_time = g["time"].where(real_event).groupby(g["patient_id"]).ffill() |
| 53 | + delta = (g["time"] - last_real_time).dt.total_seconds() / 60.0 |
| 54 | + return delta |
| 55 | + |
| 56 | + |
| 57 | +def main() -> None: |
| 58 | + g = pd.read_parquet(GRID) |
| 59 | + g["time"] = pd.to_datetime(g["time"], utc=True) |
| 60 | + g["time_since_real_carb_min"] = _build_real_tsc(g) |
| 61 | + |
| 62 | + # common filters |
| 63 | + base = ( |
| 64 | + (g["cob"].fillna(0) == 0) |
| 65 | + & (g["time_since_bolus_min"].fillna(0) >= 240) |
| 66 | + & (g["exercise_active"].fillna(False) == False) |
| 67 | + & (g["override_active"].fillna(False) == False) |
| 68 | + & g["glucose_roc"].notna() |
| 69 | + & g["actual_basal_rate"].notna() |
| 70 | + & g["scheduled_basal_rate"].notna() |
| 71 | + & (g["scheduled_basal_rate"] > 0) |
| 72 | + ) |
| 73 | + # equilibrium |
| 74 | + equil = base & (g["glucose_roc"].abs() <= 0.5) |
| 75 | + |
| 76 | + naive_fasting = equil & (g["time_since_carb_min"].fillna(0) >= 240) |
| 77 | + real_fasting = equil & (g["time_since_real_carb_min"].fillna(0) >= 240) |
| 78 | + |
| 79 | + def summarize(mask: pd.Series, label: str) -> dict: |
| 80 | + sub = g.loc[mask].copy() |
| 81 | + per_pat = ( |
| 82 | + sub.groupby("patient_id") |
| 83 | + .agg( |
| 84 | + n_rows=("glucose", "size"), |
| 85 | + median_drift=("glucose_roc", "median"), |
| 86 | + median_basal_mult=( |
| 87 | + "actual_basal_rate", |
| 88 | + lambda s: float( |
| 89 | + (s / g.loc[s.index, "scheduled_basal_rate"]).median() |
| 90 | + ), |
| 91 | + ), |
| 92 | + ) |
| 93 | + .reset_index() |
| 94 | + ) |
| 95 | + return { |
| 96 | + "label": label, |
| 97 | + "n_rows": int(mask.sum()), |
| 98 | + "n_patients": int(sub["patient_id"].nunique()), |
| 99 | + "cohort_median_drift_mg_dl_per_5min": float(per_pat["median_drift"].median()) |
| 100 | + if len(per_pat) |
| 101 | + else None, |
| 102 | + "cohort_iqr_drift": [ |
| 103 | + float(per_pat["median_drift"].quantile(0.25)) |
| 104 | + if len(per_pat) |
| 105 | + else None, |
| 106 | + float(per_pat["median_drift"].quantile(0.75)) |
| 107 | + if len(per_pat) |
| 108 | + else None, |
| 109 | + ], |
| 110 | + "cohort_median_basal_mult": float(per_pat["median_basal_mult"].median()) |
| 111 | + if len(per_pat) |
| 112 | + else None, |
| 113 | + "patients_covered": sorted(per_pat["patient_id"].tolist()), |
| 114 | + "_per_patient": per_pat, |
| 115 | + } |
| 116 | + |
| 117 | + naive = summarize(naive_fasting, "naive (any carb reset)") |
| 118 | + real = summarize(real_fasting, "real (>=5g reset only)") |
| 119 | + |
| 120 | + covered_naive = set(naive["patients_covered"]) |
| 121 | + covered_real = set(real["patients_covered"]) |
| 122 | + newly_covered = sorted(covered_real - covered_naive) |
| 123 | + lost = sorted(covered_naive - covered_real) |
| 124 | + |
| 125 | + # per-patient delta for patients present in both |
| 126 | + merged = naive["_per_patient"].merge( |
| 127 | + real["_per_patient"], on="patient_id", suffixes=("_naive", "_real") |
| 128 | + ) |
| 129 | + merged["drift_delta"] = merged["median_drift_real"] - merged["median_drift_naive"] |
| 130 | + merged["basal_mult_delta"] = ( |
| 131 | + merged["median_basal_mult_real"] - merged["median_basal_mult_naive"] |
| 132 | + ) |
| 133 | + merged["row_ratio"] = merged["n_rows_real"] / merged["n_rows_naive"].clip(lower=1) |
| 134 | + |
| 135 | + summary = { |
| 136 | + "exp": "EXP-2868", |
| 137 | + "method": ( |
| 138 | + "Redefine time_since_real_carb_min to reset only on carbs>=" |
| 139 | + f"{REAL_CARB_EVENT_THRESHOLD_G}g. Apply EXP-2865 fasting+" |
| 140 | + "equilibrium filter. Compare cohort EGP-proxy (median drift) " |
| 141 | + "and basal multiplier under naive vs real-carb gating." |
| 142 | + ), |
| 143 | + "naive": {k: v for k, v in naive.items() if k != "_per_patient"}, |
| 144 | + "real": {k: v for k, v in real.items() if k != "_per_patient"}, |
| 145 | + "coverage": { |
| 146 | + "newly_covered_patients": newly_covered, |
| 147 | + "lost_patients": lost, |
| 148 | + "n_both": len(merged), |
| 149 | + }, |
| 150 | + "per_patient_delta": { |
| 151 | + "median_drift_delta_mg_dl_per_5min": float(merged["drift_delta"].median()) |
| 152 | + if len(merged) |
| 153 | + else None, |
| 154 | + "max_abs_drift_delta": float(merged["drift_delta"].abs().max()) |
| 155 | + if len(merged) |
| 156 | + else None, |
| 157 | + "median_basal_mult_delta": float(merged["basal_mult_delta"].median()) |
| 158 | + if len(merged) |
| 159 | + else None, |
| 160 | + "median_row_ratio_real_over_naive": float(merged["row_ratio"].median()) |
| 161 | + if len(merged) |
| 162 | + else None, |
| 163 | + }, |
| 164 | + "messy_log_patients_check": { |
| 165 | + "b_in_naive": "b" in covered_naive, |
| 166 | + "b_in_real": "b" in covered_real, |
| 167 | + "odc-39819048_in_naive": "odc-39819048" in covered_naive, |
| 168 | + "odc-39819048_in_real": "odc-39819048" in covered_real, |
| 169 | + }, |
| 170 | + } |
| 171 | + |
| 172 | + merged.to_parquet(OUT / "exp-2868_fasting_compare.parquet", index=False) |
| 173 | + with open(OUT / "exp-2868_summary.json", "w") as f: |
| 174 | + json.dump(summary, f, indent=2) |
| 175 | + print(json.dumps(summary, indent=2)) |
| 176 | + |
| 177 | + |
| 178 | +if __name__ == "__main__": |
| 179 | + main() |
0 commit comments