Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 217 additions & 9 deletions src/edvise/modeling/h2o_ml/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,196 @@
LOGGER = logging.getLogger(__name__)


def prepare_glm_enum_alignment_inputs(
model: H2OEstimator,
df: pd.DataFrame,
background_data: t.Optional[pd.DataFrame],
*,
cip_like_cols: t.Optional[t.Sequence[str]] = None,
) -> t.Tuple[pd.DataFrame, t.Optional[pd.DataFrame]]:
"""
GLM-only preprocessing to avoid H2O domain issues in contributions with background_frame:
1) Normalize CIP-like columns by stripping trailing '.0' (e.g. '131202.0' -> '131202')
2) Recode unseen Enum levels (w.r.t. model training domains) to a baseline category

Returns:
(df_proc, bg_proc)
"""
if cip_like_cols is None:
cip_like_cols = (
"term_program_of_study",
"program_of_study_year_1",
"program_of_study_term_1",
)

out = model._model_json["output"]
names = list(out["names"])
col_types = list(out["column_types"])
domains = list(out["domains"])

params = model.actual_params
response = (
(out.get("response_column") or {}).get("name")
or params.get("response_column")
or params.get("y")
)

non_predictors = set()
if response:
non_predictors.add(response)
for k in ("weights_column", "offset_column", "fold_column"):
v = params.get(k)
if isinstance(v, dict):
v = v.get("column_name")
if v:
non_predictors.add(v)

enum_predictors = [
(col, domains[idx])
for idx, (col, typ) in enumerate(zip(names, col_types))
if typ == "Enum" and col not in non_predictors and domains[idx] is not None
]

def _normalize_cip_like_columns(
df_in: t.Optional[pd.DataFrame], label: str
) -> t.Optional[pd.DataFrame]:
if df_in is None:
return None

df_out = df_in.copy()
for col in cip_like_cols:
if col not in df_out.columns:
continue

s = df_out[col].astype(str)

# only values that look like '123456.0' / '123456.000'
mask = s.str.match(r"^-?\d+\.0+$")
if not mask.any():
continue

before = s[mask].head().tolist()
s.loc[mask] = s.loc[mask].str.replace(r"\.0+$", "", regex=True)
after = s[mask].head().tolist()
df_out[col] = s

LOGGER.info(
"[%s] Normalized CIP-like column '%s': %d value(s) stripped '.0'; "
"examples before=%s after=%s",
label,
col,
int(mask.sum()),
before,
after,
)
return df_out

def _pick_mode_in_domain(
series: pd.Series,
train_dom_set: set[str],
) -> t.Optional[str]:
"""
Pick the most frequent value in `series` that is also in the training domain.
Returns None if no in-domain values exist.
"""
if series is None:
return None

s = series.astype(str)
in_dom = s[s.isin(train_dom_set)]
if in_dom.empty:
return None

# Try mode first
try:
mode_vals = in_dom.mode(dropna=True)
if not mode_vals.empty:
val = mode_vals.iloc[0]
return val if isinstance(val, str) else None
except Exception:
pass

# Fallback: value_counts
try:
val = in_dom.value_counts(dropna=True).idxmax()
return val if isinstance(val, str) else None
except Exception:
return None

def _recode_unseen_to_baseline(
df_in: t.Optional[pd.DataFrame],
label: str,
*,
baseline_source_df: t.Optional[pd.DataFrame] = None,
) -> t.Optional[pd.DataFrame]:
"""
Recode values not in the model's training domain for Enum predictors.
Baseline is chosen as:
1) mode(in-domain) from `baseline_source_df[col]` if provided,
2) else mode(in-domain) from `df_in[col]`,
3) else train_dom[0].
"""
if df_in is None:
return None

df_out = df_in.copy()

for col, train_dom in enum_predictors:
if col not in df_out.columns:
continue

train_dom = [str(x) for x in train_dom]
train_dom_set = set(train_dom)

s = df_out[col].astype(str)
mask = ~s.isin(train_dom_set)
if not mask.any():
df_out[col] = s
continue

bad_vals = sorted(s[mask].unique())

# Pick baseline using the most representative in-domain value
baseline = None
if baseline_source_df is not None and col in baseline_source_df.columns:
baseline = _pick_mode_in_domain(baseline_source_df[col], train_dom_set)

if baseline is None:
baseline = _pick_mode_in_domain(s, train_dom_set)

if baseline is None:
baseline = train_dom[0] # last-resort fallback

LOGGER.warning(
"[%s] Column '%s': %d value(s) outside training domain; "
"example unseen values: %s. Recoding to baseline '%s'.",
label,
col,
int(mask.sum()),
bad_vals[:5],
baseline,
)
s.loc[mask] = baseline
df_out[col] = s

return df_out

df_proc = _normalize_cip_like_columns(df, "features")
bg_proc = _normalize_cip_like_columns(background_data, "background")

# Baseline selection strategy:
# - When recoding features, prefer the background distribution (if present)
# - When recoding background, prefer itself
df_proc = _recode_unseen_to_baseline(
df_proc, "features", baseline_source_df=bg_proc
)
bg_proc = _recode_unseen_to_baseline(
bg_proc, "background", baseline_source_df=bg_proc
)

return df_proc, bg_proc


def get_h2o_used_features(model: H2OEstimator) -> t.List[str]:
"""
Extracts the actual feature names used by the H2O model (excluding dropped/constant columns).
Expand Down Expand Up @@ -250,20 +440,38 @@ def compute_h2o_shap_contributions(

LOGGER.info("Preparing data for H2O inference...")

algo = getattr(model, "algo", None)
is_glm = algo == "glm"
LOGGER.info("algo=%s (is_glm=%s)", algo, is_glm)

df_proc = df.copy()
bg_proc = background_data.copy() if background_data is not None else None

# GLM-only alignment
# NOTE: GLMs due to not handle unseen categories well, so we need to impute
if is_glm:
if background_data is None:
raise ValueError(
"GLM predict_contributions requires background_data/background_frame."
)
else:
df_proc, bg_proc = prepare_glm_enum_alignment_inputs(
model, df_proc, bg_proc
)

# Convert features & background data to H2OFrames
missing_flags = [c for c in df.columns if c.endswith("_missing_flag")]
h2o_features = utils._to_h2o(df, force_enum_cols=missing_flags)

if background_data is not None:
missing_flags = [
c for c in background_data.columns if c.endswith("_missing_flag")
]
h2o_bd = utils._to_h2o(background_data, force_enum_cols=missing_flags)
missing_flags = [c for c in df_proc.columns if c.endswith("_missing_flag")]
h2o_features = utils._to_h2o(df_proc, force_enum_cols=missing_flags)

if bg_proc is not None:
missing_flags_bg = [c for c in bg_proc.columns if c.endswith("_missing_flag")]
h2o_bd = utils._to_h2o(bg_proc, force_enum_cols=missing_flags_bg)
else:
h2o_bd = None

# Select only the features the model actually uses
used_features = get_h2o_used_features(model)
LOGGER.info("Model used_features (%d)", len(used_features))

if used_features:
hf_subset = h2o_features[used_features]
bg_subset = h2o_bd[used_features] if h2o_bd is not None else None
Expand Down
Loading
Loading