churn/src/utils.py at master · mylastresort/churn · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np
import pandas as pd

NUM_IMPUTE_STRATEGIES = ("mean", "median", "drop")


def _impute_numerics(x_tr_raw, x_other_raw, num_cols, strategy):
    """
    Fit a SimpleImputer on x_tr_raw[num_cols], transform both splits.
    Returns imputed DataFrames and the fitted imputer.
    strategy: 'mean' | 'median'
    """
    imputer = SimpleImputer(strategy=strategy)

    x_tr_imp = pd.DataFrame(
        imputer.fit_transform(x_tr_raw[num_cols]),
        columns=num_cols,
        index=x_tr_raw.index,
    )
    x_other_imp = pd.DataFrame(
        imputer.transform(x_other_raw[num_cols]),
        columns=num_cols,
        index=x_other_raw.index,
    )
    return x_tr_imp, x_other_imp, imputer


def _process(x_tr_raw, x_other_raw, y_tr, num_cols, cat_cols, num_impute):
    imputer = None
    ohe = None

    # ── Impute missing numerics (fit on train only) ───────────────────────────
    if num_impute in ("mean", "median"):
        x_tr_num_raw, x_other_num_raw, imputer = _impute_numerics(
            x_tr_raw, x_other_raw, num_cols, strategy=num_impute
        )
    else:
        x_tr_num_raw = x_tr_raw[num_cols].copy()
        x_other_num_raw = x_other_raw[num_cols].copy()

    # ── Signed log1p transform — compress heavy tails (skewness up to 187) ───
    # Applied after imputation so no NaNs remain; skips binary null-indicator cols
    x_tr_num_raw = x_tr_num_raw.copy()
    x_other_num_raw = x_other_num_raw.copy()
    x_tr_num_raw[num_cols] = np.sign(x_tr_num_raw[num_cols].values) * np.log1p(
        np.abs(x_tr_num_raw[num_cols].values)
    )
    x_other_num_raw[num_cols] = np.sign(x_other_num_raw[num_cols].values) * np.log1p(
        np.abs(x_other_num_raw[num_cols].values)
    )

    # ── Scale numerics (fit on train only) ────────────────────────────────────
    scaler = StandardScaler()
    x_tr_num = pd.DataFrame(
        scaler.fit_transform(x_tr_num_raw),
        columns=num_cols,
        index=x_tr_raw.index,
    )
    x_other_num = pd.DataFrame(
        scaler.transform(x_other_num_raw),
        columns=num_cols,
        index=x_other_raw.index,
    )

    # ── Encode categoricals (fit on train only) ───────────────────────────────
    if cat_cols:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
        ohe_tr = ohe.fit_transform(x_tr_raw[cat_cols])
        ohe_other = ohe.transform(x_other_raw[cat_cols])

        ohe_cols = ohe.get_feature_names_out(cat_cols).tolist()

        x_tr_cat = pd.DataFrame(ohe_tr, columns=ohe_cols, index=x_tr_raw.index)
        x_other_cat = pd.DataFrame(ohe_other, columns=ohe_cols, index=x_other_raw.index)

        x_tr = pd.concat([x_tr_num, x_tr_cat], axis=1)
        x_other = pd.concat([x_other_num, x_other_cat], axis=1)
    else:
        x_tr = x_tr_num
        x_other = x_other_num

    return x_tr, x_other, scaler, imputer, ohe


def _apply_num_transform(x_raw, num_cols, cat_cols, scaler, imputer, ohe, index):
    """
    Apply fitted imputer → scaler → OHE (or target encoding) to a raw split (val or holdout).
    """
    # Impute
    num_data = x_raw[num_cols].copy()
    if imputer is not None:
        num_data = pd.DataFrame(
            imputer.transform(num_data),
            columns=num_cols,
            index=index,
        )

    # Signed log1p transform (must match _process)
    num_data = num_data.copy()
    num_data[num_cols] = np.sign(num_data[num_cols].values) * np.log1p(
        np.abs(num_data[num_cols].values)
    )

    # Scale numerics
    x_num = pd.DataFrame(
        scaler.transform(num_data),
        columns=num_cols,
        index=index,
    )

    # Categoricals
    if cat_cols and ohe is not None:
        ohe_arr = ohe.transform(x_raw[cat_cols])
        ohe_cols = ohe.get_feature_names_out(cat_cols).tolist()
        x_cat = pd.DataFrame(ohe_arr, columns=ohe_cols, index=index)
        return pd.concat([x_num, x_cat], axis=1)

    return x_num