Skip to content

Commit 3084304

Browse files
committed
prevent intercept adjustments in DP-models
1 parent c933900 commit 3084304

File tree

1 file changed

+126
-121
lines changed
  • python/interpret-core/interpret/glassbox/_ebm

1 file changed

+126
-121
lines changed

python/interpret-core/interpret/glassbox/_ebm/_ebm.py

Lines changed: 126 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -921,53 +921,20 @@ def fit(self, X, y, sample_weight=None, bags=None, init_score=None):
921921
provider = JobLibProvider(n_jobs=self.n_jobs)
922922

923923
bagged_intercept = None
924-
if n_classes == Native.Task_MonoClassification:
925-
bagged_intercept = np.full((self.outer_bags, 1), -np.inf, np.float64)
926-
intercept_correction = None
927-
elif objective_code == Native.Objective_Rmse:
928-
bagged_intercept = np.empty((self.outer_bags, 1), np.float64)
929-
930-
# RMSE is very special and we can do closed form even with init_scores
931-
y_shifted = y if init_score is None else y - init_score
924+
if not is_differential_privacy:
925+
if n_classes == Native.Task_MonoClassification:
926+
bagged_intercept = np.full((self.outer_bags, 1), -np.inf, np.float64)
927+
intercept_correction = None
928+
elif objective_code == Native.Objective_Rmse:
929+
bagged_intercept = np.empty((self.outer_bags, 1), np.float64)
932930

933-
for idx in range(self.outer_bags):
934-
bag = internal_bags[idx]
935-
sample_weight_local = sample_weight
936-
y_local = y_shifted
937-
if bag is not None:
938-
include_samples = 0 < bag
939-
y_local = y_local[include_samples]
940-
if sample_weight_local is None:
941-
sample_weight_local = bag[include_samples]
942-
else:
943-
sample_weight_local = (
944-
sample_weight_local[include_samples] * bag[include_samples]
945-
)
946-
947-
bagged_intercept[idx, :] = np.average(
948-
y_local, weights=sample_weight_local
949-
)
950-
951-
sample_weight_local = sample_weight
952-
y_local = y_shifted
953-
if visible_samples is not None:
954-
y_local = y_local[visible_samples]
955-
if sample_weight_local is not None:
956-
sample_weight_local = sample_weight_local[visible_samples]
957-
958-
intercept_correction = np.average(y_local, weights=sample_weight_local)
959-
intercept_correction -= bagged_intercept.mean(axis=0)
960-
elif init_score is None:
961-
if (
962-
objective_code == Native.Objective_LogLossBinary
963-
or objective_code == Native.Objective_LogLossMulticlass
964-
):
965-
bagged_intercept = np.empty((self.outer_bags, n_scores), np.float64)
931+
# RMSE is very special and we can do closed form even with init_scores
932+
y_shifted = y if init_score is None else y - init_score
966933

967934
for idx in range(self.outer_bags):
968935
bag = internal_bags[idx]
969936
sample_weight_local = sample_weight
970-
y_local = y
937+
y_local = y_shifted
971938
if bag is not None:
972939
include_samples = 0 < bag
973940
y_local = y_local[include_samples]
@@ -979,26 +946,61 @@ def fit(self, X, y, sample_weight=None, bags=None, init_score=None):
979946
* bag[include_samples]
980947
)
981948

982-
probs = np.bincount(y_local, weights=sample_weight_local)
983-
total = probs.sum()
984-
probs = probs.astype(np.float64, copy=False)
985-
probs /= total
986-
bagged_intercept[idx, :] = link_func(probs, link, link_param)
949+
bagged_intercept[idx, :] = np.average(
950+
y_local, weights=sample_weight_local
951+
)
987952

988953
sample_weight_local = sample_weight
989-
y_local = y
954+
y_local = y_shifted
990955
if visible_samples is not None:
991956
y_local = y_local[visible_samples]
992957
if sample_weight_local is not None:
993958
sample_weight_local = sample_weight_local[visible_samples]
994959

995-
probs = np.bincount(y_local, weights=sample_weight_local)
996-
total = probs.sum()
997-
probs = probs.astype(np.float64, copy=False)
998-
probs /= total
999-
1000-
intercept_correction = link_func(probs, link, link_param)
960+
intercept_correction = np.average(y_local, weights=sample_weight_local)
1001961
intercept_correction -= bagged_intercept.mean(axis=0)
962+
elif init_score is None:
963+
if (
964+
objective_code == Native.Objective_LogLossBinary
965+
or objective_code == Native.Objective_LogLossMulticlass
966+
):
967+
bagged_intercept = np.empty((self.outer_bags, n_scores), np.float64)
968+
969+
for idx in range(self.outer_bags):
970+
bag = internal_bags[idx]
971+
sample_weight_local = sample_weight
972+
y_local = y
973+
if bag is not None:
974+
include_samples = 0 < bag
975+
y_local = y_local[include_samples]
976+
if sample_weight_local is None:
977+
sample_weight_local = bag[include_samples]
978+
else:
979+
sample_weight_local = (
980+
sample_weight_local[include_samples]
981+
* bag[include_samples]
982+
)
983+
984+
probs = np.bincount(y_local, weights=sample_weight_local)
985+
total = probs.sum()
986+
probs = probs.astype(np.float64, copy=False)
987+
probs /= total
988+
bagged_intercept[idx, :] = link_func(probs, link, link_param)
989+
990+
sample_weight_local = sample_weight
991+
y_local = y
992+
if visible_samples is not None:
993+
y_local = y_local[visible_samples]
994+
if sample_weight_local is not None:
995+
sample_weight_local = sample_weight_local[visible_samples]
996+
997+
probs = np.bincount(y_local, weights=sample_weight_local)
998+
total = probs.sum()
999+
probs = probs.astype(np.float64, copy=False)
1000+
probs /= total
1001+
1002+
intercept_correction = link_func(probs, link, link_param)
1003+
intercept_correction -= bagged_intercept.mean(axis=0)
10021004

10031005
if bagged_intercept is None:
10041006
# TODO: get the intercept for these non-default options by boosting on the intercept
@@ -1392,79 +1394,82 @@ def fit(self, X, y, sample_weight=None, bags=None, init_score=None):
13921394
bagged_intercept, bagged_scores, bin_weights, bag_weights
13931395
)
13941396

1395-
if objective_code == Native.Objective_Rmse:
1396-
scores = ebm_predict_scores(
1397-
X,
1398-
n_samples,
1399-
feature_names_in,
1400-
feature_types_in,
1401-
bins,
1402-
intercept,
1403-
term_scores,
1404-
term_features,
1405-
init_score,
1406-
)
1397+
if not is_differential_privacy:
1398+
if objective_code == Native.Objective_Rmse:
1399+
scores = ebm_predict_scores(
1400+
X,
1401+
n_samples,
1402+
feature_names_in,
1403+
feature_types_in,
1404+
bins,
1405+
intercept,
1406+
term_scores,
1407+
term_features,
1408+
init_score,
1409+
)
14071410

1408-
sample_weight_local = sample_weight
1409-
y_local = y
1410-
if visible_samples is not None:
1411-
scores = scores[visible_samples]
1412-
y_local = y_local[visible_samples]
1413-
if sample_weight_local is not None:
1414-
sample_weight_local = sample_weight_local[visible_samples]
1415-
1416-
correction = np.average(y_local - scores, weights=sample_weight_local)
1417-
intercept += correction
1418-
bagged_intercept += correction
1419-
elif (
1420-
objective_code == Native.Objective_LogLossBinary
1421-
or objective_code == Native.Objective_LogLossMulticlass
1422-
):
1423-
scores = ebm_predict_scores(
1424-
X,
1425-
n_samples,
1426-
feature_names_in,
1427-
feature_types_in,
1428-
bins,
1429-
intercept,
1430-
term_scores,
1431-
term_features,
1432-
init_score,
1433-
)
1411+
sample_weight_local = sample_weight
1412+
y_local = y
1413+
if visible_samples is not None:
1414+
scores = scores[visible_samples]
1415+
y_local = y_local[visible_samples]
1416+
if sample_weight_local is not None:
1417+
sample_weight_local = sample_weight_local[visible_samples]
14341418

1435-
sample_weight_local = sample_weight
1436-
y_local = y
1437-
if visible_samples is not None:
1438-
scores = scores[visible_samples]
1439-
y_local = y_local[visible_samples]
1440-
if sample_weight_local is not None:
1441-
sample_weight_local = sample_weight_local[visible_samples]
1442-
1443-
probs = np.bincount(y_local, weights=sample_weight_local)
1444-
total = probs.sum()
1445-
probs = probs.astype(np.float64, copy=False)
1446-
probs /= total
1447-
actual_scores = link_func(probs, link, link_param)
1448-
1449-
n_correction_iterations = 25
1450-
for _ in range(n_correction_iterations):
1451-
pred_prob = inv_link(scores, link, link_param)
1452-
pred_prob = np.average(pred_prob, axis=0, weights=sample_weight_local)
1453-
pred_scores = link_func(pred_prob, link, link_param)
1454-
correction = actual_scores - pred_scores
1419+
correction = np.average(y_local - scores, weights=sample_weight_local)
14551420
intercept += correction
14561421
bagged_intercept += correction
1457-
scores += correction
1458-
1459-
if bagged_intercept.ndim == 2:
1460-
# multiclass
1461-
# pick the class that we're going to zero
1462-
zero_index = np.argmax(intercept)
1463-
intercept -= intercept[zero_index]
1464-
bagged_intercept -= np.expand_dims(
1465-
bagged_intercept[..., zero_index], -1
1422+
elif (
1423+
objective_code == Native.Objective_LogLossBinary
1424+
or objective_code == Native.Objective_LogLossMulticlass
1425+
):
1426+
scores = ebm_predict_scores(
1427+
X,
1428+
n_samples,
1429+
feature_names_in,
1430+
feature_types_in,
1431+
bins,
1432+
intercept,
1433+
term_scores,
1434+
term_features,
1435+
init_score,
14661436
)
14671437

1438+
sample_weight_local = sample_weight
1439+
y_local = y
1440+
if visible_samples is not None:
1441+
scores = scores[visible_samples]
1442+
y_local = y_local[visible_samples]
1443+
if sample_weight_local is not None:
1444+
sample_weight_local = sample_weight_local[visible_samples]
1445+
1446+
probs = np.bincount(y_local, weights=sample_weight_local)
1447+
total = probs.sum()
1448+
probs = probs.astype(np.float64, copy=False)
1449+
probs /= total
1450+
actual_scores = link_func(probs, link, link_param)
1451+
1452+
n_correction_iterations = 25
1453+
for _ in range(n_correction_iterations):
1454+
pred_prob = inv_link(scores, link, link_param)
1455+
pred_prob = np.average(
1456+
pred_prob, axis=0, weights=sample_weight_local
1457+
)
1458+
pred_scores = link_func(pred_prob, link, link_param)
1459+
correction = actual_scores - pred_scores
1460+
intercept += correction
1461+
bagged_intercept += correction
1462+
scores += correction
1463+
1464+
if bagged_intercept.ndim == 2:
1465+
# multiclass
1466+
# pick the class that we're going to zero
1467+
zero_index = np.argmax(intercept)
1468+
intercept -= intercept[zero_index]
1469+
bagged_intercept -= np.expand_dims(
1470+
bagged_intercept[..., zero_index], -1
1471+
)
1472+
14681473
if n_classes < Native.Task_GeneralClassification:
14691474
# scikit-learn requires intercept to be float for RegressorMixin, not numpy
14701475
intercept = float(intercept[0])

0 commit comments

Comments
 (0)