Skip to content

Commit 2d405c1

Browse files
committed
small optimizations in unify_columns
1 parent ad5e863 commit 2d405c1

2 files changed

Lines changed: 22 additions & 10 deletions

File tree

python/interpret-core/interpret/glassbox/_ebm/_bin.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@
1919

2020

2121
def eval_terms(X, n_samples, feature_names_in, feature_types_in, bins, term_features):
22+
# TODO: modify this function to do a single sweep of the term_features where
23+
# we cache extracting the raw data from the dataframe and also cache the discretized
24+
# values using a dict with keys (feature_index, id(feature_bins)).
25+
2226
# prior to calling this function, call remove_extra_bins which will eliminate extra work in this function
2327

2428
# This generator function returns data as the feature data within terms gets read. Normally for

python/interpret-core/interpret/utils/_clean_x.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,8 @@ def _densify_object_ndarray(X_col):
419419
if is_float_conversion:
420420
# TODO: handle ints here too which need to be checked if they are larger than the safe int max value
421421

422-
X_col = X_col.copy()
422+
if not X_col.flags.owndata:
423+
X_col = X_col.copy() # we place into this array below so we need to own it
423424
places = np.fromiter(
424425
map(isinstance, X_col, repeat(float)), np.bool_, count=len(X_col)
425426
)
@@ -480,7 +481,7 @@ def _process_column_initial(X_col, nonmissings, processing, min_unique_continuou
480481
if issubclass(X_col.dtype.type, np.floating):
481482
missings = np.isnan(X_col)
482483
if missings.any():
483-
nonmissings = ~missings
484+
nonmissings = np.logical_not(missings, out=missings)
484485
X_col = X_col[nonmissings]
485486
elif X_col.dtype.type is np.object_:
486487
X_col = _densify_object_ndarray(X_col)
@@ -575,7 +576,7 @@ def _encode_categorical_existing(X_col, nonmissings):
575576
if issubclass(X_col.dtype.type, np.floating):
576577
missings = np.isnan(X_col)
577578
if missings.any():
578-
nonmissings = ~missings
579+
nonmissings = np.logical_not(missings, out=missings)
579580
X_col = X_col[nonmissings]
580581
elif X_col.dtype.type is np.object_:
581582
X_col = _densify_object_ndarray(X_col)
@@ -881,12 +882,15 @@ def _process_numpy_column(X_col, is_initial, feature_type, min_unique_continuous
881882

882883
if X_col.dtype.type is np.object_:
883884
if _pandas_installed:
884-
# pandas also has the pd.NA value that indicates missing. If Pandas is available though
885-
# we can use it's function that checks for pd.NA, np.nan, and None
885+
# pandas also has the pd.NA value that indicates missing. If Pandas is
886+
# available we can use the pd.notna function that checks for
887+
# pd.NA, np.nan, math.nan, and None. pd.notna is also faster than the
888+
# alternative (X_col == X_col) & (X_col != np.array(None)) below
886889
nonmissings2 = pd.notna(X_col)
887890
else:
888891
# X_col == X_col is a check for nan that works even with mixed types, since nan != nan
889-
nonmissings2 = np.logical_and(X_col != _none_ndarray, X_col == X_col)
892+
nonmissings2 = X_col == X_col
893+
nonmissings2 &= X_col != _none_ndarray
890894
if not nonmissings2.all():
891895
X_col = X_col[nonmissings2]
892896
if nonmissings is None:
@@ -924,7 +928,7 @@ def _process_pandas_column(X_col, is_initial, feature_type, min_unique_continuou
924928
)
925929
elif isinstance(X_col.dtype, pd.CategoricalDtype):
926930
# unlike other missing value types, we get back -1's for missing here, so no need to drop them
927-
X_col = X_col.values
931+
X_col = X_col.values # pandas 1.0 introduced .cat but .values is older
928932
is_ordered = X_col.ordered
929933
pd_categories = X_col.categories.values.astype(dtype=np.str_, copy=False)
930934
X_col = X_col.codes
@@ -962,6 +966,7 @@ def _process_pandas_column(X_col, is_initial, feature_type, min_unique_continuou
962966
nonmissings = X_col.notna().values
963967
X_col = X_col.dropna()
964968
X_col = X_col.values
969+
# if X_col is a special type like UInt64Dtype convert it to numpy
965970
X_col = X_col.astype(dtype=X_col.dtype.type, copy=False)
966971
return _process_ndarray(
967972
X_col, nonmissings, is_initial, feature_type, min_unique_continuous
@@ -981,12 +986,15 @@ def _process_sparse_column(X_col, is_initial, feature_type, min_unique_continuou
981986
nonmissings = None
982987
if X_col.dtype.type is np.object_:
983988
if _pandas_installed:
984-
# pandas also has the pd.NA value that indicates missing. If Pandas is available though
985-
# we can use it's function that checks for pd.NA, np.nan, and None
989+
# pandas also has the pd.NA value that indicates missing. If Pandas is
990+
# available we can use the pd.notna function that checks for
991+
# pd.NA, np.nan, math.nan, and None. pd.notna is also faster than the
992+
# alternative (X_col == X_col) & (X_col != np.array(None)) below
986993
nonmissings = pd.notna(X_col)
987994
else:
988995
# X_col == X_col is a check for nan that works even with mixed types, since nan != nan
989-
nonmissings = np.logical_and(X_col != _none_ndarray, X_col == X_col)
996+
nonmissings = X_col == X_col
997+
nonmissings &= X_col != _none_ndarray
990998

991999
if nonmissings.all():
9921000
nonmissings = None

0 commit comments

Comments
 (0)