@@ -419,7 +419,8 @@ def _densify_object_ndarray(X_col):
419419 if is_float_conversion :
420420 # TODO: handle ints here too which need to be checked if they are larger than the safe int max value
421421
422- X_col = X_col .copy ()
422+ if not X_col .flags .owndata :
423+ X_col = X_col .copy () # we place into this array below so we need to own it
423424 places = np .fromiter (
424425 map (isinstance , X_col , repeat (float )), np .bool_ , count = len (X_col )
425426 )
@@ -480,7 +481,7 @@ def _process_column_initial(X_col, nonmissings, processing, min_unique_continuou
480481 if issubclass (X_col .dtype .type , np .floating ):
481482 missings = np .isnan (X_col )
482483 if missings .any ():
483- nonmissings = ~ missings
484+ nonmissings = np . logical_not ( missings , out = missings )
484485 X_col = X_col [nonmissings ]
485486 elif X_col .dtype .type is np .object_ :
486487 X_col = _densify_object_ndarray (X_col )
@@ -575,7 +576,7 @@ def _encode_categorical_existing(X_col, nonmissings):
575576 if issubclass (X_col .dtype .type , np .floating ):
576577 missings = np .isnan (X_col )
577578 if missings .any ():
578- nonmissings = ~ missings
579+ nonmissings = np . logical_not ( missings , out = missings )
579580 X_col = X_col [nonmissings ]
580581 elif X_col .dtype .type is np .object_ :
581582 X_col = _densify_object_ndarray (X_col )
@@ -881,12 +882,15 @@ def _process_numpy_column(X_col, is_initial, feature_type, min_unique_continuous
881882
882883 if X_col .dtype .type is np .object_ :
883884 if _pandas_installed :
884- # pandas also has the pd.NA value that indicates missing. If Pandas is available though
885- # we can use it's function that checks for pd.NA, np.nan, and None
885+ # pandas also has the pd.NA value that indicates missing. If Pandas is
886+ # available we can use the pd.notna function that checks for
887+ # pd.NA, np.nan, math.nan, and None. pd.notna is also faster than the
888+ # alternative (X_col == X_col) & (X_col != np.array(None)) below
886889 nonmissings2 = pd .notna (X_col )
887890 else :
888891 # X_col == X_col is a check for nan that works even with mixed types, since nan != nan
889- nonmissings2 = np .logical_and (X_col != _none_ndarray , X_col == X_col )
892+ nonmissings2 = X_col == X_col
893+ nonmissings2 &= X_col != _none_ndarray
890894 if not nonmissings2 .all ():
891895 X_col = X_col [nonmissings2 ]
892896 if nonmissings is None :
@@ -924,7 +928,7 @@ def _process_pandas_column(X_col, is_initial, feature_type, min_unique_continuou
924928 )
925929 elif isinstance (X_col .dtype , pd .CategoricalDtype ):
926930 # unlike other missing value types, we get back -1's for missing here, so no need to drop them
927- X_col = X_col .values
931+ X_col = X_col .values # pandas 1.0 introduced .cat but .values is older
928932 is_ordered = X_col .ordered
929933 pd_categories = X_col .categories .values .astype (dtype = np .str_ , copy = False )
930934 X_col = X_col .codes
@@ -962,6 +966,7 @@ def _process_pandas_column(X_col, is_initial, feature_type, min_unique_continuou
962966 nonmissings = X_col .notna ().values
963967 X_col = X_col .dropna ()
964968 X_col = X_col .values
969+ # if X_col is a special type like UInt64Dtype convert it to numpy
965970 X_col = X_col .astype (dtype = X_col .dtype .type , copy = False )
966971 return _process_ndarray (
967972 X_col , nonmissings , is_initial , feature_type , min_unique_continuous
@@ -981,12 +986,15 @@ def _process_sparse_column(X_col, is_initial, feature_type, min_unique_continuou
981986 nonmissings = None
982987 if X_col .dtype .type is np .object_ :
983988 if _pandas_installed :
984- # pandas also has the pd.NA value that indicates missing. If Pandas is available though
985- # we can use it's function that checks for pd.NA, np.nan, and None
989+ # pandas also has the pd.NA value that indicates missing. If Pandas is
990+ # available we can use the pd.notna function that checks for
991+ # pd.NA, np.nan, math.nan, and None. pd.notna is also faster than the
992+ # alternative (X_col == X_col) & (X_col != np.array(None)) below
986993 nonmissings = pd .notna (X_col )
987994 else :
988995 # X_col == X_col is a check for nan that works even with mixed types, since nan != nan
989- nonmissings = np .logical_and (X_col != _none_ndarray , X_col == X_col )
996+ nonmissings = X_col == X_col
997+ nonmissings &= X_col != _none_ndarray
990998
991999 if nonmissings .all ():
9921000 nonmissings = None
0 commit comments