7272 remove_extra_bins ,
7373)
7474from ...utils ._shared_dataset import SharedDataset
75+ from ...utils ._measure_mem import total_bytes
7576
7677_log = logging .getLogger (__name__ )
7778
@@ -496,8 +497,8 @@ def fit(self, X, y, sample_weight=None, bags=None, init_score=None):
496497 """Fit model to provided samples.
497498
498499 Args:
499- X: NumPy array for training samples .
500- y: NumPy array as training labels .
500+ X: { array-like, sparse matrix} of shape (n_samples, n_features). Training data .
501+ y: array-like of shape (n_samples,). Target values .
501502 sample_weight: Optional array of weights per sample. Should be same length as X and y.
502503 bags: Optional bag definitions. The first dimension should have length equal to the number of samples.
503504 The second dimension should have length equal to the number of outer_bags. The contents should be
@@ -1695,18 +1696,33 @@ def fit(self, X, y, sample_weight=None, bags=None, init_score=None):
16951696
16961697 return self
16971698
1698- def estimate_mem (self , X , y = None ):
1699+ def estimate_mem (self , X , y = None , data_multiplier = 0.0 ):
16991700 """Estimate memory usage of the model.
17001701 Args:
1701- X: dataset
1702+ X: {array-like, sparse matrix} of shape (n_samples, n_features). Training data.
1703+ y: array-like of shape (n_samples,). Target values.
1704+ data_multiplier: The data in X needs to be allocated by the caller.
1705+ If data_multiplier is set to 0.0 then this function only estimates the additional
1706+ memory consumed by the fit function. If data_multiplier is set to 1.0 then
1707+ it will include the memory allocated to X by the caller. Often the caller will make
1708+ copies of X before calling fit, and in that case the data_multiplier could be set to a
1709+ value above 1.0 if the caller would like this function to include that in the memory estimate.
1710+
17021711 Returns:
17031712 Estimated memory usage in bytes.
17041713 The estimate does not include the memory from the
17051714 caller's copy of X, nor the process's code or other data.
17061715 The estimate will be more accurate for larger datasets.
17071716 """
17081717
1718+ n_bytes = total_bytes (X )
1719+ if y is not None :
1720+ n_bytes += total_bytes (y )
1721+
1722+ n_bytes = int (n_bytes * data_multiplier )
1723+
17091724 if y is not None :
1725+ y_id = id (y )
17101726 n_classes = Native .Task_Unknown
17111727 y = clean_dimensions (y , "y" )
17121728 if y .ndim != 1 :
@@ -1757,10 +1773,18 @@ def estimate_mem(self, X, y=None):
17571773 _log .error (msg )
17581774 raise ValueError (msg )
17591775
1776+ if y_id != id (y ):
1777+ # in fit we'll also make a copy of y that cannot be deleted until the end
1778+ n_bytes += total_bytes (y )
1779+
17601780 n_samples = None if y is None else len (y )
1781+ X_id = id (X )
17611782 X , n_samples = preclean_X (
17621783 X , self .feature_names , self .feature_types , n_samples , "y"
17631784 )
1785+ if X_id != id (X ):
1786+ # a copy was made, and we'll need to also do this on fit, so add the new memory too
1787+ n_bytes += total_bytes (X )
17641788
17651789 if y is None :
17661790 n_classes = Native .Task_Regression
@@ -1794,11 +1818,19 @@ def estimate_mem(self, X, y=None):
17941818 feature_types_in ,
17951819 None ,
17961820 )
1797-
1798- bin_lengths = [
1799- len (x [0 ]) + 2 if isinstance (x [0 ], dict ) else len (x [0 ]) + 3 for x in bins
1800- ]
1801- n_tensor_bytes = sum (bin_lengths ) * np .float64 ().nbytes * self .outer_bags * 2
1821+ # first calculate the number of cells in the mains for all features
1822+ n_tensor_bytes = sum (
1823+ 2
1824+ if len (x [0 ]) == 0
1825+ else max (x [0 ].values ()) + 2
1826+ if isinstance (x [0 ], dict )
1827+ else len (x [0 ]) + 3
1828+ for x in bins
1829+ if len (x ) != 0
1830+ )
1831+ # We have 2 copies of the upate tensors in C++ (current and best) and we extract
1832+ # one more in python for the update before tearning down the C++ data.
1833+ n_tensor_bytes = n_tensor_bytes * np .float64 ().nbytes * self .outer_bags * 3
18021834
18031835 # One shared memory copy of the data mapped into all processes, plus a copy of
18041836 # the test and train data for each outer bag. Assume all processes are started
@@ -1831,6 +1863,19 @@ def estimate_mem(self, X, y=None):
18311863 None ,
18321864 )
18331865
1866+ bin_lengths = [x [0 ] if len (x ) == 1 else x [1 ] for x in bins if len (x ) != 0 ]
1867+ bin_lengths = [
1868+ 2
1869+ if len (x ) == 0
1870+ else max (x .values ()) + 2
1871+ if isinstance (x , dict )
1872+ else len (x ) + 3
1873+ for x in bin_lengths
1874+ ]
1875+ bin_lengths .sort ()
1876+ # we use the 75th percentile bin length to estimate the number of bins
1877+ n_bad_case_bins = bin_lengths [len (bin_lengths ) // 4 * 3 ]
1878+
18341879 # each outer bag makes a copy of the features. Only the training features
18351880 # are kept for interaction detection, but don't estimate that for now.
18361881 interaction_detection_bytes = (
@@ -1839,15 +1884,15 @@ def estimate_mem(self, X, y=None):
18391884
18401885 max_bytes = max (max_bytes , interaction_detection_bytes )
18411886
1842- bin_lengths . sort ()
1843- n_bad_case_bins = bin_lengths [ len ( bin_lengths ) // 4 * 3 ]
1887+ # We have 2 copies of the upate tensors in C++ (current and best) and we extract
1888+ # one more in python for the update before tearning down the C++ data.
18441889 interaction_boosting_bytes = (
18451890 n_bad_case_bins
18461891 * n_bad_case_bins
18471892 * np .float64 ().nbytes
18481893 * self .outer_bags
18491894 * interactions
1850- * 2
1895+ * 3
18511896 )
18521897
18531898 # We merge the interactions together to make a combined interaction
@@ -1866,7 +1911,7 @@ def estimate_mem(self, X, y=None):
18661911
18671912 max_bytes = max (max_bytes , interaction_boosting_bytes )
18681913
1869- return max_bytes
1914+ return int ( n_bytes + max_bytes )
18701915
18711916 def to_jsonable (self , detail = "all" ):
18721917 """Convert the model to a JSONable representation.
0 commit comments