16
16
_LGBMCheckClassificationTargets , _LGBMCheckSampleWeight , _LGBMCheckXY , _LGBMClassifierBase ,
17
17
_LGBMComputeSampleWeight , _LGBMCpuCount , _LGBMLabelEncoder , _LGBMModelBase , _LGBMRegressorBase ,
18
18
dt_DataTable , pd_DataFrame )
19
- from .engine import train
19
+ from .engine import _make_n_folds , train
20
20
21
21
__all__ = [
22
22
'LGBMClassifier' ,
@@ -412,6 +412,7 @@ def __init__(
412
412
random_state : Optional [Union [int , np .random .RandomState ]] = None ,
413
413
n_jobs : Optional [int ] = None ,
414
414
importance_type : str = 'split' ,
415
+ validation_fraction : Optional [float ] = 0.1 ,
415
416
** kwargs
416
417
):
417
418
r"""Construct a gradient boosting model.
@@ -491,6 +492,10 @@ def __init__(
491
492
The type of feature importance to be filled into ``feature_importances_``.
492
493
If 'split', result contains numbers of times the feature is used in a model.
493
494
If 'gain', result contains total gains of splits which use the feature.
495
+ validation_fraction : float or None, optional (default=0.1)
496
+ Proportion of training data to set aside as
497
+ validation data for early stopping. If None, early stopping is done on
498
+ the training data. Only used if early stopping is performed.
494
499
**kwargs
495
500
Other parameters for the model.
496
501
Check http://lightgbm.readthedocs.io/en/latest/Parameters.html for more parameters.
@@ -553,6 +558,7 @@ def __init__(
553
558
self .random_state = random_state
554
559
self .n_jobs = n_jobs
555
560
self .importance_type = importance_type
561
+ self .validation_fraction = validation_fraction
556
562
self ._Booster : Optional [Booster ] = None
557
563
self ._evals_result : _EvalResultDict = {}
558
564
self ._best_score : _LGBM_BoosterBestScoreType = {}
@@ -668,9 +674,27 @@ def _process_params(self, stage: str) -> Dict[str, Any]:
668
674
params .pop ('importance_type' , None )
669
675
params .pop ('n_estimators' , None )
670
676
params .pop ('class_weight' , None )
677
+ params .pop ("validation_fraction" , None )
671
678
672
679
if isinstance (params ['random_state' ], np .random .RandomState ):
673
680
params ['random_state' ] = params ['random_state' ].randint (np .iinfo (np .int32 ).max )
681
+
682
+ params = _choose_param_value (
683
+ main_param_name = "early_stopping_round" ,
684
+ params = params ,
685
+ default_value = "auto" ,
686
+ )
687
+ if params ["early_stopping_round" ] == "auto" :
688
+ if hasattr (self , "_n_rows_train" ) and self ._n_rows_train > 10_000 :
689
+ params ["early_stopping_round" ] = 10
690
+ else :
691
+ params ["early_stopping_round" ] = None
692
+
693
+ if params ["early_stopping_round" ] is True :
694
+ params ["early_stopping_round" ] = 10
695
+ elif params ["early_stopping_round" ] is False :
696
+ params ["early_stopping_round" ] = None
697
+
674
698
if self ._n_classes > 2 :
675
699
for alias in _ConfigAliases .get ('num_class' ):
676
700
params .pop (alias , None )
@@ -745,6 +769,19 @@ def fit(
745
769
init_model : Optional [Union [str , Path , Booster , "LGBMModel" ]] = None
746
770
) -> "LGBMModel" :
747
771
"""Docstring is set after definition, using a template."""
772
+ if not isinstance (X , (pd_DataFrame , dt_DataTable )):
773
+ _X , _y = _LGBMCheckXY (X , y , accept_sparse = True , force_all_finite = False , ensure_min_samples = 2 )
774
+ if sample_weight is not None :
775
+ sample_weight = _LGBMCheckSampleWeight (sample_weight , _X )
776
+ else :
777
+ _X , _y = X , y
778
+
779
+ self ._n_features = _X .shape [1 ]
780
+ # copy for consistency
781
+ self ._n_features_in = self ._n_features
782
+
783
+ self ._n_rows_train = _X .shape [0 ]
784
+
748
785
params = self ._process_params (stage = "fit" )
749
786
750
787
# Do not modify original args in fit function
@@ -766,13 +803,6 @@ def fit(
766
803
params ['metric' ] = [e for e in eval_metrics_builtin if e not in params ['metric' ]] + params ['metric' ]
767
804
params ['metric' ] = [metric for metric in params ['metric' ] if metric is not None ]
768
805
769
- if not isinstance (X , (pd_DataFrame , dt_DataTable )):
770
- _X , _y = _LGBMCheckXY (X , y , accept_sparse = True , force_all_finite = False , ensure_min_samples = 2 )
771
- if sample_weight is not None :
772
- sample_weight = _LGBMCheckSampleWeight (sample_weight , _X )
773
- else :
774
- _X , _y = X , y
775
-
776
806
if self ._class_weight is None :
777
807
self ._class_weight = self .class_weight
778
808
if self ._class_weight is not None :
@@ -782,51 +812,61 @@ def fit(
782
812
else :
783
813
sample_weight = np .multiply (sample_weight , class_sample_weight )
784
814
785
- self ._n_features = _X .shape [1 ]
786
- # copy for consistency
787
- self ._n_features_in = self ._n_features
788
-
789
815
train_set = Dataset (data = _X , label = _y , weight = sample_weight , group = group ,
790
816
init_score = init_score , categorical_feature = categorical_feature ,
791
817
params = params )
818
+ if params ["early_stopping_round" ] is not None and eval_set is None :
819
+ if self .validation_fraction is not None :
820
+ n_splits = max (int (np .ceil (1 / self .validation_fraction )), 2 )
821
+ stratified = isinstance (self , LGBMClassifier )
822
+ cvfolds = _make_n_folds (full_data = train_set , folds = None , nfold = n_splits ,
823
+ params = params , seed = self .random_state ,
824
+ stratified = stratified , shuffle = True )
825
+ train_idx , val_idx = next (cvfolds )
826
+ valid_set = train_set .subset (sorted (val_idx ))
827
+ train_set = train_set .subset (sorted (train_idx ))
828
+ else :
829
+ valid_set = train_set
830
+ valid_set = valid_set .construct ()
831
+ valid_sets = [valid_set ]
792
832
793
- valid_sets : List [ Dataset ] = []
794
- if eval_set is not None :
795
-
796
- def _get_meta_data (collection , name , i ):
797
- if collection is None :
798
- return None
799
- elif isinstance (collection , list ):
800
- return collection [i ] if len (collection ) > i else None
801
- elif isinstance (collection , dict ):
802
- return collection .get (i , None )
803
- else :
804
- raise TypeError (f"{ name } should be dict or list" )
805
-
806
- if isinstance (eval_set , tuple ):
807
- eval_set = [eval_set ]
808
- for i , valid_data in enumerate (eval_set ):
809
- # reduce cost for prediction training data
810
- if valid_data [0 ] is X and valid_data [1 ] is y :
811
- valid_set = train_set
812
- else :
813
- valid_weight = _get_meta_data (eval_sample_weight , 'eval_sample_weight' , i )
814
- valid_class_weight = _get_meta_data (eval_class_weight , 'eval_class_weight' , i )
815
- if valid_class_weight is not None :
816
- if isinstance (valid_class_weight , dict ) and self ._class_map is not None :
817
- valid_class_weight = {self ._class_map [k ]: v for k , v in valid_class_weight .items ()}
818
- valid_class_sample_weight = _LGBMComputeSampleWeight (valid_class_weight , valid_data [1 ])
819
- if valid_weight is None or len (valid_weight ) == 0 :
820
- valid_weight = valid_class_sample_weight
821
- else :
822
- valid_weight = np .multiply (valid_weight , valid_class_sample_weight )
823
- valid_init_score = _get_meta_data (eval_init_score , 'eval_init_score' , i )
824
- valid_group = _get_meta_data (eval_group , 'eval_group' , i )
825
- valid_set = Dataset (data = valid_data [0 ], label = valid_data [1 ], weight = valid_weight ,
826
- group = valid_group , init_score = valid_init_score ,
827
- categorical_feature = 'auto' , params = params )
828
-
829
- valid_sets .append (valid_set )
833
+ else :
834
+ valid_sets : List [ Dataset ] = []
835
+ if eval_set is not None :
836
+ def _get_meta_data (collection , name , i ):
837
+ if collection is None :
838
+ return None
839
+ elif isinstance (collection , list ):
840
+ return collection [i ] if len (collection ) > i else None
841
+ elif isinstance (collection , dict ):
842
+ return collection .get (i , None )
843
+ else :
844
+ raise TypeError (f"{ name } should be dict or list" )
845
+
846
+ if isinstance (eval_set , tuple ):
847
+ eval_set = [eval_set ]
848
+ for i , valid_data in enumerate (eval_set ):
849
+ # reduce cost for prediction training data
850
+ if valid_data [0 ] is X and valid_data [1 ] is y :
851
+ valid_set = train_set
852
+ else :
853
+ valid_weight = _get_meta_data (eval_sample_weight , 'eval_sample_weight' , i )
854
+ valid_class_weight = _get_meta_data (eval_class_weight , 'eval_class_weight' , i )
855
+ if valid_class_weight is not None :
856
+ if isinstance (valid_class_weight , dict ) and self ._class_map is not None :
857
+ valid_class_weight = {self ._class_map [k ]: v for k , v in valid_class_weight .items ()}
858
+ valid_class_sample_weight = _LGBMComputeSampleWeight (valid_class_weight , valid_data [1 ])
859
+ if valid_weight is None or len (valid_weight ) == 0 :
860
+ valid_weight = valid_class_sample_weight
861
+ else :
862
+ valid_weight = np .multiply (valid_weight , valid_class_sample_weight )
863
+ valid_init_score = _get_meta_data (eval_init_score , 'eval_init_score' , i )
864
+ valid_group = _get_meta_data (eval_group , 'eval_group' , i )
865
+ valid_set = Dataset (data = valid_data [0 ], label = valid_data [1 ], weight = valid_weight ,
866
+ group = valid_group , init_score = valid_init_score ,
867
+ categorical_feature = 'auto' , params = params )
868
+
869
+ valid_sets .append (valid_set )
830
870
831
871
if isinstance (init_model , LGBMModel ):
832
872
init_model = init_model .booster_
0 commit comments