fixed typos

Diyago · Diyago · commit 3eab5a14535b · 2021-12-16T21:52:08.000+03:00
sklearn version in setup is defined issues #24 #23 #22
diff --git a/README.md b/README.md
@@ -33,7 +33,7 @@ new_train2, new_target2 = GANGenerator().generate_data_pipe(train, target, test,
 # example with all params defined
 new_train3, new_target3 = GANGenerator(gen_x_times=1.1, cat_cols=None,
            bot_filter_quantile=0.001, top_filter_quantile=0.999, is_post_process=True,
-           adversaial_model_params={
+           adversarial_model_params={
                "metrics": "AUC", "max_depth": 2, "max_bin": 100, 
                "learning_rate": 0.02, "random_state": 42, "n_estimators": 500,
            }, pregeneration_frac=2, only_generated_data=False,
@@ -50,7 +50,7 @@ Both samplers `OriginalGenerator` and `GANGenerator` have same input parameters:
 * **top_filter_quantile**: float = 0.999 - bottom quantile for postprocess filtering
 * **is_post_process**: bool = True - perform or not post-filtering, if false bot_filter_quantile and top_filter_quantile
   ignored
-* **adversaial_model_params**: dict params for adversarial filtering model, default values for binary task
+* **adversarial_model_params**: dict params for adversarial filtering model, default values for binary task
 * **pregeneration_frac**: float = 2 - for generataion step gen_x_times * pregeneration_frac amount of data will
   generated. However in postprocessing (1 + gen_x_times) % of original data will be returned
 * **gan_params**: dict params for GAN training
diff --git a/pip_desc.md b/pip_desc.md
@@ -39,7 +39,7 @@ new_train1, new_target1 = GANGenerator().generate_data_pipe(train, target, test,
 # example with all params defined
 new_train3, new_target3 = GANGenerator(gen_x_times=1.1, cat_cols=None,
            bot_filter_quantile=0.001, top_filter_quantile=0.999, is_post_process=True,
-           adversaial_model_params={
+           adversarial_model_params={
                "metrics": "AUC", "max_depth": 2, "max_bin": 100, 
                "learning_rate": 0.02, "random_state": 42, "n_estimators": 500,
            }, pregeneration_frac=2, only_generated_data=False,
@@ -56,9 +56,9 @@ adversarial filtering
 * **top_filter_quantile**: float = 0.999 - bottom quantile for postprocess filtering
 * **is_post_process**: bool = True - perform or not postfiltering, if false bot_filter_quantile
  and top_filter_quantile ignored
-* **adversaial_model_params**: dict params for adversarial filtering model, default values for binary task
+* **adversarial_model_params**: dict params for adversarial filtering model, default values for binary task
 * **pregeneration_frac**: float = 2 - for generataion step gen_x_times * pregeneration_frac amount of data
-will generated. However in postprocessing (1 + gen_x_times) % of original data will be returned
+will be generated. However, in postprocessing (1 + gen_x_times) % of original data will be returned
 * **gan_params**: dict params for GAN training
 
 
diff --git a/setup.cfg b/setup.cfg
@@ -42,7 +42,7 @@ install_requires =
     category_encoders
     torch
     lightgbm
-    scikit_learn
+    scikit_learn==0.23.2
     torchvision
     python-dateutil
     tqdm
diff --git a/src/tabgan/sampler.py b/src/tabgan/sampler.py
@@ -45,23 +45,23 @@ def get_object_generator(self) -> Sampler:
 
 class SamplerOriginal(Sampler):
     def __init__(
-        self,
-        gen_x_times: float = 1.1,
-        cat_cols: list = None,
-        bot_filter_quantile: float = 0.001,
-        top_filter_quantile: float = 0.999,
-        is_post_process: bool = True,
-        adversaial_model_params: dict = {
-            "metrics": "AUC",
-            "max_depth": 2,
-            "max_bin": 100,
-            "n_estimators": 500,
-            "learning_rate": 0.02,
-            "random_state": 42,
-        },
-        pregeneration_frac: float = 2,
-        only_generated_data: bool = False,
-        gan_params: dict = {'batch_size': 500, 'patience': 25, "epochs" : 500,}
+            self,
+            gen_x_times: float = 1.1,
+            cat_cols: list = None,
+            bot_filter_quantile: float = 0.001,
+            top_filter_quantile: float = 0.999,
+            is_post_process: bool = True,
+            adversarial_model_params: dict = {
+                "metrics": "AUC",
+                "max_depth": 2,
+                "max_bin": 100,
+                "n_estimators": 500,
+                "learning_rate": 0.02,
+                "random_state": 42,
+            },
+            pregeneration_frac: float = 2,
+            only_generated_data: bool = False,
+            gan_params: dict = {'batch_size': 500, 'patience': 25, "epochs": 500, }
     ):
         """
 
@@ -75,7 +75,8 @@ def __init__(
         @param adversarial_model_params: dict params for adversarial filtering model, default values for binary task
         @param pregeneration_frac: float = 2 - for generation step gen_x_times * pregeneration_frac amount of data
         will generated. However in postprocessing (1 + gen_x_times) % of original data will be returned
-        @param only_generated_data: bool = False If True after generation get only newly generated, without concating input train dataframe.
+        @param only_generated_data: bool = False If True after generation get only newly generated, without
+        concating input train dataframe.
         @param gan_params: dict params for GAN training
         Only works for SamplerGAN.
         """
@@ -84,13 +85,14 @@ def __init__(
         self.is_post_process = is_post_process
         self.bot_filter_quantile = bot_filter_quantile
         self.top_filter_quantile = top_filter_quantile
-        self.adversarial_model_params = adversaial_model_params
+        self.adversarial_model_params = adversarial_model_params
         self.pregeneration_frac = pregeneration_frac
         self.only_generated_data = only_generated_data
         self.gan_params = gan_params
         self.TEMP_TARGET = "TEMP_TARGET"
 
-    def preprocess_data_df(self, df) -> pd.DataFrame:
+    @staticmethod
+    def preprocess_data_df(df) -> pd.DataFrame:
         logging.info("Input shape: {}".format(df.shape))
         if isinstance(df, pd.DataFrame) is False:
             raise ValueError(
@@ -99,7 +101,7 @@ def preprocess_data_df(self, df) -> pd.DataFrame:
         return df
 
     def preprocess_data(
-        self, train, target, test_df
+            self, train, target, test_df
     ) -> Tuple[pd.DataFrame, pd.DataFrame]:
         train = self.preprocess_data_df(train)
         target = self.preprocess_data_df(target)
@@ -119,10 +121,10 @@ def preprocess_data(
         return train, target, test_df
 
     def generate_data(
-        self, train_df, target, test_df, only_generated_data
+            self, train_df, target, test_df, only_generated_data
     ) -> Tuple[pd.DataFrame, pd.DataFrame]:
         if only_generated_data:
-            Warning.warn(
+            Warning(
                 "For SamplerOriginal setting only_generated_data doesn't change anything, "
                 "because generated data sampled from the train!"
             )
@@ -158,7 +160,7 @@ def postprocess_data(self, train_df, target, test_df):
                 max_val = test_df[num_col].quantile(self.top_filter_quantile)
                 filtered_df = train_df.loc[
                     (train_df[num_col] >= min_val) & (train_df[num_col] <= max_val)
-                ]
+                    ]
                 if filtered_df.shape[0] < 10:
                     raise ValueError(
                         "After post-processing generated data's shape less than 10. For columns {} test "
@@ -236,7 +238,7 @@ def _validate_data(train_df, target, test_df):
 
 class SamplerGAN(SamplerOriginal):
     def generate_data(
-        self, train_df, target, test_df, only_generated_data: bool
+            self, train_df, target, test_df, only_generated_data: bool
     ) -> Tuple[pd.DataFrame, pd.DataFrame]:
         self._validate_data(train_df, target, test_df)
         if target is not None:
@@ -298,7 +300,7 @@ def _sampler(creator: SampleData, in_train, in_target, in_test) -> None:
 
 def _drop_col_if_exist(df, col_to_drop) -> pd.DataFrame:
     """
-    Drops col_to_drop from input dataframe df if sucj column exists
+    Drops col_to_drop from input dataframe df if such column exists
     """
     if col_to_drop in df.columns:
         return df.drop(col_to_drop, axis=1)