adasegroup · khakhulin · May 13, 2021
diff --git a/.ipython/profile_default/startup/00-kedro-init.py b/.ipython/profile_default/startup/00-kedro-init.py
diff --git a/README.md b/README.md
@@ -14,6 +14,10 @@ To reproduce results presented in the repository you have to make sure [Kedro](h
 
 From the root of the project run the following commands in the terminal
 
+```console
+export PYTHONPATH=$PWD
+```
+
 to reproduce **Test 1**:
 
 ```console

diff --git a/src/fes/datasets/synthetic_dataset.py b/src/fes/datasets/synthetic_dataset.py
@@ -1,14 +1,15 @@
 from typing import Dict, Any
 
 import numpy as np
-from numpy import random
-
 from kedro.io.core import AbstractDataSet
 
+from src.fes.utils import calculate_snr
+
 
 class SyntheticDataset(AbstractDataSet):
 
-    def __init__(self, option, n, m, noise_std, redundancy_rate, features_fill, poly_degree=None, num_groups=None, seed=None):
+    def __init__(self, option, n, m, noise_std, redundancy_rate, features_fill, poly_degree=None, num_groups=None,
+                 seed=None):
         """
         Parameters
         ----------
@@ -38,11 +39,12 @@ def __init__(self, option, n, m, noise_std, redundancy_rate, features_fill, poly
 
         if seed is not None:
             print(f"The seed for the synthetic dataset generation is set to {seed}", end='\n\n')
-            random.seed(seed)
+            np.random.seed(seed)
 
     def _load(self) -> Any:
         if self.option == 'sparse':
-            return generate_sparse_data(self.n, self.m, self.noise_std, self.redundancy_rate, self.features_fill, self.poly_degree)
+            return _generate_sparse_data(self.n, self.m, self.noise_std, self.redundancy_rate, self.features_fill,
+                                        self.poly_degree)
 
         # TODO. Test when ista_sght will be ready
         # elif self.option == 'grouped':
@@ -58,7 +60,7 @@ def _describe(self) -> Dict[str, Any]:
         pass
 
 
-def generate_sparse_data(n, m, noise_std, redundancy_rate, features_fill, poly_degree):
+def _generate_sparse_data(n, m, noise_std, redundancy_rate, features_fill, poly_degree):
     """
     Returns y: vector of observations (n,1),
             X: design matrix (n, m)
@@ -70,9 +72,9 @@ def generate_sparse_data(n, m, noise_std, redundancy_rate, features_fill, poly_d
     w = np.zeros((m, 1))
 
     # Decide the number of features and their locations
-    num_sparse_feat = np.clip(random.binomial(m, 1 - redundancy_rate), a_min=1, a_max=None)
+    num_sparse_feat = np.clip(np.random.binomial(m, 1 - redundancy_rate), a_min=1, a_max=None)
 
-    sparse_feat_idx = random.choice(m, num_sparse_feat, replace=False)
+    sparse_feat_idx = np.random.choice(m, num_sparse_feat, replace=False)
 
     # Trim idx to poly_degree
     if poly_degree is not None:
@@ -81,12 +83,12 @@ def generate_sparse_data(n, m, noise_std, redundancy_rate, features_fill, poly_d
 
     # Fill features with values
     if features_fill == 'const':
-        w[sparse_feat_idx] = random.randint(1, 4 * m, (num_sparse_feat, 1))
+        w[sparse_feat_idx] = np.random.randint(1, 4 * m, (num_sparse_feat, 1))
 
     elif features_fill == 'normal':
         for i in range(poly_degree):
             if i == 0:
-                w[sparse_feat_idx[:, i]] = random.standard_normal((num_poly, 1))
+                w[sparse_feat_idx[:, i]] = np.random.standard_normal((num_poly, 1))
 
             else:
                 w[sparse_feat_idx[:, i]] = w[sparse_feat_idx[:, i - 1]] * w[sparse_feat_idx[:, 0]]
@@ -97,61 +99,53 @@ def generate_sparse_data(n, m, noise_std, redundancy_rate, features_fill, poly_d
     features_mask = w != 0
 
     # Generate observations
-    X = random.standard_normal((n, m))
+    X = np.random.standard_normal((n, m))
 
     y_true = X @ w
     y = y_true + np.random.standard_normal((n, 1)) * noise_std
 
     print("Synthetic sparse test dataset is generated")
-    print(f"Number of observations: {n}, features dim. {m}, number of informative features {sum(features_mask.reshape(-1))}")
+    print(
+        f"Number of observations: {n}, features dim. {m}, number of informative features {sum(features_mask.reshape(-1))}")
     print(f"Observations SNR: {calculate_snr(y_true, noise_std):.3f} dB")
     print(f"Features fill: {features_fill}", end='\n\n')
 
     return y, X, w, y_true, features_mask
 
 
-def generate_grouped_data(n, m, noise_std, redundancy_rate, features_fill, num_groups):
+def _generate_grouped_data(n, m, noise_std, redundancy_rate, features_fill, num_groups):
     if num_groups is None or num_groups < 2:
         raise ValueError("The number of groups cannot be None or less than 2")
 
     # Split x_hat into groups
-    group_end_idx = random.choice(m - 2, num_groups - 1, replace=False) + 1
+    group_end_idx = np.random.choice(m - 2, num_groups - 1, replace=False) + 1
     group_end_idx.sort()
 
     w, groups_labels = np.zeros((m, 1)), np.zeros((m, 1))
 
-    _x_hat, _groups_labels = np.split(w, group_end_idx),\
+    _x_hat, _groups_labels = np.split(w, group_end_idx), \
                              np.split(groups_labels, group_end_idx)
 
     # Decide whether to keep the group and fill it with values if needed
     for i, (x_hat_group, group_labels) in enumerate(zip(_x_hat, _groups_labels)):
-        if i == 0 or random.binomial(1, 1 - redundancy_rate) == 1:
+        if i == 0 or np.random.binomial(1, 1 - redundancy_rate) == 1:
             group_labels[:] = i + 1
 
             if features_fill == 'const':
-                x_hat_group[:] = random.randint(1, 4 * num_groups)
+                x_hat_group[:] = np.random.randint(1, 4 * num_groups)
 
             elif features_fill == 'normal':
-                x_hat_group[:] = random.standard_normal(x_hat_group.shape)
+                x_hat_group[:] = np.random.standard_normal(x_hat_group.shape)
 
             else:
                 raise ValueError(f"Unknown fill value: {features_fill}")
 
     features_mask = w != 0
 
     # Generate observations
-    X = random.standard_normal((n, m))
+    X = np.random.standard_normal((n, m))
 
     y_true = X @ w
     y = y_true + np.random.standard_normal((n)) * noise_std
 
     return y, X, w, y_true, features_mask, groups_labels
-
-
-"""
-Support utils
-"""
-
-
-def calculate_snr(y_true, noise_std):
-    return (20 * np.log10(abs(np.where(noise_std == 0, 0, y_true / noise_std)))).mean()
diff --git a/src/fes/pipeline_registry.py b/src/fes/pipeline_registry.py
@@ -31,8 +31,8 @@
 
 from kedro.pipeline import Pipeline
 
-import fes.pipelines.data_processing.pipeline as dpp
-import fes.pipelines.data_science.pipeline as dsp
+from src.fes.pipelines.data_processing import *
+from src.fes.pipelines.data_science import perm_importance_pipeline
 
 
 def register_pipelines() -> Dict[str, Pipeline]:
@@ -41,15 +41,16 @@ def register_pipelines() -> Dict[str, Pipeline]:
     Returns:
         A mapping from a pipeline name to a ``Pipeline`` object.
     """
-    synth_dataset = dpp.sparse_synth_test_data_pipeline()
-    synth_dataset_poly = dpp.sparse_synth_test_data_poly_pipeline()
-    synth_dataset_noise = dpp.sparse_synth_test_data_noise_pipeline()
-    synth_dataset_rr = dpp.sparse_synth_test_data_rr_pipeline()
-    perm_importance = dsp.perm_importance_pipeline()
+
+    synth_dataset = sparse_synth_test_data_pipeline()
+    synth_dataset_poly = sparse_synth_test_data_poly_pipeline()
+    synth_dataset_noise = sparse_synth_test_data_noise_pipeline()
+    synth_dataset_rr = sparse_synth_test_data_rr_pipeline()
+    perm_importance = perm_importance_pipeline()
 
     return {
-        "__default__": synth_dataset + perm_importance, # Test 1
-        "synth_poly_pi": synth_dataset_poly + perm_importance, # Test 2
-        "synth_noise_pi": synth_dataset_noise + perm_importance, # Test 3
-        "synth_rr_pi": synth_dataset_rr + perm_importance # Test 4
+        "__default__": synth_dataset + perm_importance,  # Test 1
+        "synth_poly_pi": synth_dataset_poly + perm_importance,  # Test 2
+        "synth_noise_pi": synth_dataset_noise + perm_importance,  # Test 3
+        "synth_rr_pi": synth_dataset_rr + perm_importance  # Test 4
     }
diff --git a/src/fes/pipelines/data_processing/__init__.py b/src/fes/pipelines/data_processing/__init__.py
@@ -0,0 +1 @@
+from .pipeline import *
diff --git a/src/fes/pipelines/data_processing/nodes.py b/src/fes/pipelines/data_processing/nodes.py
@@ -3,4 +3,4 @@ def arrange_sparse_synth_test_data(sparse_synth_test_data):
     y, X, w, y_true, features_mask = sparse_synth_test_data
     return y, X, w, y_true, features_mask
 
-# TODO. Grouped data
+# TODO. Grouped data
diff --git a/src/fes/pipelines/data_processing/pipeline.py b/src/fes/pipelines/data_processing/pipeline.py
@@ -1,3 +1,9 @@
+__all__ = ['sparse_synth_test_data_pipeline',
+           'sparse_synth_test_data_poly_pipeline',
+           'sparse_synth_test_data_noise_pipeline',
+           'sparse_synth_test_data_rr_pipeline',
+           ]
+
 from kedro.pipeline import Pipeline, node
 
 from .nodes import arrange_sparse_synth_test_data

diff --git a/src/fes/pipelines/data_science/__init__.py b/src/fes/pipelines/data_science/__init__.py
@@ -0,0 +1 @@
+from .pipeline import *
diff --git a/src/fes/pipelines/data_science/nodes.py b/src/fes/pipelines/data_science/nodes.py
@@ -1,17 +1,18 @@
 import numpy as np
+from sklearn.base import RegressorMixin
 
 from sklearn.linear_model import LinearRegression
 from sklearn.inspection import permutation_importance
 from sklearn.metrics import mean_squared_error, r2_score
 
 
-def fit_model(y, X):
+def fit_model(y: np.ndarray, X: np.ndarray) -> LinearRegression:
     """
     Parameters
     ----------
     y: (n,1) vector of observations
     X: (n,m) design matrix
-    Returns regressor: fitted regressor compatible with sklearn interface
+    Returns linear regressor: fitted regressor compatible with sklearn interface
     -------
 
     """
@@ -20,7 +21,8 @@ def fit_model(y, X):
     return regressor
 
 
-def evaluate_perm_importance(regressor, y, X, w, y_true, features_mask, parameters):
+def evaluate_perm_importance(regressor: RegressorMixin, y: np.ndarray, X: np.ndarray, w: np.ndarray,
+                             y_true: np.ndarray, features_mask: np.ndarray, parameters: dict):
     """
     Parameters
     ----------

diff --git a/src/fes/pipelines/data_science/pipeline.py b/src/fes/pipelines/data_science/pipeline.py
@@ -1,3 +1,5 @@
+__all__ = ['perm_importance_pipeline']
+
 from kedro.pipeline import Pipeline, node
 
 from .nodes import fit_model, evaluate_perm_importance

diff --git a/src/fes/utils/__init__.py b/src/fes/utils/__init__.py
@@ -0,0 +1 @@
+from .dataset import *
diff --git a/src/fes/utils/dataset.py b/src/fes/utils/dataset.py
@@ -0,0 +1,9 @@
+"""
+Support datasets utils
+"""
+
+import numpy as np
+
+
+def calculate_snr(y_true, noise_std):
+    return (20 * np.log10(abs(np.where(noise_std == 0, 0, y_true / noise_std)))).mean()