diff --git a/.ipython/profile_default/startup/00-kedro-init.py b/.ipython/profile_default/startup/00-kedro-init.py deleted file mode 100644 index 702b2c8..0000000 --- a/.ipython/profile_default/startup/00-kedro-init.py +++ /dev/null @@ -1,79 +0,0 @@ -import logging.config -import sys -from pathlib import Path -from typing import Any, Dict - -from IPython.core.magic import needs_local_scope, register_line_magic - -# Find the project root (./../../../) -from kedro.framework.startup import _get_project_metadata - -startup_error = None -project_path = Path(__file__).parents[3].resolve() - - -@register_line_magic -def reload_kedro(path, line=None, env: str = None, extra_params: Dict[str, Any] = None): - """Line magic which reloads all Kedro default variables.""" - global startup_error - global context - global catalog - global session - - try: - import kedro.config.default_logger - from kedro.framework.hooks import get_hook_manager - from kedro.framework.project import configure_project - from kedro.framework.session import KedroSession - from kedro.framework.session.session import _activate_session - from kedro.framework.cli.jupyter import collect_line_magic - except ImportError: - logging.error( - "Kedro appears not to be installed in your current environment " - "or your current IPython session was not started in a valid Kedro project." - ) - raise - - try: - path = path or project_path - - # clear hook manager - hook_manager = get_hook_manager() - name_plugin_pairs = hook_manager.list_name_plugin() - for name, plugin in name_plugin_pairs: - hook_manager.unregister(name=name, plugin=plugin) - - # remove cached user modules - metadata = _get_project_metadata(path) - to_remove = [ - mod for mod in sys.modules if mod.startswith(metadata.package_name) - ] - # `del` is used instead of `reload()` because: If the new version of a module does not - # define a name that was defined by the old version, the old definition remains. - for module in to_remove: - del sys.modules[module] - - configure_project(metadata.package_name) - session = KedroSession.create( - metadata.package_name, path, env=env, extra_params=extra_params - ) - _activate_session(session, force=True) - logging.debug("Loading the context from %s", str(path)) - context = session.load_context() - catalog = context.catalog - - logging.info("** Kedro project %s", str(metadata.project_name)) - logging.info("Defined global variable `context`, `session` and `catalog`") - - for line_magic in collect_line_magic(): - register_line_magic(needs_local_scope(line_magic)) - logging.info("Registered line magic `%s`", line_magic.__name__) - except Exception as err: - startup_error = err - logging.exception( - "Kedro's ipython session startup script failed:\n%s", str(err) - ) - raise err - - -reload_kedro(project_path) diff --git a/README.md b/README.md index 6f29a8b..f3a5c95 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,10 @@ To reproduce results presented in the repository you have to make sure [Kedro](h From the root of the project run the following commands in the terminal +```console +export PYTHONPATH=$PWD +``` + to reproduce **Test 1**: ```console diff --git a/src/fes/datasets/synthetic_dataset.py b/src/fes/datasets/synthetic_dataset.py index 695e246..cb37f70 100644 --- a/src/fes/datasets/synthetic_dataset.py +++ b/src/fes/datasets/synthetic_dataset.py @@ -1,14 +1,15 @@ from typing import Dict, Any import numpy as np -from numpy import random - from kedro.io.core import AbstractDataSet +from src.fes.utils import calculate_snr + class SyntheticDataset(AbstractDataSet): - def __init__(self, option, n, m, noise_std, redundancy_rate, features_fill, poly_degree=None, num_groups=None, seed=None): + def __init__(self, option, n, m, noise_std, redundancy_rate, features_fill, poly_degree=None, num_groups=None, + seed=None): """ Parameters ---------- @@ -38,11 +39,12 @@ def __init__(self, option, n, m, noise_std, redundancy_rate, features_fill, poly if seed is not None: print(f"The seed for the synthetic dataset generation is set to {seed}", end='\n\n') - random.seed(seed) + np.random.seed(seed) def _load(self) -> Any: if self.option == 'sparse': - return generate_sparse_data(self.n, self.m, self.noise_std, self.redundancy_rate, self.features_fill, self.poly_degree) + return _generate_sparse_data(self.n, self.m, self.noise_std, self.redundancy_rate, self.features_fill, + self.poly_degree) # TODO. Test when ista_sght will be ready # elif self.option == 'grouped': @@ -58,7 +60,7 @@ def _describe(self) -> Dict[str, Any]: pass -def generate_sparse_data(n, m, noise_std, redundancy_rate, features_fill, poly_degree): +def _generate_sparse_data(n, m, noise_std, redundancy_rate, features_fill, poly_degree): """ Returns y: vector of observations (n,1), X: design matrix (n, m) @@ -70,9 +72,9 @@ def generate_sparse_data(n, m, noise_std, redundancy_rate, features_fill, poly_d w = np.zeros((m, 1)) # Decide the number of features and their locations - num_sparse_feat = np.clip(random.binomial(m, 1 - redundancy_rate), a_min=1, a_max=None) + num_sparse_feat = np.clip(np.random.binomial(m, 1 - redundancy_rate), a_min=1, a_max=None) - sparse_feat_idx = random.choice(m, num_sparse_feat, replace=False) + sparse_feat_idx = np.random.choice(m, num_sparse_feat, replace=False) # Trim idx to poly_degree if poly_degree is not None: @@ -81,12 +83,12 @@ def generate_sparse_data(n, m, noise_std, redundancy_rate, features_fill, poly_d # Fill features with values if features_fill == 'const': - w[sparse_feat_idx] = random.randint(1, 4 * m, (num_sparse_feat, 1)) + w[sparse_feat_idx] = np.random.randint(1, 4 * m, (num_sparse_feat, 1)) elif features_fill == 'normal': for i in range(poly_degree): if i == 0: - w[sparse_feat_idx[:, i]] = random.standard_normal((num_poly, 1)) + w[sparse_feat_idx[:, i]] = np.random.standard_normal((num_poly, 1)) else: w[sparse_feat_idx[:, i]] = w[sparse_feat_idx[:, i - 1]] * w[sparse_feat_idx[:, 0]] @@ -97,42 +99,43 @@ def generate_sparse_data(n, m, noise_std, redundancy_rate, features_fill, poly_d features_mask = w != 0 # Generate observations - X = random.standard_normal((n, m)) + X = np.random.standard_normal((n, m)) y_true = X @ w y = y_true + np.random.standard_normal((n, 1)) * noise_std print("Synthetic sparse test dataset is generated") - print(f"Number of observations: {n}, features dim. {m}, number of informative features {sum(features_mask.reshape(-1))}") + print( + f"Number of observations: {n}, features dim. {m}, number of informative features {sum(features_mask.reshape(-1))}") print(f"Observations SNR: {calculate_snr(y_true, noise_std):.3f} dB") print(f"Features fill: {features_fill}", end='\n\n') return y, X, w, y_true, features_mask -def generate_grouped_data(n, m, noise_std, redundancy_rate, features_fill, num_groups): +def _generate_grouped_data(n, m, noise_std, redundancy_rate, features_fill, num_groups): if num_groups is None or num_groups < 2: raise ValueError("The number of groups cannot be None or less than 2") # Split x_hat into groups - group_end_idx = random.choice(m - 2, num_groups - 1, replace=False) + 1 + group_end_idx = np.random.choice(m - 2, num_groups - 1, replace=False) + 1 group_end_idx.sort() w, groups_labels = np.zeros((m, 1)), np.zeros((m, 1)) - _x_hat, _groups_labels = np.split(w, group_end_idx),\ + _x_hat, _groups_labels = np.split(w, group_end_idx), \ np.split(groups_labels, group_end_idx) # Decide whether to keep the group and fill it with values if needed for i, (x_hat_group, group_labels) in enumerate(zip(_x_hat, _groups_labels)): - if i == 0 or random.binomial(1, 1 - redundancy_rate) == 1: + if i == 0 or np.random.binomial(1, 1 - redundancy_rate) == 1: group_labels[:] = i + 1 if features_fill == 'const': - x_hat_group[:] = random.randint(1, 4 * num_groups) + x_hat_group[:] = np.random.randint(1, 4 * num_groups) elif features_fill == 'normal': - x_hat_group[:] = random.standard_normal(x_hat_group.shape) + x_hat_group[:] = np.random.standard_normal(x_hat_group.shape) else: raise ValueError(f"Unknown fill value: {features_fill}") @@ -140,18 +143,9 @@ def generate_grouped_data(n, m, noise_std, redundancy_rate, features_fill, num_g features_mask = w != 0 # Generate observations - X = random.standard_normal((n, m)) + X = np.random.standard_normal((n, m)) y_true = X @ w y = y_true + np.random.standard_normal((n)) * noise_std return y, X, w, y_true, features_mask, groups_labels - - -""" -Support utils -""" - - -def calculate_snr(y_true, noise_std): - return (20 * np.log10(abs(np.where(noise_std == 0, 0, y_true / noise_std)))).mean() \ No newline at end of file diff --git a/src/fes/pipeline_registry.py b/src/fes/pipeline_registry.py index 9925214..ff86e75 100644 --- a/src/fes/pipeline_registry.py +++ b/src/fes/pipeline_registry.py @@ -31,8 +31,8 @@ from kedro.pipeline import Pipeline -import fes.pipelines.data_processing.pipeline as dpp -import fes.pipelines.data_science.pipeline as dsp +from src.fes.pipelines.data_processing import * +from src.fes.pipelines.data_science import perm_importance_pipeline def register_pipelines() -> Dict[str, Pipeline]: @@ -41,15 +41,16 @@ def register_pipelines() -> Dict[str, Pipeline]: Returns: A mapping from a pipeline name to a ``Pipeline`` object. """ - synth_dataset = dpp.sparse_synth_test_data_pipeline() - synth_dataset_poly = dpp.sparse_synth_test_data_poly_pipeline() - synth_dataset_noise = dpp.sparse_synth_test_data_noise_pipeline() - synth_dataset_rr = dpp.sparse_synth_test_data_rr_pipeline() - perm_importance = dsp.perm_importance_pipeline() + + synth_dataset = sparse_synth_test_data_pipeline() + synth_dataset_poly = sparse_synth_test_data_poly_pipeline() + synth_dataset_noise = sparse_synth_test_data_noise_pipeline() + synth_dataset_rr = sparse_synth_test_data_rr_pipeline() + perm_importance = perm_importance_pipeline() return { - "__default__": synth_dataset + perm_importance, # Test 1 - "synth_poly_pi": synth_dataset_poly + perm_importance, # Test 2 - "synth_noise_pi": synth_dataset_noise + perm_importance, # Test 3 - "synth_rr_pi": synth_dataset_rr + perm_importance # Test 4 + "__default__": synth_dataset + perm_importance, # Test 1 + "synth_poly_pi": synth_dataset_poly + perm_importance, # Test 2 + "synth_noise_pi": synth_dataset_noise + perm_importance, # Test 3 + "synth_rr_pi": synth_dataset_rr + perm_importance # Test 4 } diff --git a/src/fes/pipelines/data_processing/__init__.py b/src/fes/pipelines/data_processing/__init__.py index e69de29..821dc6d 100644 --- a/src/fes/pipelines/data_processing/__init__.py +++ b/src/fes/pipelines/data_processing/__init__.py @@ -0,0 +1 @@ +from .pipeline import * diff --git a/src/fes/pipelines/data_processing/nodes.py b/src/fes/pipelines/data_processing/nodes.py index dab099f..84defd9 100644 --- a/src/fes/pipelines/data_processing/nodes.py +++ b/src/fes/pipelines/data_processing/nodes.py @@ -3,4 +3,4 @@ def arrange_sparse_synth_test_data(sparse_synth_test_data): y, X, w, y_true, features_mask = sparse_synth_test_data return y, X, w, y_true, features_mask -# TODO. Grouped data \ No newline at end of file +# TODO. Grouped data diff --git a/src/fes/pipelines/data_processing/pipeline.py b/src/fes/pipelines/data_processing/pipeline.py index a82d1ba..a1c226e 100644 --- a/src/fes/pipelines/data_processing/pipeline.py +++ b/src/fes/pipelines/data_processing/pipeline.py @@ -1,3 +1,9 @@ +__all__ = ['sparse_synth_test_data_pipeline', + 'sparse_synth_test_data_poly_pipeline', + 'sparse_synth_test_data_noise_pipeline', + 'sparse_synth_test_data_rr_pipeline', + ] + from kedro.pipeline import Pipeline, node from .nodes import arrange_sparse_synth_test_data diff --git a/src/fes/pipelines/data_science/__init__.py b/src/fes/pipelines/data_science/__init__.py index e69de29..821dc6d 100644 --- a/src/fes/pipelines/data_science/__init__.py +++ b/src/fes/pipelines/data_science/__init__.py @@ -0,0 +1 @@ +from .pipeline import * diff --git a/src/fes/pipelines/data_science/nodes.py b/src/fes/pipelines/data_science/nodes.py index 4428ae5..788d22a 100644 --- a/src/fes/pipelines/data_science/nodes.py +++ b/src/fes/pipelines/data_science/nodes.py @@ -1,17 +1,18 @@ import numpy as np +from sklearn.base import RegressorMixin from sklearn.linear_model import LinearRegression from sklearn.inspection import permutation_importance from sklearn.metrics import mean_squared_error, r2_score -def fit_model(y, X): +def fit_model(y: np.ndarray, X: np.ndarray) -> LinearRegression: """ Parameters ---------- y: (n,1) vector of observations X: (n,m) design matrix - Returns regressor: fitted regressor compatible with sklearn interface + Returns linear regressor: fitted regressor compatible with sklearn interface ------- """ @@ -20,7 +21,8 @@ def fit_model(y, X): return regressor -def evaluate_perm_importance(regressor, y, X, w, y_true, features_mask, parameters): +def evaluate_perm_importance(regressor: RegressorMixin, y: np.ndarray, X: np.ndarray, w: np.ndarray, + y_true: np.ndarray, features_mask: np.ndarray, parameters: dict): """ Parameters ---------- diff --git a/src/fes/pipelines/data_science/pipeline.py b/src/fes/pipelines/data_science/pipeline.py index 7e97e8c..d73908c 100644 --- a/src/fes/pipelines/data_science/pipeline.py +++ b/src/fes/pipelines/data_science/pipeline.py @@ -1,3 +1,5 @@ +__all__ = ['perm_importance_pipeline'] + from kedro.pipeline import Pipeline, node from .nodes import fit_model, evaluate_perm_importance diff --git a/src/fes/utils/__init__.py b/src/fes/utils/__init__.py new file mode 100644 index 0000000..c7b28ee --- /dev/null +++ b/src/fes/utils/__init__.py @@ -0,0 +1 @@ +from .dataset import * \ No newline at end of file diff --git a/src/fes/utils/dataset.py b/src/fes/utils/dataset.py new file mode 100644 index 0000000..b33e91a --- /dev/null +++ b/src/fes/utils/dataset.py @@ -0,0 +1,9 @@ +""" +Support datasets utils +""" + +import numpy as np + + +def calculate_snr(y_true, noise_std): + return (20 * np.log10(abs(np.where(noise_std == 0, 0, y_true / noise_std)))).mean()