Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 0 additions & 79 deletions .ipython/profile_default/startup/00-kedro-init.py

This file was deleted.

4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ To reproduce results presented in the repository you have to make sure [Kedro](h

From the root of the project run the following commands in the terminal

```console
export PYTHONPATH=$PWD
```

to reproduce **Test 1**:

```console
Expand Down
50 changes: 22 additions & 28 deletions src/fes/datasets/synthetic_dataset.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
from typing import Dict, Any

import numpy as np
from numpy import random

from kedro.io.core import AbstractDataSet

from src.fes.utils import calculate_snr


class SyntheticDataset(AbstractDataSet):

def __init__(self, option, n, m, noise_std, redundancy_rate, features_fill, poly_degree=None, num_groups=None, seed=None):
def __init__(self, option, n, m, noise_std, redundancy_rate, features_fill, poly_degree=None, num_groups=None,
seed=None):
"""
Parameters
----------
Expand Down Expand Up @@ -38,11 +39,12 @@ def __init__(self, option, n, m, noise_std, redundancy_rate, features_fill, poly

if seed is not None:
print(f"The seed for the synthetic dataset generation is set to {seed}", end='\n\n')
random.seed(seed)
np.random.seed(seed)

def _load(self) -> Any:
if self.option == 'sparse':
return generate_sparse_data(self.n, self.m, self.noise_std, self.redundancy_rate, self.features_fill, self.poly_degree)
return _generate_sparse_data(self.n, self.m, self.noise_std, self.redundancy_rate, self.features_fill,
self.poly_degree)

# TODO. Test when ista_sght will be ready
# elif self.option == 'grouped':
Expand All @@ -58,7 +60,7 @@ def _describe(self) -> Dict[str, Any]:
pass


def generate_sparse_data(n, m, noise_std, redundancy_rate, features_fill, poly_degree):
def _generate_sparse_data(n, m, noise_std, redundancy_rate, features_fill, poly_degree):
"""
Returns y: vector of observations (n,1),
X: design matrix (n, m)
Expand All @@ -70,9 +72,9 @@ def generate_sparse_data(n, m, noise_std, redundancy_rate, features_fill, poly_d
w = np.zeros((m, 1))

# Decide the number of features and their locations
num_sparse_feat = np.clip(random.binomial(m, 1 - redundancy_rate), a_min=1, a_max=None)
num_sparse_feat = np.clip(np.random.binomial(m, 1 - redundancy_rate), a_min=1, a_max=None)

sparse_feat_idx = random.choice(m, num_sparse_feat, replace=False)
sparse_feat_idx = np.random.choice(m, num_sparse_feat, replace=False)

# Trim idx to poly_degree
if poly_degree is not None:
Expand All @@ -81,12 +83,12 @@ def generate_sparse_data(n, m, noise_std, redundancy_rate, features_fill, poly_d

# Fill features with values
if features_fill == 'const':
w[sparse_feat_idx] = random.randint(1, 4 * m, (num_sparse_feat, 1))
w[sparse_feat_idx] = np.random.randint(1, 4 * m, (num_sparse_feat, 1))

elif features_fill == 'normal':
for i in range(poly_degree):
if i == 0:
w[sparse_feat_idx[:, i]] = random.standard_normal((num_poly, 1))
w[sparse_feat_idx[:, i]] = np.random.standard_normal((num_poly, 1))

else:
w[sparse_feat_idx[:, i]] = w[sparse_feat_idx[:, i - 1]] * w[sparse_feat_idx[:, 0]]
Expand All @@ -97,61 +99,53 @@ def generate_sparse_data(n, m, noise_std, redundancy_rate, features_fill, poly_d
features_mask = w != 0

# Generate observations
X = random.standard_normal((n, m))
X = np.random.standard_normal((n, m))

y_true = X @ w
y = y_true + np.random.standard_normal((n, 1)) * noise_std

print("Synthetic sparse test dataset is generated")
print(f"Number of observations: {n}, features dim. {m}, number of informative features {sum(features_mask.reshape(-1))}")
print(
f"Number of observations: {n}, features dim. {m}, number of informative features {sum(features_mask.reshape(-1))}")
print(f"Observations SNR: {calculate_snr(y_true, noise_std):.3f} dB")
print(f"Features fill: {features_fill}", end='\n\n')

return y, X, w, y_true, features_mask


def generate_grouped_data(n, m, noise_std, redundancy_rate, features_fill, num_groups):
def _generate_grouped_data(n, m, noise_std, redundancy_rate, features_fill, num_groups):
if num_groups is None or num_groups < 2:
raise ValueError("The number of groups cannot be None or less than 2")

# Split x_hat into groups
group_end_idx = random.choice(m - 2, num_groups - 1, replace=False) + 1
group_end_idx = np.random.choice(m - 2, num_groups - 1, replace=False) + 1
group_end_idx.sort()

w, groups_labels = np.zeros((m, 1)), np.zeros((m, 1))

_x_hat, _groups_labels = np.split(w, group_end_idx),\
_x_hat, _groups_labels = np.split(w, group_end_idx), \
np.split(groups_labels, group_end_idx)

# Decide whether to keep the group and fill it with values if needed
for i, (x_hat_group, group_labels) in enumerate(zip(_x_hat, _groups_labels)):
if i == 0 or random.binomial(1, 1 - redundancy_rate) == 1:
if i == 0 or np.random.binomial(1, 1 - redundancy_rate) == 1:
group_labels[:] = i + 1

if features_fill == 'const':
x_hat_group[:] = random.randint(1, 4 * num_groups)
x_hat_group[:] = np.random.randint(1, 4 * num_groups)

elif features_fill == 'normal':
x_hat_group[:] = random.standard_normal(x_hat_group.shape)
x_hat_group[:] = np.random.standard_normal(x_hat_group.shape)

else:
raise ValueError(f"Unknown fill value: {features_fill}")

features_mask = w != 0

# Generate observations
X = random.standard_normal((n, m))
X = np.random.standard_normal((n, m))

y_true = X @ w
y = y_true + np.random.standard_normal((n)) * noise_std

return y, X, w, y_true, features_mask, groups_labels


"""
Support utils
"""


def calculate_snr(y_true, noise_std):
return (20 * np.log10(abs(np.where(noise_std == 0, 0, y_true / noise_std)))).mean()
23 changes: 12 additions & 11 deletions src/fes/pipeline_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@

from kedro.pipeline import Pipeline

import fes.pipelines.data_processing.pipeline as dpp
import fes.pipelines.data_science.pipeline as dsp
from src.fes.pipelines.data_processing import *
from src.fes.pipelines.data_science import perm_importance_pipeline


def register_pipelines() -> Dict[str, Pipeline]:
Expand All @@ -41,15 +41,16 @@ def register_pipelines() -> Dict[str, Pipeline]:
Returns:
A mapping from a pipeline name to a ``Pipeline`` object.
"""
synth_dataset = dpp.sparse_synth_test_data_pipeline()
synth_dataset_poly = dpp.sparse_synth_test_data_poly_pipeline()
synth_dataset_noise = dpp.sparse_synth_test_data_noise_pipeline()
synth_dataset_rr = dpp.sparse_synth_test_data_rr_pipeline()
perm_importance = dsp.perm_importance_pipeline()

synth_dataset = sparse_synth_test_data_pipeline()
synth_dataset_poly = sparse_synth_test_data_poly_pipeline()
synth_dataset_noise = sparse_synth_test_data_noise_pipeline()
synth_dataset_rr = sparse_synth_test_data_rr_pipeline()
perm_importance = perm_importance_pipeline()

return {
"__default__": synth_dataset + perm_importance, # Test 1
"synth_poly_pi": synth_dataset_poly + perm_importance, # Test 2
"synth_noise_pi": synth_dataset_noise + perm_importance, # Test 3
"synth_rr_pi": synth_dataset_rr + perm_importance # Test 4
"__default__": synth_dataset + perm_importance, # Test 1
"synth_poly_pi": synth_dataset_poly + perm_importance, # Test 2
"synth_noise_pi": synth_dataset_noise + perm_importance, # Test 3
"synth_rr_pi": synth_dataset_rr + perm_importance # Test 4
}
1 change: 1 addition & 0 deletions src/fes/pipelines/data_processing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .pipeline import *
2 changes: 1 addition & 1 deletion src/fes/pipelines/data_processing/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ def arrange_sparse_synth_test_data(sparse_synth_test_data):
y, X, w, y_true, features_mask = sparse_synth_test_data
return y, X, w, y_true, features_mask

# TODO. Grouped data
# TODO. Grouped data
6 changes: 6 additions & 0 deletions src/fes/pipelines/data_processing/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
__all__ = ['sparse_synth_test_data_pipeline',
'sparse_synth_test_data_poly_pipeline',
'sparse_synth_test_data_noise_pipeline',
'sparse_synth_test_data_rr_pipeline',
]

from kedro.pipeline import Pipeline, node

from .nodes import arrange_sparse_synth_test_data
Expand Down
1 change: 1 addition & 0 deletions src/fes/pipelines/data_science/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .pipeline import *
8 changes: 5 additions & 3 deletions src/fes/pipelines/data_science/nodes.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
import numpy as np
from sklearn.base import RegressorMixin

from sklearn.linear_model import LinearRegression
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error, r2_score


def fit_model(y, X):
def fit_model(y: np.ndarray, X: np.ndarray) -> LinearRegression:
"""
Parameters
----------
y: (n,1) vector of observations
X: (n,m) design matrix
Returns regressor: fitted regressor compatible with sklearn interface
Returns linear regressor: fitted regressor compatible with sklearn interface
-------

"""
Expand All @@ -20,7 +21,8 @@ def fit_model(y, X):
return regressor


def evaluate_perm_importance(regressor, y, X, w, y_true, features_mask, parameters):
def evaluate_perm_importance(regressor: RegressorMixin, y: np.ndarray, X: np.ndarray, w: np.ndarray,
y_true: np.ndarray, features_mask: np.ndarray, parameters: dict):
"""
Parameters
----------
Expand Down
2 changes: 2 additions & 0 deletions src/fes/pipelines/data_science/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
__all__ = ['perm_importance_pipeline']

from kedro.pipeline import Pipeline, node

from .nodes import fit_model, evaluate_perm_importance
Expand Down
1 change: 1 addition & 0 deletions src/fes/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .dataset import *
9 changes: 9 additions & 0 deletions src/fes/utils/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""
Support datasets utils
"""

import numpy as np


def calculate_snr(y_true, noise_std):
return (20 * np.log10(abs(np.where(noise_std == 0, 0, y_true / noise_std)))).mean()