Skip to content

Commit db3ef79

Browse files
adding missing values for time series, fixing outliers (2d and more),… (#21)
… refactoring notebooks
2 parents b7be474 + 0e8b6b6 commit db3ef79

24 files changed

+847
-445
lines changed
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import abc
2+
from typing import Tuple
3+
4+
import numpy as np
5+
from numpy.random import default_rng
6+
7+
from badgers.core.base import GeneratorMixin
8+
from badgers.core.decorators.time_series import preprocess_inputs
9+
10+
11+
class MissingValuesGenerator(GeneratorMixin):
12+
"""
13+
Base class for transformers that generate point outliers in time-series data
14+
"""
15+
16+
def __init__(self, random_generator=default_rng(seed=0)):
17+
"""
18+
:param random_generator: a random number generator
19+
:param n_outliers: the number of outliers to generate
20+
"""
21+
self.random_generator = random_generator
22+
self.missing_indices_ = []
23+
24+
@abc.abstractmethod
25+
def generate(self, X, y, **params) -> Tuple:
26+
pass
27+
28+
29+
class MissingAtRandomGenerator(MissingValuesGenerator):
30+
"""
31+
Randomly set data points to nan (missing)
32+
"""
33+
34+
def __init__(self, random_generator=default_rng(seed=0)):
35+
"""
36+
37+
:param random_generator: a random number generator
38+
39+
"""
40+
super().__init__(random_generator=random_generator)
41+
42+
@preprocess_inputs
43+
def generate(self, X, y, n_missing: int = 10) -> Tuple:
44+
"""
45+
Randomly set values to np.nan (missing)
46+
:param X:
47+
:param y:
48+
:param n_missing: the number of outliers to generate
49+
:return:
50+
"""
51+
# generate missing values indices and values
52+
rows = self.random_generator.choice(X.shape[0], size=n_missing, replace=False, p=None)
53+
cols = self.random_generator.integers(low=0, high=X.shape[1], size=n_missing)
54+
55+
self.missing_indices_ = list(zip(rows, cols))
56+
57+
for r, c in self.missing_indices_:
58+
X.iloc[r, c] = np.nan
59+
60+
return X, y

badgers/generators/time_series/outliers.py

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import abc
22
from typing import Tuple
33

4+
import numpy as np
45
from numpy.random import default_rng
56

67
from badgers.core.base import GeneratorMixin
7-
from badgers.core.utils import random_sign
88
from badgers.core.decorators.time_series import preprocess_inputs
9+
from badgers.core.utils import random_sign
10+
911

1012
class OutliersGenerator(GeneratorMixin):
1113
"""
@@ -48,10 +50,13 @@ def generate(self, X, y, n_outliers: int = 10) -> Tuple:
4850
:return:
4951
"""
5052
# generate extreme values indices and values
51-
self.outliers_indices_ = self.random_generator.choice(X.shape[0], size=n_outliers, replace=False, p=None)
53+
rows = self.random_generator.choice(X.shape[0], size=n_outliers, replace=False, p=None)
54+
cols = self.random_generator.integers(low=0, high=X.shape[1], size=n_outliers)
5255

53-
for idx in self.outliers_indices_:
54-
X.iloc[idx, :] = 0
56+
self.outliers_indices_ = list(zip(rows, cols))
57+
58+
for r, c in self.outliers_indices_:
59+
X.iloc[r, c] = 0
5560

5661
return X, y
5762

@@ -84,15 +89,21 @@ def generate(self, X, y, n_outliers: int = 10,
8489
:return: the transformed array
8590
"""
8691
# generate extreme values indices and values
87-
self.outliers_indices_ = self.random_generator.choice(X.shape[0], size=n_outliers, replace=False, p=None)
92+
delta = int(local_window_size / 2)
93+
94+
rows = self.random_generator.choice(
95+
np.arange(delta, X.shape[0] - delta, dtype=int), size=n_outliers, replace=False, p=None)
96+
cols = self.random_generator.integers(low=0, high=X.shape[1], size=n_outliers)
97+
98+
self.outliers_indices_ = list(zip(rows, cols))
8899

89-
for idx in self.outliers_indices_:
90-
local_window = X.iloc[idx - int(local_window_size / 2):idx + int(local_window_size / 2), :]
100+
for r, c in self.outliers_indices_:
101+
local_window = X.iloc[r - delta:r + delta, c]
91102
local_mean = local_window.mean(axis=0)
92103
local_std = local_window.std(axis=0)
93-
value = local_mean + random_sign(self.random_generator, size=X.shape[1]) * (
94-
3. * local_std + self.random_generator.exponential(size=X.shape[1]))
104+
value = local_mean + random_sign(self.random_generator) * (
105+
3. * local_std + self.random_generator.exponential())
95106
# updating with new outliers
96-
X.iloc[idx, :] = value
107+
X.iloc[r, c] = value
97108

98109
return X, y

docs/dev/Create-New-Tabular-Generators.ipynb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@
228228
"\n",
229229
"First, we generate some data using sklearn make_blob utility.\n",
230230
"\n",
231-
"Second, we instanciate our generator and apply the generate function on our data.\n",
231+
"Second, we instantiate our generator and apply the generate function on our data.\n",
232232
"\n",
233233
"Finally, we plot the data using seaborn (or matplotlib or plotly, or bokeh...)"
234234
]
@@ -319,7 +319,7 @@
319319
"id": "5b40fee6-c51a-4948-9cd5-4e4ea4c2a1ce",
320320
"metadata": {},
321321
"source": [
322-
"## Instanciate our generator"
322+
"## Instantiate our generator"
323323
]
324324
},
325325
{
@@ -470,7 +470,7 @@
470470
"id": "5ce36322-a02e-4c95-a181-deefc3473a49",
471471
"metadata": {},
472472
"source": [
473-
"## Instanciate the generator, generate data with different `lam` values and plot the results"
473+
"## Instantiate the generator, generate data with different `lam` values and plot the results"
474474
]
475475
},
476476
{

docs/tutorials/Changepoints-Time-Series.ipynb

Lines changed: 0 additions & 179 deletions
This file was deleted.

docs/tutorials/Outliers-Time-Series.ipynb

Lines changed: 0 additions & 241 deletions
This file was deleted.

0 commit comments

Comments
 (0)