Skip to content

Commit cd77477

Browse files
Dev 0 0 12 (#31)
Refactoring outliers, adding novel outlier generation techniques from Steinbuss et al.
2 parents 69b901a + 61db13f commit cd77477

File tree

16 files changed

+391
-248
lines changed

16 files changed

+391
-248
lines changed

.github/workflows/documentation.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,5 +21,5 @@ jobs:
2121
path: .cache
2222
restore-keys: |
2323
mkdocs-material-
24-
- run: pip install mkdocs mkdocstrings[python] mkdocs-gen-files mkdocs-material mkdocs-literate-nav mkdocs-jupyter
24+
- run: pip install -U mkdocs mkdocstrings[python] mkdocs-gen-files mkdocs-material mkdocs-literate-nav mkdocs-jupyter
2525
- run: mkdocs gh-deploy --force

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,3 +165,6 @@ cython_debug/
165165
/badgers/uncertainty-main-uncertainty-generate-augmentation/
166166
/experiments/
167167
/.continue/
168+
/uncertainty-main-uncertainty-generate-augmentation/
169+
/mcp/
170+
/profiling_tests/
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import abc
2+
3+
import numpy as np
4+
import sklearn.base
5+
from numpy.random import default_rng
6+
from sklearn.pipeline import make_pipeline
7+
from sklearn.preprocessing import StandardScaler
8+
9+
from badgers.core.base import GeneratorMixin
10+
from badgers.core.decorators.tabular_data import preprocess_inputs
11+
12+
13+
class OutliersGenerator(GeneratorMixin):
14+
"""
15+
Base class for transformers that add outliers to tabular data
16+
"""
17+
18+
def __init__(self, random_generator: np.random.Generator = default_rng(seed=0)):
19+
"""
20+
Initialize the OutliersGenerator with a random number generator.
21+
22+
:param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0).
23+
"""
24+
self.random_generator = random_generator
25+
26+
@abc.abstractmethod
27+
def generate(self, X, y=None, **params):
28+
"""
29+
Abstract method to generate outliers data. Must be implemented by subclasses.
30+
31+
:param X: Input features (pandas DataFrame or numpy array).
32+
:param y: Target variable (pandas Series or numpy array).
33+
:param params: Additional parameters required for noise generation.
34+
"""
35+
pass
36+
37+
38+
class DecompositionAndOutlierGenerator(OutliersGenerator):
39+
40+
def __init__(self, decomposition_transformer: sklearn.base.TransformerMixin, outlier_generator: OutliersGenerator):
41+
"""
42+
Initialize the DecompositionAndOutlierGenerator with a decomposition transformer and an outlier generator.
43+
44+
:param decomposition_transformer: The dimensionality reduction transformer to be applied to the data before generating outliers.
45+
:param outlier_generator: The outlier generator to be used after the data has been transformed.
46+
"""
47+
assert hasattr(
48+
decomposition_transformer,
49+
'inverse_transform'), \
50+
f'the decomposition transformer class must implement the inverse_transform function.' \
51+
f'\nUnfortunately the class {decomposition_transformer} does not'
52+
super().__init__(random_generator=outlier_generator.random_generator)
53+
54+
self.decomposition_transformer = decomposition_transformer
55+
self.outlier_generator = outlier_generator
56+
57+
@preprocess_inputs
58+
def generate(self, X, y=None, **params):
59+
"""
60+
Randomly generate outliers by first applying a dimensionality reduction technique (sklearn.decomposition)
61+
and an outlier transformer.
62+
63+
1. Standardize the input data (mean = 0, variance = 1)
64+
2. Apply the dimensionality reduction transformer
65+
3. Generates outliers by applying the outlier transformer
66+
4. Inverse the dimensionality reduction and the standardization transformations
67+
68+
:param X: the input features
69+
:param y: the regression target, class labels, or None
70+
:param params:
71+
:return:
72+
"""
73+
74+
# standardize the data and apply the dimensionality reduction transformer
75+
pipeline = make_pipeline(
76+
StandardScaler(),
77+
self.decomposition_transformer,
78+
)
79+
Xt = pipeline.fit_transform(X)
80+
# add outliers using the zscore_transformer
81+
Xt, yt = self.outlier_generator.generate(Xt, y, **params)
82+
# inverse the manifold and standardization transformations
83+
return pipeline.inverse_transform(Xt), yt
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
import numpy as np
2+
from numpy.random import default_rng
3+
from sklearn.preprocessing import StandardScaler, MinMaxScaler
4+
5+
from badgers.core.decorators.tabular_data import preprocess_inputs
6+
from badgers.core.utils import random_sign, random_spherical_coordinate
7+
from badgers.generators.tabular_data.outliers import OutliersGenerator
8+
9+
10+
class HyperCubeSampling(OutliersGenerator):
11+
"""
12+
Sampling uniformly at random within a hypercube encapsulating all the instances
13+
14+
15+
See section 6.1.1 in [1]
16+
17+
[1] Georg Steinbuss and Klemens Böhm. 2021.
18+
Generating Artificial Outliers in the Absence of Genuine Ones — A Survey.
19+
ACM Trans. Knowl. Discov. Data 15, 2, Article 30 (April 2021), 37 pages.
20+
https://doi.org/10.1145/3447822
21+
"""
22+
23+
def __init__(self, random_generator=default_rng(seed=0)):
24+
"""
25+
Initialize the HyperCubeSampling with a random number generator.
26+
27+
:param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0).
28+
"""
29+
super().__init__(random_generator)
30+
31+
@preprocess_inputs
32+
def generate(self, X, y, n_outliers: int = 10, expansion: float = 0.0):
33+
"""
34+
35+
How to set the values for expansion.
36+
Per default expansion = 0, this means the hypercube will cover all the instances using min and max as boundaries
37+
It is possible to make the hypercube bigger, as proposed in [1] section 6.1.1
38+
39+
Instances from Data usually determine the bounds a, b ∈ IRd . For this reason, this approach
40+
needs them as input. Tax and Duin [51] and Fan et al. [21] state only that these bounds should be
41+
chosen so that the hyper-rectangle encapsulates all genuine instances. [ 48] uses the minimum and
42+
maximum for each attribute obtained from Data. Theiler and Michael Cai [52] mention that the
43+
boundary does not need to be far beyond these boundaries. Abe et al. [1] propose the rule that the
44+
boundary should expand the minimum and maximum by 10%. Désir et al. [17] propose to expand
45+
the boundary by 20%.
46+
47+
For expanding the hypercube by 10% use expansion = 0.1, for 20% use 0.2, etc.
48+
49+
:param X: the input features (pandas DataFrame or numpy array).
50+
:param y: the class labels, target values, or None (if not provided).
51+
:param n_outliers: The number of outliers to generate.
52+
:param expansion: how much the hypercube shall be expanded beyond (min,max) range, in percent (0.1 == 10%)
53+
:return: A tuple containing the augmented feature matrix with added outliers and the corresponding target values.
54+
If `y` is None, the returned target values will also be None.
55+
"""
56+
assert expansion >= 0
57+
low = 0 - expansion
58+
high = 1 + expansion
59+
60+
scaler = MinMaxScaler()
61+
scaler.fit(X)
62+
63+
outliers = self.random_generator.uniform(low=low, high=high, size=(n_outliers, X.shape[1]))
64+
65+
# add "outliers" as labels for outliers
66+
yt = np.array(["outliers"] * len(outliers))
67+
68+
return scaler.inverse_transform(outliers), yt
69+
70+
71+
class ZScoreSamplingGenerator(OutliersGenerator):
72+
"""
73+
Randomly generates outliers as data points with a z-score > 3.
74+
75+
Very similar to "GaussTail" in section 6.1.5 in [1]
76+
77+
[1] Georg Steinbuss and Klemens Böhm. 2021.
78+
Generating Artificial Outliers in the Absence of Genuine Ones — A Survey.
79+
ACM Trans. Knowl. Discov. Data 15, 2, Article 30 (April 2021), 37 pages.
80+
https://doi.org/10.1145/3447822
81+
"""
82+
83+
def __init__(self, random_generator=default_rng(seed=0)):
84+
"""
85+
Initialize the ZScoreSamplingGenerator with a random number generator.
86+
87+
:param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0).
88+
"""
89+
super().__init__(random_generator)
90+
91+
@preprocess_inputs
92+
def generate(self, X, y, n_outliers: int = 10, scale: float = 1.0):
93+
"""
94+
Randomly generates outliers as data points with a z-score > 3.
95+
96+
The process involves the following steps:
97+
1. Standardize the input data so that it has a mean of 0 and a variance of 1.
98+
2. Generate outliers by:
99+
- choosing a random sign for each outlier.
100+
- for each dimension of the data, set the value to be 3 plus a random number drawn from an exponential distribution
101+
(see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html).
102+
3. Apply the inverse of the standardization transformation to convert the generated outliers back to the original scale.
103+
104+
:param X: the input features (pandas DataFrame or numpy array).
105+
:param y: the class labels, target values, or None (if not provided).
106+
:param n_outliers: The number of outliers to generate.
107+
:param scale: float or array_like of floats (the scale parameter from https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html)
108+
The scale parameter, :math:`\beta = 1/\lambda`. Must be
109+
non-negative.
110+
:return: A tuple containing the augmented feature matrix with added outliers and the corresponding target values.
111+
If `y` is None, the returned target values will also be None.
112+
"""
113+
114+
# standardize X
115+
scaler = StandardScaler()
116+
117+
# fit, transform
118+
scaler.fit(X)
119+
Xt = scaler.transform(X)
120+
121+
# generate outliers
122+
outliers = np.array([
123+
random_sign(self.random_generator, size=Xt.shape[1]) * (
124+
3. + self.random_generator.exponential(size=Xt.shape[1], scale=scale))
125+
for _ in range(n_outliers)
126+
])
127+
128+
# in case we only have 1 outlier, reshape the array to match sklearn convention
129+
if outliers.shape[0] == 1:
130+
outliers = outliers.reshape(1, -1)
131+
132+
# add "outliers" as labels for outliers
133+
yt = np.array(["outliers"] * len(outliers))
134+
135+
return scaler.inverse_transform(outliers), yt
136+
137+
138+
class HypersphereSamplingGenerator(OutliersGenerator):
139+
"""
140+
Generates outliers by sampling points from a hypersphere with radius at least 3 sigma
141+
142+
Very similar to "GaussTail" in section 6.1.5 in [1]
143+
144+
[1] Georg Steinbuss and Klemens Böhm. 2021.
145+
Generating Artificial Outliers in the Absence of Genuine Ones — A Survey.
146+
ACM Trans. Knowl. Discov. Data 15, 2, Article 30 (April 2021), 37 pages.
147+
https://doi.org/10.1145/3447822
148+
"""
149+
150+
def __init__(self, random_generator=default_rng(seed=0)):
151+
"""
152+
Initialize the HypersphereSamplingGenerator with a random number generator.
153+
154+
:param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0).
155+
"""
156+
super().__init__(random_generator)
157+
158+
@preprocess_inputs
159+
def generate(self, X, y=None, n_outliers: int = 10, scale: float = 1.0):
160+
"""
161+
Randomly generates outliers by sampling points from a hypersphere.
162+
163+
The process involves the following steps:
164+
1. Standardize the input data so that it has a mean of 0 and a variance of 1.
165+
2. Generate outliers by:
166+
- choosing angles uniformly at random for each dimension of the data.
167+
- setting the radius to be 3 plus a random number drawn from an exponential distribution
168+
(see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html).
169+
3. Convert the spherical coordinates to Cartesian coordinates.
170+
4. Apply the inverse of the standardization transformation to convert the generated outliers back to the original scale.
171+
172+
:param X: the input features (pandas DataFrame or numpy array).
173+
:param y: the class labels, target values, or None (if not provided).
174+
:param n_outliers: The number of outliers to generate.
175+
:param scale: float (the scale parameter from https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html)
176+
The scale parameter, :math:`\beta = 1/\lambda`. Must be
177+
non-negative.
178+
:return: A tuple containing the augmented feature matrix with added outliers and the corresponding target values.
179+
If `y` is None, the returned target values will also be None.
180+
"""
181+
182+
# standardize X
183+
scaler = StandardScaler()
184+
185+
# fit, transform
186+
scaler.fit(X)
187+
Xt = scaler.transform(X)
188+
189+
# computing outliers
190+
outliers = np.array([
191+
random_spherical_coordinate(
192+
random_generator=self.random_generator,
193+
size=Xt.shape[1],
194+
radius=3. + self.random_generator.exponential(scale=scale)
195+
)
196+
for _ in range(n_outliers)
197+
])
198+
199+
# in case we only have 1 outlier, reshape the array to match sklearn convention
200+
if outliers.shape[0] == 1:
201+
outliers = outliers.reshape(1, -1)
202+
203+
# add "outliers" as labels for outliers
204+
yt = np.array(["outliers"] * len(outliers))
205+
206+
return scaler.inverse_transform(outliers), yt
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import numpy as np
2+
import pandas as pd
3+
from numpy.random import default_rng
4+
5+
from badgers.core.decorators.tabular_data import preprocess_inputs
6+
from badgers.generators.tabular_data.outliers import OutliersGenerator
7+
8+
9+
class UniformInstanceAttributeSampling(OutliersGenerator):
10+
"""
11+
Randomly generates outliers by sampling from existing instances attributes uniformly at random
12+
"""
13+
14+
def __init__(self, random_generator=default_rng(seed=0)):
15+
"""
16+
Initialize the UniformInstanceAttributeSampling with a random number generator.
17+
18+
:param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0).
19+
"""
20+
super().__init__(random_generator)
21+
22+
@preprocess_inputs
23+
def generate(self, X, y, n_outliers: int = 10):
24+
"""
25+
26+
27+
:param X: the input features (pandas DataFrame or numpy array).
28+
:param y: the class labels, target values, or None (if not provided).
29+
:param n_outliers: The number of outliers to generate.
30+
:return: A tuple containing the augmented feature matrix with added outliers and the corresponding target values.
31+
If `y` is None, the returned target values will also be None.
32+
"""
33+
34+
outliers = pd.DataFrame(
35+
data=np.stack([self.random_generator.choice(X.iloc[:,i], size=n_outliers) for i in range(X.shape[1])]).T,
36+
columns = X.columns
37+
)
38+
39+
# add "outliers" as labels for outliers
40+
yt = np.array(["outliers"] * len(outliers))
41+
42+
return outliers, yt

0 commit comments

Comments
 (0)