|
| 1 | +import numpy as np |
| 2 | +from numpy.random import default_rng |
| 3 | +from sklearn.preprocessing import StandardScaler, MinMaxScaler |
| 4 | + |
| 5 | +from badgers.core.decorators.tabular_data import preprocess_inputs |
| 6 | +from badgers.core.utils import random_sign, random_spherical_coordinate |
| 7 | +from badgers.generators.tabular_data.outliers import OutliersGenerator |
| 8 | + |
| 9 | + |
| 10 | +class HyperCubeSampling(OutliersGenerator): |
| 11 | + """ |
| 12 | + Sampling uniformly at random within a hypercube encapsulating all the instances |
| 13 | +
|
| 14 | +
|
| 15 | + See section 6.1.1 in [1] |
| 16 | +
|
| 17 | + [1] Georg Steinbuss and Klemens Böhm. 2021. |
| 18 | + Generating Artificial Outliers in the Absence of Genuine Ones — A Survey. |
| 19 | + ACM Trans. Knowl. Discov. Data 15, 2, Article 30 (April 2021), 37 pages. |
| 20 | + https://doi.org/10.1145/3447822 |
| 21 | + """ |
| 22 | + |
| 23 | + def __init__(self, random_generator=default_rng(seed=0)): |
| 24 | + """ |
| 25 | + Initialize the HyperCubeSampling with a random number generator. |
| 26 | +
|
| 27 | + :param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0). |
| 28 | + """ |
| 29 | + super().__init__(random_generator) |
| 30 | + |
| 31 | + @preprocess_inputs |
| 32 | + def generate(self, X, y, n_outliers: int = 10, expansion: float = 0.0): |
| 33 | + """ |
| 34 | +
|
| 35 | + How to set the values for expansion. |
| 36 | + Per default expansion = 0, this means the hypercube will cover all the instances using min and max as boundaries |
| 37 | + It is possible to make the hypercube bigger, as proposed in [1] section 6.1.1 |
| 38 | +
|
| 39 | + Instances from Data usually determine the bounds a, b ∈ IRd . For this reason, this approach |
| 40 | + needs them as input. Tax and Duin [51] and Fan et al. [21] state only that these bounds should be |
| 41 | + chosen so that the hyper-rectangle encapsulates all genuine instances. [ 48] uses the minimum and |
| 42 | + maximum for each attribute obtained from Data. Theiler and Michael Cai [52] mention that the |
| 43 | + boundary does not need to be far beyond these boundaries. Abe et al. [1] propose the rule that the |
| 44 | + boundary should expand the minimum and maximum by 10%. Désir et al. [17] propose to expand |
| 45 | + the boundary by 20%. |
| 46 | +
|
| 47 | + For expanding the hypercube by 10% use expansion = 0.1, for 20% use 0.2, etc. |
| 48 | +
|
| 49 | + :param X: the input features (pandas DataFrame or numpy array). |
| 50 | + :param y: the class labels, target values, or None (if not provided). |
| 51 | + :param n_outliers: The number of outliers to generate. |
| 52 | + :param expansion: how much the hypercube shall be expanded beyond (min,max) range, in percent (0.1 == 10%) |
| 53 | + :return: A tuple containing the augmented feature matrix with added outliers and the corresponding target values. |
| 54 | + If `y` is None, the returned target values will also be None. |
| 55 | + """ |
| 56 | + assert expansion >= 0 |
| 57 | + low = 0 - expansion |
| 58 | + high = 1 + expansion |
| 59 | + |
| 60 | + scaler = MinMaxScaler() |
| 61 | + scaler.fit(X) |
| 62 | + |
| 63 | + outliers = self.random_generator.uniform(low=low, high=high, size=(n_outliers, X.shape[1])) |
| 64 | + |
| 65 | + # add "outliers" as labels for outliers |
| 66 | + yt = np.array(["outliers"] * len(outliers)) |
| 67 | + |
| 68 | + return scaler.inverse_transform(outliers), yt |
| 69 | + |
| 70 | + |
| 71 | +class ZScoreSamplingGenerator(OutliersGenerator): |
| 72 | + """ |
| 73 | + Randomly generates outliers as data points with a z-score > 3. |
| 74 | +
|
| 75 | + Very similar to "GaussTail" in section 6.1.5 in [1] |
| 76 | +
|
| 77 | + [1] Georg Steinbuss and Klemens Böhm. 2021. |
| 78 | + Generating Artificial Outliers in the Absence of Genuine Ones — A Survey. |
| 79 | + ACM Trans. Knowl. Discov. Data 15, 2, Article 30 (April 2021), 37 pages. |
| 80 | + https://doi.org/10.1145/3447822 |
| 81 | + """ |
| 82 | + |
| 83 | + def __init__(self, random_generator=default_rng(seed=0)): |
| 84 | + """ |
| 85 | + Initialize the ZScoreSamplingGenerator with a random number generator. |
| 86 | +
|
| 87 | + :param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0). |
| 88 | + """ |
| 89 | + super().__init__(random_generator) |
| 90 | + |
| 91 | + @preprocess_inputs |
| 92 | + def generate(self, X, y, n_outliers: int = 10, scale: float = 1.0): |
| 93 | + """ |
| 94 | + Randomly generates outliers as data points with a z-score > 3. |
| 95 | +
|
| 96 | + The process involves the following steps: |
| 97 | + 1. Standardize the input data so that it has a mean of 0 and a variance of 1. |
| 98 | + 2. Generate outliers by: |
| 99 | + - choosing a random sign for each outlier. |
| 100 | + - for each dimension of the data, set the value to be 3 plus a random number drawn from an exponential distribution |
| 101 | + (see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html). |
| 102 | + 3. Apply the inverse of the standardization transformation to convert the generated outliers back to the original scale. |
| 103 | +
|
| 104 | + :param X: the input features (pandas DataFrame or numpy array). |
| 105 | + :param y: the class labels, target values, or None (if not provided). |
| 106 | + :param n_outliers: The number of outliers to generate. |
| 107 | + :param scale: float or array_like of floats (the scale parameter from https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html) |
| 108 | + The scale parameter, :math:`\beta = 1/\lambda`. Must be |
| 109 | + non-negative. |
| 110 | + :return: A tuple containing the augmented feature matrix with added outliers and the corresponding target values. |
| 111 | + If `y` is None, the returned target values will also be None. |
| 112 | + """ |
| 113 | + |
| 114 | + # standardize X |
| 115 | + scaler = StandardScaler() |
| 116 | + |
| 117 | + # fit, transform |
| 118 | + scaler.fit(X) |
| 119 | + Xt = scaler.transform(X) |
| 120 | + |
| 121 | + # generate outliers |
| 122 | + outliers = np.array([ |
| 123 | + random_sign(self.random_generator, size=Xt.shape[1]) * ( |
| 124 | + 3. + self.random_generator.exponential(size=Xt.shape[1], scale=scale)) |
| 125 | + for _ in range(n_outliers) |
| 126 | + ]) |
| 127 | + |
| 128 | + # in case we only have 1 outlier, reshape the array to match sklearn convention |
| 129 | + if outliers.shape[0] == 1: |
| 130 | + outliers = outliers.reshape(1, -1) |
| 131 | + |
| 132 | + # add "outliers" as labels for outliers |
| 133 | + yt = np.array(["outliers"] * len(outliers)) |
| 134 | + |
| 135 | + return scaler.inverse_transform(outliers), yt |
| 136 | + |
| 137 | + |
| 138 | +class HypersphereSamplingGenerator(OutliersGenerator): |
| 139 | + """ |
| 140 | + Generates outliers by sampling points from a hypersphere with radius at least 3 sigma |
| 141 | +
|
| 142 | + Very similar to "GaussTail" in section 6.1.5 in [1] |
| 143 | +
|
| 144 | + [1] Georg Steinbuss and Klemens Böhm. 2021. |
| 145 | + Generating Artificial Outliers in the Absence of Genuine Ones — A Survey. |
| 146 | + ACM Trans. Knowl. Discov. Data 15, 2, Article 30 (April 2021), 37 pages. |
| 147 | + https://doi.org/10.1145/3447822 |
| 148 | + """ |
| 149 | + |
| 150 | + def __init__(self, random_generator=default_rng(seed=0)): |
| 151 | + """ |
| 152 | + Initialize the HypersphereSamplingGenerator with a random number generator. |
| 153 | +
|
| 154 | + :param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0). |
| 155 | + """ |
| 156 | + super().__init__(random_generator) |
| 157 | + |
| 158 | + @preprocess_inputs |
| 159 | + def generate(self, X, y=None, n_outliers: int = 10, scale: float = 1.0): |
| 160 | + """ |
| 161 | + Randomly generates outliers by sampling points from a hypersphere. |
| 162 | +
|
| 163 | + The process involves the following steps: |
| 164 | + 1. Standardize the input data so that it has a mean of 0 and a variance of 1. |
| 165 | + 2. Generate outliers by: |
| 166 | + - choosing angles uniformly at random for each dimension of the data. |
| 167 | + - setting the radius to be 3 plus a random number drawn from an exponential distribution |
| 168 | + (see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html). |
| 169 | + 3. Convert the spherical coordinates to Cartesian coordinates. |
| 170 | + 4. Apply the inverse of the standardization transformation to convert the generated outliers back to the original scale. |
| 171 | +
|
| 172 | + :param X: the input features (pandas DataFrame or numpy array). |
| 173 | + :param y: the class labels, target values, or None (if not provided). |
| 174 | + :param n_outliers: The number of outliers to generate. |
| 175 | + :param scale: float (the scale parameter from https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html) |
| 176 | + The scale parameter, :math:`\beta = 1/\lambda`. Must be |
| 177 | + non-negative. |
| 178 | + :return: A tuple containing the augmented feature matrix with added outliers and the corresponding target values. |
| 179 | + If `y` is None, the returned target values will also be None. |
| 180 | + """ |
| 181 | + |
| 182 | + # standardize X |
| 183 | + scaler = StandardScaler() |
| 184 | + |
| 185 | + # fit, transform |
| 186 | + scaler.fit(X) |
| 187 | + Xt = scaler.transform(X) |
| 188 | + |
| 189 | + # computing outliers |
| 190 | + outliers = np.array([ |
| 191 | + random_spherical_coordinate( |
| 192 | + random_generator=self.random_generator, |
| 193 | + size=Xt.shape[1], |
| 194 | + radius=3. + self.random_generator.exponential(scale=scale) |
| 195 | + ) |
| 196 | + for _ in range(n_outliers) |
| 197 | + ]) |
| 198 | + |
| 199 | + # in case we only have 1 outlier, reshape the array to match sklearn convention |
| 200 | + if outliers.shape[0] == 1: |
| 201 | + outliers = outliers.reshape(1, -1) |
| 202 | + |
| 203 | + # add "outliers" as labels for outliers |
| 204 | + yt = np.array(["outliers"] * len(outliers)) |
| 205 | + |
| 206 | + return scaler.inverse_transform(outliers), yt |
0 commit comments