Skip to content

Commit 69b901a

Browse files
Adding scaling factor as parameter #25 (#28)
2 parents 9817271 + adfccb6 commit 69b901a

28 files changed

+743
-330
lines changed

.github/workflows/tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,4 @@ jobs:
3737
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
3838
- name: Run tests with tox
3939
# Run tox using the version of Python in `PATH`
40-
run: tox -e py
40+
run: tox -e py

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,3 +162,6 @@ cython_debug/
162162
.idea/
163163

164164
/notebooks/
165+
/badgers/uncertainty-main-uncertainty-generate-augmentation/
166+
/experiments/
167+
/.continue/

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ from badgers.generators.tabular_data.noise import GaussianNoiseGenerator
2121

2222
X, y = make_blobs()
2323
trf = GaussianNoiseGenerator()
24-
Xt, yt = trf.generate(X,y,noise_std=0.5)
24+
25+
Xt, yt = trf.generate(X, y, noise_std=0.5)
2526
```
2627

2728
More examples are available in the [tutorials](https://fraunhofer-iese.github.io/badgers/tutorials/Imbalance-Tabular-Data/) section.

badgers/generators/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""
2-
Module containing all transformers
2+
Module containing all generators
33
"""
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""
2-
Module containing all the transformers that accept graph data as input
2+
This module contains all the generator functions designed to process and yield data from graph inputs.
33
"""

badgers/generators/graph/missingness.py

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ def __init__(self, random_generator: numpy.random.Generator = default_rng(seed=0
2424

2525
@abc.abstractmethod
2626
def generate(self, X, y=None, **params) -> Tuple:
27+
"""
28+
This method should be overridden by subclasses.
29+
"""
2730
pass
2831

2932

@@ -33,15 +36,28 @@ class NodesMissingCompletelyAtRandom(MissingGenerator):
3336
"""
3437

3538
def __init__(self, random_generator: numpy.random.Generator = default_rng(seed=0)):
39+
"""
40+
Initialize the missingness generator.
41+
42+
:param random_generator: A NumPy random number generator.
43+
Defaults to a default random number generator seeded with 0.
44+
:type random_generator: numpy.random.Generator
45+
"""
3646
super().__init__(random_generator=random_generator)
3747

3848
def generate(self, X, y=None, percentage_missing: float = 0.1) -> Tuple:
3949
"""
40-
41-
:param X:
42-
:param y:
43-
:param percentage_missing: The percentage of missing nodes (float value between 0 and 1 excluded)
44-
:return:
50+
Generate a graph with a specified percentage of missing nodes.
51+
52+
:param X: The input graph from which nodes will be removed.
53+
:type X: nx.Graph
54+
:param y: Optional target array associated with the nodes in the graph.
55+
If provided, the corresponding elements will also be removed.
56+
:type y: np.ndarray, optional
57+
:param percentage_missing: The percentage of nodes to be removed (float value between 0 and 1).
58+
:type percentage_missing: float
59+
:return: A tuple containing the modified graph with missing nodes and the modified target array (if provided).
60+
:rtype: Tuple[nx.Graph, Optional[np.ndarray]]
4561
"""
4662
assert 0 < percentage_missing < 1
4763
if not isinstance(X, nx.Graph):
@@ -70,15 +86,29 @@ class EdgesMissingCompletelyAtRandom(MissingGenerator):
7086
"""
7187

7288
def __init__(self, random_generator: numpy.random.Generator = default_rng(seed=0)):
89+
"""
90+
Initialize the missingness generator.
91+
92+
:param random_generator: A NumPy random number generator.
93+
Defaults to a default random number generator seeded with 0.
94+
:type random_generator: numpy.random.Generator
95+
"""
7396
super().__init__(random_generator=random_generator)
7497

7598
def generate(self, X, y=None, percentage_missing: float = 0.1) -> Tuple:
7699
"""
77-
78-
:param X:
79-
:param y:
80-
:param percentage_missing: The percentage of missing nodes (float value between 0 and 1 excluded)
81-
:return:
100+
Generate a graph with a specified percentage of missing edges.
101+
102+
:param X: The input graph from which edges will be removed.
103+
:type X: nx.Graph
104+
:param y: Optional target data associated with the edges in the graph.
105+
If provided, the corresponding elements will also be removed.
106+
Can be a dictionary where keys are edge tuples and values are target values.
107+
:type y: dict, optional
108+
:param percentage_missing: The percentage of edges to be removed (float value between 0 and 1).
109+
:type percentage_missing: float
110+
:return: A tuple containing the modified graph with missing edges and the modified target data (if provided).
111+
:rtype: Tuple[nx.Graph, Optional[dict]]
82112
"""
83113
assert 0 < percentage_missing < 1
84114
if not isinstance(X, nx.Graph):
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""
2-
Module containing all the transformers that accept tabular data as input
2+
This module contains all the generator functions designed to process and yield data from tabular inputs.
33
"""

badgers/generators/tabular_data/drift.py

Lines changed: 30 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,10 @@ class DriftGenerator(GeneratorMixin):
1717

1818
def __init__(self, random_generator=default_rng(seed=0)):
1919
"""
20-
:param random_generator: numpy.random.Generator, default default_rng(seed=0)
21-
A random generator
20+
Initialize the drift generator.
21+
:param random_generator: A NumPy random number generator used to generate random numbers.
22+
Defaults to a default random number generator seeded with 0.
23+
:type random_generator: numpy.random.Generator
2224
"""
2325
self.random_generator = random_generator
2426

@@ -27,7 +29,6 @@ def generate(self, X, y, **params):
2729
pass
2830

2931

30-
3132
class RandomShiftGenerator(DriftGenerator):
3233
"""
3334
Randomly shift (geometrical translation) values of each column independently of one another.
@@ -37,24 +38,27 @@ class RandomShiftGenerator(DriftGenerator):
3738

3839
def __init__(self, random_generator=default_rng(seed=0)):
3940
"""
41+
Initialize the RandomShiftGenerator.
4042
41-
:param random_generator: A random generator
42-
:param shift_std: The standard deviation of the amount of shift applied (shift is chosen from a normal distribution)
43+
:param random_generator: A NumPy random number generator used to generate random numbers.
44+
Defaults to a default random number generator seeded with 0.
45+
:type random_generator: numpy.random.Generator
4346
"""
4447
super().__init__(random_generator=random_generator)
4548

4649
@preprocess_inputs
47-
def generate(self, X, y=None, shift_std: Union[float,np.array] = 0.1):
50+
def generate(self, X, y=None, shift_std: Union[float, np.array] = 0.1):
4851
"""
4952
Randomly shift (geometrical translation) values of each column independently of one another.
50-
Data are first standardized (mean = 0, var = 1) and a random number is added to each column.
51-
The ith columns is simply translated: `$x_i \left arrow x_i + \epsilon_i$`
52-
53-
54-
:param X:
55-
:param y:
56-
:param shift_std:
57-
:return:
53+
Data are first standardized (mean = 0, var = 1), and a random number drawn from a normal distribution
54+
with mean 0 and standard deviation `shift_std` is added to each column.
55+
The ith column is simply translated: `$x_i \leftarrow x_i + \epsilon_i$`, where $\epsilon_i \sim \mathcal{N}(0, \text{shift\_std})$.
56+
57+
:param X: Input features, a 2D array-like object (e.g., a Pandas DataFrame or a NumPy array).
58+
:param y: Target variable, a 1D array-like object (optional). Not used in this implementation.
59+
:param shift_std: Standard deviation of the normal distribution from which the random shifts are drawn.
60+
Can be a single float (applied to all columns) or an array of floats (one per column).
61+
:return: A tuple containing the modified feature matrix `X'` and the original target `y`.
5862
"""
5963
# normalize X
6064
scaler = StandardScaler()
@@ -77,20 +81,25 @@ class RandomShiftClassesGenerator(DriftGenerator):
7781

7882
def __init__(self, random_generator=default_rng(seed=0)):
7983
"""
80-
:param random_generator: A random generator
84+
Initialize the RandomShiftClassesGenerator.
85+
86+
:param random_generator: A NumPy random number generator used to generate random numbers.
87+
Defaults to a default random number generator seeded with 0.
88+
:type random_generator: numpy.random.Generator
8189
"""
8290
super().__init__(random_generator=random_generator)
8391

8492
@preprocess_inputs
85-
def generate(self, X, y, shift_std: Union[float,np.array] = 0.1):
93+
def generate(self, X, y, shift_std: Union[float, np.array] = 0.1):
8694
"""
8795
Randomly shift (geometrical translation) values of each class independently of one another.
88-
Data are first standardized (mean = 0, var = 1) and
89-
for each class a random number is added to all instances.
96+
Data are first standardized (mean = 0, var = 1) and for each class a random number is added to all instances.
9097
91-
:param X:
92-
:param y:
93-
:param shift_std: The standard deviation of the amount of shift applied (shift is chosen from a normal distribution)
98+
:param X: Input features, a 2D array-like object (e.g., a Pandas DataFrame or a NumPy array).
99+
:param y: Target variable, a 1D array-like object representing the class labels.
100+
:param shift_std: Standard deviation of the normal distribution from which the random shifts are drawn.
101+
Can be a single float (applied to all classes) or an array of floats (one per class).
102+
:return: A tuple containing the modified feature matrix `X'` and the original target `y`.
94103
"""
95104
# extract unique labels
96105
classes = np.unique(y)

badgers/generators/tabular_data/imbalance.py

Lines changed: 72 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -16,32 +16,61 @@ class ImbalanceGenerator(GeneratorMixin):
1616

1717
def __init__(self, random_generator=default_rng(seed=0)):
1818
"""
19-
:param random_generator: A random generator
19+
Initialize the ImbalanceGenerator with a specified random number generator.
20+
21+
:param random_generator: A NumPy random number generator used to generate random numbers.
22+
Defaults to a default random number generator seeded with 0.
23+
:type random_generator: numpy.random.Generator
2024
"""
2125
self.random_generator = random_generator
2226

2327
@abc.abstractmethod
2428
def generate(self, X, y=None, **params):
29+
"""
30+
Abstract method to generate imbalanced data from the input data.
31+
This should be overridden
32+
33+
:param X: Input features, can be a pandas DataFrame or a numpy array.
34+
:type X: Union[pandas.DataFrame, numpy.ndarray]
35+
:param y: Target variable, can be a pandas Series or a numpy array.
36+
If None, it is assumed that the target is not provided.
37+
:type y: Union[pandas.Series, numpy.ndarray, None], optional
38+
:param params: Additional keyword arguments that might be required for specific implementations.
39+
:type params: dict
40+
"""
2541
pass
2642

2743

2844
class RandomSamplingFeaturesGenerator(ImbalanceGenerator):
2945

3046
def __init__(self, random_generator=default_rng(seed=0), ):
3147
"""
32-
:param random_generator: A random generator
48+
Initialize the RandomSamplingFeaturesGenerator with a specified random number generator.
49+
:param random_generator: A NumPy random number generator used to generate random numbers.
50+
Defaults to a default random number generator seeded with 0.
51+
:type random_generator: numpy.random.Generator
3352
"""
3453
super().__init__(random_generator=random_generator)
3554

3655
@preprocess_inputs
3756
def generate(self, X, y=None, sampling_proba_func=lambda X: normalize_proba(X.iloc[:, 0])):
3857
"""
39-
Randomly samples instances based on the features values in X
40-
41-
:param X:
42-
:param y:
43-
:param sampling_proba_func: A function that takes as input data and returns a sampling probability
44-
:return: Xt, yt
58+
Randomly samples instances based on the feature values in X using a specified sampling probability function.
59+
60+
The sampling probability function is applied to the input features X to determine the probability of each instance being sampled.
61+
By default, the first column of X is used to compute the normalized sampling probabilities.
62+
63+
:param X: Input features, can be a pandas DataFrame or a numpy array.
64+
:type X: Union[pandas.DataFrame, numpy.ndarray]
65+
:param y: Target variable, can be a pandas Series or a numpy array.
66+
If None, it is assumed that the target is not provided.
67+
:type y: Union[pandas.Series, numpy.ndarray, None], optional
68+
:param sampling_proba_func: A function that takes as input data (X) and returns a series of sampling probabilities.
69+
The function should ensure that the probabilities are normalized.
70+
:type sampling_proba_func: callable
71+
:return: A tuple containing the sampled features (Xt) and the corresponding target values (yt).
72+
If y is None, only the sampled features (Xt) are returned.
73+
:rtype: Tuple[Union[pandas.DataFrame, numpy.ndarray], Union[pandas.Series, numpy.ndarray, None]]
4574
"""
4675
# total number of instances that will be missing
4776
# sampling
@@ -59,23 +88,31 @@ class RandomSamplingClassesGenerator(ImbalanceGenerator):
5988

6089
def __init__(self, random_generator=default_rng(seed=0), ):
6190
"""
91+
Initialize the RandomSamplingClassesGenerator with a specified random number generator.
6292
63-
:param random_generator: A random generator
64-
93+
:param random_generator: A NumPy random number generator used to generate random numbers.
94+
Defaults to a default random number generator seeded with 0.
95+
:type random_generator: numpy.random.Generator
6596
"""
6697
super().__init__(random_generator=random_generator)
6798
self.transformed_labels_ = None
6899

69100
@preprocess_inputs
70101
def generate(self, X, y, proportion_classes: dict = None):
71102
"""
72-
Randomly samples instances for each classes
73-
74-
:param X:
75-
:param y:
76-
:param proportion_classes: Example for having in total 50% of class 'A', 30% of class 'B', and 20% of class 'C'
77-
proportion_classes={'A':0.5, 'B':0.3, 'C':0.2}
78-
:return:
103+
Randomly samples instances for each class based on the specified proportions.
104+
105+
:param X: Input features, can be a pandas DataFrame or a numpy array.
106+
:type X: Union[pandas.DataFrame, numpy.ndarray]
107+
:param y: Target variable, must be a pandas Series or a numpy array.
108+
:type y: Union[pandas.Series, numpy.ndarray]
109+
:param proportion_classes: A dictionary specifying the desired proportion of each class.
110+
The keys are class labels and the values are the desired proportions.
111+
For example, to have 50% of class 'A', 30% of class 'B', and 20% of class 'C',
112+
use `proportion_classes={'A': 0.5, 'B': 0.3, 'C': 0.2}`.
113+
:type proportion_classes: dict, optional
114+
:return: A tuple containing the sampled features (Xt) and the corresponding target values (yt).
115+
:rtype: Tuple[Union[pandas.DataFrame, numpy.ndarray], Union[pandas.Series, numpy.ndarray]]
79116
"""
80117
# local variables
81118
Xt = []
@@ -103,21 +140,32 @@ class RandomSamplingTargetsGenerator(ImbalanceGenerator):
103140

104141
def __init__(self, random_generator=default_rng(seed=0)):
105142
"""
143+
Initialize the RandomSamplingTargetsGenerator with a specified random number generator.
106144
107-
:param random_generator: A random generator
108-
:param sampling_proba_func: A function that takes y as input and returns a sampling probability
145+
:param random_generator: A NumPy random number generator used to generate random numbers.
146+
Defaults to a default random number generator seeded with 0.
147+
:type random_generator: numpy.random.Generator
109148
"""
110149
super().__init__(random_generator=random_generator)
111150
self.transformed_labels_ = None
112151

113152
@preprocess_inputs
114153
def generate(self, X, y, sampling_proba_func=lambda y: normalize_proba(y)):
115154
"""
116-
Randomly samples instances for each classes
117-
118-
:param X:
119-
:param y:
120-
:return:
155+
Randomly samples instances based on the target values in y using a specified sampling probability function.
156+
157+
The sampling probability function is applied to the target values y to determine the probability of each instance being sampled.
158+
By default, the target values are used to compute the normalized sampling probabilities.
159+
160+
:param X: Input features, can be a pandas DataFrame or a numpy array.
161+
:type X: Union[pandas.DataFrame, numpy.ndarray]
162+
:param y: Target variable, must be a pandas Series or a numpy array.
163+
:type y: Union[pandas.Series, numpy.ndarray]
164+
:param sampling_proba_func: A function that takes as input target values (y) and returns a series of sampling probabilities.
165+
The function should ensure that the probabilities are normalized.
166+
:type sampling_proba_func: callable
167+
:return: A tuple containing the sampled features (Xt) and the corresponding target values (yt).
168+
:rtype: Tuple[Union[pandas.DataFrame, numpy.ndarray], Union[pandas.Series, numpy.ndarray]]
121169
"""
122170
sampling_probabilities_ = sampling_proba_func(y)
123171
sampling_mask = self.random_generator.choice(X.shape[0], p=sampling_probabilities_, size=X.shape[0],

0 commit comments

Comments
 (0)