Skip to content

Commit bcf49f6

Browse files
Fix insects dataset (#1731)
* Refactor insects dataset and remove variants: incremental_abrupt_imbalanced incremental_imbalanced incremental_reoccurring_imbalanced * Reformat file * Remove unnecessary printouts * Refactor archive_path * add variants * Reformat * Add release note
1 parent 4880658 commit bcf49f6

File tree

3 files changed

+87
-30
lines changed

3 files changed

+87
-30
lines changed

docs/releases/unreleased.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
# Unreleased
22

3+
## datasets
4+
- Fixed download in Insects dataset. The datasets incremental_abrupt_imbalanced, incremental_imbalanced, incremental_reoccurring_imbalanced and out-of-control are not supported anymore.
5+
- Refactored `benchmarks` and added plotly dependency for interactive plots
6+
37
## stats
48

59
- Added `update_many` method to `stats.PearsonCorr`.
610
- Changed the calculation of the Kuiper statistic in `base.KolmogorovSmirnov` to correspond to the reference implementation. The Kuiper statistic uses the difference between the maximum value and the minimum value.
7-
- Refactored `benchmarks` and added plotly dependency for interactive plots
811

912
## tree
1013

river/datasets/base.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,10 @@ def download(self, force=False, verbose=True):
267267
# Determine where to download the archive
268268
directory = self.path.parent
269269
directory.mkdir(parents=True, exist_ok=True)
270-
archive_path = directory.joinpath(os.path.basename(self.url))
270+
if self.unpack:
271+
archive_path = directory.joinpath(os.path.basename(self.url))
272+
else:
273+
archive_path = directory.joinpath(os.path.basename(self.filename))
271274

272275
with request.urlopen(self.url) as r:
273276
# Notify the user

river/datasets/insects.py

Lines changed: 79 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -8,27 +8,24 @@
88
class Insects(base.RemoteDataset):
99
"""Insects dataset.
1010
11-
This dataset has different variants, which are:
11+
This dataset has different variants for concept drift evaluation:
1212
1313
- abrupt_balanced
1414
- abrupt_imbalanced
1515
- gradual_balanced
1616
- gradual_imbalanced
17-
- incremental-abrupt_balanced
18-
- incremental-abrupt_imbalanced
19-
- incremental-reoccurring_balanced
20-
- incremental-reoccurring_imbalanced
17+
- incremental_abrupt_balanced
18+
- incremental_reoccurring_balanced
2119
- incremental_balanced
22-
- incremental_imbalanced
23-
- out-of-control
20+
2421
2522
The number of samples and the difficulty change from one variant to another. The number of
2623
classes is always the same (6), except for the last variant (24).
2724
2825
Parameters
2926
----------
3027
variant
31-
Indicates which variant of the dataset to load.
28+
Indicates which variant of the dataset to load. Defaults to "abrupt_balanced".
3229
3330
References
3431
----------
@@ -37,43 +34,97 @@ class Insects(base.RemoteDataset):
3734
3835
"""
3936

40-
variant_sizes = {
41-
"abrupt_balanced": (52_848, 16_419_025),
42-
"abrupt_imbalanced": (355_275, 110_043_637),
43-
"gradual_balanced": (24_150, 7_503_750),
44-
"gradual_imbalanced": (143_323, 44_371_501),
45-
"incremental-abrupt_balanced": (79_986, 24_849_436),
46-
"incremental-abrupt_imbalanced": (452_044, 140_004_225),
47-
"incremental-reoccurring_balanced": (79_986, 24_849_092),
48-
"incremental-reoccurring_imbalanced": (452_044, 140_004_230),
49-
"incremental_balanced": (57_018, 17_713_574),
50-
"incremental_imbalanced": (452_044, 140_004_218),
51-
"out-of-control": (905_145, 277_777_854),
37+
variant_configs = {
38+
"abrupt_balanced": {
39+
"n_samples": 52_848,
40+
"size": 14_151_769,
41+
"url": "https://drive.google.com/uc?export=download&id=1WQoIuuVgiuXfzv4kvao6XuLQG37V923O&confirm=t",
42+
"filename": "abrupt_balanced.csv",
43+
},
44+
"abrupt_imbalanced": {
45+
"n_samples": 355_275,
46+
"size": 94_893_622,
47+
"url": "https://drive.google.com/uc?export=download&id=1Z9W2_mwawobXeTihZDTDnFOCtfdgmgDa&confirm=t",
48+
"filename": "abrupt_imbalanced.csv",
49+
},
50+
"gradual_balanced": {
51+
"n_samples": 24_150,
52+
"size": 6_474_831,
53+
"url": "https://drive.google.com/uc?export=download&id=1fepYkDxwMbuoRUaG_fsymSzkuapS4vJp&confirm=t",
54+
"filename": "gradual_balanced.csv",
55+
},
56+
"gradual_imbalanced": {
57+
"n_samples": 143_323,
58+
"size": 38_339_554,
59+
"url": "https://drive.google.com/uc?export=download&id=1_WJ7lxK2sx1i6kq9Nqv4KrjT0ZXB0QbC&confirm=t",
60+
"filename": "gradual_imbalanced.csv",
61+
},
62+
"incremental_abrupt_balanced": {
63+
"n_samples": 79_986,
64+
"size": 21_421_452,
65+
"url": "https://drive.google.com/uc?export=download&id=1-J5WIBN8_F_tomdcrOaiLCxk9nzxtFsf&confirm=t",
66+
"filename": "incremental_abrupt_balanced.csv",
67+
},
68+
# "incremental_abrupt_imbalanced": {
69+
# "n_samples": 452_044,
70+
# "size": 140_004_225,
71+
# "url": "https://drive.google.com/uc?export=download&id=1M6QfsernUlM0qvqXdbo9bPYAsHCcfuhb&confirm=t",
72+
# "filename": "incremental_abrupt_imbalanced.csv",
73+
# },
74+
"incremental_reoccurring_balanced": {
75+
"n_samples": 79_986,
76+
"size": 21_433_047,
77+
"url": "https://drive.google.com/uc?export=download&id=1mSKTSsxzYMjdV005AJqrcMGajuu7dUfW&confirm=t",
78+
"filename": "incremental_reoccurring_balanced.csv",
79+
},
80+
# "incremental_reoccurring_imbalanced": {
81+
# "n_samples": 452_044,
82+
# "size": 140_004_230,
83+
# "url": "https://drive.google.com/uc?export=download&id=1aSqdxvZvug-SwQw5NLY_9nTPhhEjB2ig&confirm=t",
84+
# "filename": "incremental_reoccurring_imbalanced.csv",
85+
# },
86+
"incremental_balanced": {
87+
"n_samples": 57_018,
88+
"size": 15_258_997,
89+
"url": "https://drive.google.com/uc?export=download&id=1tKQ2KL4m-ACHCVKUDLFPrM4cyhioiOpu&confirm=t",
90+
"filename": "incremental_balanced.csv",
91+
},
92+
# "incremental_imbalanced": {
93+
# "n_samples": 452_044,
94+
# "size": 140_004_218,
95+
# "url": "https://drive.google.com/uc?export=download&id=1K3vp0EjA4FPDeSgffiBe4CMosbB4CKbL&confirm=t",
96+
# "filename": "incremental_imbalanced.csv",
97+
# }
5298
}
53-
54-
variants = list(variant_sizes.keys())
99+
variants = variant_configs.keys()
55100

56101
def __init__(self, variant="abrupt_balanced"):
57-
try:
58-
n_samples, size = self.variant_sizes[variant]
59-
except KeyError:
60-
variants = "\n".join(f"- {v}" for v in self.variant_sizes)
102+
if variant not in self.variant_configs:
103+
variants = "\n".join(f"- {v}" for v in self.variant_configs)
61104
raise ValueError(f"Unknown variant, possible choices are:\n{variants}")
105+
106+
config = self.variant_configs[variant]
107+
n_samples = config["n_samples"]
108+
size = config["size"]
109+
url = config["url"]
110+
filename = config["filename"]
62111
n_classes = 24 if variant == "out-of-control" else 6
63112

64113
super().__init__(
65114
n_classes=n_classes,
66115
n_samples=n_samples,
67116
n_features=33,
68117
task=base.MULTI_CLF,
69-
url=f"http://sites.labic.icmc.usp.br/vsouza/repository/creme/INSECTS-{variant}_norm.arff",
118+
url=url,
70119
size=size,
71120
unpack=False,
121+
filename=filename,
72122
)
73123
self.variant = variant
74124

75125
def _iter(self):
76-
return stream.iter_arff(self.path, target="class")
126+
cols = [f"f{i}" for i in range(1, 34)] + ["class"]
127+
return stream.iter_csv(self.path, target="class", fieldnames=cols)
77128

78129
@property
79130
def _repr_content(self):

0 commit comments

Comments
 (0)