88class Insects (base .RemoteDataset ):
99 """Insects dataset.
1010
11- This dataset has different variants, which are :
11+ This dataset has different variants for concept drift evaluation :
1212
1313 - abrupt_balanced
1414 - abrupt_imbalanced
1515 - gradual_balanced
1616 - gradual_imbalanced
17- - incremental-abrupt_balanced
18- - incremental-abrupt_imbalanced
19- - incremental-reoccurring_balanced
20- - incremental-reoccurring_imbalanced
17+ - incremental_abrupt_balanced
18+ - incremental_reoccurring_balanced
2119 - incremental_balanced
22- - incremental_imbalanced
23- - out-of-control
20+
2421
2522 The number of samples and the difficulty change from one variant to another. The number of
2623 classes is always the same (6), except for the last variant (24).
2724
2825 Parameters
2926 ----------
3027 variant
31- Indicates which variant of the dataset to load.
28+ Indicates which variant of the dataset to load. Defaults to "abrupt_balanced".
3229
3330 References
3431 ----------
@@ -37,43 +34,97 @@ class Insects(base.RemoteDataset):
3734
3835 """
3936
40- variant_sizes = {
41- "abrupt_balanced" : (52_848 , 16_419_025 ),
42- "abrupt_imbalanced" : (355_275 , 110_043_637 ),
43- "gradual_balanced" : (24_150 , 7_503_750 ),
44- "gradual_imbalanced" : (143_323 , 44_371_501 ),
45- "incremental-abrupt_balanced" : (79_986 , 24_849_436 ),
46- "incremental-abrupt_imbalanced" : (452_044 , 140_004_225 ),
47- "incremental-reoccurring_balanced" : (79_986 , 24_849_092 ),
48- "incremental-reoccurring_imbalanced" : (452_044 , 140_004_230 ),
49- "incremental_balanced" : (57_018 , 17_713_574 ),
50- "incremental_imbalanced" : (452_044 , 140_004_218 ),
51- "out-of-control" : (905_145 , 277_777_854 ),
37+ variant_configs = {
38+ "abrupt_balanced" : {
39+ "n_samples" : 52_848 ,
40+ "size" : 14_151_769 ,
41+ "url" : "https://drive.google.com/uc?export=download&id=1WQoIuuVgiuXfzv4kvao6XuLQG37V923O&confirm=t" ,
42+ "filename" : "abrupt_balanced.csv" ,
43+ },
44+ "abrupt_imbalanced" : {
45+ "n_samples" : 355_275 ,
46+ "size" : 94_893_622 ,
47+ "url" : "https://drive.google.com/uc?export=download&id=1Z9W2_mwawobXeTihZDTDnFOCtfdgmgDa&confirm=t" ,
48+ "filename" : "abrupt_imbalanced.csv" ,
49+ },
50+ "gradual_balanced" : {
51+ "n_samples" : 24_150 ,
52+ "size" : 6_474_831 ,
53+ "url" : "https://drive.google.com/uc?export=download&id=1fepYkDxwMbuoRUaG_fsymSzkuapS4vJp&confirm=t" ,
54+ "filename" : "gradual_balanced.csv" ,
55+ },
56+ "gradual_imbalanced" : {
57+ "n_samples" : 143_323 ,
58+ "size" : 38_339_554 ,
59+ "url" : "https://drive.google.com/uc?export=download&id=1_WJ7lxK2sx1i6kq9Nqv4KrjT0ZXB0QbC&confirm=t" ,
60+ "filename" : "gradual_imbalanced.csv" ,
61+ },
62+ "incremental_abrupt_balanced" : {
63+ "n_samples" : 79_986 ,
64+ "size" : 21_421_452 ,
65+ "url" : "https://drive.google.com/uc?export=download&id=1-J5WIBN8_F_tomdcrOaiLCxk9nzxtFsf&confirm=t" ,
66+ "filename" : "incremental_abrupt_balanced.csv" ,
67+ },
68+ # "incremental_abrupt_imbalanced": {
69+ # "n_samples": 452_044,
70+ # "size": 140_004_225,
71+ # "url": "https://drive.google.com/uc?export=download&id=1M6QfsernUlM0qvqXdbo9bPYAsHCcfuhb&confirm=t",
72+ # "filename": "incremental_abrupt_imbalanced.csv",
73+ # },
74+ "incremental_reoccurring_balanced" : {
75+ "n_samples" : 79_986 ,
76+ "size" : 21_433_047 ,
77+ "url" : "https://drive.google.com/uc?export=download&id=1mSKTSsxzYMjdV005AJqrcMGajuu7dUfW&confirm=t" ,
78+ "filename" : "incremental_reoccurring_balanced.csv" ,
79+ },
80+ # "incremental_reoccurring_imbalanced": {
81+ # "n_samples": 452_044,
82+ # "size": 140_004_230,
83+ # "url": "https://drive.google.com/uc?export=download&id=1aSqdxvZvug-SwQw5NLY_9nTPhhEjB2ig&confirm=t",
84+ # "filename": "incremental_reoccurring_imbalanced.csv",
85+ # },
86+ "incremental_balanced" : {
87+ "n_samples" : 57_018 ,
88+ "size" : 15_258_997 ,
89+ "url" : "https://drive.google.com/uc?export=download&id=1tKQ2KL4m-ACHCVKUDLFPrM4cyhioiOpu&confirm=t" ,
90+ "filename" : "incremental_balanced.csv" ,
91+ },
92+ # "incremental_imbalanced": {
93+ # "n_samples": 452_044,
94+ # "size": 140_004_218,
95+ # "url": "https://drive.google.com/uc?export=download&id=1K3vp0EjA4FPDeSgffiBe4CMosbB4CKbL&confirm=t",
96+ # "filename": "incremental_imbalanced.csv",
97+ # }
5298 }
53-
54- variants = list (variant_sizes .keys ())
99+ variants = variant_configs .keys ()
55100
56101 def __init__ (self , variant = "abrupt_balanced" ):
57- try :
58- n_samples , size = self .variant_sizes [variant ]
59- except KeyError :
60- variants = "\n " .join (f"- { v } " for v in self .variant_sizes )
102+ if variant not in self .variant_configs :
103+ variants = "\n " .join (f"- { v } " for v in self .variant_configs )
61104 raise ValueError (f"Unknown variant, possible choices are:\n { variants } " )
105+
106+ config = self .variant_configs [variant ]
107+ n_samples = config ["n_samples" ]
108+ size = config ["size" ]
109+ url = config ["url" ]
110+ filename = config ["filename" ]
62111 n_classes = 24 if variant == "out-of-control" else 6
63112
64113 super ().__init__ (
65114 n_classes = n_classes ,
66115 n_samples = n_samples ,
67116 n_features = 33 ,
68117 task = base .MULTI_CLF ,
69- url = f"http://sites.labic.icmc.usp.br/vsouza/repository/creme/INSECTS- { variant } _norm.arff" ,
118+ url = url ,
70119 size = size ,
71120 unpack = False ,
121+ filename = filename ,
72122 )
73123 self .variant = variant
74124
75125 def _iter (self ):
76- return stream .iter_arff (self .path , target = "class" )
126+ cols = [f"f{ i } " for i in range (1 , 34 )] + ["class" ]
127+ return stream .iter_csv (self .path , target = "class" , fieldnames = cols )
77128
78129 @property
79130 def _repr_content (self ):
0 commit comments