@@ -84,7 +84,7 @@ def serialize(cls, obj):
84
84
orig_close = bstream .close
85
85
bstream .close = lambda : None
86
86
try :
87
- obj .astype ( dtype = object ). to_frame (name = "Target" ).to_parquet (
87
+ obj .to_frame (name = "Target" ).to_parquet (
88
88
bstream , compression = "Brotli" , index = False
89
89
)
90
90
finally :
@@ -1287,6 +1287,7 @@ def retrieve_openml(
1287
1287
suite = openml .study .get_suite (suite_id )
1288
1288
tasks = suite .tasks .copy ()
1289
1289
random .Random (1337 ).shuffle (tasks )
1290
+ cat_type = pd .CategoricalDtype (ordered = False )
1290
1291
for task_id in tqdm (tasks , desc = source ):
1291
1292
task = openml .tasks .get_task (
1292
1293
task_id ,
@@ -1313,48 +1314,23 @@ def retrieve_openml(
1313
1314
)
1314
1315
1315
1316
if task .task_type_id == openml .tasks .TaskType .SUPERVISED_CLASSIFICATION :
1316
- problem = (
1317
- "binary"
1318
- if dataset .qualities ["NumberOfClasses" ] == 2
1319
- else "multiclass"
1320
- )
1317
+ classes , y = np .unique (y .values , return_inverse = True )
1318
+ problem = "binary" if len (classes ) == 2 else "multiclass"
1321
1319
1322
1320
# for benchmarking we do not care about the original target strings
1323
- y = pd .Series (np . unique ( y , return_inverse = True )[ 1 ] )
1321
+ y = pd .Series (y , dtype = np . int16 )
1324
1322
elif task .task_type_id == openml .tasks .TaskType .SUPERVISED_REGRESSION :
1325
1323
problem = "regression"
1324
+ y = pd .Series (y , dtype = np .float64 )
1326
1325
else :
1327
1326
raise Exception (f"Unrecognized task_type_id { task .task_type_id } ." )
1328
1327
1329
1328
for col_name , cat in zip (X .columns , categorical_mask ):
1330
1329
col = X [col_name ]
1331
-
1332
- if pd .api .types .is_sparse (col ):
1333
- col = col .sparse .to_dense ()
1334
- X [col_name ] = col
1335
-
1336
- if col .dtype .name == "category" :
1337
- if not cat :
1338
- raise Exception (
1339
- f"Categorical type mismatch. Was CategoricalDtype but indicated non-categorical."
1340
- )
1341
- if col .cat .ordered :
1342
- # OpenMl incorrectly is indicating these as ordered
1343
- X [col_name ] = col .cat .as_unordered ()
1344
- elif col .dtype .name == "object" :
1345
- if cat :
1346
- X [col_name ] = col .astype (pd .CategoricalDtype (ordered = False ))
1347
- else :
1348
- X [col_name ] = col .astype (float )
1349
- elif np .issubdtype (col .dtype , np .floating ) or np .issubdtype (
1350
- col .dtype , np .integer
1351
- ):
1352
- if cat :
1353
- raise Exception (
1354
- f"Categorical type mismatch. Was continuous but indicated categorical."
1355
- )
1330
+ if cat :
1331
+ X [col_name ] = pd .Series (col , dtype = cat_type , name = col .name )
1356
1332
else :
1357
- raise Exception ( f"Unrecognized data type { col . dtype . name } ." )
1333
+ X [ col_name ] = pd . Series ( col , dtype = np . float64 , name = col . name )
1358
1334
1359
1335
meta = {
1360
1336
"name" : name ,
@@ -1480,6 +1456,7 @@ def retrieve_catboost_50k(
1480
1456
if cache_dir is not None :
1481
1457
cache_dir = pathlib .Path (cache_dir , "catboost_50k" )
1482
1458
1459
+ cat_type = pd .CategoricalDtype (ordered = False )
1483
1460
for dataset in tqdm (datasets , desc = "catboost_50k" ):
1484
1461
name = dataset ["name" ]
1485
1462
X_name = f"{ name } .X.parquet"
@@ -1492,14 +1469,34 @@ def retrieve_catboost_50k(
1492
1469
target = dataset ["target" ]
1493
1470
X = df .drop (target , axis = 1 )
1494
1471
y = df [target ]
1495
- problem = dataset ["problem" ]
1496
- if dataset ["problem" ] == "classification" :
1497
- problem = "binary" if len (y .unique ()) == 2 else "multiclass"
1472
+ problem_type = dataset ["problem" ]
1473
+
1474
+ if problem_type == "classification" :
1475
+ classes , y = np .unique (y .values , return_inverse = True )
1476
+ problem = "binary" if len (classes ) == 2 else "multiclass"
1477
+
1478
+ # for benchmarking we do not care about the original target strings
1479
+ y = pd .Series (y , dtype = np .int16 )
1480
+ elif problem_type == "regression" :
1481
+ problem = "regression"
1482
+ y = pd .Series (y , dtype = np .float64 )
1483
+ else :
1484
+ raise Exception (f"Unrecognized problem { problem_type } ." )
1485
+
1486
+ categorical_mask = [dt .kind == "O" for dt in X .dtypes ]
1487
+
1488
+ for col_name , cat in zip (X .columns , categorical_mask ):
1489
+ col = X [col_name ]
1490
+ if cat :
1491
+ X [col_name ] = pd .Series (col , dtype = cat_type , name = col .name )
1492
+ else :
1493
+ X [col_name ] = pd .Series (col , dtype = np .float64 , name = col .name )
1494
+
1498
1495
meta = {
1499
1496
"name" : name ,
1500
1497
"problem" : problem ,
1501
1498
"source" : "catboost_50k" ,
1502
- "categorical_mask" : [ dt . kind == "O" for dt in X . dtypes ] ,
1499
+ "categorical_mask" : categorical_mask ,
1503
1500
"feature_names" : list (X .columns ),
1504
1501
}
1505
1502
supervised = SupervisedDataset (X , y , meta )
@@ -1531,6 +1528,7 @@ def retrieve_pmlb(cache_dir: str = None) -> Generator[SupervisedDataset, None, N
1531
1528
)
1532
1529
dataset_names .extend ([("regression" , name ) for name in regression_dataset_names ])
1533
1530
1531
+ cat_type = pd .CategoricalDtype (ordered = False )
1534
1532
for problem_type , dataset_name in tqdm (dataset_names , desc = "pmlb" ):
1535
1533
name = dataset_name
1536
1534
X_name = f"{ name } .X.parquet"
@@ -1542,14 +1540,32 @@ def retrieve_pmlb(cache_dir: str = None) -> Generator[SupervisedDataset, None, N
1542
1540
df = fetch_data (dataset_name )
1543
1541
X = df .drop ("target" , axis = 1 )
1544
1542
y = df ["target" ]
1545
- problem = problem_type
1546
1543
if problem_type == "classification" :
1547
- problem = "binary" if len (y .unique ()) == 2 else "multiclass"
1544
+ classes , y = np .unique (y .values , return_inverse = True )
1545
+ problem = "binary" if len (classes ) == 2 else "multiclass"
1546
+
1547
+ # for benchmarking we do not care about the original target strings
1548
+ y = pd .Series (y , dtype = np .int16 )
1549
+ elif problem_type == "regression" :
1550
+ problem = "regression"
1551
+ y = pd .Series (y , dtype = np .float64 )
1552
+ else :
1553
+ raise Exception (f"Unrecognized problem_type { problem_type } ." )
1554
+
1555
+ categorical_mask = [dt .kind == "O" for dt in X .dtypes ]
1556
+
1557
+ for col_name , cat in zip (X .columns , categorical_mask ):
1558
+ col = X [col_name ]
1559
+ if cat :
1560
+ X [col_name ] = pd .Series (col , dtype = cat_type , name = col .name )
1561
+ else :
1562
+ X [col_name ] = pd .Series (col , dtype = np .float64 , name = col .name )
1563
+
1548
1564
meta = {
1549
1565
"name" : name ,
1550
1566
"problem" : problem ,
1551
1567
"source" : "pmlb" ,
1552
- "categorical_mask" : [ dt . kind == "O" for dt in X . dtypes ] ,
1568
+ "categorical_mask" : categorical_mask ,
1553
1569
"feature_names" : list (X .columns ),
1554
1570
}
1555
1571
supervised = SupervisedDataset (X , y , meta )
0 commit comments