@@ -84,7 +84,7 @@ def serialize(cls, obj):
84
84
orig_close = bstream .close
85
85
bstream .close = lambda : None
86
86
try :
87
- obj .astype ( dtype = object ). to_frame (name = "Target" ).to_parquet (
87
+ obj .to_frame (name = "Target" ).to_parquet (
88
88
bstream , compression = "Brotli" , index = False
89
89
)
90
90
finally :
@@ -1277,6 +1277,7 @@ def retrieve_openml(
1277
1277
suite = openml .study .get_suite (suite_id )
1278
1278
tasks = suite .tasks .copy ()
1279
1279
random .Random (1337 ).shuffle (tasks )
1280
+ cat_type = pd .CategoricalDtype (ordered = False )
1280
1281
for task_id in tqdm (tasks , desc = source ):
1281
1282
task = openml .tasks .get_task (
1282
1283
task_id ,
@@ -1303,48 +1304,23 @@ def retrieve_openml(
1303
1304
)
1304
1305
1305
1306
if task .task_type_id == openml .tasks .TaskType .SUPERVISED_CLASSIFICATION :
1306
- problem = (
1307
- "binary"
1308
- if dataset .qualities ["NumberOfClasses" ] == 2
1309
- else "multiclass"
1310
- )
1307
+ classes , y = np .unique (y .values , return_inverse = True )
1308
+ problem = "binary" if len (classes ) == 2 else "multiclass"
1311
1309
1312
1310
# for benchmarking we do not care about the original target strings
1313
- y = pd .Series (np . unique ( y , return_inverse = True )[ 1 ] )
1311
+ y = pd .Series (y , dtype = np . int16 )
1314
1312
elif task .task_type_id == openml .tasks .TaskType .SUPERVISED_REGRESSION :
1315
1313
problem = "regression"
1314
+ y = pd .Series (y , dtype = np .float64 )
1316
1315
else :
1317
1316
raise Exception (f"Unrecognized task_type_id { task .task_type_id } ." )
1318
1317
1319
1318
for col_name , cat in zip (X .columns , categorical_mask ):
1320
1319
col = X [col_name ]
1321
-
1322
- if pd .api .types .is_sparse (col ):
1323
- col = col .sparse .to_dense ()
1324
- X [col_name ] = col
1325
-
1326
- if col .dtype .name == "category" :
1327
- if not cat :
1328
- raise Exception (
1329
- f"Categorical type mismatch. Was CategoricalDtype but indicated non-categorical."
1330
- )
1331
- if col .cat .ordered :
1332
- # OpenMl incorrectly is indicating these as ordered
1333
- X [col_name ] = col .cat .as_unordered ()
1334
- elif col .dtype .name == "object" :
1335
- if cat :
1336
- X [col_name ] = col .astype (pd .CategoricalDtype (ordered = False ))
1337
- else :
1338
- X [col_name ] = col .astype (float )
1339
- elif np .issubdtype (col .dtype , np .floating ) or np .issubdtype (
1340
- col .dtype , np .integer
1341
- ):
1342
- if cat :
1343
- raise Exception (
1344
- f"Categorical type mismatch. Was continuous but indicated categorical."
1345
- )
1320
+ if cat :
1321
+ X [col_name ] = pd .Series (col , dtype = cat_type , name = col .name )
1346
1322
else :
1347
- raise Exception ( f"Unrecognized data type { col . dtype . name } ." )
1323
+ X [ col_name ] = pd . Series ( col , dtype = np . float64 , name = col . name )
1348
1324
1349
1325
meta = {
1350
1326
"name" : name ,
@@ -1470,6 +1446,7 @@ def retrieve_catboost_50k(
1470
1446
if cache_dir is not None :
1471
1447
cache_dir = pathlib .Path (cache_dir , "catboost_50k" )
1472
1448
1449
+ cat_type = pd .CategoricalDtype (ordered = False )
1473
1450
for dataset in tqdm (datasets , desc = "catboost_50k" ):
1474
1451
name = dataset ["name" ]
1475
1452
X_name = f"{ name } .X.parquet"
@@ -1482,14 +1459,34 @@ def retrieve_catboost_50k(
1482
1459
target = dataset ["target" ]
1483
1460
X = df .drop (target , axis = 1 )
1484
1461
y = df [target ]
1485
- problem = dataset ["problem" ]
1486
- if dataset ["problem" ] == "classification" :
1487
- problem = "binary" if len (y .unique ()) == 2 else "multiclass"
1462
+ problem_type = dataset ["problem" ]
1463
+
1464
+ if problem_type == "classification" :
1465
+ classes , y = np .unique (y .values , return_inverse = True )
1466
+ problem = "binary" if len (classes ) == 2 else "multiclass"
1467
+
1468
+ # for benchmarking we do not care about the original target strings
1469
+ y = pd .Series (y , dtype = np .int16 )
1470
+ elif problem_type == "regression" :
1471
+ problem = "regression"
1472
+ y = pd .Series (y , dtype = np .float64 )
1473
+ else :
1474
+ raise Exception (f"Unrecognized problem { problem_type } ." )
1475
+
1476
+ categorical_mask = [dt .kind == "O" for dt in X .dtypes ]
1477
+
1478
+ for col_name , cat in zip (X .columns , categorical_mask ):
1479
+ col = X [col_name ]
1480
+ if cat :
1481
+ X [col_name ] = pd .Series (col , dtype = cat_type , name = col .name )
1482
+ else :
1483
+ X [col_name ] = pd .Series (col , dtype = np .float64 , name = col .name )
1484
+
1488
1485
meta = {
1489
1486
"name" : name ,
1490
1487
"problem" : problem ,
1491
1488
"source" : "catboost_50k" ,
1492
- "categorical_mask" : [ dt . kind == "O" for dt in X . dtypes ] ,
1489
+ "categorical_mask" : categorical_mask ,
1493
1490
"feature_names" : list (X .columns ),
1494
1491
}
1495
1492
supervised = SupervisedDataset (X , y , meta )
@@ -1521,6 +1518,7 @@ def retrieve_pmlb(cache_dir: str = None) -> Generator[SupervisedDataset, None, N
1521
1518
)
1522
1519
dataset_names .extend ([("regression" , name ) for name in regression_dataset_names ])
1523
1520
1521
+ cat_type = pd .CategoricalDtype (ordered = False )
1524
1522
for problem_type , dataset_name in tqdm (dataset_names , desc = "pmlb" ):
1525
1523
name = dataset_name
1526
1524
X_name = f"{ name } .X.parquet"
@@ -1532,14 +1530,32 @@ def retrieve_pmlb(cache_dir: str = None) -> Generator[SupervisedDataset, None, N
1532
1530
df = fetch_data (dataset_name )
1533
1531
X = df .drop ("target" , axis = 1 )
1534
1532
y = df ["target" ]
1535
- problem = problem_type
1536
1533
if problem_type == "classification" :
1537
- problem = "binary" if len (y .unique ()) == 2 else "multiclass"
1534
+ classes , y = np .unique (y .values , return_inverse = True )
1535
+ problem = "binary" if len (classes ) == 2 else "multiclass"
1536
+
1537
+ # for benchmarking we do not care about the original target strings
1538
+ y = pd .Series (y , dtype = np .int16 )
1539
+ elif problem_type == "regression" :
1540
+ problem = "regression"
1541
+ y = pd .Series (y , dtype = np .float64 )
1542
+ else :
1543
+ raise Exception (f"Unrecognized problem_type { problem_type } ." )
1544
+
1545
+ categorical_mask = [dt .kind == "O" for dt in X .dtypes ]
1546
+
1547
+ for col_name , cat in zip (X .columns , categorical_mask ):
1548
+ col = X [col_name ]
1549
+ if cat :
1550
+ X [col_name ] = pd .Series (col , dtype = cat_type , name = col .name )
1551
+ else :
1552
+ X [col_name ] = pd .Series (col , dtype = np .float64 , name = col .name )
1553
+
1538
1554
meta = {
1539
1555
"name" : name ,
1540
1556
"problem" : problem ,
1541
1557
"source" : "pmlb" ,
1542
- "categorical_mask" : [ dt . kind == "O" for dt in X . dtypes ] ,
1558
+ "categorical_mask" : categorical_mask ,
1543
1559
"feature_names" : list (X .columns ),
1544
1560
}
1545
1561
supervised = SupervisedDataset (X , y , meta )
0 commit comments