@@ -84,7 +84,7 @@ def serialize(cls, obj):
8484 orig_close = bstream .close
8585 bstream .close = lambda : None
8686 try :
87- obj .astype ( dtype = object ). to_frame (name = "Target" ).to_parquet (
87+ obj .to_frame (name = "Target" ).to_parquet (
8888 bstream , compression = "Brotli" , index = False
8989 )
9090 finally :
@@ -1287,6 +1287,7 @@ def retrieve_openml(
12871287 suite = openml .study .get_suite (suite_id )
12881288 tasks = suite .tasks .copy ()
12891289 random .Random (1337 ).shuffle (tasks )
1290+ cat_type = pd .CategoricalDtype (ordered = False )
12901291 for task_id in tqdm (tasks , desc = source ):
12911292 task = openml .tasks .get_task (
12921293 task_id ,
@@ -1313,48 +1314,23 @@ def retrieve_openml(
13131314 )
13141315
13151316 if task .task_type_id == openml .tasks .TaskType .SUPERVISED_CLASSIFICATION :
1316- problem = (
1317- "binary"
1318- if dataset .qualities ["NumberOfClasses" ] == 2
1319- else "multiclass"
1320- )
1317+ classes , y = np .unique (y .values , return_inverse = True )
1318+ problem = "binary" if len (classes ) == 2 else "multiclass"
13211319
13221320 # for benchmarking we do not care about the original target strings
1323- y = pd .Series (np . unique ( y , return_inverse = True )[ 1 ] )
1321+ y = pd .Series (y , dtype = np . int16 )
13241322 elif task .task_type_id == openml .tasks .TaskType .SUPERVISED_REGRESSION :
13251323 problem = "regression"
1324+ y = pd .Series (y , dtype = np .float64 )
13261325 else :
13271326 raise Exception (f"Unrecognized task_type_id { task .task_type_id } ." )
13281327
13291328 for col_name , cat in zip (X .columns , categorical_mask ):
13301329 col = X [col_name ]
1331-
1332- if pd .api .types .is_sparse (col ):
1333- col = col .sparse .to_dense ()
1334- X [col_name ] = col
1335-
1336- if col .dtype .name == "category" :
1337- if not cat :
1338- raise Exception (
1339- f"Categorical type mismatch. Was CategoricalDtype but indicated non-categorical."
1340- )
1341- if col .cat .ordered :
1342- # OpenMl incorrectly is indicating these as ordered
1343- X [col_name ] = col .cat .as_unordered ()
1344- elif col .dtype .name == "object" :
1345- if cat :
1346- X [col_name ] = col .astype (pd .CategoricalDtype (ordered = False ))
1347- else :
1348- X [col_name ] = col .astype (float )
1349- elif np .issubdtype (col .dtype , np .floating ) or np .issubdtype (
1350- col .dtype , np .integer
1351- ):
1352- if cat :
1353- raise Exception (
1354- f"Categorical type mismatch. Was continuous but indicated categorical."
1355- )
1330+ if cat :
1331+ X [col_name ] = pd .Series (col , dtype = cat_type , name = col .name )
13561332 else :
1357- raise Exception ( f"Unrecognized data type { col . dtype . name } ." )
1333+ X [ col_name ] = pd . Series ( col , dtype = np . float64 , name = col . name )
13581334
13591335 meta = {
13601336 "name" : name ,
@@ -1480,6 +1456,7 @@ def retrieve_catboost_50k(
14801456 if cache_dir is not None :
14811457 cache_dir = pathlib .Path (cache_dir , "catboost_50k" )
14821458
1459+ cat_type = pd .CategoricalDtype (ordered = False )
14831460 for dataset in tqdm (datasets , desc = "catboost_50k" ):
14841461 name = dataset ["name" ]
14851462 X_name = f"{ name } .X.parquet"
@@ -1492,14 +1469,34 @@ def retrieve_catboost_50k(
14921469 target = dataset ["target" ]
14931470 X = df .drop (target , axis = 1 )
14941471 y = df [target ]
1495- problem = dataset ["problem" ]
1496- if dataset ["problem" ] == "classification" :
1497- problem = "binary" if len (y .unique ()) == 2 else "multiclass"
1472+ problem_type = dataset ["problem" ]
1473+
1474+ if problem_type == "classification" :
1475+ classes , y = np .unique (y .values , return_inverse = True )
1476+ problem = "binary" if len (classes ) == 2 else "multiclass"
1477+
1478+ # for benchmarking we do not care about the original target strings
1479+ y = pd .Series (y , dtype = np .int16 )
1480+ elif problem_type == "regression" :
1481+ problem = "regression"
1482+ y = pd .Series (y , dtype = np .float64 )
1483+ else :
1484+ raise Exception (f"Unrecognized problem { problem_type } ." )
1485+
1486+ categorical_mask = [dt .kind == "O" for dt in X .dtypes ]
1487+
1488+ for col_name , cat in zip (X .columns , categorical_mask ):
1489+ col = X [col_name ]
1490+ if cat :
1491+ X [col_name ] = pd .Series (col , dtype = cat_type , name = col .name )
1492+ else :
1493+ X [col_name ] = pd .Series (col , dtype = np .float64 , name = col .name )
1494+
14981495 meta = {
14991496 "name" : name ,
15001497 "problem" : problem ,
15011498 "source" : "catboost_50k" ,
1502- "categorical_mask" : [ dt . kind == "O" for dt in X . dtypes ] ,
1499+ "categorical_mask" : categorical_mask ,
15031500 "feature_names" : list (X .columns ),
15041501 }
15051502 supervised = SupervisedDataset (X , y , meta )
@@ -1531,6 +1528,7 @@ def retrieve_pmlb(cache_dir: str = None) -> Generator[SupervisedDataset, None, N
15311528 )
15321529 dataset_names .extend ([("regression" , name ) for name in regression_dataset_names ])
15331530
1531+ cat_type = pd .CategoricalDtype (ordered = False )
15341532 for problem_type , dataset_name in tqdm (dataset_names , desc = "pmlb" ):
15351533 name = dataset_name
15361534 X_name = f"{ name } .X.parquet"
@@ -1542,14 +1540,32 @@ def retrieve_pmlb(cache_dir: str = None) -> Generator[SupervisedDataset, None, N
15421540 df = fetch_data (dataset_name )
15431541 X = df .drop ("target" , axis = 1 )
15441542 y = df ["target" ]
1545- problem = problem_type
15461543 if problem_type == "classification" :
1547- problem = "binary" if len (y .unique ()) == 2 else "multiclass"
1544+ classes , y = np .unique (y .values , return_inverse = True )
1545+ problem = "binary" if len (classes ) == 2 else "multiclass"
1546+
1547+ # for benchmarking we do not care about the original target strings
1548+ y = pd .Series (y , dtype = np .int16 )
1549+ elif problem_type == "regression" :
1550+ problem = "regression"
1551+ y = pd .Series (y , dtype = np .float64 )
1552+ else :
1553+ raise Exception (f"Unrecognized problem_type { problem_type } ." )
1554+
1555+ categorical_mask = [dt .kind == "O" for dt in X .dtypes ]
1556+
1557+ for col_name , cat in zip (X .columns , categorical_mask ):
1558+ col = X [col_name ]
1559+ if cat :
1560+ X [col_name ] = pd .Series (col , dtype = cat_type , name = col .name )
1561+ else :
1562+ X [col_name ] = pd .Series (col , dtype = np .float64 , name = col .name )
1563+
15481564 meta = {
15491565 "name" : name ,
15501566 "problem" : problem ,
15511567 "source" : "pmlb" ,
1552- "categorical_mask" : [ dt . kind == "O" for dt in X . dtypes ] ,
1568+ "categorical_mask" : categorical_mask ,
15531569 "feature_names" : list (X .columns ),
15541570 }
15551571 supervised = SupervisedDataset (X , y , meta )
0 commit comments