@@ -84,7 +84,7 @@ def serialize(cls, obj):
8484 orig_close = bstream .close
8585 bstream .close = lambda : None
8686 try :
87- obj .astype ( dtype = object ). to_frame (name = "Target" ).to_parquet (
87+ obj .to_frame (name = "Target" ).to_parquet (
8888 bstream , compression = "Brotli" , index = False
8989 )
9090 finally :
@@ -1277,6 +1277,7 @@ def retrieve_openml(
12771277 suite = openml .study .get_suite (suite_id )
12781278 tasks = suite .tasks .copy ()
12791279 random .Random (1337 ).shuffle (tasks )
1280+ cat_type = pd .CategoricalDtype (ordered = False )
12801281 for task_id in tqdm (tasks , desc = source ):
12811282 task = openml .tasks .get_task (
12821283 task_id ,
@@ -1303,48 +1304,23 @@ def retrieve_openml(
13031304 )
13041305
13051306 if task .task_type_id == openml .tasks .TaskType .SUPERVISED_CLASSIFICATION :
1306- problem = (
1307- "binary"
1308- if dataset .qualities ["NumberOfClasses" ] == 2
1309- else "multiclass"
1310- )
1307+ classes , y = np .unique (y .values , return_inverse = True )
1308+ problem = "binary" if len (classes ) == 2 else "multiclass"
13111309
13121310 # for benchmarking we do not care about the original target strings
1313- y = pd .Series (np . unique ( y , return_inverse = True )[ 1 ] )
1311+ y = pd .Series (y , dtype = np . int16 )
13141312 elif task .task_type_id == openml .tasks .TaskType .SUPERVISED_REGRESSION :
13151313 problem = "regression"
1314+ y = pd .Series (y , dtype = np .float64 )
13161315 else :
13171316 raise Exception (f"Unrecognized task_type_id { task .task_type_id } ." )
13181317
13191318 for col_name , cat in zip (X .columns , categorical_mask ):
13201319 col = X [col_name ]
1321-
1322- if pd .api .types .is_sparse (col ):
1323- col = col .sparse .to_dense ()
1324- X [col_name ] = col
1325-
1326- if col .dtype .name == "category" :
1327- if not cat :
1328- raise Exception (
1329- f"Categorical type mismatch. Was CategoricalDtype but indicated non-categorical."
1330- )
1331- if col .cat .ordered :
1332- # OpenMl incorrectly is indicating these as ordered
1333- X [col_name ] = col .cat .as_unordered ()
1334- elif col .dtype .name == "object" :
1335- if cat :
1336- X [col_name ] = col .astype (pd .CategoricalDtype (ordered = False ))
1337- else :
1338- X [col_name ] = col .astype (float )
1339- elif np .issubdtype (col .dtype , np .floating ) or np .issubdtype (
1340- col .dtype , np .integer
1341- ):
1342- if cat :
1343- raise Exception (
1344- f"Categorical type mismatch. Was continuous but indicated categorical."
1345- )
1320+ if cat :
1321+ X [col_name ] = pd .Series (col , dtype = cat_type , name = col .name )
13461322 else :
1347- raise Exception ( f"Unrecognized data type { col . dtype . name } ." )
1323+ X [ col_name ] = pd . Series ( col , dtype = np . float64 , name = col . name )
13481324
13491325 meta = {
13501326 "name" : name ,
@@ -1470,6 +1446,7 @@ def retrieve_catboost_50k(
14701446 if cache_dir is not None :
14711447 cache_dir = pathlib .Path (cache_dir , "catboost_50k" )
14721448
1449+ cat_type = pd .CategoricalDtype (ordered = False )
14731450 for dataset in tqdm (datasets , desc = "catboost_50k" ):
14741451 name = dataset ["name" ]
14751452 X_name = f"{ name } .X.parquet"
@@ -1482,14 +1459,34 @@ def retrieve_catboost_50k(
14821459 target = dataset ["target" ]
14831460 X = df .drop (target , axis = 1 )
14841461 y = df [target ]
1485- problem = dataset ["problem" ]
1486- if dataset ["problem" ] == "classification" :
1487- problem = "binary" if len (y .unique ()) == 2 else "multiclass"
1462+ problem_type = dataset ["problem" ]
1463+
1464+ if problem_type == "classification" :
1465+ classes , y = np .unique (y .values , return_inverse = True )
1466+ problem = "binary" if len (classes ) == 2 else "multiclass"
1467+
1468+ # for benchmarking we do not care about the original target strings
1469+ y = pd .Series (y , dtype = np .int16 )
1470+ elif problem_type == "regression" :
1471+ problem = "regression"
1472+ y = pd .Series (y , dtype = np .float64 )
1473+ else :
1474+ raise Exception (f"Unrecognized problem { problem_type } ." )
1475+
1476+ categorical_mask = [dt .kind == "O" for dt in X .dtypes ]
1477+
1478+ for col_name , cat in zip (X .columns , categorical_mask ):
1479+ col = X [col_name ]
1480+ if cat :
1481+ X [col_name ] = pd .Series (col , dtype = cat_type , name = col .name )
1482+ else :
1483+ X [col_name ] = pd .Series (col , dtype = np .float64 , name = col .name )
1484+
14881485 meta = {
14891486 "name" : name ,
14901487 "problem" : problem ,
14911488 "source" : "catboost_50k" ,
1492- "categorical_mask" : [ dt . kind == "O" for dt in X . dtypes ] ,
1489+ "categorical_mask" : categorical_mask ,
14931490 "feature_names" : list (X .columns ),
14941491 }
14951492 supervised = SupervisedDataset (X , y , meta )
@@ -1521,6 +1518,7 @@ def retrieve_pmlb(cache_dir: str = None) -> Generator[SupervisedDataset, None, N
15211518 )
15221519 dataset_names .extend ([("regression" , name ) for name in regression_dataset_names ])
15231520
1521+ cat_type = pd .CategoricalDtype (ordered = False )
15241522 for problem_type , dataset_name in tqdm (dataset_names , desc = "pmlb" ):
15251523 name = dataset_name
15261524 X_name = f"{ name } .X.parquet"
@@ -1532,14 +1530,32 @@ def retrieve_pmlb(cache_dir: str = None) -> Generator[SupervisedDataset, None, N
15321530 df = fetch_data (dataset_name )
15331531 X = df .drop ("target" , axis = 1 )
15341532 y = df ["target" ]
1535- problem = problem_type
15361533 if problem_type == "classification" :
1537- problem = "binary" if len (y .unique ()) == 2 else "multiclass"
1534+ classes , y = np .unique (y .values , return_inverse = True )
1535+ problem = "binary" if len (classes ) == 2 else "multiclass"
1536+
1537+ # for benchmarking we do not care about the original target strings
1538+ y = pd .Series (y , dtype = np .int16 )
1539+ elif problem_type == "regression" :
1540+ problem = "regression"
1541+ y = pd .Series (y , dtype = np .float64 )
1542+ else :
1543+ raise Exception (f"Unrecognized problem_type { problem_type } ." )
1544+
1545+ categorical_mask = [dt .kind == "O" for dt in X .dtypes ]
1546+
1547+ for col_name , cat in zip (X .columns , categorical_mask ):
1548+ col = X [col_name ]
1549+ if cat :
1550+ X [col_name ] = pd .Series (col , dtype = cat_type , name = col .name )
1551+ else :
1552+ X [col_name ] = pd .Series (col , dtype = np .float64 , name = col .name )
1553+
15381554 meta = {
15391555 "name" : name ,
15401556 "problem" : problem ,
15411557 "source" : "pmlb" ,
1542- "categorical_mask" : [ dt . kind == "O" for dt in X . dtypes ] ,
1558+ "categorical_mask" : categorical_mask ,
15431559 "feature_names" : list (X .columns ),
15441560 }
15451561 supervised = SupervisedDataset (X , y , meta )
0 commit comments