Skip to content

Commit d7d5fd2

Browse files
committed
clean up X, y datetypes in powerlift
1 parent dd17bf9 commit d7d5fd2

File tree

3 files changed

+62
-45
lines changed

3 files changed

+62
-45
lines changed

docs/benchmarks/ebm-benchmark.ipynb

+2-4
Original file line numberDiff line numberDiff line change
@@ -294,12 +294,12 @@
294294
" elif trial.method.name == \"knn\":\n",
295295
" est = Pipeline([(\"ct\", ct), (\"est\", KNeighborsClassifier(**knn_params))])\n",
296296
" elif trial.method.name == \"aplr\":\n",
297+
" fit_params[\"y\"] = fit_params[\"y\"].astype(str)\n",
298+
" y_test = y_test.astype(str)\n",
297299
" ct.sparse_threshold = 0 # APLR only handles dense\n",
298300
" if trial.task.name in {\"CIFAR_10\", \"Fashion-MNIST\", \"Devnagari-Script\", \"mnist_784\"}:\n",
299301
" max_samples = 10000 # crashes or fit time too long without subsampling\n",
300302
" est = Pipeline([(\"ct\", ct), (\"est\", APLRClassifier(**aplr_params))])\n",
301-
" fit_params[\"y\"] = fit_params[\"y\"].astype(str).to_numpy()\n",
302-
" y_test = y_test.astype(str).to_numpy()\n",
303303
" else:\n",
304304
" raise Exception(f\"Unrecognized method name {trial.method.name}\")\n",
305305
"\n",
@@ -350,8 +350,6 @@
350350
" if trial.task.name in {\"Airlines_DepDelay_10M\"}:\n",
351351
" max_samples = 100000 # crashes or fit time too long without subsampling\n",
352352
" est = Pipeline([(\"ct\", ct), (\"est\", APLRRegressor(**aplr_params))])\n",
353-
" fit_params[\"y\"] = fit_params[\"y\"].astype(str).to_numpy()\n",
354-
" y_test = y_test.astype(str).to_numpy()\n",
355353
" else:\n",
356354
" raise Exception(f\"Unrecognized method name {trial.method.name}\")\n",
357355
"\n",

python/powerlift/powerlift/bench/experiment.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,15 @@
44
"""
55

66
import pandas as pd
7+
import numpy as np
78
from typing import Dict, Iterable
89
from typing import Type, TypeVar
910
from typing import Union, Optional, List
1011
from dataclasses import dataclass
1112
from numbers import Number
1213
import time
1314

14-
from powerlift.bench.store import Store
15+
from powerlift.bench.store import Store, MIMETYPE_SERIES
1516

1617

1718
@dataclass(frozen=True)
@@ -105,6 +106,8 @@ def data(self, aliases: Iterable[str]) -> List[object]:
105106
name = alias_map[alias]
106107
asset = name_to_asset[name]
107108
parsed = BytesParser.deserialize(asset.mimetype, asset.embedded)
109+
if asset.mimetype == MIMETYPE_SERIES:
110+
parsed = np.array(parsed)
108111
outputs.append(parsed)
109112
return outputs
110113

python/powerlift/powerlift/bench/store.py

+56-40
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def serialize(cls, obj):
8484
orig_close = bstream.close
8585
bstream.close = lambda: None
8686
try:
87-
obj.astype(dtype=object).to_frame(name="Target").to_parquet(
87+
obj.to_frame(name="Target").to_parquet(
8888
bstream, compression="Brotli", index=False
8989
)
9090
finally:
@@ -1277,6 +1277,7 @@ def retrieve_openml(
12771277
suite = openml.study.get_suite(suite_id)
12781278
tasks = suite.tasks.copy()
12791279
random.Random(1337).shuffle(tasks)
1280+
cat_type = pd.CategoricalDtype(ordered=False)
12801281
for task_id in tqdm(tasks, desc=source):
12811282
task = openml.tasks.get_task(
12821283
task_id,
@@ -1303,48 +1304,23 @@ def retrieve_openml(
13031304
)
13041305

13051306
if task.task_type_id == openml.tasks.TaskType.SUPERVISED_CLASSIFICATION:
1306-
problem = (
1307-
"binary"
1308-
if dataset.qualities["NumberOfClasses"] == 2
1309-
else "multiclass"
1310-
)
1307+
classes, y = np.unique(y.values, return_inverse=True)
1308+
problem = "binary" if len(classes) == 2 else "multiclass"
13111309

13121310
# for benchmarking we do not care about the original target strings
1313-
y = pd.Series(np.unique(y, return_inverse=True)[1])
1311+
y = pd.Series(y, dtype=np.int16)
13141312
elif task.task_type_id == openml.tasks.TaskType.SUPERVISED_REGRESSION:
13151313
problem = "regression"
1314+
y = pd.Series(y, dtype=np.float64)
13161315
else:
13171316
raise Exception(f"Unrecognized task_type_id {task.task_type_id}.")
13181317

13191318
for col_name, cat in zip(X.columns, categorical_mask):
13201319
col = X[col_name]
1321-
1322-
if pd.api.types.is_sparse(col):
1323-
col = col.sparse.to_dense()
1324-
X[col_name] = col
1325-
1326-
if col.dtype.name == "category":
1327-
if not cat:
1328-
raise Exception(
1329-
f"Categorical type mismatch. Was CategoricalDtype but indicated non-categorical."
1330-
)
1331-
if col.cat.ordered:
1332-
# OpenMl incorrectly is indicating these as ordered
1333-
X[col_name] = col.cat.as_unordered()
1334-
elif col.dtype.name == "object":
1335-
if cat:
1336-
X[col_name] = col.astype(pd.CategoricalDtype(ordered=False))
1337-
else:
1338-
X[col_name] = col.astype(float)
1339-
elif np.issubdtype(col.dtype, np.floating) or np.issubdtype(
1340-
col.dtype, np.integer
1341-
):
1342-
if cat:
1343-
raise Exception(
1344-
f"Categorical type mismatch. Was continuous but indicated categorical."
1345-
)
1320+
if cat:
1321+
X[col_name] = pd.Series(col, dtype=cat_type, name=col.name)
13461322
else:
1347-
raise Exception(f"Unrecognized data type {col.dtype.name}.")
1323+
X[col_name] = pd.Series(col, dtype=np.float64, name=col.name)
13481324

13491325
meta = {
13501326
"name": name,
@@ -1470,6 +1446,7 @@ def retrieve_catboost_50k(
14701446
if cache_dir is not None:
14711447
cache_dir = pathlib.Path(cache_dir, "catboost_50k")
14721448

1449+
cat_type = pd.CategoricalDtype(ordered=False)
14731450
for dataset in tqdm(datasets, desc="catboost_50k"):
14741451
name = dataset["name"]
14751452
X_name = f"{name}.X.parquet"
@@ -1482,14 +1459,34 @@ def retrieve_catboost_50k(
14821459
target = dataset["target"]
14831460
X = df.drop(target, axis=1)
14841461
y = df[target]
1485-
problem = dataset["problem"]
1486-
if dataset["problem"] == "classification":
1487-
problem = "binary" if len(y.unique()) == 2 else "multiclass"
1462+
problem_type = dataset["problem"]
1463+
1464+
if problem_type == "classification":
1465+
classes, y = np.unique(y.values, return_inverse=True)
1466+
problem = "binary" if len(classes) == 2 else "multiclass"
1467+
1468+
# for benchmarking we do not care about the original target strings
1469+
y = pd.Series(y, dtype=np.int16)
1470+
elif problem_type == "regression":
1471+
problem = "regression"
1472+
y = pd.Series(y, dtype=np.float64)
1473+
else:
1474+
raise Exception(f"Unrecognized problem {problem_type}.")
1475+
1476+
categorical_mask = [dt.kind == "O" for dt in X.dtypes]
1477+
1478+
for col_name, cat in zip(X.columns, categorical_mask):
1479+
col = X[col_name]
1480+
if cat:
1481+
X[col_name] = pd.Series(col, dtype=cat_type, name=col.name)
1482+
else:
1483+
X[col_name] = pd.Series(col, dtype=np.float64, name=col.name)
1484+
14881485
meta = {
14891486
"name": name,
14901487
"problem": problem,
14911488
"source": "catboost_50k",
1492-
"categorical_mask": [dt.kind == "O" for dt in X.dtypes],
1489+
"categorical_mask": categorical_mask,
14931490
"feature_names": list(X.columns),
14941491
}
14951492
supervised = SupervisedDataset(X, y, meta)
@@ -1521,6 +1518,7 @@ def retrieve_pmlb(cache_dir: str = None) -> Generator[SupervisedDataset, None, N
15211518
)
15221519
dataset_names.extend([("regression", name) for name in regression_dataset_names])
15231520

1521+
cat_type = pd.CategoricalDtype(ordered=False)
15241522
for problem_type, dataset_name in tqdm(dataset_names, desc="pmlb"):
15251523
name = dataset_name
15261524
X_name = f"{name}.X.parquet"
@@ -1532,14 +1530,32 @@ def retrieve_pmlb(cache_dir: str = None) -> Generator[SupervisedDataset, None, N
15321530
df = fetch_data(dataset_name)
15331531
X = df.drop("target", axis=1)
15341532
y = df["target"]
1535-
problem = problem_type
15361533
if problem_type == "classification":
1537-
problem = "binary" if len(y.unique()) == 2 else "multiclass"
1534+
classes, y = np.unique(y.values, return_inverse=True)
1535+
problem = "binary" if len(classes) == 2 else "multiclass"
1536+
1537+
# for benchmarking we do not care about the original target strings
1538+
y = pd.Series(y, dtype=np.int16)
1539+
elif problem_type == "regression":
1540+
problem = "regression"
1541+
y = pd.Series(y, dtype=np.float64)
1542+
else:
1543+
raise Exception(f"Unrecognized problem_type {problem_type}.")
1544+
1545+
categorical_mask = [dt.kind == "O" for dt in X.dtypes]
1546+
1547+
for col_name, cat in zip(X.columns, categorical_mask):
1548+
col = X[col_name]
1549+
if cat:
1550+
X[col_name] = pd.Series(col, dtype=cat_type, name=col.name)
1551+
else:
1552+
X[col_name] = pd.Series(col, dtype=np.float64, name=col.name)
1553+
15381554
meta = {
15391555
"name": name,
15401556
"problem": problem,
15411557
"source": "pmlb",
1542-
"categorical_mask": [dt.kind == "O" for dt in X.dtypes],
1558+
"categorical_mask": categorical_mask,
15431559
"feature_names": list(X.columns),
15441560
}
15451561
supervised = SupervisedDataset(X, y, meta)

0 commit comments

Comments
 (0)