Skip to content

Commit f2f2a4c

Browse files
committed
clean up X, y datetypes in powerlift
1 parent b024b4f commit f2f2a4c

File tree

3 files changed

+62
-45
lines changed

3 files changed

+62
-45
lines changed

docs/benchmarks/ebm-benchmark.ipynb

+2-4
Original file line numberDiff line numberDiff line change
@@ -294,12 +294,12 @@
294294
" elif trial.method.name == \"knn\":\n",
295295
" est = Pipeline([(\"ct\", ct), (\"est\", KNeighborsClassifier(**knn_params))])\n",
296296
" elif trial.method.name == \"aplr\":\n",
297+
" fit_params[\"y\"] = fit_params[\"y\"].astype(str)\n",
298+
" y_test = y_test.astype(str)\n",
297299
" ct.sparse_threshold = 0 # APLR only handles dense\n",
298300
" if trial.task.name in {\"CIFAR_10\", \"Fashion-MNIST\", \"Devnagari-Script\", \"mnist_784\"}:\n",
299301
" max_samples = 10000 # crashes or fit time too long without subsampling\n",
300302
" est = Pipeline([(\"ct\", ct), (\"est\", APLRClassifier(**aplr_params))])\n",
301-
" fit_params[\"y\"] = fit_params[\"y\"].astype(str).to_numpy()\n",
302-
" y_test = y_test.astype(str).to_numpy()\n",
303303
" else:\n",
304304
" raise Exception(f\"Unrecognized method name {trial.method.name}\")\n",
305305
"\n",
@@ -350,8 +350,6 @@
350350
" if trial.task.name in {\"Airlines_DepDelay_10M\"}:\n",
351351
" max_samples = 100000 # crashes or fit time too long without subsampling\n",
352352
" est = Pipeline([(\"ct\", ct), (\"est\", APLRRegressor(**aplr_params))])\n",
353-
" fit_params[\"y\"] = fit_params[\"y\"].astype(str).to_numpy()\n",
354-
" y_test = y_test.astype(str).to_numpy()\n",
355353
" else:\n",
356354
" raise Exception(f\"Unrecognized method name {trial.method.name}\")\n",
357355
"\n",

python/powerlift/powerlift/bench/experiment.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,15 @@
44
"""
55

66
import pandas as pd
7+
import numpy as np
78
from typing import Dict, Iterable
89
from typing import Type, TypeVar
910
from typing import Union, Optional, List
1011
from dataclasses import dataclass
1112
from numbers import Number
1213
import time
1314

14-
from powerlift.bench.store import Store
15+
from powerlift.bench.store import Store, MIMETYPE_SERIES
1516

1617

1718
@dataclass(frozen=True)
@@ -105,6 +106,8 @@ def data(self, aliases: Iterable[str]) -> List[object]:
105106
name = alias_map[alias]
106107
asset = name_to_asset[name]
107108
parsed = BytesParser.deserialize(asset.mimetype, asset.embedded)
109+
if asset.mimetype == MIMETYPE_SERIES:
110+
parsed = np.array(parsed)
108111
outputs.append(parsed)
109112
return outputs
110113

python/powerlift/powerlift/bench/store.py

+56-40
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def serialize(cls, obj):
8484
orig_close = bstream.close
8585
bstream.close = lambda: None
8686
try:
87-
obj.astype(dtype=object).to_frame(name="Target").to_parquet(
87+
obj.to_frame(name="Target").to_parquet(
8888
bstream, compression="Brotli", index=False
8989
)
9090
finally:
@@ -1287,6 +1287,7 @@ def retrieve_openml(
12871287
suite = openml.study.get_suite(suite_id)
12881288
tasks = suite.tasks.copy()
12891289
random.Random(1337).shuffle(tasks)
1290+
cat_type = pd.CategoricalDtype(ordered=False)
12901291
for task_id in tqdm(tasks, desc=source):
12911292
task = openml.tasks.get_task(
12921293
task_id,
@@ -1313,48 +1314,23 @@ def retrieve_openml(
13131314
)
13141315

13151316
if task.task_type_id == openml.tasks.TaskType.SUPERVISED_CLASSIFICATION:
1316-
problem = (
1317-
"binary"
1318-
if dataset.qualities["NumberOfClasses"] == 2
1319-
else "multiclass"
1320-
)
1317+
classes, y = np.unique(y.values, return_inverse=True)
1318+
problem = "binary" if len(classes) == 2 else "multiclass"
13211319

13221320
# for benchmarking we do not care about the original target strings
1323-
y = pd.Series(np.unique(y, return_inverse=True)[1])
1321+
y = pd.Series(y, dtype=np.int16)
13241322
elif task.task_type_id == openml.tasks.TaskType.SUPERVISED_REGRESSION:
13251323
problem = "regression"
1324+
y = pd.Series(y, dtype=np.float64)
13261325
else:
13271326
raise Exception(f"Unrecognized task_type_id {task.task_type_id}.")
13281327

13291328
for col_name, cat in zip(X.columns, categorical_mask):
13301329
col = X[col_name]
1331-
1332-
if pd.api.types.is_sparse(col):
1333-
col = col.sparse.to_dense()
1334-
X[col_name] = col
1335-
1336-
if col.dtype.name == "category":
1337-
if not cat:
1338-
raise Exception(
1339-
f"Categorical type mismatch. Was CategoricalDtype but indicated non-categorical."
1340-
)
1341-
if col.cat.ordered:
1342-
# OpenMl incorrectly is indicating these as ordered
1343-
X[col_name] = col.cat.as_unordered()
1344-
elif col.dtype.name == "object":
1345-
if cat:
1346-
X[col_name] = col.astype(pd.CategoricalDtype(ordered=False))
1347-
else:
1348-
X[col_name] = col.astype(float)
1349-
elif np.issubdtype(col.dtype, np.floating) or np.issubdtype(
1350-
col.dtype, np.integer
1351-
):
1352-
if cat:
1353-
raise Exception(
1354-
f"Categorical type mismatch. Was continuous but indicated categorical."
1355-
)
1330+
if cat:
1331+
X[col_name] = pd.Series(col, dtype=cat_type, name=col.name)
13561332
else:
1357-
raise Exception(f"Unrecognized data type {col.dtype.name}.")
1333+
X[col_name] = pd.Series(col, dtype=np.float64, name=col.name)
13581334

13591335
meta = {
13601336
"name": name,
@@ -1480,6 +1456,7 @@ def retrieve_catboost_50k(
14801456
if cache_dir is not None:
14811457
cache_dir = pathlib.Path(cache_dir, "catboost_50k")
14821458

1459+
cat_type = pd.CategoricalDtype(ordered=False)
14831460
for dataset in tqdm(datasets, desc="catboost_50k"):
14841461
name = dataset["name"]
14851462
X_name = f"{name}.X.parquet"
@@ -1492,14 +1469,34 @@ def retrieve_catboost_50k(
14921469
target = dataset["target"]
14931470
X = df.drop(target, axis=1)
14941471
y = df[target]
1495-
problem = dataset["problem"]
1496-
if dataset["problem"] == "classification":
1497-
problem = "binary" if len(y.unique()) == 2 else "multiclass"
1472+
problem_type = dataset["problem"]
1473+
1474+
if problem_type == "classification":
1475+
classes, y = np.unique(y.values, return_inverse=True)
1476+
problem = "binary" if len(classes) == 2 else "multiclass"
1477+
1478+
# for benchmarking we do not care about the original target strings
1479+
y = pd.Series(y, dtype=np.int16)
1480+
elif problem_type == "regression":
1481+
problem = "regression"
1482+
y = pd.Series(y, dtype=np.float64)
1483+
else:
1484+
raise Exception(f"Unrecognized problem {problem_type}.")
1485+
1486+
categorical_mask = [dt.kind == "O" for dt in X.dtypes]
1487+
1488+
for col_name, cat in zip(X.columns, categorical_mask):
1489+
col = X[col_name]
1490+
if cat:
1491+
X[col_name] = pd.Series(col, dtype=cat_type, name=col.name)
1492+
else:
1493+
X[col_name] = pd.Series(col, dtype=np.float64, name=col.name)
1494+
14981495
meta = {
14991496
"name": name,
15001497
"problem": problem,
15011498
"source": "catboost_50k",
1502-
"categorical_mask": [dt.kind == "O" for dt in X.dtypes],
1499+
"categorical_mask": categorical_mask,
15031500
"feature_names": list(X.columns),
15041501
}
15051502
supervised = SupervisedDataset(X, y, meta)
@@ -1531,6 +1528,7 @@ def retrieve_pmlb(cache_dir: str = None) -> Generator[SupervisedDataset, None, N
15311528
)
15321529
dataset_names.extend([("regression", name) for name in regression_dataset_names])
15331530

1531+
cat_type = pd.CategoricalDtype(ordered=False)
15341532
for problem_type, dataset_name in tqdm(dataset_names, desc="pmlb"):
15351533
name = dataset_name
15361534
X_name = f"{name}.X.parquet"
@@ -1542,14 +1540,32 @@ def retrieve_pmlb(cache_dir: str = None) -> Generator[SupervisedDataset, None, N
15421540
df = fetch_data(dataset_name)
15431541
X = df.drop("target", axis=1)
15441542
y = df["target"]
1545-
problem = problem_type
15461543
if problem_type == "classification":
1547-
problem = "binary" if len(y.unique()) == 2 else "multiclass"
1544+
classes, y = np.unique(y.values, return_inverse=True)
1545+
problem = "binary" if len(classes) == 2 else "multiclass"
1546+
1547+
# for benchmarking we do not care about the original target strings
1548+
y = pd.Series(y, dtype=np.int16)
1549+
elif problem_type == "regression":
1550+
problem = "regression"
1551+
y = pd.Series(y, dtype=np.float64)
1552+
else:
1553+
raise Exception(f"Unrecognized problem_type {problem_type}.")
1554+
1555+
categorical_mask = [dt.kind == "O" for dt in X.dtypes]
1556+
1557+
for col_name, cat in zip(X.columns, categorical_mask):
1558+
col = X[col_name]
1559+
if cat:
1560+
X[col_name] = pd.Series(col, dtype=cat_type, name=col.name)
1561+
else:
1562+
X[col_name] = pd.Series(col, dtype=np.float64, name=col.name)
1563+
15481564
meta = {
15491565
"name": name,
15501566
"problem": problem,
15511567
"source": "pmlb",
1552-
"categorical_mask": [dt.kind == "O" for dt in X.dtypes],
1568+
"categorical_mask": categorical_mask,
15531569
"feature_names": list(X.columns),
15541570
}
15551571
supervised = SupervisedDataset(X, y, meta)

0 commit comments

Comments
 (0)