Skip to content
107 changes: 86 additions & 21 deletions river/preprocessing/one_hot.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,11 @@ class OneHotEncoder(base.MiniBatchTransformer):
... ]
>>> pprint(X)
[{'c1': 'u', 'c2': 'd'},
{'c1': 'a', 'c2': 'x'},
{'c1': 'i', 'c2': 'h'},
{'c1': 'h', 'c2': 'e'}]
{'c1': 'a', 'c2': 'x'},
{'c1': 'i', 'c2': 'h'},
{'c1': 'h', 'c2': 'e'}]

e can now apply one-hot encoding. All the provided are one-hot encoded, there is therefore
We can now apply one-hot encoding. All the provided are one-hot encoded, there is therefore
no need to specify which features to encode.

>>> from river import preprocessing
Expand Down Expand Up @@ -85,6 +85,29 @@ class OneHotEncoder(base.MiniBatchTransformer):
{'c2_h': 1}
{'c2_e': 1}

Like in `scikit-learn`, you can also specify the expected categories manually.
This is handy when you want to constrain category encoding space
to e.g. top 20% most popular category values you've picked in advance.

>>> categories = {'c1': {'a', 'h'}, 'c2': {'x', 'e'}}
>>> oh = preprocessing.OneHotEncoder(categories=categories)
>>> # oh = preprocessing.OneHotEncoder()
>>> for x in X:
... oh.learn_one(x)
... pprint(oh.transform_one(x))
{'c1_a': 0, 'c1_h': 0, 'c2_e': 0, 'c2_x': 0}
{'c1_a': 1, 'c1_h': 0, 'c2_e': 0, 'c2_x': 1}
{'c1_a': 0, 'c1_h': 0, 'c2_e': 0, 'c2_x': 0}
{'c1_a': 0, 'c1_h': 1, 'c2_e': 1, 'c2_x': 0}

>>> for key in sorted(oh.values.keys()):
... print(key)
... print(sorted(oh.values[key]))
c1
['a', 'h']
c2
['e', 'x']

A subset of the features can be one-hot encoded by piping a `compose.Select` into the
`OneHotEncoder`.

Expand Down Expand Up @@ -192,23 +215,43 @@ class OneHotEncoder(base.MiniBatchTransformer):
c2_x Sparse[uint8, 0]
dtype: object

Explicit categories:

>>> oh = preprocessing.OneHotEncoder(categories=categories)

# oh = preprocessing.OneHotEncoder()
>>> oh.learn_many(X)
>>> df = oh.transform_many(X)
>>> df.sort_index(axis="columns")
c1_a c1_h c2_e c2_x
0 0 0 0 0
1 1 0 0 1
2 0 0 0 0
"""

def __init__(self, drop_zeros=False, drop_first=False):
def __init__(self, categories = "auto", drop_zeros=False, drop_first=False):
self.drop_zeros = drop_zeros
self.drop_first = drop_first
self.values = collections.defaultdict(set)
self.categories = categories

if self.categories == "auto":
self.values = collections.defaultdict(set)
else:
self.values = self.categories

def learn_one(self, x):
if self.drop_zeros:
return

for i, xi in x.items():
if isinstance(xi, list) or isinstance(xi, set):
for xj in xi:
self.values[i].add(xj)
else:
self.values[i].add(xi)
# NOTE: assume if category mappings are explicitly provided,
# they're intended to be kept fixed.
if self.categories == "auto":
for i, xi in x.items():
if isinstance(xi, list) or isinstance(xi, set):
for xj in xi:
self.values[i].add(xj)
else:
self.values[i].add(xi)

def transform_one(self, x, y=None):
oh = {}
Expand All @@ -217,13 +260,25 @@ def transform_one(self, x, y=None):
if not self.drop_zeros:
oh = {f"{i}_{v}": 0 for i, values in self.values.items() for v in values}

# Add 1s
for i, xi in x.items():
if isinstance(xi, list) or isinstance(xi, set):
for xj in xi:
oh[f"{i}_{xj}"] = 1
else:
oh[f"{i}_{xi}"] = 1
# Add 1
# NOTE: assume if category mappings are explicitly provided,
# no other category values are allowed for output. Aligns with `sklearn` behavior.
if self.categories == "auto":
for i, xi in x.items():
if isinstance(xi, list) or isinstance(xi, set):
for xj in xi:
oh[f"{i}_{xj}"] = 1
else:
oh[f"{i}_{xi}"] = 1
else:
for i, xi in x.items():
if isinstance(xi, list) or isinstance(xi, set):
for xj in xi:
if xj in self.values[i]:
oh[f"{i}_{xj}"] = 1
else:
if xi in self.values[i]:
oh[f"{i}_{xi}"] = 1

if self.drop_first:
oh.pop(min(oh.keys()))
Expand All @@ -234,12 +289,22 @@ def learn_many(self, X):
if self.drop_zeros:
return

for col in X.columns:
self.values[col].update(X[col].unique())
# NOTE: assume if category mappings are explicitly provided,
# they're intended to be kept fixed.
if self.categories == "auto":
for col in X.columns:
self.values[col].update(X[col].unique())

def transform_many(self, X):
oh = pd.get_dummies(X, columns=X.columns, sparse=True, dtype="uint8")

# NOTE: assume if category mappings are explicitly provided,
# no other category values are allowed for output. Aligns with `sklearn` behavior.
if self.categories != "auto":
seen_in_the_past = {f"{col}_{val}" for col, vals in self.values.items() for val in vals}
to_remove = set(oh.columns) - seen_in_the_past
oh.drop(columns=list(to_remove), inplace=True)

if not self.drop_zeros:
seen_in_the_past = {f"{col}_{val}" for col, vals in self.values.items() for val in vals}
to_add = seen_in_the_past - set(oh.columns)
Expand Down
63 changes: 46 additions & 17 deletions river/preprocessing/ordinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,26 @@ class OrdinalEncoder(base.MiniBatchTransformer):
{'country': 2, 'place': 1}
{'country': -1, 'place': -1}

Like in `scikit-learn`, you can also specify the expected categories manually.
This is handy when you want to constrain category encoding space
to e.g. top 20% most popular category values you've picked in advance.

>>> categories = {'country': {'France': 1},
... 'place': {'Burger King': 2, 'Starbucks': 3}}
>>> encoder = preprocessing.OrdinalEncoder(categories=categories)
>>> for x in X:
... print(encoder.transform_one(x))
... encoder.learn_one(x)
{'country': 1, 'place': 0}
{'country': -1, 'place': -1}
{'country': 0, 'place': 2}
{'country': 1, 'place': 2}
{'country': 0, 'place': 3}
{'country': 0, 'place': 3}
{'country': 0, 'place': 0}
{'country': -1, 'place': -1}

>>> import pandas as pd
>>> xb1 = pd.DataFrame(X[0:4], index=[0, 1, 2, 3])
>>> xb2 = pd.DataFrame(X[4:8], index=[4, 5, 6, 7])

Expand All @@ -87,39 +107,47 @@ class OrdinalEncoder(base.MiniBatchTransformer):

def __init__(
self,
categories = "auto",
unknown_value: int | None = 0,
none_value: int = -1,
):
self.unknown_value = unknown_value
self.none_value = none_value
self.categories = categories
self.values: collections.defaultdict | dict | None = None

# We're going to have one auto-incrementing counter per feature. This counter will generate
# the category codes for each feature.
self._counters: collections.defaultdict = collections.defaultdict(
functools.partial(make_counter, {unknown_value, none_value})
)
if self.categories == "auto":
# We're going to have one auto-incrementing counter per feature. This counter will generate
# the category codes for each feature.
self._counters: collections.defaultdict = collections.defaultdict(
functools.partial(make_counter, {unknown_value, none_value})
)

# We're going to store the categories in a dict of dicts. The outer dict will map each
# feature to its inner dict. The inner dict will map each category to its code.
self.values = collections.defaultdict(dict)
else:
self.values = self.categories

# We're going to store the categories in a dict of dicts. The outer dict will map each
# feature to its inner dict. The inner dict will map each category to its code.
self.categories: collections.defaultdict = collections.defaultdict(dict)

def transform_one(self, x):
return {
i: self.none_value if xi is None else self.categories[i].get(xi, self.unknown_value)
i: self.none_value if xi is None else self.values[i].get(xi, self.unknown_value)
for i, xi in x.items()
}

def learn_one(self, x):
for i, xi in x.items():
if xi is not None and xi not in self.categories[i]:
self.categories[i][xi] = next(self._counters[i])
if self.categories == "auto":
for i, xi in x.items():
if xi is not None and xi not in self.values[i]:
self.values[i][xi] = next(self._counters[i])

def transform_many(self, X):
return pd.DataFrame(
{
i: pd.Series(
X[i]
.map({**self.categories[i], None: self.none_value})
.map({**self.values[i], None: self.none_value})
.fillna(self.unknown_value),
dtype=np.int64,
)
Expand All @@ -128,7 +156,8 @@ def transform_many(self, X):
)

def learn_many(self, X, y=None):
for i in X.columns:
for xi in X[i].dropna().unique():
if xi not in self.categories[i]:
self.categories[i][xi] = next(self._counters[i])
if self.categories == "auto":
for i in X.columns:
for xi in X[i].dropna().unique():
if xi not in self.values[i]:
self.values[i][xi] = next(self._counters[i])