diff --git a/docs/releases/unreleased.md b/docs/releases/unreleased.md index 4ed6dd48f1..6165c0739b 100644 --- a/docs/releases/unreleased.md +++ b/docs/releases/unreleased.md @@ -26,3 +26,7 @@ ## stream - `stream.iter_arff` now supports blank values (treated as missing values). + +## preprocessing + +- Add support for expected categories in `preprocessing.OneHotEncoder`, `preprocessing.OrdinalEncoder`, akin to scikit-learn API for respective encoders. diff --git a/river/preprocessing/one_hot.py b/river/preprocessing/one_hot.py index 59d3337fae..3272258d3b 100644 --- a/river/preprocessing/one_hot.py +++ b/river/preprocessing/one_hot.py @@ -19,6 +19,14 @@ class OneHotEncoder(base.MiniBatchTransformer): Parameters ---------- + categories + Categories (unique values) per feature: + `None` : Determine categories automatically from the training data. + + dict of dicts : Expected categories for each feature. The outer dict maps each feature to its inner dict. + The inner dict maps each category to its code. + + The used categories can be found in the `values` attribute. drop_zeros Whether or not 0s should be made explicit or not. drop_first @@ -26,6 +34,12 @@ class OneHotEncoder(base.MiniBatchTransformer): This is useful in some statistical models where perfectly collinear features cause problems. + Attributes + ---------- + values + A dict of dicts. The outer dict maps each feature to its inner dict. The inner dict maps + each category to its code. + Examples -------- @@ -46,11 +60,11 @@ class OneHotEncoder(base.MiniBatchTransformer): ... ] >>> pprint(X) [{'c1': 'u', 'c2': 'd'}, - {'c1': 'a', 'c2': 'x'}, - {'c1': 'i', 'c2': 'h'}, - {'c1': 'h', 'c2': 'e'}] + {'c1': 'a', 'c2': 'x'}, + {'c1': 'i', 'c2': 'h'}, + {'c1': 'h', 'c2': 'e'}] - e can now apply one-hot encoding. All the provided are one-hot encoded, there is therefore + We can now apply one-hot encoding. All the provided are one-hot encoded, there is therefore no need to specify which features to encode. >>> from river import preprocessing @@ -85,6 +99,28 @@ class OneHotEncoder(base.MiniBatchTransformer): {'c2_h': 1} {'c2_e': 1} + Like in `scikit-learn`, you can also specify the expected categories manually. + This is handy when you want to constrain category encoding space + to e.g. top 20% most popular category values you've picked in advance. + + >>> categories = {'c1': {'a', 'h'}, 'c2': {'x', 'e'}} + >>> oh = preprocessing.OneHotEncoder(categories=categories) + >>> for x in X: + ... oh.learn_one(x) + ... pprint(oh.transform_one(x)) + {'c1_a': 0, 'c1_h': 0, 'c2_e': 0, 'c2_x': 0} + {'c1_a': 1, 'c1_h': 0, 'c2_e': 0, 'c2_x': 1} + {'c1_a': 0, 'c1_h': 0, 'c2_e': 0, 'c2_x': 0} + {'c1_a': 0, 'c1_h': 1, 'c2_e': 1, 'c2_x': 0} + + >>> for key in sorted(oh.values.keys()): + ... print(key) + ... print(sorted(oh.values[key])) + c1 + ['a', 'h'] + c2 + ['e', 'x'] + A subset of the features can be one-hot encoded by piping a `compose.Select` into the `OneHotEncoder`. @@ -135,6 +171,7 @@ class OneHotEncoder(base.MiniBatchTransformer): >>> from pprint import pprint >>> import random >>> import string + >>> import pandas as pd >>> random.seed(42) >>> alphabet = list(string.ascii_lowercase) @@ -192,23 +229,45 @@ class OneHotEncoder(base.MiniBatchTransformer): c2_x Sparse[uint8, 0] dtype: object + Explicit categories: + + >>> oh = preprocessing.OneHotEncoder(categories=categories) + + + >>> oh.learn_many(X) + >>> df = oh.transform_many(X) + >>> df.sort_index(axis="columns") + c1_a c1_h c2_e c2_x + 0 0 0 0 0 + 1 1 0 0 1 + 2 0 0 0 0 + """ - def __init__(self, drop_zeros=False, drop_first=False): + def __init__(self, categories: dict | None = None, drop_zeros=False, drop_first=False): self.drop_zeros = drop_zeros self.drop_first = drop_first - self.values = collections.defaultdict(set) + self.categories = categories + self.values: collections.defaultdict | dict | None = None + + if self.categories is None: + self.values = collections.defaultdict(set) + else: + self.values = self.categories def learn_one(self, x): if self.drop_zeros: return - for i, xi in x.items(): - if isinstance(xi, list) or isinstance(xi, set): - for xj in xi: - self.values[i].add(xj) - else: - self.values[i].add(xi) + # NOTE: assume if category mappings are explicitly provided, + # they're intended to be kept fixed. + if self.categories is None: + for i, xi in x.items(): + if isinstance(xi, list) or isinstance(xi, set): + for xj in xi: + self.values[i].add(xj) + else: + self.values[i].add(xi) def transform_one(self, x, y=None): oh = {} @@ -217,13 +276,25 @@ def transform_one(self, x, y=None): if not self.drop_zeros: oh = {f"{i}_{v}": 0 for i, values in self.values.items() for v in values} - # Add 1s - for i, xi in x.items(): - if isinstance(xi, list) or isinstance(xi, set): - for xj in xi: - oh[f"{i}_{xj}"] = 1 - else: - oh[f"{i}_{xi}"] = 1 + # Add 1 + # NOTE: assume if category mappings are explicitly provided, + # no other category values are allowed for output. Aligns with `sklearn` behavior. + if self.categories is None: + for i, xi in x.items(): + if isinstance(xi, list) or isinstance(xi, set): + for xj in xi: + oh[f"{i}_{xj}"] = 1 + else: + oh[f"{i}_{xi}"] = 1 + else: + for i, xi in x.items(): + if isinstance(xi, list) or isinstance(xi, set): + for xj in xi: + if xj in self.values[i]: + oh[f"{i}_{xj}"] = 1 + else: + if xi in self.values[i]: + oh[f"{i}_{xi}"] = 1 if self.drop_first: oh.pop(min(oh.keys())) @@ -234,12 +305,22 @@ def learn_many(self, X): if self.drop_zeros: return - for col in X.columns: - self.values[col].update(X[col].unique()) + # NOTE: assume if category mappings are explicitly provided, + # they're intended to be kept fixed. + if self.categories is None: + for col in X.columns: + self.values[col].update(X[col].unique()) def transform_many(self, X): oh = pd.get_dummies(X, columns=X.columns, sparse=True, dtype="uint8") + # NOTE: assume if category mappings are explicitly provided, + # no other category values are allowed for output. Aligns with `sklearn` behavior. + if self.categories is not None: + seen_in_the_past = {f"{col}_{val}" for col, vals in self.values.items() for val in vals} + to_remove = set(oh.columns) - seen_in_the_past + oh.drop(columns=list(to_remove), inplace=True) + if not self.drop_zeros: seen_in_the_past = {f"{col}_{val}" for col, vals in self.values.items() for val in vals} to_add = seen_in_the_past - set(oh.columns) diff --git a/river/preprocessing/ordinal.py b/river/preprocessing/ordinal.py index 01b981ac86..60346fb10c 100644 --- a/river/preprocessing/ordinal.py +++ b/river/preprocessing/ordinal.py @@ -22,6 +22,14 @@ class OrdinalEncoder(base.MiniBatchTransformer): Parameters ---------- + categories + Categories (unique values) per feature: + `None` : Determine categories automatically from the training data. + + dict of dicts : Expected categories for each feature. The outer dict maps each feature to its inner dict. + The inner dict maps each category to its code. + + The used categories can be found in the `values` attribute. unknown_value The value to use for unknown categories seen during `transform_one`. Unknown categories will be mapped to an integer once they are seen during `learn_one`. This value can be set @@ -31,7 +39,7 @@ class OrdinalEncoder(base.MiniBatchTransformer): Attributes ---------- - categories + values A dict of dicts. The outer dict maps each feature to its inner dict. The inner dict maps each category to its code. @@ -64,6 +72,26 @@ class OrdinalEncoder(base.MiniBatchTransformer): {'country': 2, 'place': 1} {'country': -1, 'place': -1} + Like in `scikit-learn`, you can also specify the expected categories manually. + This is handy when you want to constrain category encoding space + to e.g. top 20% most popular category values you've picked in advance. + + >>> categories = {'country': {'France': 1}, + ... 'place': {'Burger King': 2, 'Starbucks': 3}} + >>> encoder = preprocessing.OrdinalEncoder(categories=categories) + >>> for x in X: + ... print(encoder.transform_one(x)) + ... encoder.learn_one(x) + {'country': 1, 'place': 0} + {'country': -1, 'place': -1} + {'country': 0, 'place': 2} + {'country': 1, 'place': 2} + {'country': 0, 'place': 3} + {'country': 0, 'place': 3} + {'country': 0, 'place': 0} + {'country': -1, 'place': -1} + + >>> import pandas as pd >>> xb1 = pd.DataFrame(X[0:4], index=[0, 1, 2, 3]) >>> xb2 = pd.DataFrame(X[4:8], index=[4, 5, 6, 7]) @@ -87,40 +115,45 @@ class OrdinalEncoder(base.MiniBatchTransformer): def __init__( self, + categories: dict | None = None, unknown_value: int | None = 0, none_value: int = -1, ): self.unknown_value = unknown_value self.none_value = none_value - - # We're going to have one auto-incrementing counter per feature. This counter will generate - # the category codes for each feature. - self._counters: collections.defaultdict = collections.defaultdict( - functools.partial(make_counter, {unknown_value, none_value}) - ) - - # We're going to store the categories in a dict of dicts. The outer dict will map each - # feature to its inner dict. The inner dict will map each category to its code. - self.categories: collections.defaultdict = collections.defaultdict(dict) + self.categories = categories + self.values: collections.defaultdict | dict | None = None + + if self.categories is None: + # We're going to have one auto-incrementing counter per feature. This counter will generate + # the category codes for each feature. + self._counters: collections.defaultdict = collections.defaultdict( + functools.partial(make_counter, {unknown_value, none_value}) + ) + + # We're going to store the categories in a dict of dicts. The outer dict will map each + # feature to its inner dict. The inner dict will map each category to its code. + self.values = collections.defaultdict(dict) + else: + self.values = self.categories def transform_one(self, x): return { - i: self.none_value if xi is None else self.categories[i].get(xi, self.unknown_value) + i: self.none_value if xi is None else self.values[i].get(xi, self.unknown_value) for i, xi in x.items() } def learn_one(self, x): - for i, xi in x.items(): - if xi is not None and xi not in self.categories[i]: - self.categories[i][xi] = next(self._counters[i]) + if self.categories is None: + for i, xi in x.items(): + if xi is not None and xi not in self.values[i]: + self.values[i][xi] = next(self._counters[i]) def transform_many(self, X): return pd.DataFrame( { i: pd.Series( - X[i] - .map({**self.categories[i], None: self.none_value}) - .fillna(self.unknown_value), + X[i].map({**self.values[i], None: self.none_value}).fillna(self.unknown_value), dtype=np.int64, ) for i in X.columns @@ -128,7 +161,8 @@ def transform_many(self, X): ) def learn_many(self, X, y=None): - for i in X.columns: - for xi in X[i].dropna().unique(): - if xi not in self.categories[i]: - self.categories[i][xi] = next(self._counters[i]) + if self.categories is None: + for i in X.columns: + for xi in X[i].dropna().unique(): + if xi not in self.values[i]: + self.values[i][xi] = next(self._counters[i])