From 2fce4f3ba5e7a441888668773fc4c27aecff79f6 Mon Sep 17 00:00:00 2001 From: Alexey C <54956904+ColdTeapot273K@users.noreply.github.com> Date: Sat, 24 Aug 2024 03:13:53 +0500 Subject: [PATCH 1/9] Add support for explicitly expected categories for OHE --- river/preprocessing/one_hot.py | 133 +++++++++++++++++++++++++++------ 1 file changed, 112 insertions(+), 21 deletions(-) diff --git a/river/preprocessing/one_hot.py b/river/preprocessing/one_hot.py index 59d3337fae..ad74d34fcd 100644 --- a/river/preprocessing/one_hot.py +++ b/river/preprocessing/one_hot.py @@ -46,11 +46,11 @@ class OneHotEncoder(base.MiniBatchTransformer): ... ] >>> pprint(X) [{'c1': 'u', 'c2': 'd'}, - {'c1': 'a', 'c2': 'x'}, - {'c1': 'i', 'c2': 'h'}, - {'c1': 'h', 'c2': 'e'}] + {'c1': 'a', 'c2': 'x'}, + {'c1': 'i', 'c2': 'h'}, + {'c1': 'h', 'c2': 'e'}] - e can now apply one-hot encoding. All the provided are one-hot encoded, there is therefore + We can now apply one-hot encoding. All the provided are one-hot encoded, there is therefore no need to specify which features to encode. >>> from river import preprocessing @@ -85,6 +85,45 @@ class OneHotEncoder(base.MiniBatchTransformer): {'c2_h': 1} {'c2_e': 1} + Like in `scikit-learn`, you can also specify the expected categories manually. + This is handy when you want to constrain category encoding space + to e.g. top 20% most popular category values you've picked in advance. + + X = [ + { + 'c1': random.choice(alphabet), + 'c2': random.choice(alphabet), + } + for _ in range(4) + ] + pprint(X) + + >>> categories = {'c1': {'a', 'h'}, 'c2': {'x', 'e'}} + >>> oh = preprocessing.OneHotEncoder(categories=categories) + >>> # oh = preprocessing.OneHotEncoder() + >>> for x in X: + ... oh.learn_one(x) + ... pprint(oh.transform_one(x)) + {'c1_a': 0, 'c1_h': 0, 'c2_e': 0, 'c2_x': 0} + {'c1_a': 1, 'c1_h': 0, 'c2_e': 0, 'c2_x': 1} + {'c1_a': 0, 'c1_h': 0, 'c2_e': 0, 'c2_x': 0} + {'c1_a': 0, 'c1_h': 1, 'c2_e': 1, 'c2_x': 0} + + >>> for key in sorted(oh.values.keys()): + ... print(key) + ... print(sorted(oh.values[key])) + c1 + ['a', 'h'] + c2 + ['e', 'x'] + + + oh.values.items() + [{'c1': {'a', 'h'}, 'c2': {'e', 'x'}}] + [{'c1': {'a', 'h'}, 'c2': {'e', 'x'}}] + + {'c1': {'a', 'h', 'i', 'u'}, 'c2': {'d', 'e', 'h', 'x'}} + A subset of the features can be one-hot encoded by piping a `compose.Select` into the `OneHotEncoder`. @@ -192,23 +231,53 @@ class OneHotEncoder(base.MiniBatchTransformer): c2_x Sparse[uint8, 0] dtype: object + Explicit categories: + + >>> oh = preprocessing.OneHotEncoder(categories=categories) + + # oh = preprocessing.OneHotEncoder() + >>> oh.learn_many(X) + >>> df = oh.transform_many(X) + >>> df.sort_index(axis="columns") + c1_a c1_h c2_e c2_x + 0 0 0 0 0 + 1 1 0 0 1 + 2 0 0 0 0 + + # c1_a c1_i c1_u c2_d c2_h c2_x + # 0 0 0 1 1 0 0 + # 1 1 0 0 0 0 1 + # 2 0 1 0 0 1 0 + + c1_a c1_h c2_e c2_x + 0 0 0 0 0 + 1 1 0 0 1 + 2 0 0 0 0 """ - def __init__(self, drop_zeros=False, drop_first=False): + def __init__(self, categories: str | dict = "auto", drop_zeros=False, drop_first=False): self.drop_zeros = drop_zeros self.drop_first = drop_first - self.values = collections.defaultdict(set) + self.categories = categories + + if self.categories == "auto": + self.values = collections.defaultdict(set) + else: + self.values = self.categories def learn_one(self, x): if self.drop_zeros: return - for i, xi in x.items(): - if isinstance(xi, list) or isinstance(xi, set): - for xj in xi: - self.values[i].add(xj) - else: - self.values[i].add(xi) + # NOTE: assume if category mappings are explicitly provided, + # they're intended to be kept fixed. + if self.categories == "auto": + for i, xi in x.items(): + if isinstance(xi, list) or isinstance(xi, set): + for xj in xi: + self.values[i].add(xj) + else: + self.values[i].add(xi) def transform_one(self, x, y=None): oh = {} @@ -217,13 +286,25 @@ def transform_one(self, x, y=None): if not self.drop_zeros: oh = {f"{i}_{v}": 0 for i, values in self.values.items() for v in values} - # Add 1s - for i, xi in x.items(): - if isinstance(xi, list) or isinstance(xi, set): - for xj in xi: - oh[f"{i}_{xj}"] = 1 - else: - oh[f"{i}_{xi}"] = 1 + # Add 1 + # NOTE: assume if category mappings are explicitly provided, + # no other category values are allowed for output. Aligns with `sklearn` behavior. + if self.categories == "auto": + for i, xi in x.items(): + if isinstance(xi, list) or isinstance(xi, set): + for xj in xi: + oh[f"{i}_{xj}"] = 1 + else: + oh[f"{i}_{xi}"] = 1 + else: + for i, xi in x.items(): + if isinstance(xi, list) or isinstance(xi, set): + for xj in xi: + if xj in self.values[i]: + oh[f"{i}_{xj}"] = 1 + else: + if xi in self.values[i]: + oh[f"{i}_{xi}"] = 1 if self.drop_first: oh.pop(min(oh.keys())) @@ -234,12 +315,22 @@ def learn_many(self, X): if self.drop_zeros: return - for col in X.columns: - self.values[col].update(X[col].unique()) + # NOTE: assume if category mappings are explicitly provided, + # they're intended to be kept fixed. + if self.categories == "auto": + for col in X.columns: + self.values[col].update(X[col].unique()) def transform_many(self, X): oh = pd.get_dummies(X, columns=X.columns, sparse=True, dtype="uint8") + # NOTE: assume if category mappings are explicitly provided, + # no other category values are allowed for output. Aligns with `sklearn` behavior. + if self.categories != "auto": + seen_in_the_past = {f"{col}_{val}" for col, vals in self.values.items() for val in vals} + to_remove = set(oh.columns) - seen_in_the_past + oh.drop(columns=list(to_remove), inplace=True) + if not self.drop_zeros: seen_in_the_past = {f"{col}_{val}" for col, vals in self.values.items() for val in vals} to_add = seen_in_the_past - set(oh.columns) From 537003daaa0d203b3bf24e54b35e7f5cc0d5011a Mon Sep 17 00:00:00 2001 From: Alexey C <54956904+ColdTeapot273K@users.noreply.github.com> Date: Sat, 24 Aug 2024 04:28:08 +0500 Subject: [PATCH 2/9] cleanup --- river/preprocessing/one_hot.py | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/river/preprocessing/one_hot.py b/river/preprocessing/one_hot.py index ad74d34fcd..7d3dc556a8 100644 --- a/river/preprocessing/one_hot.py +++ b/river/preprocessing/one_hot.py @@ -89,15 +89,6 @@ class OneHotEncoder(base.MiniBatchTransformer): This is handy when you want to constrain category encoding space to e.g. top 20% most popular category values you've picked in advance. - X = [ - { - 'c1': random.choice(alphabet), - 'c2': random.choice(alphabet), - } - for _ in range(4) - ] - pprint(X) - >>> categories = {'c1': {'a', 'h'}, 'c2': {'x', 'e'}} >>> oh = preprocessing.OneHotEncoder(categories=categories) >>> # oh = preprocessing.OneHotEncoder() @@ -117,13 +108,6 @@ class OneHotEncoder(base.MiniBatchTransformer): c2 ['e', 'x'] - - oh.values.items() - [{'c1': {'a', 'h'}, 'c2': {'e', 'x'}}] - [{'c1': {'a', 'h'}, 'c2': {'e', 'x'}}] - - {'c1': {'a', 'h', 'i', 'u'}, 'c2': {'d', 'e', 'h', 'x'}} - A subset of the features can be one-hot encoded by piping a `compose.Select` into the `OneHotEncoder`. @@ -243,19 +227,9 @@ class OneHotEncoder(base.MiniBatchTransformer): 0 0 0 0 0 1 1 0 0 1 2 0 0 0 0 - - # c1_a c1_i c1_u c2_d c2_h c2_x - # 0 0 0 1 1 0 0 - # 1 1 0 0 0 0 1 - # 2 0 1 0 0 1 0 - - c1_a c1_h c2_e c2_x - 0 0 0 0 0 - 1 1 0 0 1 - 2 0 0 0 0 """ - def __init__(self, categories: str | dict = "auto", drop_zeros=False, drop_first=False): + def __init__(self, categories = "auto", drop_zeros=False, drop_first=False): self.drop_zeros = drop_zeros self.drop_first = drop_first self.categories = categories From 3dc5dd3cf88cb21d69086f476e8070988488bfb1 Mon Sep 17 00:00:00 2001 From: Alexey C <54956904+ColdTeapot273K@users.noreply.github.com> Date: Sat, 24 Aug 2024 17:54:14 +0500 Subject: [PATCH 3/9] Add expected category support for OrdinalEncoder --- river/preprocessing/ordinal.py | 62 ++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/river/preprocessing/ordinal.py b/river/preprocessing/ordinal.py index 01b981ac86..79b757c189 100644 --- a/river/preprocessing/ordinal.py +++ b/river/preprocessing/ordinal.py @@ -64,6 +64,26 @@ class OrdinalEncoder(base.MiniBatchTransformer): {'country': 2, 'place': 1} {'country': -1, 'place': -1} + Like in `scikit-learn`, you can also specify the expected categories manually. + This is handy when you want to constrain category encoding space + to e.g. top 20% most popular category values you've picked in advance. + + >>> categories = {'country': {'France': 1}, + ... 'place': {'Burger King': 2, 'Starbucks': 3}} + >>> encoder = preprocessing.OrdinalEncoder(categories=categories) + >>> for x in X: + ... print(encoder.transform_one(x)) + ... encoder.learn_one(x) + {'country': 1, 'place': 0} + {'country': -1, 'place': -1} + {'country': 0, 'place': 2} + {'country': 1, 'place': 2} + {'country': 0, 'place': 3} + {'country': 0, 'place': 3} + {'country': 0, 'place': 0} + {'country': -1, 'place': -1} + + >>> import pandas as pd >>> xb1 = pd.DataFrame(X[0:4], index=[0, 1, 2, 3]) >>> xb2 = pd.DataFrame(X[4:8], index=[4, 5, 6, 7]) @@ -87,39 +107,46 @@ class OrdinalEncoder(base.MiniBatchTransformer): def __init__( self, + categories: str | dict = "auto", unknown_value: int | None = 0, none_value: int = -1, ): self.unknown_value = unknown_value self.none_value = none_value + self.categories = categories - # We're going to have one auto-incrementing counter per feature. This counter will generate - # the category codes for each feature. - self._counters: collections.defaultdict = collections.defaultdict( - functools.partial(make_counter, {unknown_value, none_value}) - ) + if self.categories == "auto": + # We're going to have one auto-incrementing counter per feature. This counter will generate + # the category codes for each feature. + self._counters: collections.defaultdict = collections.defaultdict( + functools.partial(make_counter, {unknown_value, none_value}) + ) + + # We're going to store the categories in a dict of dicts. The outer dict will map each + # feature to its inner dict. The inner dict will map each category to its code. + self.values: collections.defaultdict = collections.defaultdict(dict) + else: + self.values: dict = self.categories - # We're going to store the categories in a dict of dicts. The outer dict will map each - # feature to its inner dict. The inner dict will map each category to its code. - self.categories: collections.defaultdict = collections.defaultdict(dict) def transform_one(self, x): return { - i: self.none_value if xi is None else self.categories[i].get(xi, self.unknown_value) + i: self.none_value if xi is None else self.values[i].get(xi, self.unknown_value) for i, xi in x.items() } def learn_one(self, x): - for i, xi in x.items(): - if xi is not None and xi not in self.categories[i]: - self.categories[i][xi] = next(self._counters[i]) + if self.categories == "auto": + for i, xi in x.items(): + if xi is not None and xi not in self.values[i]: + self.values[i][xi] = next(self._counters[i]) def transform_many(self, X): return pd.DataFrame( { i: pd.Series( X[i] - .map({**self.categories[i], None: self.none_value}) + .map({**self.values[i], None: self.none_value}) .fillna(self.unknown_value), dtype=np.int64, ) @@ -128,7 +155,8 @@ def transform_many(self, X): ) def learn_many(self, X, y=None): - for i in X.columns: - for xi in X[i].dropna().unique(): - if xi not in self.categories[i]: - self.categories[i][xi] = next(self._counters[i]) + if self.categories == "auto": + for i in X.columns: + for xi in X[i].dropna().unique(): + if xi not in self.values[i]: + self.values[i][xi] = next(self._counters[i]) From 4ff7bb2f53715cd4cf25d0f6a524190c5a12ec72 Mon Sep 17 00:00:00 2001 From: Alexey C <54956904+ColdTeapot273K@users.noreply.github.com> Date: Sat, 24 Aug 2024 18:02:06 +0500 Subject: [PATCH 4/9] fix mypy complaints --- river/preprocessing/ordinal.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/river/preprocessing/ordinal.py b/river/preprocessing/ordinal.py index 79b757c189..62a93ddf68 100644 --- a/river/preprocessing/ordinal.py +++ b/river/preprocessing/ordinal.py @@ -82,7 +82,7 @@ class OrdinalEncoder(base.MiniBatchTransformer): {'country': 0, 'place': 3} {'country': 0, 'place': 0} {'country': -1, 'place': -1} - + >>> import pandas as pd >>> xb1 = pd.DataFrame(X[0:4], index=[0, 1, 2, 3]) >>> xb2 = pd.DataFrame(X[4:8], index=[4, 5, 6, 7]) @@ -107,13 +107,14 @@ class OrdinalEncoder(base.MiniBatchTransformer): def __init__( self, - categories: str | dict = "auto", + categories = "auto", unknown_value: int | None = 0, none_value: int = -1, ): self.unknown_value = unknown_value self.none_value = none_value self.categories = categories + self.values: collections.defaultdict | dict | None = None if self.categories == "auto": # We're going to have one auto-incrementing counter per feature. This counter will generate @@ -124,9 +125,9 @@ def __init__( # We're going to store the categories in a dict of dicts. The outer dict will map each # feature to its inner dict. The inner dict will map each category to its code. - self.values: collections.defaultdict = collections.defaultdict(dict) + self.values = collections.defaultdict(dict) else: - self.values: dict = self.categories + self.values = self.categories def transform_one(self, x): From 86aafa04f3184b75a388953550a65e5ab951cd2a Mon Sep 17 00:00:00 2001 From: Alexey C <54956904+ColdTeapot273K@users.noreply.github.com> Date: Tue, 10 Sep 2024 13:54:39 +0500 Subject: [PATCH 5/9] Update river/preprocessing/one_hot.py Code review fixes Co-authored-by: Max Halford --- river/preprocessing/one_hot.py | 1 - 1 file changed, 1 deletion(-) diff --git a/river/preprocessing/one_hot.py b/river/preprocessing/one_hot.py index 7d3dc556a8..0b1c3c1cf0 100644 --- a/river/preprocessing/one_hot.py +++ b/river/preprocessing/one_hot.py @@ -91,7 +91,6 @@ class OneHotEncoder(base.MiniBatchTransformer): >>> categories = {'c1': {'a', 'h'}, 'c2': {'x', 'e'}} >>> oh = preprocessing.OneHotEncoder(categories=categories) - >>> # oh = preprocessing.OneHotEncoder() >>> for x in X: ... oh.learn_one(x) ... pprint(oh.transform_one(x)) From cbb202c43648719c231370438597f6d3c273f1b0 Mon Sep 17 00:00:00 2001 From: Alexey C <54956904+ColdTeapot273K@users.noreply.github.com> Date: Tue, 10 Sep 2024 13:54:47 +0500 Subject: [PATCH 6/9] Update river/preprocessing/one_hot.py Code review fixes Co-authored-by: Max Halford --- river/preprocessing/one_hot.py | 1 + 1 file changed, 1 insertion(+) diff --git a/river/preprocessing/one_hot.py b/river/preprocessing/one_hot.py index 0b1c3c1cf0..0207073405 100644 --- a/river/preprocessing/one_hot.py +++ b/river/preprocessing/one_hot.py @@ -226,6 +226,7 @@ class OneHotEncoder(base.MiniBatchTransformer): 0 0 0 0 0 1 1 0 0 1 2 0 0 0 0 + """ def __init__(self, categories = "auto", drop_zeros=False, drop_first=False): From 4ce2ade7f5bc872d6634ed39e46fa5c1af10efa4 Mon Sep 17 00:00:00 2001 From: Alexey C <54956904+ColdTeapot273K@users.noreply.github.com> Date: Wed, 2 Oct 2024 02:21:28 +0500 Subject: [PATCH 7/9] Adjust default params, update respective docs --- river/preprocessing/one_hot.py | 31 +++++++++++++++++++++++-------- river/preprocessing/ordinal.py | 23 ++++++++++++++--------- 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/river/preprocessing/one_hot.py b/river/preprocessing/one_hot.py index 0207073405..609f6f6178 100644 --- a/river/preprocessing/one_hot.py +++ b/river/preprocessing/one_hot.py @@ -19,6 +19,14 @@ class OneHotEncoder(base.MiniBatchTransformer): Parameters ---------- + categories + Categories (unique values) per feature: + `None` : Determine categories automatically from the training data. + + dict of dicts : Expected categories for each feature. The outer dict maps each feature to its inner dict. + The inner dict maps each category to its code. + + The used categories can be found in the `values` attribute. drop_zeros Whether or not 0s should be made explicit or not. drop_first @@ -26,6 +34,12 @@ class OneHotEncoder(base.MiniBatchTransformer): This is useful in some statistical models where perfectly collinear features cause problems. + Attributes + ---------- + values + A dict of dicts. The outer dict maps each feature to its inner dict. The inner dict maps + each category to its code. + Examples -------- @@ -157,6 +171,7 @@ class OneHotEncoder(base.MiniBatchTransformer): >>> from pprint import pprint >>> import random >>> import string + >>> import pandas as pd >>> random.seed(42) >>> alphabet = list(string.ascii_lowercase) @@ -218,7 +233,7 @@ class OneHotEncoder(base.MiniBatchTransformer): >>> oh = preprocessing.OneHotEncoder(categories=categories) - # oh = preprocessing.OneHotEncoder() + >>> oh.learn_many(X) >>> df = oh.transform_many(X) >>> df.sort_index(axis="columns") @@ -226,15 +241,15 @@ class OneHotEncoder(base.MiniBatchTransformer): 0 0 0 0 0 1 1 0 0 1 2 0 0 0 0 - + """ - def __init__(self, categories = "auto", drop_zeros=False, drop_first=False): + def __init__(self, categories: dict | None = None, drop_zeros=False, drop_first=False): self.drop_zeros = drop_zeros self.drop_first = drop_first self.categories = categories - if self.categories == "auto": + if self.categories is None: self.values = collections.defaultdict(set) else: self.values = self.categories @@ -245,7 +260,7 @@ def learn_one(self, x): # NOTE: assume if category mappings are explicitly provided, # they're intended to be kept fixed. - if self.categories == "auto": + if self.categories is None: for i, xi in x.items(): if isinstance(xi, list) or isinstance(xi, set): for xj in xi: @@ -263,7 +278,7 @@ def transform_one(self, x, y=None): # Add 1 # NOTE: assume if category mappings are explicitly provided, # no other category values are allowed for output. Aligns with `sklearn` behavior. - if self.categories == "auto": + if self.categories is None: for i, xi in x.items(): if isinstance(xi, list) or isinstance(xi, set): for xj in xi: @@ -291,7 +306,7 @@ def learn_many(self, X): # NOTE: assume if category mappings are explicitly provided, # they're intended to be kept fixed. - if self.categories == "auto": + if self.categories is None: for col in X.columns: self.values[col].update(X[col].unique()) @@ -300,7 +315,7 @@ def transform_many(self, X): # NOTE: assume if category mappings are explicitly provided, # no other category values are allowed for output. Aligns with `sklearn` behavior. - if self.categories != "auto": + if self.categories is not None: seen_in_the_past = {f"{col}_{val}" for col, vals in self.values.items() for val in vals} to_remove = set(oh.columns) - seen_in_the_past oh.drop(columns=list(to_remove), inplace=True) diff --git a/river/preprocessing/ordinal.py b/river/preprocessing/ordinal.py index 62a93ddf68..60346fb10c 100644 --- a/river/preprocessing/ordinal.py +++ b/river/preprocessing/ordinal.py @@ -22,6 +22,14 @@ class OrdinalEncoder(base.MiniBatchTransformer): Parameters ---------- + categories + Categories (unique values) per feature: + `None` : Determine categories automatically from the training data. + + dict of dicts : Expected categories for each feature. The outer dict maps each feature to its inner dict. + The inner dict maps each category to its code. + + The used categories can be found in the `values` attribute. unknown_value The value to use for unknown categories seen during `transform_one`. Unknown categories will be mapped to an integer once they are seen during `learn_one`. This value can be set @@ -31,7 +39,7 @@ class OrdinalEncoder(base.MiniBatchTransformer): Attributes ---------- - categories + values A dict of dicts. The outer dict maps each feature to its inner dict. The inner dict maps each category to its code. @@ -107,7 +115,7 @@ class OrdinalEncoder(base.MiniBatchTransformer): def __init__( self, - categories = "auto", + categories: dict | None = None, unknown_value: int | None = 0, none_value: int = -1, ): @@ -116,7 +124,7 @@ def __init__( self.categories = categories self.values: collections.defaultdict | dict | None = None - if self.categories == "auto": + if self.categories is None: # We're going to have one auto-incrementing counter per feature. This counter will generate # the category codes for each feature. self._counters: collections.defaultdict = collections.defaultdict( @@ -129,7 +137,6 @@ def __init__( else: self.values = self.categories - def transform_one(self, x): return { i: self.none_value if xi is None else self.values[i].get(xi, self.unknown_value) @@ -137,7 +144,7 @@ def transform_one(self, x): } def learn_one(self, x): - if self.categories == "auto": + if self.categories is None: for i, xi in x.items(): if xi is not None and xi not in self.values[i]: self.values[i][xi] = next(self._counters[i]) @@ -146,9 +153,7 @@ def transform_many(self, X): return pd.DataFrame( { i: pd.Series( - X[i] - .map({**self.values[i], None: self.none_value}) - .fillna(self.unknown_value), + X[i].map({**self.values[i], None: self.none_value}).fillna(self.unknown_value), dtype=np.int64, ) for i in X.columns @@ -156,7 +161,7 @@ def transform_many(self, X): ) def learn_many(self, X, y=None): - if self.categories == "auto": + if self.categories is None: for i in X.columns: for xi in X[i].dropna().unique(): if xi not in self.values[i]: From 58f237c1f4203bd9de4072431aae4fafeabf574c Mon Sep 17 00:00:00 2001 From: Alexey C <54956904+ColdTeapot273K@users.noreply.github.com> Date: Wed, 2 Oct 2024 02:26:07 +0500 Subject: [PATCH 8/9] Fix pre-commit hook complaints --- river/preprocessing/one_hot.py | 1 + 1 file changed, 1 insertion(+) diff --git a/river/preprocessing/one_hot.py b/river/preprocessing/one_hot.py index 609f6f6178..3272258d3b 100644 --- a/river/preprocessing/one_hot.py +++ b/river/preprocessing/one_hot.py @@ -248,6 +248,7 @@ def __init__(self, categories: dict | None = None, drop_zeros=False, drop_first= self.drop_zeros = drop_zeros self.drop_first = drop_first self.categories = categories + self.values: collections.defaultdict | dict | None = None if self.categories is None: self.values = collections.defaultdict(set) From fcf01f1b6625f488ece059b17e9cec98bef06d76 Mon Sep 17 00:00:00 2001 From: Alexey C <54956904+ColdTeapot273K@users.noreply.github.com> Date: Tue, 5 Nov 2024 20:47:58 +0500 Subject: [PATCH 9/9] Upd unreleased.md --- docs/releases/unreleased.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/releases/unreleased.md b/docs/releases/unreleased.md index 6d142f06e2..676ad294ed 100644 --- a/docs/releases/unreleased.md +++ b/docs/releases/unreleased.md @@ -12,3 +12,7 @@ ## tree - Instead of letting trees grow indefinitely, setting the `max_depth` parameter to `None` will stop the trees from growing when they reach the system recursion limit. + +## preprocessing + +- Add support for expected categories in `preprocessing.OneHotEncoder`, `preprocessing.OrdinalEncoder`, akin to scikit-learn API for respective encoders.