From 2fce4f3ba5e7a441888668773fc4c27aecff79f6 Mon Sep 17 00:00:00 2001
From: Alexey C <54956904+ColdTeapot273K@users.noreply.github.com>
Date: Sat, 24 Aug 2024 03:13:53 +0500
Subject: [PATCH 1/9] Add support for explicitly expected categories for OHE

---
 river/preprocessing/one_hot.py | 133 +++++++++++++++++++++++++++------
 1 file changed, 112 insertions(+), 21 deletions(-)

diff --git a/river/preprocessing/one_hot.py b/river/preprocessing/one_hot.py
index 59d3337fae..ad74d34fcd 100644
--- a/river/preprocessing/one_hot.py
+++ b/river/preprocessing/one_hot.py
@@ -46,11 +46,11 @@ class OneHotEncoder(base.MiniBatchTransformer):
     ... ]
     >>> pprint(X)
     [{'c1': 'u', 'c2': 'd'},
-        {'c1': 'a', 'c2': 'x'},
-        {'c1': 'i', 'c2': 'h'},
-        {'c1': 'h', 'c2': 'e'}]
+     {'c1': 'a', 'c2': 'x'},
+     {'c1': 'i', 'c2': 'h'},
+     {'c1': 'h', 'c2': 'e'}]
 
-    e can now apply one-hot encoding. All the provided are one-hot encoded, there is therefore
+    We can now apply one-hot encoding. All the provided are one-hot encoded, there is therefore
     no need to specify which features to encode.
 
     >>> from river import preprocessing
@@ -85,6 +85,45 @@ class OneHotEncoder(base.MiniBatchTransformer):
     {'c2_h': 1}
     {'c2_e': 1}
 
+    Like in `scikit-learn`, you can also specify the expected categories manually.
+    This is handy when you want to constrain category encoding space
+    to e.g. top 20% most popular category values you've picked in advance.
+
+    X = [
+        {
+            'c1': random.choice(alphabet),
+            'c2': random.choice(alphabet),
+        }
+        for _ in range(4)
+    ]
+    pprint(X)
+
+    >>> categories = {'c1': {'a', 'h'}, 'c2': {'x', 'e'}}
+    >>> oh = preprocessing.OneHotEncoder(categories=categories)
+    >>> # oh = preprocessing.OneHotEncoder()
+    >>> for x in X:
+    ...     oh.learn_one(x)
+    ...     pprint(oh.transform_one(x))
+    {'c1_a': 0, 'c1_h': 0, 'c2_e': 0, 'c2_x': 0}
+    {'c1_a': 1, 'c1_h': 0, 'c2_e': 0, 'c2_x': 1}
+    {'c1_a': 0, 'c1_h': 0, 'c2_e': 0, 'c2_x': 0}
+    {'c1_a': 0, 'c1_h': 1, 'c2_e': 1, 'c2_x': 0}
+
+    >>> for key in sorted(oh.values.keys()):
+    ...     print(key)
+    ...     print(sorted(oh.values[key]))
+    c1
+    ['a', 'h']
+    c2
+    ['e', 'x']
+
+
+    oh.values.items()
+    [{'c1': {'a', 'h'}, 'c2': {'e', 'x'}}]
+    [{'c1': {'a', 'h'}, 'c2': {'e', 'x'}}]
+
+    {'c1': {'a', 'h', 'i', 'u'}, 'c2': {'d', 'e', 'h', 'x'}}
+
     A subset of the features can be one-hot encoded by piping a `compose.Select` into the
     `OneHotEncoder`.
 
@@ -192,23 +231,53 @@ class OneHotEncoder(base.MiniBatchTransformer):
     c2_x          Sparse[uint8, 0]
     dtype: object
 
+    Explicit categories:
+
+    >>> oh = preprocessing.OneHotEncoder(categories=categories)
+
+    # oh = preprocessing.OneHotEncoder()
+    >>> oh.learn_many(X)
+    >>> df = oh.transform_many(X)
+    >>> df.sort_index(axis="columns")
+       c1_a  c1_h  c2_e  c2_x
+    0     0     0     0     0
+    1     1     0     0     1
+    2     0     0     0     0
+
+    #     c1_a  c1_i  c1_u  c2_d  c2_h  c2_x
+    # 0     0     0     1     1     0     0
+    # 1     1     0     0     0     0     1
+    # 2     0     1     0     0     1     0
+
+        c1_a  c1_h  c2_e  c2_x
+    0     0     0     0     0
+    1     1     0     0     1
+    2     0     0     0     0
     """
 
-    def __init__(self, drop_zeros=False, drop_first=False):
+    def __init__(self, categories: str | dict = "auto", drop_zeros=False, drop_first=False):
         self.drop_zeros = drop_zeros
         self.drop_first = drop_first
-        self.values = collections.defaultdict(set)
+        self.categories = categories
+
+        if self.categories == "auto":
+            self.values = collections.defaultdict(set)
+        else:
+            self.values = self.categories
 
     def learn_one(self, x):
         if self.drop_zeros:
             return
 
-        for i, xi in x.items():
-            if isinstance(xi, list) or isinstance(xi, set):
-                for xj in xi:
-                    self.values[i].add(xj)
-            else:
-                self.values[i].add(xi)
+        # NOTE: assume if category mappings are explicitly provided,
+        # they're intended to be kept fixed.
+        if self.categories == "auto":
+            for i, xi in x.items():
+                if isinstance(xi, list) or isinstance(xi, set):
+                    for xj in xi:
+                        self.values[i].add(xj)
+                else:
+                    self.values[i].add(xi)
 
     def transform_one(self, x, y=None):
         oh = {}
@@ -217,13 +286,25 @@ def transform_one(self, x, y=None):
         if not self.drop_zeros:
             oh = {f"{i}_{v}": 0 for i, values in self.values.items() for v in values}
 
-        # Add 1s
-        for i, xi in x.items():
-            if isinstance(xi, list) or isinstance(xi, set):
-                for xj in xi:
-                    oh[f"{i}_{xj}"] = 1
-            else:
-                oh[f"{i}_{xi}"] = 1
+        # Add 1
+        # NOTE: assume if category mappings are explicitly provided,
+        # no other category values are allowed for output. Aligns with `sklearn` behavior.
+        if self.categories == "auto":
+            for i, xi in x.items():
+                if isinstance(xi, list) or isinstance(xi, set):
+                    for xj in xi:
+                        oh[f"{i}_{xj}"] = 1
+                else:
+                    oh[f"{i}_{xi}"] = 1
+        else:
+            for i, xi in x.items():
+                if isinstance(xi, list) or isinstance(xi, set):
+                    for xj in xi:
+                        if xj in self.values[i]:
+                            oh[f"{i}_{xj}"] = 1
+                else:
+                    if xi in self.values[i]:
+                        oh[f"{i}_{xi}"] = 1
 
         if self.drop_first:
             oh.pop(min(oh.keys()))
@@ -234,12 +315,22 @@ def learn_many(self, X):
         if self.drop_zeros:
             return
 
-        for col in X.columns:
-            self.values[col].update(X[col].unique())
+        # NOTE: assume if category mappings are explicitly provided,
+        # they're intended to be kept fixed.
+        if self.categories == "auto":
+            for col in X.columns:
+                self.values[col].update(X[col].unique())
 
     def transform_many(self, X):
         oh = pd.get_dummies(X, columns=X.columns, sparse=True, dtype="uint8")
 
+        # NOTE: assume if category mappings are explicitly provided,
+        # no other category values are allowed for output. Aligns with `sklearn` behavior.
+        if self.categories != "auto":
+            seen_in_the_past = {f"{col}_{val}" for col, vals in self.values.items() for val in vals}
+            to_remove = set(oh.columns) - seen_in_the_past
+            oh.drop(columns=list(to_remove), inplace=True)
+
         if not self.drop_zeros:
             seen_in_the_past = {f"{col}_{val}" for col, vals in self.values.items() for val in vals}
             to_add = seen_in_the_past - set(oh.columns)

From 537003daaa0d203b3bf24e54b35e7f5cc0d5011a Mon Sep 17 00:00:00 2001
From: Alexey C <54956904+ColdTeapot273K@users.noreply.github.com>
Date: Sat, 24 Aug 2024 04:28:08 +0500
Subject: [PATCH 2/9] cleanup

---
 river/preprocessing/one_hot.py | 28 +---------------------------
 1 file changed, 1 insertion(+), 27 deletions(-)

diff --git a/river/preprocessing/one_hot.py b/river/preprocessing/one_hot.py
index ad74d34fcd..7d3dc556a8 100644
--- a/river/preprocessing/one_hot.py
+++ b/river/preprocessing/one_hot.py
@@ -89,15 +89,6 @@ class OneHotEncoder(base.MiniBatchTransformer):
     This is handy when you want to constrain category encoding space
     to e.g. top 20% most popular category values you've picked in advance.
 
-    X = [
-        {
-            'c1': random.choice(alphabet),
-            'c2': random.choice(alphabet),
-        }
-        for _ in range(4)
-    ]
-    pprint(X)
-
     >>> categories = {'c1': {'a', 'h'}, 'c2': {'x', 'e'}}
     >>> oh = preprocessing.OneHotEncoder(categories=categories)
     >>> # oh = preprocessing.OneHotEncoder()
@@ -117,13 +108,6 @@ class OneHotEncoder(base.MiniBatchTransformer):
     c2
     ['e', 'x']
 
-
-    oh.values.items()
-    [{'c1': {'a', 'h'}, 'c2': {'e', 'x'}}]
-    [{'c1': {'a', 'h'}, 'c2': {'e', 'x'}}]
-
-    {'c1': {'a', 'h', 'i', 'u'}, 'c2': {'d', 'e', 'h', 'x'}}
-
     A subset of the features can be one-hot encoded by piping a `compose.Select` into the
     `OneHotEncoder`.
 
@@ -243,19 +227,9 @@ class OneHotEncoder(base.MiniBatchTransformer):
     0     0     0     0     0
     1     1     0     0     1
     2     0     0     0     0
-
-    #     c1_a  c1_i  c1_u  c2_d  c2_h  c2_x
-    # 0     0     0     1     1     0     0
-    # 1     1     0     0     0     0     1
-    # 2     0     1     0     0     1     0
-
-        c1_a  c1_h  c2_e  c2_x
-    0     0     0     0     0
-    1     1     0     0     1
-    2     0     0     0     0
     """
 
-    def __init__(self, categories: str | dict = "auto", drop_zeros=False, drop_first=False):
+    def __init__(self, categories = "auto", drop_zeros=False, drop_first=False):
         self.drop_zeros = drop_zeros
         self.drop_first = drop_first
         self.categories = categories

From 3dc5dd3cf88cb21d69086f476e8070988488bfb1 Mon Sep 17 00:00:00 2001
From: Alexey C <54956904+ColdTeapot273K@users.noreply.github.com>
Date: Sat, 24 Aug 2024 17:54:14 +0500
Subject: [PATCH 3/9] Add expected category support for OrdinalEncoder

---
 river/preprocessing/ordinal.py | 62 ++++++++++++++++++++++++----------
 1 file changed, 45 insertions(+), 17 deletions(-)

diff --git a/river/preprocessing/ordinal.py b/river/preprocessing/ordinal.py
index 01b981ac86..79b757c189 100644
--- a/river/preprocessing/ordinal.py
+++ b/river/preprocessing/ordinal.py
@@ -64,6 +64,26 @@ class OrdinalEncoder(base.MiniBatchTransformer):
     {'country': 2, 'place': 1}
     {'country': -1, 'place': -1}
 
+    Like in `scikit-learn`, you can also specify the expected categories manually.
+    This is handy when you want to constrain category encoding space
+    to e.g. top 20% most popular category values you've picked in advance.
+
+    >>> categories = {'country': {'France': 1},
+    ...               'place': {'Burger King': 2, 'Starbucks': 3}}
+    >>> encoder = preprocessing.OrdinalEncoder(categories=categories)
+    >>> for x in X:
+    ...     print(encoder.transform_one(x))
+    ...     encoder.learn_one(x)
+    {'country': 1, 'place': 0}
+    {'country': -1, 'place': -1}
+    {'country': 0, 'place': 2}
+    {'country': 1, 'place': 2}
+    {'country': 0, 'place': 3}
+    {'country': 0, 'place': 3}
+    {'country': 0, 'place': 0}
+    {'country': -1, 'place': -1}
+    
+    >>> import pandas as pd
     >>> xb1 = pd.DataFrame(X[0:4], index=[0, 1, 2, 3])
     >>> xb2 = pd.DataFrame(X[4:8], index=[4, 5, 6, 7])
 
@@ -87,39 +107,46 @@ class OrdinalEncoder(base.MiniBatchTransformer):
 
     def __init__(
         self,
+        categories: str | dict = "auto",
         unknown_value: int | None = 0,
         none_value: int = -1,
     ):
         self.unknown_value = unknown_value
         self.none_value = none_value
+        self.categories = categories
 
-        # We're going to have one auto-incrementing counter per feature. This counter will generate
-        # the category codes for each feature.
-        self._counters: collections.defaultdict = collections.defaultdict(
-            functools.partial(make_counter, {unknown_value, none_value})
-        )
+        if self.categories == "auto":
+            # We're going to have one auto-incrementing counter per feature. This counter will generate
+            # the category codes for each feature.
+            self._counters: collections.defaultdict = collections.defaultdict(
+                functools.partial(make_counter, {unknown_value, none_value})
+            )
+
+            # We're going to store the categories in a dict of dicts. The outer dict will map each
+            # feature to its inner dict. The inner dict will map each category to its code.
+            self.values: collections.defaultdict = collections.defaultdict(dict)
+        else:
+            self.values: dict = self.categories 
 
-        # We're going to store the categories in a dict of dicts. The outer dict will map each
-        # feature to its inner dict. The inner dict will map each category to its code.
-        self.categories: collections.defaultdict = collections.defaultdict(dict)
 
     def transform_one(self, x):
         return {
-            i: self.none_value if xi is None else self.categories[i].get(xi, self.unknown_value)
+            i: self.none_value if xi is None else self.values[i].get(xi, self.unknown_value)
             for i, xi in x.items()
         }
 
     def learn_one(self, x):
-        for i, xi in x.items():
-            if xi is not None and xi not in self.categories[i]:
-                self.categories[i][xi] = next(self._counters[i])
+        if self.categories == "auto":
+            for i, xi in x.items():
+                if xi is not None and xi not in self.values[i]:
+                    self.values[i][xi] = next(self._counters[i])
 
     def transform_many(self, X):
         return pd.DataFrame(
             {
                 i: pd.Series(
                     X[i]
-                    .map({**self.categories[i], None: self.none_value})
+                    .map({**self.values[i], None: self.none_value})
                     .fillna(self.unknown_value),
                     dtype=np.int64,
                 )
@@ -128,7 +155,8 @@ def transform_many(self, X):
         )
 
     def learn_many(self, X, y=None):
-        for i in X.columns:
-            for xi in X[i].dropna().unique():
-                if xi not in self.categories[i]:
-                    self.categories[i][xi] = next(self._counters[i])
+        if self.categories == "auto":
+            for i in X.columns:
+                for xi in X[i].dropna().unique():
+                    if xi not in self.values[i]:
+                        self.values[i][xi] = next(self._counters[i])

From 4ff7bb2f53715cd4cf25d0f6a524190c5a12ec72 Mon Sep 17 00:00:00 2001
From: Alexey C <54956904+ColdTeapot273K@users.noreply.github.com>
Date: Sat, 24 Aug 2024 18:02:06 +0500
Subject: [PATCH 4/9] fix mypy complaints

---
 river/preprocessing/ordinal.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/river/preprocessing/ordinal.py b/river/preprocessing/ordinal.py
index 79b757c189..62a93ddf68 100644
--- a/river/preprocessing/ordinal.py
+++ b/river/preprocessing/ordinal.py
@@ -82,7 +82,7 @@ class OrdinalEncoder(base.MiniBatchTransformer):
     {'country': 0, 'place': 3}
     {'country': 0, 'place': 0}
     {'country': -1, 'place': -1}
-    
+
     >>> import pandas as pd
     >>> xb1 = pd.DataFrame(X[0:4], index=[0, 1, 2, 3])
     >>> xb2 = pd.DataFrame(X[4:8], index=[4, 5, 6, 7])
@@ -107,13 +107,14 @@ class OrdinalEncoder(base.MiniBatchTransformer):
 
     def __init__(
         self,
-        categories: str | dict = "auto",
+        categories = "auto",
         unknown_value: int | None = 0,
         none_value: int = -1,
     ):
         self.unknown_value = unknown_value
         self.none_value = none_value
         self.categories = categories
+        self.values: collections.defaultdict | dict | None = None
 
         if self.categories == "auto":
             # We're going to have one auto-incrementing counter per feature. This counter will generate
@@ -124,9 +125,9 @@ def __init__(
 
             # We're going to store the categories in a dict of dicts. The outer dict will map each
             # feature to its inner dict. The inner dict will map each category to its code.
-            self.values: collections.defaultdict = collections.defaultdict(dict)
+            self.values = collections.defaultdict(dict)
         else:
-            self.values: dict = self.categories 
+            self.values = self.categories
 
 
     def transform_one(self, x):

From 86aafa04f3184b75a388953550a65e5ab951cd2a Mon Sep 17 00:00:00 2001
From: Alexey C <54956904+ColdTeapot273K@users.noreply.github.com>
Date: Tue, 10 Sep 2024 13:54:39 +0500
Subject: [PATCH 5/9] Update river/preprocessing/one_hot.py

Code review fixes

Co-authored-by: Max Halford <maxhalford25@gmail.com>
---
 river/preprocessing/one_hot.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/river/preprocessing/one_hot.py b/river/preprocessing/one_hot.py
index 7d3dc556a8..0b1c3c1cf0 100644
--- a/river/preprocessing/one_hot.py
+++ b/river/preprocessing/one_hot.py
@@ -91,7 +91,6 @@ class OneHotEncoder(base.MiniBatchTransformer):
 
     >>> categories = {'c1': {'a', 'h'}, 'c2': {'x', 'e'}}
     >>> oh = preprocessing.OneHotEncoder(categories=categories)
-    >>> # oh = preprocessing.OneHotEncoder()
     >>> for x in X:
     ...     oh.learn_one(x)
     ...     pprint(oh.transform_one(x))

From cbb202c43648719c231370438597f6d3c273f1b0 Mon Sep 17 00:00:00 2001
From: Alexey C <54956904+ColdTeapot273K@users.noreply.github.com>
Date: Tue, 10 Sep 2024 13:54:47 +0500
Subject: [PATCH 6/9] Update river/preprocessing/one_hot.py

Code review fixes

Co-authored-by: Max Halford <maxhalford25@gmail.com>
---
 river/preprocessing/one_hot.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/river/preprocessing/one_hot.py b/river/preprocessing/one_hot.py
index 0b1c3c1cf0..0207073405 100644
--- a/river/preprocessing/one_hot.py
+++ b/river/preprocessing/one_hot.py
@@ -226,6 +226,7 @@ class OneHotEncoder(base.MiniBatchTransformer):
     0     0     0     0     0
     1     1     0     0     1
     2     0     0     0     0
+    
     """
 
     def __init__(self, categories = "auto", drop_zeros=False, drop_first=False):

From 4ce2ade7f5bc872d6634ed39e46fa5c1af10efa4 Mon Sep 17 00:00:00 2001
From: Alexey C <54956904+ColdTeapot273K@users.noreply.github.com>
Date: Wed, 2 Oct 2024 02:21:28 +0500
Subject: [PATCH 7/9] Adjust default params, update respective docs

---
 river/preprocessing/one_hot.py | 31 +++++++++++++++++++++++--------
 river/preprocessing/ordinal.py | 23 ++++++++++++++---------
 2 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/river/preprocessing/one_hot.py b/river/preprocessing/one_hot.py
index 0207073405..609f6f6178 100644
--- a/river/preprocessing/one_hot.py
+++ b/river/preprocessing/one_hot.py
@@ -19,6 +19,14 @@ class OneHotEncoder(base.MiniBatchTransformer):
 
     Parameters
     ----------
+    categories
+        Categories (unique values) per feature:
+            `None` : Determine categories automatically from the training data.
+
+            dict of dicts : Expected categories for each feature. The outer dict maps each feature to its inner dict.
+            The inner dict maps each category to its code.
+
+        The used categories can be found in the `values` attribute.
     drop_zeros
         Whether or not 0s should be made explicit or not.
     drop_first
@@ -26,6 +34,12 @@ class OneHotEncoder(base.MiniBatchTransformer):
         This is useful in some statistical models where perfectly collinear features cause
         problems.
 
+    Attributes
+    ----------
+    values
+        A dict of dicts. The outer dict maps each feature to its inner dict. The inner dict maps
+        each category to its code.
+
     Examples
     --------
 
@@ -157,6 +171,7 @@ class OneHotEncoder(base.MiniBatchTransformer):
     >>> from pprint import pprint
     >>> import random
     >>> import string
+    >>> import pandas as pd
 
     >>> random.seed(42)
     >>> alphabet = list(string.ascii_lowercase)
@@ -218,7 +233,7 @@ class OneHotEncoder(base.MiniBatchTransformer):
 
     >>> oh = preprocessing.OneHotEncoder(categories=categories)
 
-    # oh = preprocessing.OneHotEncoder()
+
     >>> oh.learn_many(X)
     >>> df = oh.transform_many(X)
     >>> df.sort_index(axis="columns")
@@ -226,15 +241,15 @@ class OneHotEncoder(base.MiniBatchTransformer):
     0     0     0     0     0
     1     1     0     0     1
     2     0     0     0     0
-    
+
     """
 
-    def __init__(self, categories = "auto", drop_zeros=False, drop_first=False):
+    def __init__(self, categories: dict | None = None, drop_zeros=False, drop_first=False):
         self.drop_zeros = drop_zeros
         self.drop_first = drop_first
         self.categories = categories
 
-        if self.categories == "auto":
+        if self.categories is None:
             self.values = collections.defaultdict(set)
         else:
             self.values = self.categories
@@ -245,7 +260,7 @@ def learn_one(self, x):
 
         # NOTE: assume if category mappings are explicitly provided,
         # they're intended to be kept fixed.
-        if self.categories == "auto":
+        if self.categories is None:
             for i, xi in x.items():
                 if isinstance(xi, list) or isinstance(xi, set):
                     for xj in xi:
@@ -263,7 +278,7 @@ def transform_one(self, x, y=None):
         # Add 1
         # NOTE: assume if category mappings are explicitly provided,
         # no other category values are allowed for output. Aligns with `sklearn` behavior.
-        if self.categories == "auto":
+        if self.categories is None:
             for i, xi in x.items():
                 if isinstance(xi, list) or isinstance(xi, set):
                     for xj in xi:
@@ -291,7 +306,7 @@ def learn_many(self, X):
 
         # NOTE: assume if category mappings are explicitly provided,
         # they're intended to be kept fixed.
-        if self.categories == "auto":
+        if self.categories is None:
             for col in X.columns:
                 self.values[col].update(X[col].unique())
 
@@ -300,7 +315,7 @@ def transform_many(self, X):
 
         # NOTE: assume if category mappings are explicitly provided,
         # no other category values are allowed for output. Aligns with `sklearn` behavior.
-        if self.categories != "auto":
+        if self.categories is not None:
             seen_in_the_past = {f"{col}_{val}" for col, vals in self.values.items() for val in vals}
             to_remove = set(oh.columns) - seen_in_the_past
             oh.drop(columns=list(to_remove), inplace=True)
diff --git a/river/preprocessing/ordinal.py b/river/preprocessing/ordinal.py
index 62a93ddf68..60346fb10c 100644
--- a/river/preprocessing/ordinal.py
+++ b/river/preprocessing/ordinal.py
@@ -22,6 +22,14 @@ class OrdinalEncoder(base.MiniBatchTransformer):
 
     Parameters
     ----------
+    categories
+        Categories (unique values) per feature:
+            `None` : Determine categories automatically from the training data.
+
+            dict of dicts : Expected categories for each feature. The outer dict maps each feature to its inner dict.
+            The inner dict maps each category to its code.
+
+        The used categories can be found in the `values` attribute.
     unknown_value
         The value to use for unknown categories seen during `transform_one`. Unknown categories
         will be mapped to an integer once they are seen during `learn_one`. This value can be set
@@ -31,7 +39,7 @@ class OrdinalEncoder(base.MiniBatchTransformer):
 
     Attributes
     ----------
-    categories
+    values
         A dict of dicts. The outer dict maps each feature to its inner dict. The inner dict maps
         each category to its code.
 
@@ -107,7 +115,7 @@ class OrdinalEncoder(base.MiniBatchTransformer):
 
     def __init__(
         self,
-        categories = "auto",
+        categories: dict | None = None,
         unknown_value: int | None = 0,
         none_value: int = -1,
     ):
@@ -116,7 +124,7 @@ def __init__(
         self.categories = categories
         self.values: collections.defaultdict | dict | None = None
 
-        if self.categories == "auto":
+        if self.categories is None:
             # We're going to have one auto-incrementing counter per feature. This counter will generate
             # the category codes for each feature.
             self._counters: collections.defaultdict = collections.defaultdict(
@@ -129,7 +137,6 @@ def __init__(
         else:
             self.values = self.categories
 
-
     def transform_one(self, x):
         return {
             i: self.none_value if xi is None else self.values[i].get(xi, self.unknown_value)
@@ -137,7 +144,7 @@ def transform_one(self, x):
         }
 
     def learn_one(self, x):
-        if self.categories == "auto":
+        if self.categories is None:
             for i, xi in x.items():
                 if xi is not None and xi not in self.values[i]:
                     self.values[i][xi] = next(self._counters[i])
@@ -146,9 +153,7 @@ def transform_many(self, X):
         return pd.DataFrame(
             {
                 i: pd.Series(
-                    X[i]
-                    .map({**self.values[i], None: self.none_value})
-                    .fillna(self.unknown_value),
+                    X[i].map({**self.values[i], None: self.none_value}).fillna(self.unknown_value),
                     dtype=np.int64,
                 )
                 for i in X.columns
@@ -156,7 +161,7 @@ def transform_many(self, X):
         )
 
     def learn_many(self, X, y=None):
-        if self.categories == "auto":
+        if self.categories is None:
             for i in X.columns:
                 for xi in X[i].dropna().unique():
                     if xi not in self.values[i]:

From 58f237c1f4203bd9de4072431aae4fafeabf574c Mon Sep 17 00:00:00 2001
From: Alexey C <54956904+ColdTeapot273K@users.noreply.github.com>
Date: Wed, 2 Oct 2024 02:26:07 +0500
Subject: [PATCH 8/9] Fix pre-commit hook complaints

---
 river/preprocessing/one_hot.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/river/preprocessing/one_hot.py b/river/preprocessing/one_hot.py
index 609f6f6178..3272258d3b 100644
--- a/river/preprocessing/one_hot.py
+++ b/river/preprocessing/one_hot.py
@@ -248,6 +248,7 @@ def __init__(self, categories: dict | None = None, drop_zeros=False, drop_first=
         self.drop_zeros = drop_zeros
         self.drop_first = drop_first
         self.categories = categories
+        self.values: collections.defaultdict | dict | None = None
 
         if self.categories is None:
             self.values = collections.defaultdict(set)

From fcf01f1b6625f488ece059b17e9cec98bef06d76 Mon Sep 17 00:00:00 2001
From: Alexey C <54956904+ColdTeapot273K@users.noreply.github.com>
Date: Tue, 5 Nov 2024 20:47:58 +0500
Subject: [PATCH 9/9] Upd unreleased.md

---
 docs/releases/unreleased.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/releases/unreleased.md b/docs/releases/unreleased.md
index 6d142f06e2..676ad294ed 100644
--- a/docs/releases/unreleased.md
+++ b/docs/releases/unreleased.md
@@ -12,3 +12,7 @@
 ## tree
 
 - Instead of letting trees grow indefinitely, setting the `max_depth` parameter to `None` will stop the trees from growing when they reach the system recursion limit.
+
+## preprocessing
+
+- Add support for expected categories in `preprocessing.OneHotEncoder`, `preprocessing.OrdinalEncoder`, akin to scikit-learn API for respective encoders.