Skip to content

Commit fdb52ce

Browse files
authored
Update change logs history (#1049)
* Investigate issue 1047 about LabelEncoder Signed-off-by: Xavier Dupre <[email protected]> * add another unit test Signed-off-by: Xavier Dupre <[email protected]> * update change logs Signed-off-by: Xavier Dupre <[email protected]> * disable a test for old version of scikit-learn Signed-off-by: Xavier Dupre <[email protected]> --------- Signed-off-by: Xavier Dupre <[email protected]>
1 parent ae29a33 commit fdb52ce

File tree

2 files changed

+98
-3
lines changed

2 files changed

+98
-3
lines changed

CHANGELOGS.md

+4-1
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,13 @@
44

55
* Supports cosine distance (LocalOutlierFactor, ...)
66
[#1050](https://github.com/onnx/sklearn-onnx/pull/1050),
7+
* Supports multiple columns for OrdinalEncoder
8+
[#1044](https://github.com/onnx/sklearn-onnx/pull/1044) (by @max-509)
79
* Add an example on how to handle FunctionTransformer
810
[#1042](https://github.com/onnx/sklearn-onnx/pull/1042),
911
Versions of `scikit-learn < 1.0` are not tested any more.
10-
* FeatureHasher, raise an error when the delimiter length is > 1,
12+
* Supports lists of strings as inputs for FeatureHasher
13+
[#1025](https://github.com/onnx/sklearn-onnx/pull/1036),
1114
[#1036](https://github.com/onnx/sklearn-onnx/pull/1036)
1215
* skl2onnx works with onnx==1.15.0,
1316
[#1034](https://github.com/onnx/sklearn-onnx/pull/1034)

tests/test_sklearn_ordinal_encoder.py

+94-2
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,21 @@
22

33
"""Tests scikit-learn's OrdinalEncoder converter."""
44
import unittest
5+
from numpy.testing import assert_almost_equal
56
import packaging.version as pv
67
import numpy as np
78
import pandas as pd
89
import onnxruntime
910
from sklearn import __version__ as sklearn_version
11+
from sklearn.compose import ColumnTransformer
12+
from sklearn.pipeline import make_pipeline
13+
from sklearn.ensemble import RandomForestRegressor
1014

1115
try:
1216
from sklearn.preprocessing import OrdinalEncoder
1317
except ImportError:
1418
pass
15-
from skl2onnx import convert_sklearn
19+
from skl2onnx import convert_sklearn, to_onnx
1620
from skl2onnx.common.data_types import (
1721
Int64TensorType,
1822
StringTensorType,
@@ -30,6 +34,11 @@ def ordinal_encoder_support():
3034
return pv.Version(vers) >= pv.Version("0.20.0")
3135

3236

37+
def set_output_support():
38+
vers = ".".join(sklearn_version.split(".")[:2])
39+
return pv.Version(vers) >= pv.Version("1.2")
40+
41+
3342
class TestSklearnOrdinalEncoderConverter(unittest.TestCase):
3443
@unittest.skipIf(
3544
not ordinal_encoder_support(),
@@ -172,6 +181,89 @@ def test_model_ordinal_encoder_cat_list(self):
172181
data, model, model_onnx, basename="SklearnOrdinalEncoderCatList"
173182
)
174183

184+
@unittest.skipIf(
185+
not set_output_support(),
186+
reason="'ColumnTransformer' object has no attribute 'set_output'",
187+
)
188+
@unittest.skipIf(
189+
not ordinal_encoder_support(),
190+
reason="OrdinalEncoder was not available before 0.20",
191+
)
192+
def test_ordinal_encoder_pipeline_int64(self):
193+
from onnxruntime import InferenceSession
194+
195+
data = pd.DataFrame({"cat": ["cat2", "cat1"], "num": [0, 1]})
196+
data["num"] = data["num"].astype(np.float32)
197+
y = np.array([0, 1], dtype=np.float32)
198+
preprocessor = ColumnTransformer(
199+
transformers=[
200+
("cat", OrdinalEncoder(dtype=np.int64), ["cat"]),
201+
("num", "passthrough", ["num"]),
202+
],
203+
sparse_threshold=1,
204+
verbose_feature_names_out=False,
205+
).set_output(transform="pandas")
206+
model = make_pipeline(
207+
preprocessor, RandomForestRegressor(n_estimators=3, max_depth=2)
208+
)
209+
model.fit(data, y)
210+
expected = model.predict(data)
211+
model_onnx = to_onnx(model, data[:1], target_opset=TARGET_OPSET)
212+
sess = InferenceSession(
213+
model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
214+
)
215+
got = sess.run(
216+
None,
217+
{
218+
"cat": data["cat"].values.reshape((-1, 1)),
219+
"num": data["num"].values.reshape((-1, 1)),
220+
},
221+
)
222+
assert_almost_equal(expected, got[0].ravel())
223+
224+
@unittest.skipIf(
225+
not set_output_support(),
226+
reason="'ColumnTransformer' object has no attribute 'set_output'",
227+
)
228+
@unittest.skipIf(
229+
not ordinal_encoder_support(),
230+
reason="OrdinalEncoder was not available before 0.20",
231+
)
232+
def test_ordinal_encoder_pipeline_string_int64(self):
233+
from onnxruntime import InferenceSession
234+
235+
data = pd.DataFrame(
236+
{"C1": ["cat2", "cat1", "cat3"], "C2": [1, 0, 1], "num": [0, 1, 1]}
237+
)
238+
data["num"] = data["num"].astype(np.float32)
239+
y = np.array([0, 1, 2], dtype=np.float32)
240+
preprocessor = ColumnTransformer(
241+
transformers=[
242+
("cat", OrdinalEncoder(dtype=np.int64), ["C1", "C2"]),
243+
("num", "passthrough", ["num"]),
244+
],
245+
sparse_threshold=1,
246+
verbose_feature_names_out=False,
247+
).set_output(transform="pandas")
248+
model = make_pipeline(
249+
preprocessor, RandomForestRegressor(n_estimators=3, max_depth=2)
250+
)
251+
model.fit(data, y)
252+
expected = model.predict(data)
253+
model_onnx = to_onnx(model, data[:1], target_opset=TARGET_OPSET)
254+
sess = InferenceSession(
255+
model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
256+
)
257+
got = sess.run(
258+
None,
259+
{
260+
"C1": data["C1"].values.reshape((-1, 1)),
261+
"C2": data["C2"].values.reshape((-1, 1)),
262+
"num": data["num"].values.reshape((-1, 1)),
263+
},
264+
)
265+
assert_almost_equal(expected, got[0].ravel())
266+
175267

176268
if __name__ == "__main__":
177-
unittest.main()
269+
unittest.main(verbosity=2)

0 commit comments

Comments
 (0)