Skip to content

Commit fea2d97

Browse files
committed
Fix categorical encoding
1 parent 2b67a49 commit fea2d97

File tree

4 files changed

+87
-5
lines changed

4 files changed

+87
-5
lines changed

h2o-algos/src/main/java/hex/knn/KNN.java

+4
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,10 @@ public boolean isSupervised() {
4141
if( null == _parms._distance) {
4242
error("_distance", "Distance parameter not set.");
4343
}
44+
if (null != _parms._categorical_encoding && Model.Parameters.CategoricalEncodingScheme.Enum != _parms._categorical_encoding
45+
&& Model.Parameters.CategoricalEncodingScheme.AUTO != _parms._categorical_encoding) {
46+
error("_categorical_encoding", "Only enum categorical encoding is supported.");
47+
}
4448
}
4549

4650
class KNNDriver extends Driver {

h2o-algos/src/main/java/hex/knn/KNNModel.java

+3-1
Original file line numberDiff line numberDiff line change
@@ -76,10 +76,12 @@ public ModelMetrics.MetricBuilder makeMetricBuilder(String[] domain) {
7676

7777
@Override
7878
protected double[] score0(double[] data, double[] preds) {
79-
Frame train = _parms._train.get();
79+
Frame train = new Frame(_parms.train());
80+
adaptTestForTrain(train, false, false);
8081
int idIndex = train.find(_parms._id_column);
8182
int responseIndex = train.find(_parms._response_column);
8283
byte idType = train.types()[idIndex];
84+
8385
preds = new KNNScoringTask(data, _parms._k, _output.nclasses(), KNNDistanceFactory.createDistance(_parms._distance), idIndex, idType,
8486
responseIndex).doAll(train).score();
8587
Scope.untrack(train);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import sys, os
2+
3+
sys.path.insert(1, os.path.join("..", "..", ".."))
4+
import h2o
5+
from tests import pyunit_utils, assert_equals
6+
from h2o.estimators.knn import H2OKnnEstimator
7+
import numpy as np
8+
9+
10+
def knn_categorical_data():
11+
seed = 12345
12+
id_column = "id"
13+
response_column = "IsDepDelayed"
14+
x_names = ["Origin", "Distance", "Dest"]
15+
16+
train = h2o.upload_file(pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
17+
train[response_column] = train[response_column].asfactor()
18+
train[id_column] = h2o.H2OFrame(np.arange(0, train.shape[0]))
19+
print(train.shape)
20+
train = train[0:1000, :]
21+
22+
# test AUTO categorical encoding
23+
model = H2OKnnEstimator(
24+
k=3,
25+
id_column=id_column,
26+
distance="euclidean",
27+
seed=seed,
28+
auc_type="macroovr"
29+
)
30+
31+
model.train(y=response_column, x=x_names, training_frame=train)
32+
assert model is not None
33+
34+
preds = model.predict(train)
35+
assert preds is not None
36+
37+
# test enum categorical encoding
38+
model = H2OKnnEstimator(
39+
k=3,
40+
id_column=id_column,
41+
distance="euclidean",
42+
seed=seed,
43+
auc_type="macroovr",
44+
categorical_encoding="enum"
45+
)
46+
47+
model.train(y=response_column, x=x_names, training_frame=train)
48+
49+
# test different categorical encoding than enum
50+
try:
51+
model = H2OKnnEstimator(
52+
k=3,
53+
id_column=id_column,
54+
distance="euclidean",
55+
seed=seed,
56+
auc_type="macroovr",
57+
categorical_encoding="one_hot_explicit")
58+
59+
model.train(y=response_column, x=x_names, training_frame=train)
60+
except Exception as ex:
61+
exception = str(ex)
62+
assert ("H2OModelBuilderIllegalArgumentException" in exception)
63+
assert ("_categorical_encoding: Only enum categorical encoding is supported." in exception)
64+
65+
66+
if __name__ == "__main__":
67+
pyunit_utils.standalone_test(knn_categorical_data)
68+
else:
69+
knn_categorical_data()

h2o-py/tests/testdir_algos/knn/pyunit_knn_smoke.py

+11-4
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@ def knn_api_smoke():
1313
response_column = "class"
1414
x_names = ["sepal_len", "sepal_wid", "petal_len", "petal_wid"]
1515

16-
train_h2o = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
17-
train_h2o[response_column] = train_h2o[response_column].asfactor()
18-
train_h2o[id_column] = h2o.H2OFrame(np.arange(0, train_h2o.shape[0]))
16+
train = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
17+
train[response_column] = train[response_column].asfactor()
18+
train[id_column] = h2o.H2OFrame(np.arange(0, train.shape[0]))
1919

2020
model = H2OKnnEstimator(
2121
k=3,
@@ -24,7 +24,7 @@ def knn_api_smoke():
2424
seed=seed,
2525
auc_type="macroovr"
2626
)
27-
model.train(y=response_column, x=x_names, training_frame=train_h2o)
27+
model.train(y=response_column, x=x_names, training_frame=train)
2828
perf = model.model_performance()
2929

3030
print(perf)
@@ -37,6 +37,13 @@ def knn_api_smoke():
3737
distances = model.distances()
3838
assert distances is not None
3939

40+
preds = model.predict(train)
41+
assert preds is not None
42+
43+
print(preds[0,:])
44+
print(preds[51, :])
45+
print(preds[101, :])
46+
4047

4148
if __name__ == "__main__":
4249
pyunit_utils.standalone_test(knn_api_smoke)

0 commit comments

Comments
 (0)