2
2
3
3
sys .path .insert (1 , os .path .join (".." , ".." , ".." ))
4
4
import h2o
5
- from tests import pyunit_utils , assert_equals
5
+ from tests import pyunit_utils
6
6
from h2o .estimators .knn import H2OKnnEstimator
7
7
import numpy as np
8
- from sklearn .neighbors import KNeighborsClassifier
9
8
from sklearn .neighbors import kneighbors_graph
10
9
import pandas as pd
11
10
@@ -15,35 +14,48 @@ def knn_sklearn_compare():
15
14
id_column = "id"
16
15
response_column = "class"
17
16
x_names = ["sepal_len" , "sepal_wid" , "petal_len" , "petal_wid" ]
17
+ k = 3
18
+ metrics = ["euclidean" , "manhattan" , "cosine" ]
18
19
19
20
train = pd .read_csv (pyunit_utils .locate ("smalldata/iris/iris_wheader.csv" ))
20
-
21
- knn = KNeighborsClassifier (n_neighbors = 3 )
22
- knn .fit (train [x_names ], train [response_column ])
23
- print (knn )
24
- knn_score = knn .score (train [x_names ], train [response_column ])
25
- print (knn_score )
26
-
27
- knn_graph = kneighbors_graph (train [x_names ], 3 , mode = 'connectivity' , include_self = False , metric = "euclidean" )
28
- print (knn_graph )
29
-
21
+
30
22
train_h2o = h2o .H2OFrame (train )
31
23
train_h2o [response_column ] = train_h2o [response_column ].asfactor ()
32
24
train_h2o [id_column ] = h2o .H2OFrame (np .arange (0 , train_h2o .shape [0 ]))
25
+
26
+ for metric in metrics :
27
+ print ("Check results for " + metric + " metric." )
28
+ sklearn_knn_graph = kneighbors_graph (train [x_names ],
29
+ k ,
30
+ mode = 'connectivity' ,
31
+ include_self = True ,
32
+ metric = metric )
33
33
34
- h2o_knn = H2OKnnEstimator (
35
- k = 3 ,
36
- id_column = id_column ,
37
- distance = "euclidean" ,
38
- seed = seed ,
39
- auc_type = "macroovr"
40
- )
34
+ h2o_knn = H2OKnnEstimator (k = k ,
35
+ id_column = id_column ,
36
+ distance = metric ,
37
+ seed = seed )
38
+
39
+ h2o_knn .train (y = response_column , x = x_names , training_frame = train_h2o )
40
+
41
+ distances_frame = h2o_knn .distances ().as_data_frame ()
42
+ assert distances_frame is not None
41
43
42
- h2o_knn .train (y = response_column , x = x_names , training_frame = train_h2o )
43
- distances_key = h2o_knn ._model_json ["output" ]["distances" ]
44
- print (distances_key )
45
- distances_frame = h2o .get_frame (distances_key )
46
- print (distances_frame )
44
+ diff = 0
45
+ allowed_diff = 20
46
+ for i in range (train .shape [0 ]):
47
+ sklearn_neighbours = sklearn_knn_graph [i ].nonzero ()[1 ]
48
+ for j in range (k ):
49
+ sklearn_n = sklearn_neighbours [j ]
50
+ h2o_n = distances_frame ["id_" + str (j + 1 )][i ]
51
+ if sklearn_n != h2o_n :
52
+ print (distances_frame .loc [[i ]])
53
+ print ("[" + str (i )+ "," + str (j )+ "] sklearn:h2o " + str (sklearn_n )+ " == " + str (h2o_n ))
54
+ diff += 1
55
+
56
+ # some neighbours should have different order due to parallelization
57
+ print ("Number of different neighbours: " + str (diff ))
58
+ assert diff < allowed_diff
47
59
48
60
49
61
if __name__ == "__main__" :
0 commit comments