Skip to content

Commit f8db051

Browse files
committed
fixed discrepancy between simstring and quickumls app
1 parent 19044a1 commit f8db051

File tree

1 file changed

+8
-6
lines changed

1 file changed

+8
-6
lines changed

toolbox.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import unicodedata
88
from string import punctuation
99
from itertools import takewhile, repeat
10-
from six import xrange
10+
from six.moves import xrange
1111

1212
# installed modules
1313
import numpy
@@ -66,20 +66,22 @@ def prepare_string_for_db_input(s):
6666

6767

6868
def make_ngrams(s, n):
69-
s = u'{t}{s}{t}'.format(s=safe_unicode(s), t=('$' * (n - 1)))
69+
# s = u'{t}{s}{t}'.format(s=safe_unicode(s), t=('$' * (n - 1)))
7070
return (s[i:i + n] for i in xrange(len(s) - n + 1))
7171

7272

7373
def get_similarity(x, y, n, similarity_name):
7474
X, Y = set(make_ngrams(x, n)), set(make_ngrams(y, n))
75+
intersec = len(X.intersection(Y))
76+
7577
if similarity_name == 'dice':
76-
return 2 * len(X & Y) / (len(X) + len(Y))
78+
return 2 * intersec / (len(X) + len(Y))
7779
elif similarity_name == 'jaccard':
78-
return len(X & Y) / len(X | Y)
80+
return intersec / (len(X) + len(Y) - intersec)
7981
elif similarity_name == 'cosine':
80-
return len(X & Y) / numpy.sqrt(len(X) * len(Y))
82+
return intersec / numpy.sqrt(len(X) * len(Y))
8183
elif similarity_name == 'overlap':
82-
return len(X & Y)
84+
return intersec
8385
else:
8486
msg = 'Similarity {} not recognized'.format(similarity_name)
8587
raise TypeError(msg)

0 commit comments

Comments
 (0)