|
7 | 7 | import unicodedata
|
8 | 8 | from string import punctuation
|
9 | 9 | from itertools import takewhile, repeat
|
10 |
| -from six import xrange |
| 10 | +from six.moves import xrange |
11 | 11 |
|
12 | 12 | # installed modules
|
13 | 13 | import numpy
|
@@ -66,20 +66,22 @@ def prepare_string_for_db_input(s):
|
66 | 66 |
|
67 | 67 |
|
68 | 68 | def make_ngrams(s, n):
|
69 |
| - s = u'{t}{s}{t}'.format(s=safe_unicode(s), t=('$' * (n - 1))) |
| 69 | + # s = u'{t}{s}{t}'.format(s=safe_unicode(s), t=('$' * (n - 1))) |
70 | 70 | return (s[i:i + n] for i in xrange(len(s) - n + 1))
|
71 | 71 |
|
72 | 72 |
|
73 | 73 | def get_similarity(x, y, n, similarity_name):
|
74 | 74 | X, Y = set(make_ngrams(x, n)), set(make_ngrams(y, n))
|
| 75 | + intersec = len(X.intersection(Y)) |
| 76 | + |
75 | 77 | if similarity_name == 'dice':
|
76 |
| - return 2 * len(X & Y) / (len(X) + len(Y)) |
| 78 | + return 2 * intersec / (len(X) + len(Y)) |
77 | 79 | elif similarity_name == 'jaccard':
|
78 |
| - return len(X & Y) / len(X | Y) |
| 80 | + return intersec / (len(X) + len(Y) - intersec) |
79 | 81 | elif similarity_name == 'cosine':
|
80 |
| - return len(X & Y) / numpy.sqrt(len(X) * len(Y)) |
| 82 | + return intersec / numpy.sqrt(len(X) * len(Y)) |
81 | 83 | elif similarity_name == 'overlap':
|
82 |
| - return len(X & Y) |
| 84 | + return intersec |
83 | 85 | else:
|
84 | 86 | msg = 'Similarity {} not recognized'.format(similarity_name)
|
85 | 87 | raise TypeError(msg)
|
|
0 commit comments