|
7 | 7 | import unicodedata |
8 | 8 | from string import punctuation |
9 | 9 | from itertools import takewhile, repeat |
10 | | -from six import xrange |
| 10 | +from six.moves import xrange |
11 | 11 |
|
12 | 12 | # installed modules |
13 | 13 | import numpy |
@@ -66,20 +66,22 @@ def prepare_string_for_db_input(s): |
66 | 66 |
|
67 | 67 |
|
68 | 68 | def make_ngrams(s, n): |
69 | | - s = u'{t}{s}{t}'.format(s=safe_unicode(s), t=('$' * (n - 1))) |
| 69 | + # s = u'{t}{s}{t}'.format(s=safe_unicode(s), t=('$' * (n - 1))) |
70 | 70 | return (s[i:i + n] for i in xrange(len(s) - n + 1)) |
71 | 71 |
|
72 | 72 |
|
73 | 73 | def get_similarity(x, y, n, similarity_name): |
74 | 74 | X, Y = set(make_ngrams(x, n)), set(make_ngrams(y, n)) |
| 75 | + intersec = len(X.intersection(Y)) |
| 76 | + |
75 | 77 | if similarity_name == 'dice': |
76 | | - return 2 * len(X & Y) / (len(X) + len(Y)) |
| 78 | + return 2 * intersec / (len(X) + len(Y)) |
77 | 79 | elif similarity_name == 'jaccard': |
78 | | - return len(X & Y) / len(X | Y) |
| 80 | + return intersec / (len(X) + len(Y) - intersec) |
79 | 81 | elif similarity_name == 'cosine': |
80 | | - return len(X & Y) / numpy.sqrt(len(X) * len(Y)) |
| 82 | + return intersec / numpy.sqrt(len(X) * len(Y)) |
81 | 83 | elif similarity_name == 'overlap': |
82 | | - return len(X & Y) |
| 84 | + return intersec |
83 | 85 | else: |
84 | 86 | msg = 'Similarity {} not recognized'.format(similarity_name) |
85 | 87 | raise TypeError(msg) |
|
0 commit comments