forked from bradhackinen/nama
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlsa.py
85 lines (58 loc) · 2.42 KB
/
lsa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import pandas as pd
import numpy as np
from sparsesvd import sparsesvd
from nama.utilities import *
from nama.defaults import *
from nama.tokenizers import *
class LSAModel():
def __init__(self,matcher,hasher=None,tokenizer=lambda s: nmgrams(s,1,3),k=50,weighting='norm',no_below=1,use_components=True):
self.bow = BOW()
self.hasher = hasher
self.tokenizer = tokenizer
self.k = k
self.weighting = weighting
self.no_below = no_below
if use_components:
componentTokens = ((t for s in c for t in self.tokenizer(s)) for c in matcher.components())
C = self.bow.fit(componentTokens,no_below=self.no_below)
else:
C = self.bow.fit((self.tokenizer(s) for s in matcher.G.nodes()),no_below=self.no_below)
if weighting:
if weighting == 'norm':
self.w = 1/np.sqrt(C.power(2).sum(axis=1))
elif weighting == 'idf':
self.w = 1/(np.log(1 + (C > 0).sum(axis=1)))
elif weighting == 'entropy':
P = C.multiply(1/C.sum(axis=1))
S = P.copy()
S.data = P.data*np.log(P.data)
self.w = 1 + (S.sum(axis=1) / np.log(C.shape[1]))
else:
raise Exception('Unknown weighting type:',weighting)
else:
self.w = np.ones((len(self.bow.tokensDF),1))
V,s,_ = sparsesvd(C.multiply(self.w).tocsc(),k=k)
self.V = V / s[:,np.newaxis]
def vectorizeStrings(self,strings,epsilon=1e-9,normalize=True):
C = self.bow.countMatrix((self.tokenizer(string) for string in strings))
vecs = C.T.dot(self.V.T)
if normalize:
vecs = vecs / np.sqrt((vecs**2).sum(axis=1))[:,np.newaxis]
return vecs
if __name__ == '__main__':
# Test code
import nama
from nama.matcher import Matcher
import nama.similarity as similarity
import cProfile as profile
from nama.hashes import corpHash
# Initialize the matcher
matcher = Matcher(['ABC Inc.','abc inc','A.B.C. INCORPORATED','The XYZ Company','X Y Z CO','ABC Inc.','XYZ Co.','ABC Inc.','XYZ Co.'])
# Add some corpHash matches
matcher.matchHash(nama.hashes.corpHash)
# Initalize a new, untrained similarity model
lsa = LSAModel(matcher)
lsa.vectorizeStrings(matcher.strings())
matcher.suggestMatches(lsa,min_string_count=0)
matcher.plotMatches()