forked from bradhackinen/nama
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutilities.py
63 lines (41 loc) · 1.76 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import pandas as pd
import numpy as np
from collections import Counter
import scipy.sparse as sparse
from unidecode import unidecode
def stringToAscii(s):
s = unidecode(s)
s = s.encode('ascii')
return s
def dfChunks(df,chunk_size):
for i in range(0,len(df),chunk_size):
yield df[i:i+chunk_size]
class BOW():
def fit(self,docs,no_below=2,returnCountMatrix=True):
occurrences,J = self.occurrences(docs)
# Prepare token vocabulary information
docCounts = Counter(t for (i,t) in occurrences.keys())
self.tokensDF = pd.DataFrame([(t,c) for t,c in docCounts.items() if c>=no_below],columns=['token','n_docs'])
self.tokensDF = self.tokensDF.sort_values('n_docs',ascending=False).reset_index(drop=True)
self.tokenid = {t:i for i,t in enumerate(self.tokensDF['token'])}
# Optionally compute and return countMatrix (saves second pass of counting occurrences)
if returnCountMatrix:
return self.occurrencesToCountMatrix(occurrences,J)
def countMatrix(self,docs):
C = self.occurrencesToCountMatrix(*self.occurrences(docs))
return C
def frequencyMatrix(self,docs):
C = self.countMatrix(docs)
F = C.multiply(1/C.sum(axis=0))
return F
def occurrencesToCountMatrix(self,occurrences,J=None):
C = np.array([(self.tokenid[t],j,c) for (j,t),c in occurrences.items() if t in self.tokenid])
if J is None:
J = C[:,1].max()+1
C = sparse.coo_matrix((C[:,2],(C[:,0],C[:,1])),shape=(len(self.tokensDF),J)).tocsc()
return C
def occurrences(self,docs):
occurrences = Counter()
for j,doc in enumerate(docs):
occurrences.update((j,t) for t in doc)
return occurrences,j+1