-
Notifications
You must be signed in to change notification settings - Fork 63
Expand file tree
/
Copy pathfeature_extractor.py
More file actions
105 lines (98 loc) · 3.84 KB
/
Copy pathfeature_extractor.py
File metadata and controls
105 lines (98 loc) · 3.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
'''
'''
import io
import nltk
import numpy as np
import cPickle as pkl
import os.path
import string
from deep_dialog.tools import to_tokens
class FeatureExtractor:
def __init__(self, corpus_path, db_path, N=1):
self.N = N
save_path = db_path.rsplit('/',1)[0] + '/fdict_%d.p'%N
if os.path.isfile(save_path):
f = open(save_path, 'rb')
self.grams = pkl.load(f)
self.n = pkl.load(f)
f.close()
else:
self.grams = {}
self.n = 0
if corpus_path is not None: self._build_vocab_from_corpus(corpus_path)
if db_path is not None: self._build_vocab_from_db(db_path)
f = open(save_path, 'wb')
pkl.dump(self.grams, f)
pkl.dump(self.n, f)
f.close()
print 'Vocab Size = %d' %self.n
def _build_vocab_from_db(self, corpus):
try:
f = io.open(corpus, 'r')
for line in f:
elements = line.rstrip().split('\t')[1:]
for ele in elements:
tokens = to_tokens(ele)
for i in range(len(tokens)):
for t in range(self.N):
if i-t<0: continue
ngram = '_'.join(tokens[i-t:i+1])
if ngram not in self.grams:
self.grams[ngram] = self.n
self.n += 1
f.close()
except UnicodeDecodeError:
f = open(corpus, 'r')
for line in f:
elements = line.rstrip().split('\t')[1:]
for ele in elements:
tokens = to_tokens(ele)
for i in range(len(tokens)):
for t in range(self.N):
if i-t<0: continue
ngram = '_'.join(tokens[i-t:i+1])
if ngram not in self.grams:
self.grams[ngram] = self.n
self.n += 1
f.close()
def _build_vocab_from_corpus(self, corpus):
if not os.path.isfile(corpus): return
try:
f = io.open(corpus, 'r')
for line in f:
tokens = to_tokens(line.rstrip())
for i in range(len(tokens)):
for t in range(self.N):
if i-t<0: continue
ngram = '_'.join(tokens[i-t:i+1])
if ngram not in self.grams:
self.grams[ngram] = self.n
self.n += 1
f.close()
except UnicodeDecodeError:
f = open(corpus, 'r')
for line in f:
tokens = to_tokens(line.rstrip())
for i in range(len(tokens)):
for t in range(self.N):
if i-t<0: continue
ngram = '_'.join(tokens[i-t:i+1])
if ngram not in self.grams:
self.grams[ngram] = self.n
self.n += 1
f.close()
def featurize(self, text):
vec = np.zeros((len(self.grams),)).astype('float32')
tokens = to_tokens(text)
for i in range(len(tokens)):
for t in range(self.N):
if i-t<0: continue
ngram = '_'.join(tokens[i-t:i+1])
if ngram in self.grams:
vec[self.grams[ngram]] += 1.
return vec
if __name__=='__main__':
F = FeatureExtractor('../data/corpora/selected_medium_corpus.txt','../data/selected_medium/db.txt')
print '\n'.join(F.grams.keys())
print F.featurize('Please search for the movie with Matthew Saville as director')
print F.featurize('I would like to see the movie with drama as genre')