forked from soundsinteresting/perpca
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathvectorize.py
103 lines (83 loc) · 3.65 KB
/
vectorize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import numpy as np
import matplotlib.pyplot as plt
import re
import os
import json
USUAL_WORDS = [ '','and','And','but','But','uh','like','or','if','If','because','so','So','also',
'of','for','to','in','In','on','out','at','with','about','as','into','by','from','than',
'the','The','that','thats','Thats','this','those','there','There',
'a','an','all','too','some','very',
'is','are','be','have','has','had','not','been','Ive',
'what','when','who','how','What','When','which',
'was','were','do','does','did','will','would','should','can','Ill','going','dont',
'want','get','got','think','thank','Thank','every','also','said','say','know','knew',
'Mr',
'I','Im','my','me','he','He','his','her','she','She',
'we','We','weve','us','our','you','You','your','They','they','theyre','them','Them','their','it','It','its','Its',
'D','000']
def create_vector(word_count, vocab_size, wn):
vec = np.zeros(vocab_size)
for word in word_count:
if word in wn.keys():
vec[wn[word]] = word_count[word]
return vec
def top_words(vector, number2word, top=5):
positions = np.argpartition(np.abs(vector), -top)[-top:]
words = []
for position in positions:
words.append(number2word[position])
return words
def vectorize_words():
source_dir = r"debate" # r"./organized_results/train_output"
filenames = [os.path.join(source_dir, name) for name in os.listdir(source_dir)]
all_dialogue = []
word2number = dict()
word2count = dict()
total_words = 0
years = []
for name in filenames:
years.append(int(name[-9:-5]))
file = open(name)
data = json.load(file)
one_debate = []
for speech in data['content']:
dialogue = speech['dialogue']
#import re
words = re.split('\W+', dialogue)
speech_dict = dict()
for word in words:
if not word in USUAL_WORDS:
#if not word in word2number.keys():
# word2number[word] = len(word2number)
if not word in word2count.keys():
word2count[word] = 0
word2count[word] += 1
if not word in speech_dict.keys():
speech_dict[word] = 0
speech_dict[word] += 1
total_words += 1
if len(speech_dict)>0:
one_debate.append(speech_dict)
all_dialogue.append(one_debate)
wc = {k: v for k, v in sorted(word2count.items(), key=lambda item: item[1], reverse=True) if v > 5}
print("total words %d, total dictionary %d" % (total_words, len(wc)))
print("total debates %d" % len(all_dialogue))
word2number = {w:i for i,w in enumerate(wc.keys())}
number2word = {i:w for i,w in enumerate(wc.keys())}
vocab_size= len(word2number)
allyears = sorted(set(years))
all_data = []
for year in allyears:
data = []
for id, diags in enumerate(all_dialogue):
if not years[id] == year:
continue
num_diag = len(diags)
debate = np.zeros((num_diag, vocab_size))
for i,speech in enumerate(diags):
vec = create_vector(speech, vocab_size, word2number)
debate[i] += vec
data.append(debate)
data = np.concatenate(data)
all_data.append(data)
return all_data, number2word, allyears