-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathalgorithm.py
More file actions
168 lines (139 loc) · 58.7 KB
/
algorithm.py
File metadata and controls
168 lines (139 loc) · 58.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# Algorithm for discerning high level information about folks in the civic technology scene
# 1. Categorize sentences into one of the categories under consideration:
# * Funding
# * Data
# * Employment
# * Collaboration
# * Location - from investigator
# 2. Parse individual sentences once high level labels have been defined to do deep semantic analysis automatically
# * We do this by pulling out the named entities and using the correct action words to imply causality
# 3. Save to database
import spacy
import nltk
from spacy.parts_of_speech import *
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import string
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer #consider moving from CountVectorizer to this
from nltk.classify.scikitlearn import SklearnClassifier
import itertools
# from text_classify.algorithms import *
import pickle
def preprocess(sentence,label=None):
tokens = nltk.word_tokenize(sentence)
tokens = [w for w in tokens if w not in stopwords.words("english")]
features = {}
for token in tokens:
features[token]=tokens.count(token)
if label:
return (features,label)
else:
return features
def svm(train_data,preprocessing=True):
training_data = []
for data in train_data:
training_data.append(preprocess(data[0],label=data[1]))
cl = SklearnClassifier(LinearSVC())
cl.train(training_data)
return cl
def tokenizeText(sample):
tokens = sample.split(" ")
# stoplist the tokens
tokens = [tok for tok in tokens if tok not in STOPLIST]
# stoplist symbols
tokens = [tok for tok in tokens if tok not in SYMBOLS]
# remove large strings of whitespace
while "" in tokens:
tokens.remove("")
while " " in tokens:
tokens.remove(" ")
while "\n" in tokens:
tokens.remove("\n")
while "\n\n" in tokens:
tokens.remove("\n\n")
return tokens
def accuracy(classifier_name,classifier,test_data):
"""
example usage:
from text_classify.algorithms import *
import pickle
training_data = pickle.load(open("training_data","rb"))
test_data = pickle.load(open("test_data,"rb"))
cl = svm(training_data)
accuracy("svm",cl,test_data)
the possible choices for classifiers at present are:
svm, decision_tree, naive_bayes
"""
if classifier_name in ["svm","decision_tree"]:
testing = [preprocess(data[0],label=data[1]) for data in test_data]
counter = 0.0
# print test_data
for ind,data in enumerate(test_data):
if classifier.classify(testing[ind][0]) == data[1]:
counter += 1
return counter/len(test_data)
# print counter
# print len(test_data)
# print counter/len(test_data)
if classifier_name == "naive_bayes":
return classifier.accuracy(test_data)
else:
return "no idea!"
def readfile_tokenizedata(file):
# read in data file (test dataset???)
with open(file,"r") as f:
data = f.read()
#sentence tokenization using spaCy
nlp = spacy.load("en")
d = data.decode('utf-8')
doc = nlp(d, tag=True) #tokenized doc
sentences = [sent.string.strip() for sent in doc.sents]
return sentences
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "..."]
# TODO: Fill in the labels for each training set
# funding (directional)
labels = ['Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Funding', 'Funding', 'Funding', 'Not funding', 'Funding', 'Funding', 'Funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Funding', 'Funding', 'Funding', 'Funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Funding', 'Funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Funding', 'Funding', 'Funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Funding', 'Funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Funding', 'Funding', 'Funding', 'Funding', 'Funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Funding', 'Funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding', 'Not funding']
# data (directional)
labels2 = ['Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Data', 'Not data', 'Not data', 'Not data', 'Not data', 'Data', 'Not data', 'Data', 'Not data', 'Data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Data', 'Data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Data', 'Not data', 'Data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Data', 'Not data', 'Data', 'Data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Data', 'Not data', 'Not data', 'Not data', 'Data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data', 'Data', 'Not data', 'Not data', 'Not data', 'Not data', 'Not data']
# employment (bidirectional)
labels3 = ['Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Employment', 'Employment', 'Employment', 'Employment', 'Employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Employment', 'Employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Employment', 'Employment', 'Employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Not employment', 'Employment', 'Not employment', 'Employment', 'Not employment', 'Not employment', 'Employment', 'Employment', 'Employment', 'Not employment', 'Not employment']
# collaboration (bidirectional)
labels4 = ['Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Not collaboration', 'Collaboration', 'Collaboration']
# location (bidirectional)
labels5 = ['Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location', 'Not location']
# read in and tokenize each training set
# Option 1: word tokenization (removes stopwords, symbols, whitespace chunks)
# with open("funding_training.txt", 'r') as f:
# data1 = f.read()
# for d in tokenizeText(data1):
# print d
# Option 2: sentence tokenization using spaCy
data1 = readfile_tokenizedata("funding_training.txt")
data2 = readfile_tokenizedata("data_training.txt")
data3 = readfile_tokenizedata("employment_training.txt")
data4 = readfile_tokenizedata("collaboration_training.txt")
data5 = readfile_tokenizedata("location_training.txt")
for d in data1:
preprocess(data)
#Generate training set for each set of labels
training1 = zip(data1,labels)
training2 = zip(data2, labels2)
training3 = zip(data3, labels3)
training4 = zip(data4, labels4)
training5 = zip(data5, labels5)
# Create classifier for each training set
cl1 = svm(training1)
cl2 = svm(training2)
cl3 = svm(training3)
cl4 = svm(training4)
cl5 = svm(training5)
#test set - this will eventually be all of the Civicist/TP content?
test_data = preprocess("sample.txt")
accuracy("svm",cl1, test_data)
accuracy("svm",cl2, test_data)
accuracy("svm",cl3, test_data)
accuracy("svm",cl4, test_data)
accuracy("svm",cl5, test_data)