-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathextractFutureWorks.py
222 lines (183 loc) · 8.31 KB
/
extractFutureWorks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
from bs4 import BeautifulSoup
from bs4 import BeautifulStoneSoup
import nltk.data
import re
import glob
import pickle
import pprint
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
stopwords = nltk.corpus.stopwords.words('english')+[',','.','!','``',"''",'?',"'s",';','$',':',"'",'_', ')', '(']
stemmer = PorterStemmer()
FW_ABS_Index = {}
def main():
#basic
#indicators = ['futur', 'plan']
#expanded
indicators = ['futur', 'work', 'plan', 'improv', 'explor', 'approach', 'perform', 'research', 'evalu', 'extend', 'would']
output_pickle = 'FutureWorkAndAbstractPickle_expanded.pk'
#output_pickle = 'temp'
input_directory = "../cleanXMLdataV2/*.out"
#input_directory = "../cleanXMLdataV2/P11-2088.out"
#read_files = glob.glob("/Users/aditi_khullar/Documents/Dropbox/cleanXMLdataV2/*.out")
read_files = glob.glob(input_directory)
for xmlFile in read_files:
paperId = xmlFile.split("/")[-1][0:-4]
print paperId
if FW_ABS_Index.has_key(paperId) is False:
FW_ABS_Index[paperId] = ["Smaple Abstract", "Sample Futurework", "Sample Intro", "Sample Con"]
with open(xmlFile, 'r') as f:
xmlData = f.read();
#`soup = BeautifulStoneSoup(xmlData, selfClosingTags=['sectionHeader','bodyText'])
soup = BeautifulSoup(xmlData, 'xml')
#print soup.prettify()
bodiesAWK = []
# gets all of the lines after conclusion and before acknowledgement
limit = 5 #assuming that there are not many seperations between conclusion and acknowledgement
bodiesCON = []
bodyConText = ""
for i, header in enumerate(soup.findAll('sectionHeader')):
#print i
if header['genericHeader'] == 'abstract':
if header.find_next('bodyText') is not None:
paperAbs = get_abstract(header.find_next('bodyText').get_text())
else:
paperAbs = " "
FW_ABS_Index[paperId][0] = paperAbs
if header['genericHeader'] == 'introduction':
#startPointer = header.find_next('bodyText')
#print "1:" + header.find_next('bodyText').get_text()
#if header.find_next('bodyText') is not None:
# #print "2:" + header.find_next('bodyText').get_text()
# paperIntro = get_abstract(header.find_next('bodyText').get_text())
#else:
# #paperIntro = " "
FW_ABS_Index[paperId][2] = get_abstract(get_bodyText(header))
#print FW_ABS_Index[paperId][2]
#gets the conclusion section
if header['genericHeader'] == 'conclusions' and float( header['confidence']) > 0.95:
#print "1:" + header.find_next('bodyText').get_text()
if header.find_next('bodyText') is not None:
#print "2:" + header.find_next('bodyText').get_text()
paperCon = get_abstract(header.find_next('bodyText').get_text())
else:
paperCon = " "
FW_ABS_Index[paperId][3] = paperCon
#print "paperCon: " + FW_ABS_Index[paperId][3]
#gets all of what we believe is the conclusion section to extract future works in
if header['genericHeader'] == 'conclusions':
if float( header['confidence']) < 0.95:
#bodiesCON = header.find_all_previous('bodyText', limit=3)
#print header['confidence']
bodyConText = get_abstract(prev_bodyText(header))
else:
#print 'bodies'
#print header
#print header.find_next()
bodyConText = get_abstract(get_bodyText(header))#bodiesCON + header.find_all_next('bodyText')
#bodyConText = ""
#for item in bodiesCON:
# bodyConText += item.get_text()
#print bodyConText
#print "-----------------"
futureWorkText = get_futureWork_extended(bodyConText,indicators)
FW_ABS_Index[paperId][1] = futureWorkText
#print futureWorkText
#bodiesCON = header.find_next('bodyText').find_next('bodyText')
# print len(FW_ABS_Index)
# print "Abstract"
# print FW_ABS_Index['W10-4150'][0]
# print "Future Work"
# print FW_ABS_Index['W10-4150'][1]
# Writing the Dictinary to a file
# fileW = open("FutureWorkAndAbstractIndex.txt", "w")
# for key in FW_ABS_Index:
# values = ""
# print FW_ABS_Index[key]
# for v in FW_ABS_Index[key]:
# values = values + " \n----\n " + v
# fileW.write("%s : %s\n" % (key, values))
# Writing the Dictinary to a pickle file
output = open(output_pickle, 'wb')
pickle.dump(FW_ABS_Index, output)
output.close()
return
def get_futureWork(all_text):
ret = ""
pattern = re.compile("future")
for sent in sent_detector.tokenize(all_text.strip()):
clean_sent = sent.lower().replace('\n', ' ').strip()
clean_sent = clean_sent.replace("- ", "")
if pattern.search(clean_sent):
ret = ret + "\n" + clean_sent
return ret
def get_futureWork_plus(all_text):
ret = ""
pattern1 = re.compile("future")
pattern2 = re.compile("plan to")
for sent in sent_detector.tokenize(all_text.strip()):
clean_sent = sent.lower().replace('\n', ' ').strip()
clean_sent = clean_sent.replace("- ", "")
if pattern1.search(clean_sent):
ret = ret + "\n" + clean_sent
#todo make sure the sentence is not added already
elif pattern2.search(clean_sent):
ret = ret + "\n" + clean_sent
return ret
def get_futureWork_extended(all_text, indicators):
ret = ""
#indicators = ['futur', 'work', 'use', 'plan', 'model', 'improv', 'system', 'research', 'method', 'featur', 'includ', 'investig', 'explor', 'direct', 'languag', 'would', 'data', 'evalu', 'approach', 'perform']
#indicators = ['futur', 'work', 'plan', 'improv', 'explor', 'approach', 'perform', 'research', 'evalu', 'extend', 'would']
for sent in sent_detector.tokenize(all_text.strip()):
clean_sent = sent.lower().replace('\n', ' ').strip()
clean_sent = clean_sent.replace("- ", "")
for token in [t.lower() for t in nltk.word_tokenize(clean_sent)]:
if token in stopwords:
continue
if stemmer:
token = stemmer.stem(token)
if token in indicators:
ret = ret + "\n" + clean_sent
break
return ret
def get_abstract(all_text):
ret = ""
for sent in sent_detector.tokenize(all_text.strip()):
clean_sent = sent.lower().replace('\n', ' ').strip()
clean_sent = clean_sent.replace("- ", "")
ret = ret + "\n" + clean_sent
return ret
def get_bodyText(starterPointer):
saveText = " "
if starterPointer is not None:
#saveText = starterPointer.get_text()
currentPoint = starterPointer.find_next()
while currentPoint is not None :
#print currentPoint.name
if currentPoint.name == 'sectionHeader':
break
#elif currentPoint.name == 'page':
# print currentPoint.name
elif currentPoint.name == 'bodyText':
saveText = saveText + currentPoint.get_text()
currentPoint = currentPoint.find_next()
#print saveText
return saveText
def prev_bodyText(starterPointer):
saveText = " "
if starterPointer is not None:
saveText = starterPointer.get_text()
currentPoint = starterPointer.find_previous()
while currentPoint is not None :
#print currentPoint.name
if currentPoint.name == 'sectionHeader':
break
#elif currentPoint.name == 'page':
# print currentPoint.name
elif currentPoint.name == 'bodyText':
saveText = saveText + currentPoint.get_text()
currentPoint = currentPoint.find_previous()
return saveText
if __name__=="__main__":
main()