forked from roomthily/ISO-19115-Distribution-Structures
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload_word_counts.py
227 lines (176 loc) · 8.1 KB
/
load_word_counts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# coding: utf-8
import glob
import re
from datetime import datetime
import dateutil.parser as dateparser
from itertools import chain
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import WordListCorpusReader
from semproc.rawresponse import RawResponse
from semproc.bag_parser import BagParser
import json as js # name conflict with sqla
import sqlalchemy as sqla
from sqlalchemy.orm import sessionmaker
from sqlalchemy.dialects.postgresql import *
from sqlalchemy import and_, or_, not_
from mpp.models import Response
from mpp.models import BagOfWords
import warnings
warnings.filterwarnings('ignore')
# In[25]:
def convert_header_list(headers):
'''
convert from the list of strings, one string
per kvp, to a dict with keys normalized
'''
return dict(
(k.strip().lower(), v.strip()) for k, v in (
h.split(':', 1) for h in headers)
)
def remove_stopwords(text):
'''
remove any known english stopwords from a
piece of text (bag of words or otherwise)
'''
_stopwords = set(stopwords.words('english'))
words = word_tokenize(text)
words = words if isinstance(words, list) else words.split()
return ' '.join([w for w in words if w not in _stopwords and w])
def load_token_list(term_file):
'''
load some stopword list from the corpus
'''
__location__ = '../corpora/'
tokens = WordListCorpusReader(__location__, term_file)
return [w.replace('+', '\+') for w in tokens.words()]
def remove_tokens(term_file, text):
'''
do this before something like tokenize or the
resplit option will split the mimetypes to not
be recognizable as such anymore
'''
words = load_token_list(term_file)
pttn = re.compile('|'.join(words))
return pttn.sub('', text)
def remove_numeric(text):
match_pttn = ur'\w*\b-?\d\s*\w*'
captures = re.findall(match_pttn, u' {0} '.format(text))
# strip them out
if captures:
text = re.sub('|'.join(captures), ' ', text)
return '' if text == '0' else text
return text
def strip_dates(text):
# this should still make it an invalid date
# text = text[3:] if text.startswith('NaN') else text
try:
d = dateparser.parse(text)
return ''
except ValueError:
return text
except OverflowError:
return text
def strip_filenames(text):
# we'll see
exts = ('png', 'jpg', 'hdf', 'xml', 'doc', 'pdf', 'txt', 'jar', 'nc', 'XSL', 'kml', 'xsd')
return '' if text.endswith(exts) else text
def strip_identifiers(texts):
# chuck any urns, urls, uuids
_pattern_set = [
('url', ur"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""),
# a urn that isn't a url
('urn', ur"(?![http://])(?![https://])(?![ftp://])(([a-z0-9.\S][a-z0-9-.\S]{0,}\S:{1,2}\S)+[a-z0-9()+,\-.=@;$_!*'%/?#]+)"),
('uuid', ur'([a-f\d]{8}(-[a-f\d]{4}){3}-[a-f\d]{12}?)'),
('doi', ur"(10[.][0-9]{4,}(?:[/][0-9]+)*/(?:(?![\"&\\'])\S)+)"),
('md5', ur"([a-f0-9]{32})")
]
for pattern_type, pattern in _pattern_set:
for m in re.findall(re.compile(pattern), texts):
m = max(m) if isinstance(m, tuple) else m
try:
texts = texts.replace(m, '')
except Exception as ex:
print ex
print m
files = ['cat_interop_urns.txt', 'mimetypes.txt', 'namespaces.txt']
for f in files:
texts = remove_tokens(f, texts)
return texts.split()
def remove_punctuation(text):
simple_pattern = r'[;|>+:=.,()/?!\[\]{}]'
text = re.sub(simple_pattern, ' ', text)
text = text.replace(' - ', ' ').strip()
return text if text != '-' else ''
def strip_punctuation(text):
terminal_punctuation = '(){}[].,~|":'
return text.strip(terminal_punctuation).strip()
def clean(text):
text = strip_dates(text)
text = remove_numeric(text)
text = remove_punctuation(text.strip()).strip()
text = strip_punctuation(text)
text = strip_filenames(text)
return text
exclude_tags = ['schemaLocation', 'noNamespaceSchemaLocation']
# In[26]:
# grab the clean text from the rds
with open('big_rds.conf', 'r') as f:
conf = js.loads(f.read())
# our connection
engine = sqla.create_engine(conf.get('connection'))
Session = sessionmaker()
Session.configure(bind=engine)
session = Session()
# In[27]:
clauses = [
Response.format=='xml',
not_(or_(
Response.cleaned_content.startswith("<rdf"),
Response.cleaned_content.startswith("<RDF")
))
]
# get a count of the xml responses
TOTAL = session.query(Response).filter(
and_(*clauses)).count()
START = 0
# In[ ]:
LIMIT = 100
# total = 5
# LIMIT= total
print 'TOTAL', TOTAL
for i in xrange(START, TOTAL, LIMIT):
# get some responses
responses = session.query(Response).filter(
and_(*clauses)).limit(LIMIT).offset(i).all()
print 'processing', i, len(responses)
for response in responses:
cleaned_content = response.cleaned_content
# omg. skip the big ones for regex hangs?
if len(cleaned_content.encode('utf-8')) / 1048576.0 > 1.:
print 'SKIPPING big file', response.id
continue
# strip the html cruft but ignore the a tags
bp = BagParser(cleaned_content.encode('utf-8'), True, False)
if bp.parser.xml is None:
print 'NOT XML: ', cleaned_content[:100]
continue
# we don't care about the fully qualified namespace here
stripped_text = [b[1].split() for b in bp.strip_text(exclude_tags) if b[1]]
stripped_text = list(chain.from_iterable(stripped_text))
cleaned_text = [s for s in stripped_text if clean(s)]
bow = strip_identifiers(' '.join(cleaned_text))
bag = BagOfWords(
generated_on=datetime.now().isoformat(),
bag_of_words=bow,
method="basic",
response_id=response.id
)
try:
session.add(bag)
session.commit()
except Exception as ex:
print 'failed ', response.id, ex
session.rollback()
session.close()