forked from roomthily/ISO-19115-Distribution-Structures
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword_count_cli.py
116 lines (90 loc) · 5.16 KB
/
word_count_cli.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# -*- coding: utf-8 -*-
from optparse import OptionParser
import sys
import re
import dateutil.parser as dateparser
from itertools import chain
from semproc.bag_parser import BagParser
from semproc.nlp_utils import *
import warnings
warnings.filterwarnings('ignore')
'''
a little cli for the bag of words
count to handle the race conditions
in the regex
'''
def convert_header_list(headers):
'''
convert from the list of strings, one string
per kvp, to a dict with keys normalized
'''
return dict(
(k.strip().lower(), v.strip()) for k, v in (
h.split(':', 1) for h in headers)
)
def strip_dates(text):
# this should still make it an invalid date
# text = text[3:] if text.startswith('NaN') else text
try:
d = dateparser.parse(text)
return ''
except ValueError:
return text
except OverflowError:
return text
def strip_filenames(text):
# we'll see
exts = ('png', 'jpg', 'hdf', 'xml', 'doc', 'pdf', 'txt', 'jar', 'nc', 'XSL', 'kml', 'xsd')
return '' if text.endswith(exts) else text
def strip_identifiers(texts):
# chuck any urns, urls, uuids
_pattern_set = [
('url', ur"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""),
# a urn that isn't a url
('urn', ur"(?![http://])(?![https://])(?![ftp://])(([a-z0-9.\S][a-z0-9-.\S]{0,}\S:{1,2}\S)+[a-z0-9()+,\-.=@;$_!*'%/?#]+)"),
('uuid', ur'([a-f\d]{8}(-[a-f\d]{4}){3}-[a-f\d]{12}?)'),
('doi', ur"(10[.][0-9]{4,}(?:[/][0-9]+)*/(?:(?![\"&\\'])\S)+)"),
('md5', ur"([a-f0-9]{32})")
]
for pattern_type, pattern in _pattern_set:
for m in re.findall(re.compile(pattern), texts):
m = max(m) if isinstance(m, tuple) else m
try:
texts = texts.replace(m, '')
except Exception as ex:
print ex
print m
files = ['cat_interop_urns.txt', 'mimetypes.txt', 'namespaces.txt']
for f in files:
texts = remove_tokens(f, texts)
return texts.split()
def clean(text):
text = strip_dates(text)
text = remove_numeric(text).strip()
text = remove_punctuation(text.strip()).strip()
text = strip_terminal_punctuation(text.strip()).strip()
text = strip_filenames(text).strip()
return text
def main():
op = OptionParser()
op.add_option('--file', '-f')
options, arguments = op.parse_args()
# get it from some file, sometimes the xml
# is too long for the args and i'm tired of
# quoting things.
if not options.file:
op.error('No xml file')
exclude_tags = ['schemaLocation', 'noNamespaceSchemaLocation']
with open(options.file, 'r') as f:
xml_as_string = f.read()
bp = BagParser(xml_as_string.encode('utf-8'), True, False)
if bp.parser.xml is None:
sys.stderr.write('Failed xml parse')
sys.exit(1)
stripped_text = [b[1].split() for b in bp.strip_text(exclude_tags) if b[1]]
stripped_text = list(chain.from_iterable(stripped_text))
cleaned_text = [clean(s) for s in stripped_text]
bow = strip_identifiers(' '.join([c for c in cleaned_text if c]))
print ' '.join([b.encode('utf-8') for b in bow if b]).replace("'", "\'")
if __name__ == '__main__':
main()