This repository has been archived by the owner on Jul 22, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathcreate_frequency_file
executable file
·275 lines (236 loc) · 11.3 KB
/
create_frequency_file
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
#!/usr/bin/env python3
# =============================================================================
# @file create_frequency_file
# @brief Create a frequency.csv or frequency.pklz from raw token frequencies
# @author Michael Hucka
# @license Please see the file named LICENSE in the project directory
# @website https://github.com/casics/extractor
# =============================================================================
import csv
import math
import plac
import re
import sys
from nltk.corpus import words as nltk_words
from nltk.corpus import wordnet as nltk_wordnet
from nltk.stem import SnowballStemmer
import enchant
try:
sys.path.append(os.path.join(os.path.dirname(__file__), "../spiral"))
except:
sys.path.append("../spiral")
import frequencies
# Main
# .............................................................................
@plac.annotations(
inputfile = ('input text file', 'option', 'i'),
outputfile = ('output file', 'option', 'o'),
threshold = ('minimum frequency threshold', 'option', 't'),
debug = ('drop into ipdb opening files', 'flag', 'd'),
)
def main(inputfile=None, outputfile=None, threshold=0, debug=False):
'''The intput file should be a plain-text table of frequencies, with each
line consisting of a token, some whitespace, and an integer; alternatively,
the input file can be a csv file in which the first column has the tokens and
the second column has the frequencies. The output format is based on the
file extension: .csv for a CSV file, .pklz for a compressed pickle file.
'''
threshold = int(threshold)
if not inputfile:
raise SystemExit('Missing input file argument.')
if not outputfile:
raise SystemExit('Missing output file argument.')
if inputfile.endswith('.csv'):
delimiter = ','
else:
delimiter = (' ', '\t')
try:
data = {}
with open(inputfile, 'r') as input:
if debug:
import ipdb; ipdb.set_trace()
total = 0
kept = 0
for line in input:
total += 1
(token, frequency) = line.split(delimiter)
if int(frequency) < threshold:
msg('{} below threshold -- dropping {}', frequency, token)
continue
if filter(token):
continue
data[token] = int(frequency)
kept += 1
msg('{} strings read, {} kept.', total, kept)
if outputfile.endswith('.csv'):
with open(outputfile, 'w') as output:
for token, frequency in sorted(data.items(), reverse=True,
key=lambda x: x[1]):
output.write(token)
output.write(',')
output.write(str(frequency))
output.write('\n')
else:
frequencies.save_frequencies_to_pickle(data, outputfile)
msg('Output saved in {}.', outputfile)
except Exception as err:
msg(err)
# Filter.
# .............................................................................
# The code below is an attempt to algorithmically remove stuff that is not
# desirable in a frequency table. This is a very conservative effort;
# ideally, far more stuff would be filtered out, but it's difficult to
# come up with filter rules that won't remove stuff that should be kept.
# I also didn't want to resort to manual editing of the frequency table because
# that would produce idiosyncratic results and would not be reproducible.
#
# Note: be careful about filtering things that have mixed case and naively
# might be thought to be multiword identifiers. I did this originally,
# thinking it would be safe to filter out strings that start with a capital
# letter and then have at least one more capital letter followed by a
# lowercase letter, such as "ABCFoo". Unfortunately this will catch things
# like LaTeX and PDFLaTeX, which people do write in mixed case. This would
# be bad for our goals. The code below does a limited version of this that
# (based on experimentation) does a reasonable job of removing one kind of
# pattern.
common_elements = {'list', 'version', 'input', 'output', 'pointer', 'ptr',
'data', 'tuple', 'print', 'image', 'err', 'error', 'node',
'code', 'mode', 'value', 'number', 'handler', 'test',
'error', 'io', 'db', 'info', 'id', 'set', 'put', 'get',
'unit', 'encode', 'decode', 'opt', 'format', 'fmt',
'text', 'file', 'dir', 'check', 'start', 'stop',
'string', 'offset', 'mem', 'field', 'host', 'var', 'char',
'next', 'prev', 'filter', 'config'}
common_ends = common_elements
common_starts = set(common_elements)
common_starts.update(['my', 'is', 'make'])
nltk_dictionary = set(nltk_words.words())
nltk_dictionary.update(nltk_wordnet.all_lemma_names())
enchant_dictionary = enchant.Dict('en_US')
stemmer = SnowballStemmer('english')
# The following exceptions were obtained while trying to find ways of filtering
# out obvious junk from the frequency table generated from 46k repos. There
# are probably other exceptions that should be here but were missed, and if
# we had a different set of repos, we'd probably catch different exceptions.
# This is imperfect, but IMHO better than nothing.
exceptions = {'ipython', 'caching', 'revoked', 'doxygen', 'cpython',
'slashless', 'exotica', 'mathematica',
'chunker', 'arctanh', 'arcsinh', 'arccosh', 'arcsech',
'coursera', 'activex', 'butterworth', 'utorrent', 'minimap',
'xdisplay', 'xwindows', 'icontact', 'icalendar',
'crypto', 'kmeans', 'interp', 'approx', 'latin5', 'iframe',
'sensei', 'jquery', 'gunzip', 'xapian', 'xenstore', 'csharp',
'eeprom', 'iomega', 'asynchronously', 'wunderground',
'texinfo', 'pdb', 'imdb', 'gdb', 'ipdb', 'mongodb', 'dynamodb',
'mysqldb', 'mysql', 'bsddb', 'innodb', 'couchdb' 'zodb',
'pydb', 'uuid', 'uid', 'bio', 'mercurio', 'stdio', 'stderr',
'stdout', 'settings', 'sets', 'setup', 'setups', 'unicode',
'cached', 'codecs', 'ident', 'coords', 'iscsi', 'dirichlet',
'json', 'util', 'async', 'init', 'gzip', 'cron', 'cuda', 'mbox',
'todo', 'eval', 'dest', 'proc', 'lang', 'xcode', 'kivy', 'x509',
'ckan', 'imap', 'frag', 'numa', 'chan', 'anim', 'zope', 'cmake',
'rsync', 'jpeg', 'plone', 'mgmt', 'dwim', 'i386', 'sftp', 'tftp',
'fasta', 'wget', 'priv', 'bson', 'bokeh', 'excl', 'gmail',
'inode', 'ebay', 'tahoe', 'xray', 'xbox', 's390', 'foaf', 'i486',
'xfer', 'nmake', 'tmux', 'vram', 'sdram', 'haxe', 'iana', 'inet',
'i686', 'eula', 'emph', 'sata', 'pata', 'uber', 'ipad', 'cdash',
'uboot', 'acosh', 'asech', 'atanh', 'jira', 'vmware', 'iweb',
'msec', 'usec', 'xref', 'toc'}
def filter(s):
'''Return True if the token should be filtered out.'''
# Filter out pure numbers.
# Fast number detector from https://stackoverflow.com/a/23639915/743730
if s.replace('.', '', 1).isdigit():
msg('dropping {}', s)
return True
# Filter out strings containing 3 upper case followed by 4 lower case
# letters or vice versa. This is a conservative test for one kind of
# multiword string that seems to produce few-to-no false positives in my
# testing.
if (re.search('[A-Z][A-Z][A-Z][a-z][a-z][a-z][a-z]', s)
or re.search('[A-Z][A-Z][A-Z][A-Z][a-z][a-z][a-z]', s)):
msg('dropping {}', s)
return True
# Remaining tests are all based on lower case version of string.
s = s.lower()
# Skip exceptions.
if s in exceptions:
return False
# Filter out stuff like "e545", "line23", "case2" etc. Yes, there's a
# risk this will catch some acronyms that I'm not aware of, but I think the
# risk is low enough that it's okay to do this. Besides, for Spiral, we
# have a separate list of acronyms, and so they will be handled elsewhere.
if re.search('^(e|error|page|line|case|test)[0-9]+$', s):
msg('dropping {}', s)
return True
# Remove things that are reognizable words bracketed by a single letter,
# such as "openerp" or 'xflush'. This requires care, because some things
# are tokens we do want in the frequency table, so the rules below are
# very limited. Note: don't remove things only because they have a
# number at the end. Example: lib2to3 should be left in.
if len(s) > 3 and len(s) < 7 and not in_dictionary(s):
if in_dictionary(s[1:]) and not s.startswith('pre') and s[-1] not in ['s']:
msg('dropping {}', s)
return True
if len(s) > 5 and not in_dictionary(s):
if in_dictionary(s[1:]) and not s.startswith('pre'):
msg('dropping {}', s)
return True
if in_dictionary(s[:-1]) and s[-1] not in ['s', 'r', 'd', 'g', 'y']:
msg('dropping {}', s)
return True
# Remove things that end with certain strings that are recognizable as
# common contractions for separate words.
if multiple_words_starting_with(s, common_starts):
msg('dropping {}', s)
return True
if multiple_words_ending_with(s, common_ends):
msg('dropping {}', s)
return True
# Remove what are recognized as two or more common words concatenated.
if len(s) > 7 and multiple_words(s):
msg('dropping {}', s)
return True
return False
def multiple_words_ending_with(s, endings_list):
for ending in endings_list:
# 2 chars more than the length of the ending, to be safer
minlength = len(ending) + 2
if (not in_dictionary(s) and s.endswith(ending) and len(s) >= minlength):
return True
return False
def multiple_words_starting_with(s, starts_list):
for start in starts_list:
# 2 chars more than the length of the ending, to be safer
minlength = len(start) + 2
if (not in_dictionary(s) and s.startswith(start) and len(s) >= minlength):
return True
return False
def multiple_words(s):
if len(s) < 7:
return False
if in_dictionary(s):
return False
for i in range(3, len(s) - 3):
if s[i:] in nltk_dictionary:
if s[:i] in nltk_dictionary:
return True
elif multiple_words(s[:i]):
return True
return False
def in_dictionary(s):
stemmed = stemmer.stem(s)
return (s in nltk_dictionary or stemmed in nltk_dictionary
or enchant_dictionary.check(s) or enchant_dictionary.check(stemmed))
def msg(string, *other_args):
'''Like the standard print(), but treats the first argument as a string
with format specifiers, and also flushes the output immediately. Flushing
immediately is useful when piping the output of a script, because Python
by default will buffer the output in that situation and this makes it
very difficult to see what is happening in real time.
'''
print(string.format(*other_args), flush=True)
# Entry point.
# .............................................................................
if __name__ == '__main__':
plac.call(main)