utils/create_frequency_file

#!/usr/bin/env python3
# =============================================================================
# @file    create_frequency_file
# @brief   Create a frequency.csv or frequency.pklz from raw token frequencies
# @author  Michael Hucka
# @license Please see the file named LICENSE in the project directory
# @website https://github.com/casics/extractor
# =============================================================================

import csv
import math
import plac
import re
import sys
from nltk.corpus import words as nltk_words
from nltk.corpus import wordnet as nltk_wordnet
from nltk.stem import SnowballStemmer
import enchant

try:
    sys.path.append(os.path.join(os.path.dirname(__file__), "../spiral"))
except:
    sys.path.append("../spiral")

import frequencies


# Main
# .............................................................................

@plac.annotations(
    inputfile  = ('input text file',                     'option', 'i'),
    outputfile = ('output file',                         'option', 'o'),
    threshold  = ('minimum frequency threshold',         'option', 't'),
    debug      = ('drop into ipdb opening files',        'flag',   'd'),
)

def main(inputfile=None, outputfile=None, threshold=0, debug=False):
    '''The intput file should be a plain-text table of frequencies, with each
line consisting of a token, some whitespace, and an integer; alternatively,
the input file can be a csv file in which the first column has the tokens and
the second column has the frequencies.  The output format is based on the
file extension: .csv for a CSV file, .pklz for a compressed pickle file.
'''
    threshold = int(threshold)
    if not inputfile:
        raise SystemExit('Missing input file argument.')
    if not outputfile:
        raise SystemExit('Missing output file argument.')
    if inputfile.endswith('.csv'):
        delimiter = ','
    else:
        delimiter = (' ', '\t')
    try:
        data = {}
        with open(inputfile, 'r') as input:
            if debug:
                import ipdb; ipdb.set_trace()
            total = 0
            kept = 0
            for line in input:
                total += 1
                (token, frequency) = line.split(delimiter)
                if int(frequency) < threshold:
                    msg('{} below threshold -- dropping {}', frequency, token)
                    continue
                if filter(token):
                    continue
                data[token] = int(frequency)
                kept += 1
            msg('{} strings read, {} kept.', total, kept)
        if outputfile.endswith('.csv'):
            with open(outputfile, 'w') as output:
                for token, frequency in sorted(data.items(), reverse=True,
                                               key=lambda x: x[1]):
                    output.write(token)
                    output.write(',')
                    output.write(str(frequency))
                    output.write('\n')
        else:
            frequencies.save_frequencies_to_pickle(data, outputfile)
        msg('Output saved in {}.', outputfile)
    except Exception as err:
        msg(err)


# Filter.
# .............................................................................
# The code below is an attempt to algorithmically remove stuff that is not
# desirable in a frequency table.  This is a very conservative effort;
# ideally, far more stuff would be filtered out, but it's difficult to
# come up with filter rules that won't remove stuff that should be kept.
# I also didn't want to resort to manual editing of the frequency table because
# that would produce idiosyncratic results and would not be reproducible.
#
# Note: be careful about filtering things that have mixed case and naively
# might be thought to be multiword identifiers.  I did this originally,
# thinking it would be safe to filter out strings that start with a capital
# letter and then have at least one more capital letter followed by a
# lowercase letter, such as "ABCFoo".  Unfortunately this will catch things
# like LaTeX and PDFLaTeX, which people do write in mixed case.  This would
# be bad for our goals.  The code below does a limited version of this that
# (based on experimentation) does a reasonable job of removing one kind of
# pattern.

common_elements = {'list', 'version', 'input', 'output', 'pointer', 'ptr',
                   'data', 'tuple', 'print', 'image', 'err', 'error', 'node',
                   'code', 'mode', 'value', 'number', 'handler', 'test',
                   'error', 'io', 'db', 'info', 'id', 'set', 'put', 'get',
                   'unit', 'encode', 'decode', 'opt', 'format', 'fmt',
                   'text', 'file', 'dir', 'check', 'start', 'stop',
                   'string', 'offset', 'mem', 'field', 'host', 'var', 'char',
                   'next', 'prev', 'filter', 'config'}

common_ends   = common_elements
common_starts = set(common_elements)
common_starts.update(['my', 'is', 'make'])

nltk_dictionary = set(nltk_words.words())
nltk_dictionary.update(nltk_wordnet.all_lemma_names())

enchant_dictionary = enchant.Dict('en_US')

stemmer = SnowballStemmer('english')

# The following exceptions were obtained while trying to find ways of filtering
# out obvious junk from the frequency table generated from 46k repos.  There
# are probably other exceptions that should be here but were missed, and if
# we had a different set of repos, we'd probably catch different exceptions.
# This is imperfect, but IMHO better than nothing.

exceptions = {'ipython', 'caching', 'revoked', 'doxygen', 'cpython',
              'slashless', 'exotica', 'mathematica',
              'chunker', 'arctanh', 'arcsinh', 'arccosh', 'arcsech',
              'coursera', 'activex', 'butterworth', 'utorrent', 'minimap',
              'xdisplay', 'xwindows', 'icontact', 'icalendar',
              'crypto', 'kmeans', 'interp', 'approx', 'latin5', 'iframe',
              'sensei', 'jquery', 'gunzip', 'xapian', 'xenstore', 'csharp',
              'eeprom', 'iomega', 'asynchronously', 'wunderground',
              'texinfo', 'pdb', 'imdb', 'gdb', 'ipdb', 'mongodb', 'dynamodb',
              'mysqldb', 'mysql', 'bsddb', 'innodb', 'couchdb' 'zodb',
              'pydb', 'uuid', 'uid', 'bio', 'mercurio', 'stdio', 'stderr',
              'stdout', 'settings', 'sets', 'setup', 'setups', 'unicode',
              'cached', 'codecs', 'ident', 'coords', 'iscsi', 'dirichlet',
              'json', 'util', 'async', 'init', 'gzip', 'cron', 'cuda', 'mbox',
              'todo', 'eval', 'dest', 'proc', 'lang', 'xcode', 'kivy', 'x509',
              'ckan', 'imap', 'frag', 'numa', 'chan', 'anim', 'zope', 'cmake',
              'rsync', 'jpeg', 'plone', 'mgmt', 'dwim', 'i386', 'sftp', 'tftp',
              'fasta', 'wget', 'priv', 'bson', 'bokeh', 'excl', 'gmail',
              'inode', 'ebay', 'tahoe', 'xray', 'xbox', 's390', 'foaf', 'i486',
              'xfer', 'nmake', 'tmux', 'vram', 'sdram', 'haxe', 'iana', 'inet',
              'i686', 'eula', 'emph', 'sata', 'pata', 'uber', 'ipad', 'cdash',
              'uboot', 'acosh', 'asech', 'atanh', 'jira', 'vmware', 'iweb',
              'msec', 'usec', 'xref', 'toc'}

def filter(s):
    '''Return True if the token should be filtered out.'''
    # Filter out pure numbers.
    # Fast number detector from https://stackoverflow.com/a/23639915/743730
    if s.replace('.', '', 1).isdigit():
        msg('dropping {}', s)
        return True

    # Filter out strings containing 3 upper case followed by 4 lower case
    # letters or vice versa.  This is a conservative test for one kind of
    # multiword string that seems to produce few-to-no false positives in my
    # testing.
    if (re.search('[A-Z][A-Z][A-Z][a-z][a-z][a-z][a-z]', s)
        or re.search('[A-Z][A-Z][A-Z][A-Z][a-z][a-z][a-z]', s)):
        msg('dropping {}', s)
        return True

    # Remaining tests are all based on lower case version of string.
    s = s.lower()

    # Skip exceptions.
    if s in exceptions:
        return False

    # Filter out stuff like "e545", "line23", "case2" etc.  Yes, there's a
    # risk this will catch some acronyms that I'm not aware of, but I think the
    # risk is low enough that it's okay to do this.  Besides, for Spiral, we
    # have a separate list of acronyms, and so they will be handled elsewhere.
    if re.search('^(e|error|page|line|case|test)[0-9]+$', s):
        msg('dropping {}', s)
        return True

    # Remove things that are reognizable words bracketed by a single letter,
    # such as "openerp" or 'xflush'. This requires care, because some things
    # are tokens we do want in the frequency table, so the rules below are
    # very limited.  Note: don't remove things only because they have a
    # number at the end.  Example: lib2to3 should be left in.
    if len(s) > 3 and len(s) < 7 and not in_dictionary(s):
        if in_dictionary(s[1:]) and not s.startswith('pre') and s[-1] not in ['s']:
            msg('dropping {}', s)
            return True
    if len(s) > 5 and not in_dictionary(s):
        if in_dictionary(s[1:]) and not s.startswith('pre'):
            msg('dropping {}', s)
            return True
        if in_dictionary(s[:-1]) and s[-1] not in ['s', 'r', 'd', 'g', 'y']:
            msg('dropping {}', s)
            return True

    # Remove things that end with certain strings that are recognizable as
    # common contractions for separate words.
    if multiple_words_starting_with(s, common_starts):
        msg('dropping {}', s)
        return True

    if multiple_words_ending_with(s, common_ends):
        msg('dropping {}', s)
        return True

    # Remove what are recognized as two or more common words concatenated.
    if len(s) > 7 and multiple_words(s):
        msg('dropping {}', s)
        return True

    return False


def multiple_words_ending_with(s, endings_list):
    for ending in endings_list:
        # 2 chars more than the length of the ending, to be safer
        minlength = len(ending) + 2
        if (not in_dictionary(s) and s.endswith(ending) and len(s) >= minlength):
            return True
    return False


def multiple_words_starting_with(s, starts_list):
    for start in starts_list:
        # 2 chars more than the length of the ending, to be safer
        minlength = len(start) + 2
        if (not in_dictionary(s) and s.startswith(start) and len(s) >= minlength):
            return True
    return False


def multiple_words(s):
    if len(s) < 7:
        return False
    if in_dictionary(s):
        return False
    for i in range(3, len(s) - 3):
        if s[i:] in nltk_dictionary:
            if s[:i] in nltk_dictionary:
                return True
            elif multiple_words(s[:i]):
                return True
    return False


def in_dictionary(s):
    stemmed = stemmer.stem(s)
    return (s in nltk_dictionary or stemmed in nltk_dictionary
            or enchant_dictionary.check(s) or enchant_dictionary.check(stemmed))


def msg(string, *other_args):
    '''Like the standard print(), but treats the first argument as a string
    with format specifiers, and also flushes the output immediately. Flushing
    immediately is useful when piping the output of a script, because Python
    by default will buffer the output in that situation and this makes it
    very difficult to see what is happening in real time.
    '''
    print(string.format(*other_args), flush=True)


# Entry point.
# .............................................................................

if __name__ == '__main__':
    plac.call(main)