diff --git a/tony/myio.py b/tony/myio.py new file mode 100644 index 0000000..84dbe8c --- /dev/null +++ b/tony/myio.py @@ -0,0 +1,49 @@ +''' + myio.py + SNLP.UdS.SS16 + @author: Tony Hong +''' + +import os + + +def read_file(path): + ''' + read raw text from file + ''' + raw_text = open(path, 'r').read().decode('utf-8') + return raw_text + + +def get_file_dict(container, file_dir, sp_filetype=''): + result = dict() + if type(container) == type({}): + filenames = container.itervalues() + elif type(container) == type([]): + filenames = container + else: + return result + + if sp_filetype: + suffix = '.' + sp_filetype + else: + suffix = '' + + for k in filenames: + names = k.split('.') + name = names[0] + if suffix: + result[name] = os.path.join(file_dir, name + suffix) + else: + result[name] = os.path.join(file_dir, k) + + return result + +def get_text_dict(file_dict): + ''' + set up text dict from file dict + ''' + result = dict() + for k, v in file_dict.iteritems(): + result[k] = read_file(v) + return result diff --git a/tony/pandas-wrapper.py b/tony/pandas-wrapper.py new file mode 100644 index 0000000..916e667 --- /dev/null +++ b/tony/pandas-wrapper.py @@ -0,0 +1,56 @@ +''' + Pandas module of tony +''' + +import numpy as np +import pandas as pd + +# TODO: only 2-level, too specific, need extension +def getVectorL2(series, label): + '''Retrieve label-1-level vector from 2-level series. + ''' + try: + return series.ix[label] + except KeyError, e: + return pd.DataFrame().sum() + +def getValueL2(series, label1, label2): + '''Retrieve label-2-value from 2-level series. + ''' + try: + return series.ix[label1].ix[label2] + except KeyError, e: + return 0 + + +def getValue(series, *labels): + '''Retrieve label-n-value from n-level series. + ''' + result = series + i = 0 + try: + for l in labels: + indexLambda = getIndexLambda(thisSeries) + result = indexLambda(l) + i = i + 1 + except KeyError, e: + if i < len(labels): + return pd.DataFrame().sum() + else: + return 0 + +def getIndexLambda(series): + return lambda i: series.ix[i] + + +def main(): + df = pd.DataFrame({ 'A' : 1., + 'B' : pd.Timestamp('20130102'), + 'C' : pd.Series(1,index=list(range(4)),dtype='float32'), + 'D' : np.array([3] * 4,dtype='int32'), + 'E' : pd.Categorical(["test","train","test","train"]), + 'F' : 'foo' }) + + +if __name__ == '__main__': + main() diff --git a/tony/util.py b/tony/util.py deleted file mode 100644 index 4ef9690..0000000 --- a/tony/util.py +++ /dev/null @@ -1,12 +0,0 @@ -''' - Util module of tony -''' - -# Verify that an object is iterable if it implemented the iterator protocol -def isiterable(obj): -'''This function would return Truefor strings as well as most Python collection types''' - try: - iter(obj) - return True - except TypeError: # not iterable - return False diff --git a/tony/utils.py b/tony/utils.py new file mode 100644 index 0000000..22dd408 --- /dev/null +++ b/tony/utils.py @@ -0,0 +1,30 @@ +''' + Utilities module of Tony +''' + +def isiterable(obj): + '''Verify that an object is iterable if it implemented the iterator protocol. + + This function would return True for strings as well as most Python collection types. + ''' + try: + iter(obj) + return True + except TypeError: # not iterable + return False + + +def remove_punctuation(value): + '''make a list of the operations you want to apply to a particular set of strings. + ''' + return re.sub('[!#?]', '', value) + +clean_ops = [str.strip, remove_punctuation, str.title] + +def clean_strings(strings, ops): + result = [] + for value in strings: + for function in ops: + value = function(value) + result.append(value) + return result