-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_types.py
213 lines (163 loc) · 9.16 KB
/
extract_types.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import re
from collections import Counter
import unicodedata
import math
def remove_accents(input_str):
# https://stackoverflow.com/a/517974
nfkd_form = unicodedata.normalize('NFKD', input_str)
return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
def remove_special_characters_and_shrink_whitespace(input_string):
output_string = re.sub(r'\s+', ' ', re.sub(r'[^A-Za-z0-9\s]', '', input_string))
return output_string
def clean_string(input_string):
if (isinstance(input_string, str) and input_string.strip() != '') or (not math.isnan(input_string)):
return remove_special_characters_and_shrink_whitespace(remove_accents(str(input_string).lower()))
else:
return ''
def get_unique_ngram(list_of_ordered_tokens, n):
# Get the unique ngrams in a string, given a list of ordered tokens in that string
return set(zip(*[list_of_ordered_tokens[i:i - (n - 1)] if i < n - 1 else list_of_ordered_tokens[n - 1:] for i in range(0, n)]))
def reduce_nested_ngram_freq(x, ngram_freq, mutable_ngram_freq):
max_matched_n = 0 # Length of the longest ngram(s) we have matched for this name
# Note that we have ordered the ngram_freq such that longer ngrams come first, so we can expect to match longest ngram asap
for n, ngram, count in ngram_freq:
# If the current ngram occurred in more than one name AND the ngram is in the current name
if count > 1 and ' ' + ngram + ' ' in ' ' + x + ' ':
# If the current ngram is at least the same or even longer than the max n we matched,
# just update the max n and move on
if n >= max_matched_n:
max_matched_n = n
else:
# Else if the current ngram is shorter than longest ngram we have matched in this name,
# this probably suggests the current ngram is part of the longest ngram we matched before,
# thus we need to decrease the frequency count for this shorter ngram as it is not an independent count
mutable_ngram_freq[ngram] -= 1
def strip_preposition(input_string, prepositions='al,el,at,et,la,le,les,da,de,do,du,des,das,and,of,for'):
output_string = input_string.strip()
for preposition in prepositions.split(',') if isinstance(prepositions, str) else prepositions:
output_string = re.sub(r'^' + preposition + '$', '', output_string)
output_string = re.sub(r'^' + preposition + ' ', '', output_string)
output_string = re.sub(r' ' + preposition + '$', '', output_string)
return output_string
def strip_number_prefix(input_string, numbers='i,ii,iii,iv,v,vi,vii,viii,ix,x'):
output_string = input_string.strip()
for number in numbers.split(',') if isinstance(numbers, str) else numbers:
output_string = re.sub(r'^' + number + '$', '', output_string)
output_string = re.sub(r'^' + number + ' ', '', output_string)
output_string = re.sub(r'^\d+$', '', output_string)
output_string = re.sub(r'^\d+ ', '', output_string)
return output_string
def consolidate_count_list(input_list):
output_list = [(item, count) for item, count in input_list if item.strip() != '' and len(item) > 1]
dic = {}
for item, count in output_list:
if dic.get(item) is None:
dic[item] = count
else:
dic[item] += count
output_list = sorted(list(dic.items()), key=lambda x: -x[-1])
return output_list
def health_facility_rules(input_list):
output_list = [(strip_preposition(strip_number_prefix(item)), count) for item, count in input_list]
output_list = consolidate_count_list(output_list)
return output_list
def print_table(table_data):
table_data = [list(tup) for tup in table_data]
no_percentage = False
float_rounding = 3
format_string = ''
for i in range(len(table_data[0])):
this_column = [tup[i] for tup in table_data]
if isinstance(this_column[0], str):
column_type = 'str'
max_string_len = max([len(x) for x in this_column])
max_string_len = max(10, max_string_len + 5)
format_string += '{: <' + str(max_string_len) + '}'
elif isinstance(this_column[0], float):
if no_percentage:
column_type = 'float'
else:
if max(this_column) <= 1 and min(this_column) >= 0:
column_type = 'percentage'
d = 0 # find the number of digits till which there are value other than zero
while not all([x * (10**d) == int(x * (10**d)) for x in this_column]):
d += 1
if d > 6:
break
float_rounding = d - 2
format_string += '{: >10.' + str(float_rounding) + '%}'
else:
column_type = 'float'
if column_type == 'float':
max_abs_num_len = len(str(int(max([abs(x) for x in this_column]))))
max_abs_num_len = max(10, max_abs_num_len + 5 + float_rounding)
format_string += '{: >' + str(max_abs_num_len) + '}'
this_column = [round(x, float_rounding) for x in this_column]
elif isinstance(this_column[0], int):
column_type = 'int'
max_abs_num_len = len(str(int(max([abs(x) for x in this_column]))))
max_abs_num_len = max(10, max_abs_num_len + 5)
format_string += '{: >' + str(max_abs_num_len) + '}'
if column_type == 'percentage':
for j in range(len(table_data)):
table_data[j][i] *= 100
for row in table_data:
print(format_string.format(*row))
def extract_types(input_list, thres=None, maximum_expected_number_of_types=20, return_proportion=False, no_full_match=False, custom_rule=health_facility_rules):
'''
Given a list of names, return the common types in the names along with their frequency
'''
name_list = [str(x) for x in list(input_list) if (isinstance(x, str) and x.strip() != '') or (not math.isnan(x))]
name_list = [clean_string(name) for name in name_list]
number_of_names_in_this_list = len(name_list)
phrase_counter = Counter()
# For each name in the name list, tokenize it and count the ngrams in the name,
# and add the frequency of ngrams to the phrase counter
for name in name_list:
tokens_in_name = name.strip().split(" ")
for n in range(1, min(len(tokens_in_name), 10) + 1): # Record frequency of ngrams up to 10 tokens' long
phrase_counter.update(get_unique_ngram(tokens_in_name, n))
# For each pair of (token_sequence, count) in the phrase_counter, convert it to (n, ngram, count), where n is the "n" in ngram
ngram_freq = [(len(token_sequence), ' '.join(token_sequence), count) for token_sequence, count in phrase_counter.most_common() if count >= round(len(name_list) * 0.001)]
# Sort the ngram_freq list in the reverse order of n, such that longer ngrams is earlier in the list
ngram_freq = sorted(ngram_freq, key=lambda entry: -entry[0])
# Create a mutable copy of the ngram_freq list
mutable_ngram_freq = {ngram: count for n, ngram, count in ngram_freq}
# Iterate through all the names in the name list, use the reduce the frequency count for the ngrams that are part of longer ngram
for name in name_list:
reduce_nested_ngram_freq(name, ngram_freq, mutable_ngram_freq)
sorted_consolidated_ngram_freq = sorted(list(mutable_ngram_freq.items()), key=lambda t: -t[-1])
# If thres is not provided, return the top k types found, with k defined by maximum_expected_number_of_types
if thres is None:
if len(sorted_consolidated_ngram_freq) > maximum_expected_number_of_types:
minimum_count = sorted_consolidated_ngram_freq[maximum_expected_number_of_types][-1]
else:
minimum_count = 1
# If thres is provided, calculate the minimum count by multiplying threshold with number of names in the list
else:
if thres < 1 and thres >= 0:
minimum_count = int(thres * number_of_names_in_this_list)
elif thres >= 1 and int(thres) == thres:
minimum_count = thres
minimum_count = max(1, minimum_count)
results = [item for item in sorted_consolidated_ngram_freq if item[-1] > minimum_count]
if no_full_match: # Should remove items whose occurrences are all full match in the list
results = [(item, count) for item, count in results if item not in name_list]
if custom_rule:
results = custom_rule(results)
# If return_proportion is true, return the results in the format (ngram, proportion, count)
if return_proportion:
results = [(item, round(count / number_of_names_in_this_list, 3), count) for item, count in results]
print_table(results)
return results
def match_type_string(entry, common_types):
if isinstance(common_types[0], tuple):
common_type_strings = [x[0] for x in sorted(common_types, key=lambda x: (-x[0].count(' '), -len(x[0])))]
elif isinstance(common_types[0], str):
pass
else:
raise
for type_string in common_type_strings:
if ' ' + type_string + ' ' in ' ' + entry + ' ':
return type_string.strip()
return math.nan