-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathlabel_processing.py
133 lines (114 loc) · 6.07 KB
/
label_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import logging
import scipy.cluster.hierarchy as sch
# to map the user labels
# - user_input_df: pass in original user input dataframe, return changed user input dataframe
# - sp2en: change Spanish to English
def map_labels_sp2en(user_input_df):
# Spanish words to English
span_eng_dict = {'revisado_bike': 'test ride with bike', 'placas_de carro': 'car plates', 'aseguranza': 'insurance',
'iglesia': 'church', 'curso': 'course',
'mi_hija recién aliviada': 'my daughter just had a new baby',
'servicio_comunitario': 'community service', 'pago_de aseguranza': 'insurance payment',
'grupo_comunitario': 'community group', 'caminata_comunitaria': 'community walk'}
# change language
user_input_df = user_input_df.replace(span_eng_dict)
return user_input_df
# to map purposes and replaced mode in user inputs
# - cvt_pur_mo: convert purposes and replaced mode
def map_labels_purpose(user_input_df):
# Convert purpose
map_pur_dict = {'course': 'school', 'work_- lunch break': 'lunch_break', 'on_the way home': 'home',
'insurance_payment': 'insurance'}
# convert purpose
user_input_df = user_input_df.replace(map_pur_dict)
return user_input_df
def map_labels_mode(user_input_df):
# convert mode
for a in range(len(user_input_df)):
if user_input_df.iloc[a]["replaced_mode"] == "same_mode":
# to see which row will be converted
logging.debug("The following rows will be changed: %s", user_input_df.iloc[a])
user_input_df.iloc[a]["replaced_mode"] = user_input_df.iloc[a]['mode_confirm']
return user_input_df
# this function will change Spanish to English, convert purposes, and convert modes
def map_labels(user_input_df):
# Note that the spanish -> english conversion MUST currently happen before the other
# mode and purpose mappings
user_input_df = map_labels_sp2en(user_input_df)
user_input_df = map_labels_purpose(user_input_df)
user_input_df = map_labels_mode(user_input_df)
return user_input_df
# use hierarchical clustering to get labels of the second round
# - sch.linkage: perform hierarchical(agglomerative) clustering
# In this function, we set a low bound and a higher bound(cutoff) of distance in the dendrogram
# - last_d: the distance of the last cluster in the dendrogram
# - low: the lower bound of distance
# e.g., if low = 300, last_d = 250, we will assign 0s as labels for the points, irrespective of the first round labels.
# and the list of second round labels will be like [0,0,0,0,0].
# It means the points are already similar to each other after the first round of clustering, they don't need to
# go through the second round.
# - max_d: the cutoff of distance
# - dist_pct: the percentage of the last distance in the dendrogram
# - sch.fcluster: form clusters from the hierarchical clustering defined by the given linkage matrix
# e.g., if last_d = 10000, dist_pct = 0.4, max_d = 400, clusters will be assigned at the distance of 400
# - clusters: the labels from the second round clustering
def get_second_labels(x,method,low,dist_pct):
z = sch.linkage(x, method=method, metric='euclidean')
last_d = z[-1][2]
clusters = []
if last_d < low:
for i in range(len(x)):
clusters.append(0)
else:
max_d = last_d * dist_pct
clusters = sch.fcluster(z, max_d, criterion='distance')
return clusters
# this function includes hierarchical clustering and changing labels from the first round to get appropriate labels for
# the second round of clustering
# appropriate labels are label from the first round concatenate label from the second round
# (e.g. label from first round is 1, label from second round is 2, the new label will be 12)
# - second_round_idx_labels: a list to store the indices and labels from the first round.
# - second_labels: labels from the second round of clustering
def get_new_labels(second_labels,second_round_idx_labels,new_labels):
for i in range(len(second_labels)):
first_index = second_round_idx_labels[i][0]
new_label = second_round_idx_labels[i][1]
# concatenate labels from two rounds
new_label = int(str(new_label) + str(second_labels[i]))
for k in range(len(new_labels)):
if k == first_index:
new_labels[k] = new_label
break
return new_labels
# group similar trips according to new_labels, store the original indices of the trips
def group_similar_trips(new_labels,track):
bin_sim_trips_idx = []
# find the unique set of bins and store their indices into `bin_sim_trips`
label_set = set(new_labels)
# convert the set of unique labels into their indices
# concretely, if the input labels are ['a','a','a','b','b','b']
# the unique labels are ['a', 'b']
for sel_label in label_set:
# for the first iteration, bin = [0,1,2]
# for the second iteration, bin = [3,4,5]
bin = [index for (index, label) in enumerate(new_labels) if label == sel_label]
bin_sim_trips_idx.append(bin)
# At the end, bin_sim_trips_idx = [[0,1,2],[3,4,5]]
# using track to replace the current indices with original indices
for bin in bin_sim_trips_idx:
# in the first iteration, bin = [0,1,2]
# in the first iteration of that, we map the trip index of the
# common trip (e.g. 0) to the original index for that trip from the track (e.g. 42)
for i in range(len(bin)):
bin[i] = track[bin[i]][0]
# At this point, the bin_sim_trips_idx will have original indices for the trips
return bin_sim_trips_idx
# replace the first round labels with new labels
# - track: a list to store the indices and labels from the first round of clustering
# for item in track, item[0] is the original index of the trip in filter_trips
# item[1] is the label after the first round of clustering
# we change the labels from the first round with new labels from the second round here
def change_track_labels(track,new_labels):
for i in range(len(new_labels)):
track[i][1] = new_labels[i]
return track