e-mission · corinne-hcr · Jun 13, 2021
diff --git a/tour_model_eval/data_preprocessing.py b/tour_model_eval/data_preprocessing.py
@@ -13,8 +13,7 @@ def read_data(user):
 
 # - trips: all trips read from database
 # - filter_trips: valid trips that have user labels and are not points
-def filter_data(user,radius):
-    trips = read_data(user)
+def filter_data(trips,radius):
     non_empty_trips = [t for t in trips if t["data"]["user_input"] != {}]
     non_empty_trips_df = pd.DataFrame(t["data"]["user_input"] for t in non_empty_trips)
     valid_trips_df = non_empty_trips_df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
@@ -23,7 +22,7 @@ def filter_data(user,radius):
 
     # similarity codes can filter out trips that are points in valid_trips
     filter_trips = similarity.filter_too_short(valid_trips, radius)
-    return filter_trips,trips
+    return filter_trips
 
 
 # use KFold (n_splits=5) to split the data into 5 models (5 training sets, 5 test sets)

diff --git a/tour_model_eval/get_request_percentage.py b/tour_model_eval/get_request_percentage.py
@@ -131,4 +131,5 @@ def get_req_pct(new_labels,track,filter_trips,sim):
     new_bins = label_pro.group_similar_trips(new_labels,track)
     req_trips = get_requested_trips(new_bins,filter_trips,sim)
     pct = len(req_trips)/len(filter_trips)
+    pct = float('%.3f' % pct)
     return pct
diff --git a/tour_model_eval/get_scores.py b/tour_model_eval/get_scores.py
@@ -68,4 +68,5 @@ def score(bin_trips, labels_pred):
 
     labels_pred = labels_pred
     homo_score = skm.homogeneity_score(labels_true, labels_pred)
+    homo_score = float('%.3f' % homo_score)
     return homo_score
diff --git a/tour_model_eval/get_users.py b/tour_model_eval/get_users.py
@@ -19,7 +19,8 @@ def get_user_ls(all_users,radius):
     for i in range(len(all_users)):
         curr_user = 'user' + str(i + 1)
         user = all_users[i]
-        filter_trips,trips = preprocess.filter_data(user,radius)
+        trips = preprocess.read_data(user)
+        filter_trips = preprocess.filter_data(trips,radius)
         if valid_user(filter_trips,trips):
             valid_user_ls.append(curr_user)
             user_ls.append(curr_user)

diff --git a/tour_model_eval/label_processing.py b/tour_model_eval/label_processing.py
@@ -61,6 +61,7 @@ def map_labels(user_input_df):
 # - dist_pct: the percentage of the last distance in the dendrogram
 # - sch.fcluster: form clusters from the hierarchical clustering defined by the given linkage matrix
 # e.g., if last_d = 10000, dist_pct = 0.4, max_d = 400, clusters will be assigned at the distance of 400
+# - clusters: the labels from the second round clustering
 def get_second_labels(x,method,low,dist_pct):
     z = sch.linkage(x, method=method, metric='euclidean')
     last_d = z[-1][2]
@@ -80,8 +81,7 @@ def get_second_labels(x,method,low,dist_pct):
 # (e.g. label from first round is 1, label from second round is 2, the new label will be 12)
 # - second_round_idx_labels: a list to store the indices and labels from the first round.
 # - second_labels: labels from the second round of clustering
-def get_new_labels(x,low,dist_pct,second_round_idx_labels,new_labels,method=None):
-    second_labels = get_second_labels(x,method,low,dist_pct)
+def get_new_labels(second_labels,second_round_idx_labels,new_labels):
     for i in range(len(second_labels)):
         first_index = second_round_idx_labels[i][0]
         new_label = second_round_idx_labels[i][1]