Skip to content

v-measure #19

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
378 changes: 320 additions & 58 deletions tour_model_eval/confirmed_trips_eval_bins_clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,65 +9,327 @@
import emission.storage.decorations.analysis_timeseries_queries as esda
import pandas as pd
from numpy import *
from sklearn import metrics
from pandas.testing import assert_frame_equal

# Spanish words to English
span_eng_dict = {'revisado_bike':'test ride with bike','placas_de carro':'car plates','aseguranza':'insurance',
'iglesia':'church','curso':'course','mi_hija recién aliviada':'my daughter just had a new baby',
'servicio_comunitario':'community service','pago_de aseguranza':'insurance payment',
'grupo_comunitario':'community group','caminata_comunitaria':'community walk'}

# Convert purpose
map_pur_dict = {'course':'school','work_- lunch break':'lunch_break','on_the way home':'home',
'insurance_payment':'insurance'}

# precision_bins takes five parameters
# - all_bins_preci: the list that collects precision of each bin, should pass in an empty list
# - sp2en=None means no need to translate language
# sp2en='True' will use span_eng_dict to change Spanish to English
#
# - cvt_purpose=None means no need to convert purposes
# cvt_purpose='True' will use map_pur_dict to convert purposes
# using this parameter should also set sp2en='True'
def precision_bins (all_bins_preci,bins,non_empty_trips,sp2en=None,cvt_purpose=None):
for bin in bins:
bin_user_input = (non_empty_trips[i].data["user_input"] for i in bin if
non_empty_trips[i].data["user_input"] != {})
bin_df = pd.DataFrame(data=bin_user_input)
if sp2en == 'True':
bin_df = bin_df.replace(span_eng_dict)
if cvt_purpose == 'True':
bin_df = bin_df.replace(map_pur_dict)
duplic_trips = bin_df[bin_df.duplicated(keep=False)]

# for bin that doesn't have duplicate trips, assign 0 as precision
if duplic_trips.empty and len(bin_df) > 1:
all_bins_preci.append(0)
# for bin only has one trip, assign 1.0 as precision
elif len(bin_df) == 1:
all_bins_preci.append(1.0)

# - user_ls: a list of all users
# - valid_user_ls: a list of valid users
def get_user_ls(all_users,radius):
user_ls = []
valid_user_ls = []
for i in range(len(all_users)):
curr_user = 'user' + str(i + 1)
user = all_users[i]
filter_trips,sim,trips = filter_data(user,radius)
if valid_user(filter_trips,trips):
valid_user_ls.append(curr_user)
user_ls.append(curr_user)
else:
duplic = duplic_trips.groupby(duplic_trips.columns.tolist()).apply(lambda x: tuple(x.index)).tolist()
max_duplic = max(duplic, key=lambda i: len(i))
precision = round(len(max_duplic) / len(bin), 2)
all_bins_preci.append(precision)
return all_bins_preci


# precision_all_users takes four parameters
# - all_users: pass in all participants' data
# - sp2en: default None, no need to change language
# - cvt_purpose: default None, no need to convert purpose
def precision_bin_all_users(all_users,radius,sp2en=None,cvt_purpose=None):
all_users_preci = []
user_ls.append(curr_user)
continue
return user_ls,valid_user_ls


# - trips: all trips read from database
# - filter_trips: valid trips that have user labels and are not points
def filter_data(user,radius):
trips = pipeline.read_data(uuid=user, key=esda.CONFIRMED_TRIP_KEY)
non_empty_trips = [t for t in trips if t["data"]["user_input"] != {}]
non_empty_trips_df = pd.DataFrame(t["data"]["user_input"] for t in non_empty_trips)
valid_trips_df = non_empty_trips_df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
valid_trips_idx_ls = valid_trips_df.index.tolist()
valid_trips = [non_empty_trips[i]for i in valid_trips_idx_ls]

# similarity codes can filter out trips that are points in valid_trips
sim = similarity.similarity(valid_trips, radius)
filter_trips = sim.data
return filter_trips,sim,trips


# to determine if the user is valid:
# valid user should have >= 10 trips for further analysis and the proportion of filter_trips is >=50%
def valid_user(filter_trips,trips):
valid = False
if len(filter_trips) >= 10 and len(filter_trips) / len(trips) >= 0.5:
valid = True
return valid


# to map the user labels
# - user_input_df: pass in original user input dataframe, return changed user input dataframe
# - sp2en: change Spanish to English
# - cvt_pur_mo: convert purposes and replaced mode
def map_labels(user_input_df,sp2en,cvt_pur_mo):
# Spanish words to English
span_eng_dict = {'revisado_bike': 'test ride with bike', 'placas_de carro': 'car plates', 'aseguranza': 'insurance',
'iglesia': 'church', 'curso': 'course',
'mi_hija recién aliviada': 'my daughter just had a new baby',
'servicio_comunitario': 'community service', 'pago_de aseguranza': 'insurance payment',
'grupo_comunitario': 'community group', 'caminata_comunitaria': 'community walk'}

# Convert purpose
map_pur_dict = {'course': 'school', 'work_- lunch break': 'lunch_break', 'on_the way home': 'home',
'insurance_payment': 'insurance'}

if sp2en:
# change language
user_input_df = user_input_df.replace(span_eng_dict)
elif cvt_pur_mo:
# change language first
user_input_df = user_input_df.replace(span_eng_dict)
# convert purpose
user_input_df = user_input_df.replace(map_pur_dict)
# convert mode
for a in range(len(user_input_df)):
if user_input_df.iloc[a]["replaced_mode"] == "same_mode":
# to see which row will be converted
logging.debug("The following rows will be changed: %s", user_input_df.iloc[a])
user_input_df.iloc[a]["replaced_mode"] = user_input_df.iloc[a]['mode_confirm']
return user_input_df


# check if the user is valid
# append NaN to the score lists when the user invalid
def valid_user_check(filter_trips,trips,homo_score,comp_score,v_score):
if not valid_user(filter_trips, trips):
homo_score.append(NaN)
comp_score.append(NaN)
v_score.append(NaN)
skip = True
else:
skip = False
return homo_score,comp_score,v_score,skip
Comment on lines +94 to +102
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not going to insist on this since you are planning to refactor anyway, but you don't need to have homo_score, comp_score and v_score as both input and output. You can create a class that encapsulates all the scores and just pass it in directly.

Also, I really don't think you need to have append code in here - you can just have this function return valid or invalid. If invalid, append the values and continue; if valid, proceed with computation.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please file an issue to clean this up later.



# This function is to get homogeneity score, complete score, and v-score
def compute_score(labels_true,labels_pred,homo_score,comp_score,v_score):
homo = metrics.homogeneity_score(labels_true, labels_pred)
homo_score.append(float('%.3f' % homo))
comp = metrics.completeness_score(labels_true, labels_pred)
comp_score.append(float('%.3f' % comp))
v = metrics.v_measure_score(labels_true, labels_pred)
v_score.append(float('%.3f' % v))
return homo_score,comp_score,v_score


# This function is to compare a trip with a group of trips to see if they happened in a same day
Copy link
Contributor

@shankari shankari Apr 9, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is the meaning of the parameters?
you have a bin and filter_trips. Why do you need to pass in both?
why not just pass in two trips?

def match_day(trip,bin,filter_trips):
if bin:
t = filter_trips[bin[0]]
if trip['data']['start_local_dt']['year']==t['data']['start_local_dt']['year']\
and trip['data']['start_local_dt']['month']==t['data']['start_local_dt']['month']\
and trip['data']['start_local_dt']['day']==t['data']['start_local_dt']['day']:
return True
return False


# This function is to compare a trip with a group of trips to see if they happened in a same month
def match_month(trip,bin,filter_trips):
if bin:
t = filter_trips[bin[0]]
if trip['data']['start_local_dt']['year']==t['data']['start_local_dt']['year']\
and trip['data']['start_local_dt']['month']==t['data']['start_local_dt']['month']:
return True
return False


# This function bins trips according to ['start_local_dt']
def bin_date(trip_ls,filter_trips,day=None,month=None):
bin_date = []
for trip_index in trip_ls:
added = False
trip = filter_trips[trip_index]

for bin in bin_date:
if day:
if match_day(trip,bin,filter_trips):
bin.append(trip_index)
added = True
break
if month:
if match_month(trip,bin,filter_trips):
bin.append(trip_index)
added = True
break

Comment on lines +144 to +155
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like there should be a better way to implement this in pandas using groupby. Please file an issue for this so you can explore how to clean it up later.

if not added:
bin_date.append([trip_index])

Comment on lines +142 to +158
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code is just so messy and non-pythonic

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add this to the list of areas to improve the implementation.

return bin_date


# compare the trip orders in bin_trips with those in filter_trips above cutoff
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Improve comment. What is "trip_orders"? why does it matter?
Just repeating the function name in the comment doesn't add much

def compare_trip_orders(bins,bin_trips,filter_trips):
# compare the trips order in bins and those in valid_trips using timestamp
bin_trips_ts = pd.DataFrame(data=[trip["data"]["start_ts"] for trip in bin_trips])
bin_ls = []
for bin in bins:
for index in bin:
bin_ls.append(index)
bins_ts = pd.DataFrame(data=[filter_trips[i]["data"]["start_ts"] for i in bin_ls])
# compare two data frames, the program will continue to score calculation if two data frames are the same
assert_frame_equal(bins_ts, bin_trips_ts)


def find_first_trip(filter_trips,bin):
early_trip = filter_trips[bin[0]]
index = 0
for i in range(1,len(bin)):
compare_trip = filter_trips[bin[i]]
if early_trip['data']["start_ts"] > compare_trip['data']["start_ts"]:
early_trip = compare_trip
index = i
early_trip_index = bin[index]
return early_trip_index, index






# v_measure_bins takes 5 parameters
# - sp2en=True: change Spanish to English
# - cvt_pur_mo=True: convert purposes and replaced mode
# - cutoff=True: choose to analyze bins above cutoff
# - cutoff=None: analyze all bins
# Note: for sp2en and cvt_pur_mo, set either one to be True as needed. cvt_pur_mo will change language first
def v_measure_bins(all_users,radius,sp2en=None,cvt_pur_mo=None,cutoff=None):
homo_score = []
comp_score = []
v_score = []
for i in range(len(all_users)):
user = all_users[i]
trips = pipeline.read_data(uuid=user, key=esda.CONFIRMED_TRIP_KEY)
all_bins_preci = []
non_empty_trips = [t for t in trips if t["data"]["user_input"] != {}]
if non_empty_trips != {}:
sim = similarity.similarity(non_empty_trips, radius)
if sim.data:
sim.bin_data()
all_bins_preci = precision_bins(all_bins_preci, sim.bins, non_empty_trips, sp2en, cvt_purpose)
all_users_preci.append(round(mean(all_bins_preci), 2))
return all_users_preci
filter_trips,sim,trips = filter_data(user,radius)

homo_score,comp_score,v_score,skip = valid_user_check(filter_trips,trips,homo_score,comp_score,v_score)
if skip:
continue

sim.bin_data()
if cutoff is None:
trip_index_ls = []
bins = sim.bins
for bin in bins:
for index in bin:
trip_index_ls.append(index)
bin_trips = [filter_trips[num] for num in trip_index_ls]

elif cutoff:
sim.delete_bins()
bin_trips = sim.newdata
bins = sim.bins

bin_trips_user_input_df = pd.DataFrame(data=[trip["data"]["user_input"] for trip in bin_trips])
bin_trips_user_input_df = map_labels(bin_trips_user_input_df, sp2en, cvt_pur_mo)

# turn all user_input into list without binning
bin_trips_user_input_ls = bin_trips_user_input_df.values.tolist()
# drop duplicate user_input
no_dup_df = bin_trips_user_input_df.drop_duplicates()
# turn non-duplicate user_input into list
no_dup_list = no_dup_df.values.tolist()

# collect labels_true based on user_input
labels_true = []
for trip in bin_trips_user_input_ls:
if trip in no_dup_list:
labels_true.append(no_dup_list.index(trip))

# collect labels_pred based on bins
labels_pred = []
for b in range(len(bins)):
for trip in bins[b]:
labels_pred.append(b)

# compare the trips order in bins and those in valid_trips using timestamp
bin_trips_ts = pd.DataFrame(data=[trip["data"]["start_ts"] for trip in bin_trips])
bin_ls = []
for bin in bins:
for index in bin:
bin_ls.append(index)
bins_ts = pd.DataFrame(data=[filter_trips[i]["data"]["start_ts"] for i in bin_ls])
# compare two data frames, the program will continue to score calculation if two data frames are the same
assert_frame_equal(bins_ts, bin_trips_ts)
homo_score, comp_score, v_score = compute_score(labels_true, labels_pred, homo_score, comp_score, v_score)

return homo_score, comp_score, v_score


# - sp2en=True: change Spanish to English
# - cvt_pur_mo=True: convert purposes and replaced mode
# - cutoff=True: choose to analyze bins above cutoff
# - cutoff=None: analyze all bins
# Note: for sp2en and cvt_pur_mo, set either one to be True as needed. cvt_pur_mo will change language first
def v_measure_clusters(all_users,radius,sp2en=None,cvt_pur_mo=None):
homo_score = []
comp_score = []
v_score = []
for i in range(len(all_users)):
user = all_users[i]
filter_trips,sim,trips = filter_data(user,radius)

homo_score,comp_score,v_score,skip = valid_user_check(filter_trips,trips,homo_score,comp_score,v_score)
if skip:
continue

sim.bin_data()
sim.delete_bins()
bin_trips = sim.newdata
bins = sim.bins

# clustering the data only based on sil score (min_cluster = 0) instead of bins number (len(bins))
feat = featurization.featurization(bin_trips)
min = 0
max = int(math.ceil(1.5 * len(bins)))
feat.cluster(min_clusters=min, max_clusters=max)
cluster_trips = feat.data
cluster_user_input_df = pd.DataFrame(data=[i["data"]["user_input"] for i in cluster_trips])
cluster_user_input_df = map_labels(cluster_user_input_df, sp2en, cvt_pur_mo)
# turn cluster_trips to list without any changes
cluster_user_input_ls = cluster_user_input_df.values.tolist()
# drop duplicate user_input
no_dup_df = cluster_user_input_df.drop_duplicates()
# turn non-duplicate user_input into list
no_dup_list = no_dup_df.values.tolist()
# collect labels_true based on user_input
labels_true = []
for trip in cluster_user_input_ls:
if trip in no_dup_list:
labels_true.append(no_dup_list.index(trip))
labels_pred = feat.labels

# compare the points in cluster_trips and those in feat.points, the program will continue to score calculation
# if the frames are the same
cluster_ps = []
for trip in cluster_trips:
cluster_ps.append([trip["data"]["start_loc"]["coordinates"][0],
trip["data"]["start_loc"]["coordinates"][1],
trip["data"]["end_loc"]["coordinates"][0],
trip["data"]["end_loc"]["coordinates"][1]])
cluster_ps_df = pd.DataFrame(data=cluster_ps)
label_ps_df = pd.DataFrame(data=feat.points)
assert_frame_equal(cluster_ps_df, label_ps_df)
homo_score, comp_score, v_score = compute_score(labels_true, labels_pred, homo_score, comp_score, v_score)

return homo_score, comp_score, v_score




















Loading