@@ -413,8 +413,20 @@ def calculate_cluster_features(df: pd.DataFrame, cluster_id_vars_on_2nd_level: l
413413 pd.DataFrame: clusters with descriptive variables
414414 """
415415
416- df ["tst_median" ] = pd .to_datetime (df ["tst_median" ], format = "ISO8601" )
417- df ["oday" ] = pd .to_datetime (df ["oday" ])
416+ df = df .copy ()
417+
418+ for col in ["lat_median" , "long_median" , "hdg_median" , "weight" ]:
419+ if col in df .columns :
420+ df [col ] = pd .to_numeric (df [col ], errors = "coerce" )
421+
422+ if "tst_median" in df .columns :
423+ df ["tst_median" ] = pd .to_datetime (df ["tst_median" ], errors = "coerce" , utc = True )
424+ df ["tst_median_ns" ] = df ["tst_median" ].view ("int64" )
425+ else :
426+ df ["tst_median_ns" ] = pd .Series (index = df .index , dtype = "float64" )
427+
428+ if "oday" in df .columns :
429+ df ["oday" ] = pd .to_datetime (df ["oday" ], errors = "coerce" )
418430
419431 clust_counts = df .drop_duplicates (
420432 subset = [
@@ -430,11 +442,26 @@ def calculate_cluster_features(df: pd.DataFrame, cluster_id_vars_on_2nd_level: l
430442 clust_delay_feats = df .groupby (cluster_id_vars_on_2nd_level , observed = False )["weight" ].quantile ([0.10 , 0.25 , 0.5 , 0.75 , 0.90 ]).unstack ()
431443 clust_delay_feats .columns = [(int (x * 100 )) for x in clust_delay_feats .columns ]
432444 clust_delay_feats = clust_delay_feats .add_prefix ("q_" ).reset_index ()
433- median_vars = df .groupby (cluster_id_vars_on_2nd_level , observed = False )[["lat_median" , "long_median" , "tst_median" , "hdg_median" ]].median ().reset_index ()
445+
446+ median_cols = ["lat_median" , "long_median" , "hdg_median" , "tst_median_ns" ]
447+ existing_median_cols = [c for c in median_cols if c in df .columns ]
448+
449+ median_vars = (df .groupby (cluster_id_vars_on_2nd_level , observed = False )[existing_median_cols ].median ().reset_index ())
450+
451+ if "tst_median_ns" in median_vars .columns :
452+ median_vars ["tst_median" ] = pd .to_datetime (median_vars ["tst_median_ns" ], utc = True )
453+ median_vars = median_vars .drop (columns = ["tst_median_ns" ])
454+
434455 res = median_vars .merge (clust_counts , on = cluster_id_vars_on_2nd_level , how = "outer" )
435456 res = res .merge (clust_delay_feats , on = cluster_id_vars_on_2nd_level , how = "outer" )
436- res ["oday_min" ] = df .oday .min ()
437- res ["oday_max" ] = df .oday .max ()
457+
458+ if "oday" in df .columns :
459+ res ["oday_min" ] = df ["oday" ].min ()
460+ res ["oday_max" ] = df ["oday" ].max ()
461+ else :
462+ res ["oday_min" ] = pd .NaT
463+ res ["oday_max" ] = pd .NaT
464+
438465 return res
439466
440467
0 commit comments