Skip to content

Commit 9a3a989

Browse files
authored
Merge pull request #210 from HSLdevcom/log-recluster-stack-trace
fix calculate_cluster_features() to handle different timezones
2 parents a4a31b3 + 71c89c5 commit 9a3a989

File tree

1 file changed

+32
-5
lines changed

1 file changed

+32
-5
lines changed

python/common/recluster.py

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -413,8 +413,20 @@ def calculate_cluster_features(df: pd.DataFrame, cluster_id_vars_on_2nd_level: l
413413
pd.DataFrame: clusters with descriptive variables
414414
"""
415415

416-
df["tst_median"] = pd.to_datetime(df["tst_median"], format="ISO8601")
417-
df["oday"] = pd.to_datetime(df["oday"])
416+
df = df.copy()
417+
418+
for col in ["lat_median", "long_median", "hdg_median", "weight"]:
419+
if col in df.columns:
420+
df[col] = pd.to_numeric(df[col], errors="coerce")
421+
422+
if "tst_median" in df.columns:
423+
df["tst_median"] = pd.to_datetime(df["tst_median"], errors="coerce", utc=True)
424+
df["tst_median_ns"] = df["tst_median"].view("int64")
425+
else:
426+
df["tst_median_ns"] = pd.Series(index=df.index, dtype="float64")
427+
428+
if "oday" in df.columns:
429+
df["oday"] = pd.to_datetime(df["oday"], errors="coerce")
418430

419431
clust_counts = df.drop_duplicates(
420432
subset=[
@@ -430,11 +442,26 @@ def calculate_cluster_features(df: pd.DataFrame, cluster_id_vars_on_2nd_level: l
430442
clust_delay_feats = df.groupby(cluster_id_vars_on_2nd_level, observed=False)["weight"].quantile([0.10, 0.25, 0.5, 0.75, 0.90]).unstack()
431443
clust_delay_feats.columns = [(int(x * 100)) for x in clust_delay_feats.columns]
432444
clust_delay_feats = clust_delay_feats.add_prefix("q_").reset_index()
433-
median_vars = df.groupby(cluster_id_vars_on_2nd_level, observed=False)[["lat_median", "long_median", "tst_median", "hdg_median"]].median().reset_index()
445+
446+
median_cols = ["lat_median", "long_median", "hdg_median", "tst_median_ns"]
447+
existing_median_cols = [c for c in median_cols if c in df.columns]
448+
449+
median_vars = (df.groupby(cluster_id_vars_on_2nd_level, observed=False)[existing_median_cols].median().reset_index())
450+
451+
if "tst_median_ns" in median_vars.columns:
452+
median_vars["tst_median"] = pd.to_datetime(median_vars["tst_median_ns"], utc=True)
453+
median_vars = median_vars.drop(columns=["tst_median_ns"])
454+
434455
res = median_vars.merge(clust_counts, on=cluster_id_vars_on_2nd_level, how="outer")
435456
res = res.merge(clust_delay_feats, on=cluster_id_vars_on_2nd_level, how="outer")
436-
res["oday_min"] = df.oday.min()
437-
res["oday_max"] = df.oday.max()
457+
458+
if "oday" in df.columns:
459+
res["oday_min"] = df["oday"].min()
460+
res["oday_max"] = df["oday"].max()
461+
else:
462+
res["oday_min"] = pd.NaT
463+
res["oday_max"] = pd.NaT
464+
438465
return res
439466

440467

0 commit comments

Comments
 (0)