|
1 | 1 | # import intake |
2 | | -import pandas as pd |
3 | | -import geopandas as gpd |
4 | | -from calitp_data_analysis import geography_utils |
5 | | - |
| 2 | +import uuid |
6 | 3 |
|
7 | | -from update_vars import ANALYSIS_DATE, BORDER_BUFFER_METERS |
| 4 | +import geopandas as gpd |
| 5 | +import pandas as pd |
8 | 6 | from segment_speed_utils import helpers |
| 7 | +from update_vars import ANALYSIS_DATE, BORDER_BUFFER_METERS |
9 | 8 | from utils import read_census_tracts |
10 | | -import uuid |
11 | 9 |
|
12 | | -def intersection_hash(row): |
13 | | - ''' |
| 10 | + |
| 11 | +def intersection_hash(row, id_col="tract"): |
| 12 | + """ |
14 | 13 | Get unique hash of intersection zones. |
15 | 14 | No need to keep both t1 x t2 and t2 x t1 |
16 | | - ''' |
17 | | - t1 = int(row.tract_1[2:]) # drop state code |
18 | | - t2 = int(row.tract_2[2:]) |
| 15 | + """ |
| 16 | + t1 = int(row[f"{id_col}_1"][2:]) # drop state code |
| 17 | + t2 = int(row[f"{id_col}_2"][2:]) |
19 | 18 | row_tracts = [t1, t2] |
20 | | - row_tracts.sort() # modifies inplace |
| 19 | + row_tracts.sort() # modifies inplace |
21 | 20 | return hash(tuple(row_tracts)) |
22 | 21 |
|
23 | | -def find_borders(tracts_gdf: gpd.GeoDataFrame, |
24 | | - border_buffer: int = BORDER_BUFFER_METERS |
| 22 | + |
| 23 | +def find_borders( |
| 24 | + tracts_gdf: gpd.GeoDataFrame, border_buffer: int = BORDER_BUFFER_METERS, id_col: str = "tract" |
25 | 25 | ) -> gpd.GeoDataFrame: |
26 | | - ''' |
27 | | - ''' |
| 26 | + """ """ |
28 | 27 | tracts_gdf = tracts_gdf.copy() |
29 | 28 | tracts_gdf.geometry = tracts_gdf.buffer(border_buffer) |
30 | 29 | borders = gpd.overlay(tracts_gdf, tracts_gdf) |
31 | | - borders = borders[borders['tract_1'] != borders['tract_2']] |
| 30 | + borders = borders[borders[f"{id_col}_1"] != borders[f"{id_col}_2"]] |
32 | 31 | # for dropping mirrored borders |
33 | | - borders['intersection_hash'] = borders.apply(intersection_hash, axis=1) |
34 | | - borders = borders.drop_duplicates(subset=['intersection_hash']) |
| 32 | + borders["intersection_hash"] = borders.apply(intersection_hash, axis=1, id_col=id_col) |
| 33 | + borders = borders.drop_duplicates(subset=["intersection_hash"]) |
35 | 34 | # for more elegant tracking |
36 | | - borders['intersection_id'] = [str(uuid.uuid4()) for _ in range(borders.shape[0])] |
| 35 | + borders["intersection_id"] = [str(uuid.uuid4()) for _ in range(borders.shape[0])] |
37 | 36 | return borders |
38 | 37 |
|
39 | | -def find_shapes_in_tracts_borders(shape_stops, tracts, borders): |
40 | | - |
41 | | - ''' |
| 38 | + |
| 39 | +def find_shapes_in_areas_borders(shape_stops, areas, borders, id_col="tract"): |
| 40 | + """ |
42 | 41 | sjoin stops to tracts and border zones by GTFS shape. |
43 | 42 | create tsi_segment_id equal to tract if a single tract |
44 | 43 | or intersection_id if a border zone |
45 | | - ''' |
46 | | - shape_stops_tracts_borders = (pd.concat([tracts, borders]) |
47 | | - .sjoin(shape_stops) |
48 | | - .drop(columns='index_right') |
49 | | - ) |
50 | | - |
51 | | - shape_stops_tracts_borders = shape_stops_tracts_borders.assign( |
52 | | - tsi_segment_id = shape_stops_tracts_borders.tract.combine_first( |
53 | | - shape_stops_tracts_borders.intersection_id) |
54 | | - .astype(str) |
| 44 | + """ |
| 45 | + shape_stops_areas_borders = pd.concat([areas, borders]).sjoin(shape_stops).drop(columns="index_right") |
| 46 | + |
| 47 | + shape_stops_areas_borders = shape_stops_areas_borders.assign( |
| 48 | + tsi_segment_id=shape_stops_areas_borders[f"{id_col}"] |
| 49 | + .combine_first(shape_stops_areas_borders.intersection_id) |
| 50 | + .astype(str) |
55 | 51 | ) |
56 | | - return shape_stops_tracts_borders |
| 52 | + return shape_stops_areas_borders |
57 | 53 |
|
58 | 54 |
|
59 | 55 | if __name__ == "__main__": |
60 | | - |
61 | | - print(f'prepare_tracts_borders {ANALYSIS_DATE}') |
| 56 | + |
| 57 | + print(f"prepare_tracts_borders {ANALYSIS_DATE}") |
62 | 58 | tracts = read_census_tracts(ANALYSIS_DATE) |
63 | 59 | shapes = helpers.import_scheduled_shapes(ANALYSIS_DATE) |
64 | 60 | borders = find_borders(tracts) |
65 | | - st = helpers.import_scheduled_stop_times(analysis_date=ANALYSIS_DATE, |
66 | | - columns=['feed_key', 'trip_id', 'stop_id'], |
67 | | - get_pandas=True) |
68 | | - trips = helpers.import_scheduled_trips(ANALYSIS_DATE, columns=['shape_array_key', 'trip_id', 'feed_key']) |
69 | | - stops = helpers.import_scheduled_stops(ANALYSIS_DATE, columns=['feed_key', 'stop_id', 'geometry']) |
70 | | - |
71 | | - shape_stops = (stops.merge(st, on = ['feed_key', 'stop_id']) |
72 | | - .merge(trips, on = ['feed_key', 'trip_id']) |
73 | | - .drop_duplicates(subset=['feed_key', 'shape_array_key', 'stop_id']) |
74 | | - .dropna() |
| 61 | + st = helpers.import_scheduled_stop_times( |
| 62 | + analysis_date=ANALYSIS_DATE, columns=["feed_key", "trip_id", "stop_id"], get_pandas=True |
| 63 | + ) |
| 64 | + trips = helpers.import_scheduled_trips(ANALYSIS_DATE, columns=["shape_array_key", "trip_id", "feed_key"]) |
| 65 | + stops = helpers.import_scheduled_stops(ANALYSIS_DATE, columns=["feed_key", "stop_id", "geometry"]) |
| 66 | + |
| 67 | + shape_stops = ( |
| 68 | + stops.merge(st, on=["feed_key", "stop_id"]) |
| 69 | + .merge(trips, on=["feed_key", "trip_id"]) |
| 70 | + .drop_duplicates(subset=["feed_key", "shape_array_key", "stop_id"]) |
| 71 | + .dropna() |
75 | 72 | ) |
76 | 73 |
|
77 | | - borders.to_parquet(f'borders_{ANALYSIS_DATE}.parquet') |
78 | | - shape_stops_tracts_borders = find_shapes_in_tracts_borders(shape_stops, tracts, borders) |
79 | | - shape_stops_tracts_borders.to_parquet(f'shape_stops_tracts_borders_{ANALYSIS_DATE}.parquet') |
| 74 | + borders.to_parquet(f"borders_{ANALYSIS_DATE}.parquet") |
| 75 | + shape_stops_tracts_borders = find_shapes_in_areas_borders(shape_stops, tracts, borders) |
| 76 | + shape_stops_tracts_borders.to_parquet(f"shape_stops_tracts_borders_{ANALYSIS_DATE}.parquet") |
0 commit comments