33from typing import TypedDict , cast , Callable
44from pathlib import Path
55
6+ import numpy as np
67import pandas as pd
78import ujson
89from ferry .crawler .cache import load_cache_json
@@ -217,28 +218,37 @@ def aggregate_flags(
217218 return flags , course_flags
218219
219220
220- def parse_location (location : str ) -> dict [str , str ]:
221- if " - " not in location :
222- if " " not in location :
223- # Just building code
224- return {"building_name" : "" , "code" : location , "number" : "" }
225- # [code] [number]
226- code , number = location .split (" " , 1 )
227- return {"building_name" : "" , "code" : code , "number" : number }
228- abbrev , rest = location .split (" - " , 1 )
229- if " " not in abbrev :
230- if rest == abbrev :
231- rest = ""
232- # [code] - [building name]
233- return {"building_name" : rest , "code" : abbrev , "number" : "" }
234- code , number = abbrev .split (" " , 1 )
235- if not rest .endswith (number ):
221+ def parse_location (location : str ) -> dict [str , str | None ]:
222+ def do_parse ():
223+ if " - " not in location :
224+ if " " not in location :
225+ # Just building code
226+ return {"building_name" : None , "code" : location , "room" : None }
227+ # [code] [room]
228+ code , room = location .split (" " , 1 )
229+ return {"building_name" : None , "code" : code , "room" : room }
230+ abbrev , rest = location .split (" - " , 1 )
231+ if " " not in abbrev :
232+ if rest == abbrev :
233+ rest = None
234+ # [code] - [building name]
235+ return {"building_name" : rest , "code" : abbrev , "room" : None }
236+ code , room = abbrev .split (" " , 1 )
237+ if not rest .endswith (room ):
238+ raise ValueError (f"Unexpected location format: { location } " )
239+ building_full_name = rest .removesuffix (f" { room } " )
240+ if building_full_name == code :
241+ building_full_name = None
242+ # [code] [room] - [building name] [room]
243+ return {"building_name" : building_full_name , "code" : code , "room" : room }
244+
245+ res = do_parse ()
246+ for key in res :
247+ if res [key ] == "" or res [key ] == "TBA" :
248+ res [key ] = None
249+ if res ["code" ] is None :
236250 raise ValueError (f"Unexpected location format: { location } " )
237- building_full_name = rest .removesuffix (f" { number } " )
238- if building_full_name == code :
239- building_full_name = ""
240- # [code] [number] - [building name] [number]
241- return {"building_name" : building_full_name , "code" : code , "number" : number }
251+ return res
242252
243253
244254weekdays = {
@@ -254,12 +264,12 @@ def parse_location(location: str) -> dict[str, str]:
254264
255265def aggregate_locations (
256266 courses : pd .DataFrame , data_dir : Path
257- ) -> tuple [pd .DataFrame , pd .DataFrame ]:
267+ ) -> tuple [pd .DataFrame , pd .DataFrame , pd . DataFrame ]:
258268 location_data = []
259269 for times_by_day in courses ["times_by_day" ]:
260270 for day_details in ujson .loads (times_by_day ).values ():
261271 for [_ , _ , location , location_url ] in day_details :
262- if location == "" or location == "TBA" :
272+ if location == "" or location == "TBA" or location == "TBA TBA" :
263273 continue
264274 location_data .append ({** parse_location (location ), "url" : location_url })
265275 locations = pd .DataFrame (location_data ).drop_duplicates ().reset_index (drop = True )
@@ -270,29 +280,52 @@ def report_multiple_names(row: pd.DataFrame):
270280 f"Multiple names for { row .name } : { row ['building_name' ].unique ()} "
271281 )
272282
273- locations_by_name = locations [locations ["building_name" ] != "" ]. groupby (
274- [ "code" , "number" ]
283+ locations [~ locations ["building_name" ]. isna ()]. groupby ([ "code" , "room" ]). apply (
284+ report_multiple_names
275285 )
276- locations_by_name .apply (report_multiple_names )
277- locations ["building_name" ] = locations ["building_name" ].replace ({"" : None })
278- locations = locations .groupby (["code" , "number" ], as_index = False ).last ()
286+ locations = locations .groupby (["code" , "room" ], as_index = False , dropna = False ).last ()
279287
280288 locations ["location_id" ] = generate_id (
281289 locations ,
282- lambda row : f"{ row ['code' ]} { row ['number' ] } " ,
290+ lambda row : f"{ row ['code' ]} { row ['room' ] or '' } " ,
283291 data_dir / "id_cache" / "location_id.json" ,
284292 )
285- location_to_id = locations .set_index (["code" , "number " ])["location_id" ].to_dict ()
293+ location_to_id = locations .set_index (["code" , "room " ])["location_id" ].to_dict ()
286294 locations = locations .set_index ("location_id" )
287295
296+ buildings = locations .copy (deep = True )
297+ locations .rename (columns = {"code" : "building_code" }, inplace = True )
298+
299+ def report_different_info (group : pd .DataFrame ):
300+ all_names = group ["building_name" ].unique ()
301+ all_names = all_names [~ pd .isna (all_names )]
302+ if len (all_names ) > 1 :
303+ logging .warning (
304+ f"Multiple building names for building { group .name } : { all_names } ; only the last name will be used"
305+ )
306+ all_urls = group ["url" ].unique ()
307+ all_urls = all_urls [~ pd .isna (all_urls )]
308+ if len (all_urls ) > 1 :
309+ logging .warning (
310+ f"Multiple URLs for building { group .name } : { all_urls } ; only the last URL will be used"
311+ )
312+
313+ buildings .groupby ("code" ).apply (report_different_info )
314+ buildings = (
315+ buildings .sort_values (by = ["building_name" , "url" ])
316+ .groupby ("code" )
317+ .last ()
318+ .reset_index ()
319+ )
320+
288321 # For each course, go from { day_of_week: [start, end, location, url] } to
289322 # an array of { day_of_week, start_time, end_time, location_id }
290323 def to_course_meetings (times_by_day : str ):
291324 data = ujson .loads (times_by_day )
292325 meetings = []
293326 for day , day_details in data .items ():
294327 for [start , end , location , _ ] in day_details :
295- if location == "" or location == "TBA" :
328+ if location == "" or location == "TBA" or location == "TBA TBA" :
296329 meetings .append (
297330 {
298331 "days_of_week" : weekdays [day ],
@@ -309,7 +342,7 @@ def to_course_meetings(times_by_day: str):
309342 "start_time" : start ,
310343 "end_time" : end ,
311344 "location_id" : location_to_id [
312- location_info ["code" ], location_info ["number" ]
345+ location_info ["code" ], location_info ["room" ] or np . nan
313346 ],
314347 }
315348 )
@@ -318,9 +351,7 @@ def to_course_meetings(times_by_day: str):
318351 course_meetings = courses ["times_by_day" ].apply (to_course_meetings )
319352 # course_meetings is a series of course_id -> list of dicts.
320353 # Explode it to get a DataFrame with course_id, days_of_week, start_time, end_time, location_id
321- course_meetings = (
322- course_meetings .explode ().dropna ().apply (pd .Series ).reset_index ()
323- )
354+ course_meetings = course_meetings .explode ().dropna ().apply (pd .Series ).reset_index ()
324355 # Merge rows with the same start/end/location
325356 course_meetings = (
326357 course_meetings .groupby (["course_id" , "start_time" , "end_time" , "location_id" ])
@@ -329,7 +360,7 @@ def to_course_meetings(times_by_day: str):
329360 )
330361 course_meetings ["days_of_week" ] = course_meetings ["days_of_week" ].astype (int )
331362 course_meetings ["location_id" ] = course_meetings ["location_id" ].astype (int )
332- return course_meetings , locations
363+ return course_meetings , locations , buildings
333364
334365
335366class CourseTables (TypedDict ):
@@ -341,6 +372,7 @@ class CourseTables(TypedDict):
341372 flags : pd .DataFrame
342373 course_meetings : pd .DataFrame
343374 locations : pd .DataFrame
375+ buildings : pd .DataFrame
344376
345377
346378def import_courses (data_dir : Path , seasons : list [str ]) -> CourseTables :
@@ -398,7 +430,7 @@ def import_courses(data_dir: Path, seasons: list[str]) -> CourseTables:
398430
399431 professors , course_professors = aggregate_professors (courses , data_dir )
400432 flags , course_flags = aggregate_flags (courses , data_dir )
401- course_meetings , locations = aggregate_locations (courses , data_dir )
433+ course_meetings , locations , buildings = aggregate_locations (courses , data_dir )
402434
403435 print ("\033 [F" , end = "" )
404436 print ("Importing courses... ✔" )
@@ -410,8 +442,9 @@ def import_courses(data_dir: Path, seasons: list[str]) -> CourseTables:
410442 print (f"Total professors: { len (professors )} " )
411443 print (f"Total course-flags: { len (course_flags )} " )
412444 print (f"Total flags: { len (flags )} " )
413- print (f"Total locations: { len (locations )} " )
414445 print (f"Total course-meetings: { len (course_meetings )} " )
446+ print (f"Total locations: { len (locations )} " )
447+ print (f"Total buildings: { len (buildings )} " )
415448
416449 return {
417450 "courses" : courses ,
@@ -421,5 +454,6 @@ def import_courses(data_dir: Path, seasons: list[str]) -> CourseTables:
421454 "course_flags" : course_flags ,
422455 "flags" : flags ,
423456 "locations" : locations ,
457+ "buildings" : buildings ,
424458 "course_meetings" : course_meetings ,
425459 }
0 commit comments