refactor: split locations into more tables

Josh-Cena · Josh-Cena · commit a97cf7bd2abc · 2024-11-12T02:06:03.000-05:00
diff --git a/docs/db_diagram.pdf b/docs/db_diagram.pdf
diff --git a/ferry/database/__init__.py b/ferry/database/__init__.py
@@ -13,6 +13,7 @@
     EvaluationStatistics,
     Flag,
     course_meetings,
+    Building,
     Location,
     Listing,
     Professor,
diff --git a/ferry/database/models.py b/ferry/database/models.py
@@ -476,22 +476,50 @@ class Flag(BaseModel):
 )
 
 
+class Building(BaseModel):
+    """
+    Buildings table.
+    """
+
+    __tablename__ = "buildings"
+
+    code = Column(
+        String,
+        comment="Building short code/abbreviation, as in YCS",
+        index=True,
+        nullable=False,
+        primary_key=True,
+    )
+    building_name = Column(String, comment="Building full name")
+    url = Column(String, comment="Yale campus map URL")
+
+
 class Location(BaseModel):
     """
     Locations table.
     """
 
     __tablename__ = "locations"
 
-    location_id = Column(Integer, comment="Location ID", primary_key=True)
-    building_name = Column(String, comment="Building full name")
-    code = Column(
+    location_id = Column(Integer, primary_key=True)
+    building_code = Column(
         String,
-        comment="Building short code/abbreviation, as in YCS",
+        ForeignKey("buildings.code"),
+        comment="Building code",
         index=True,
         nullable=False,
     )
-    number = Column(String, comment="Room number", index=True)
+    building = relationship("Building", backref="locations", cascade="all")
+    room = Column(String, comment="Room number")
+
+    __table_args__ = (
+        Index(
+            "idx_building_code_room_unique",
+            "building_code",
+            "room",
+            unique=True,
+        ),
+    )
 
 
 class Professor(BaseModel):
diff --git a/ferry/transform/__init__.py b/ferry/transform/__init__.py
@@ -101,6 +101,7 @@ def transform(data_dir: Path) -> dict[str, pd.DataFrame]:
         ("course_flags", database.course_flags),
         ("course_meetings", database.course_meetings),
         ("locations", database.Location.__table__),
+        ("buildings", database.Building.__table__),
         ("evaluation_questions", database.EvaluationQuestion.__table__),
         ("evaluation_narratives", database.EvaluationNarrative.__table__),
         ("evaluation_statistics", database.EvaluationStatistics.__table__),
diff --git a/ferry/transform/cache_id.py b/ferry/transform/cache_id.py
@@ -67,9 +67,11 @@ def save_id_cache(tables: dict[str, pd.DataFrame], data_dir: Path):
     listing_to_id = {f"{k[0]}-{k[1]}": v for k, v in listing_to_id.items()}
     flag_to_id = tables["flags"].set_index("flag_text")["flag_id"].to_dict()
     location_to_id = (
-        tables["locations"].set_index(["code", "number"])["location_id"].to_dict()
+        tables["locations"]
+        .set_index(["building_code", "room"])["location_id"]
+        .to_dict()
     )
-    location_to_id = {f"{k[0]} {k[1]}": v for k, v in location_to_id.items()}
+    location_to_id = {f"{k[0]} {k[1] or ''}": v for k, v in location_to_id.items()}
     professor_to_id = (
         tables["professors"].set_index(["name", "email"])["professor_id"].to_dict()
     )
diff --git a/ferry/transform/import_courses.py b/ferry/transform/import_courses.py
@@ -3,6 +3,7 @@
 from typing import TypedDict, cast, Callable
 from pathlib import Path
 
+import numpy as np
 import pandas as pd
 import ujson
 from ferry.crawler.cache import load_cache_json
@@ -217,28 +218,37 @@ def aggregate_flags(
     return flags, course_flags
 
 
-def parse_location(location: str) -> dict[str, str]:
-    if " - " not in location:
-        if " " not in location:
-            # Just building code
-            return {"building_name": "", "code": location, "number": ""}
-        # [code] [number]
-        code, number = location.split(" ", 1)
-        return {"building_name": "", "code": code, "number": number}
-    abbrev, rest = location.split(" - ", 1)
-    if " " not in abbrev:
-        if rest == abbrev:
-            rest = ""
-        # [code] - [building name]
-        return {"building_name": rest, "code": abbrev, "number": ""}
-    code, number = abbrev.split(" ", 1)
-    if not rest.endswith(number):
+def parse_location(location: str) -> dict[str, str | None]:
+    def do_parse():
+        if " - " not in location:
+            if " " not in location:
+                # Just building code
+                return {"building_name": None, "code": location, "room": None}
+            # [code] [room]
+            code, room = location.split(" ", 1)
+            return {"building_name": None, "code": code, "room": room}
+        abbrev, rest = location.split(" - ", 1)
+        if " " not in abbrev:
+            if rest == abbrev:
+                rest = None
+            # [code] - [building name]
+            return {"building_name": rest, "code": abbrev, "room": None}
+        code, room = abbrev.split(" ", 1)
+        if not rest.endswith(room):
+            raise ValueError(f"Unexpected location format: {location}")
+        building_full_name = rest.removesuffix(f" {room}")
+        if building_full_name == code:
+            building_full_name = None
+        # [code] [room] - [building name] [room]
+        return {"building_name": building_full_name, "code": code, "room": room}
+
+    res = do_parse()
+    for key in res:
+        if res[key] == "" or res[key] == "TBA":
+            res[key] = None
+    if res["code"] is None:
         raise ValueError(f"Unexpected location format: {location}")
-    building_full_name = rest.removesuffix(f" {number}")
-    if building_full_name == code:
-        building_full_name = ""
-    # [code] [number] - [building name] [number]
-    return {"building_name": building_full_name, "code": code, "number": number}
+    return res
 
 
 weekdays = {
@@ -254,12 +264,12 @@ def parse_location(location: str) -> dict[str, str]:
 
 def aggregate_locations(
     courses: pd.DataFrame, data_dir: Path
-) -> tuple[pd.DataFrame, pd.DataFrame]:
+) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     location_data = []
     for times_by_day in courses["times_by_day"]:
         for day_details in ujson.loads(times_by_day).values():
             for [_, _, location, location_url] in day_details:
-                if location == "" or location == "TBA":
+                if location == "" or location == "TBA" or location == "TBA TBA":
                     continue
                 location_data.append({**parse_location(location), "url": location_url})
     locations = pd.DataFrame(location_data).drop_duplicates().reset_index(drop=True)
@@ -270,29 +280,52 @@ def report_multiple_names(row: pd.DataFrame):
                 f"Multiple names for {row.name}: {row['building_name'].unique()}"
             )
 
-    locations_by_name = locations[locations["building_name"] != ""].groupby(
-        ["code", "number"]
+    locations[~locations["building_name"].isna()].groupby(["code", "room"]).apply(
+        report_multiple_names
     )
-    locations_by_name.apply(report_multiple_names)
-    locations["building_name"] = locations["building_name"].replace({"": None})
-    locations = locations.groupby(["code", "number"], as_index=False).last()
+    locations = locations.groupby(["code", "room"], as_index=False, dropna=False).last()
 
     locations["location_id"] = generate_id(
         locations,
-        lambda row: f"{row['code']} {row['number']}",
+        lambda row: f"{row['code']} {row['room'] or ''}",
         data_dir / "id_cache" / "location_id.json",
     )
-    location_to_id = locations.set_index(["code", "number"])["location_id"].to_dict()
+    location_to_id = locations.set_index(["code", "room"])["location_id"].to_dict()
     locations = locations.set_index("location_id")
 
+    buildings = locations.copy(deep=True)
+    locations.rename(columns={"code": "building_code"}, inplace=True)
+
+    def report_different_info(group: pd.DataFrame):
+        all_names = group["building_name"].unique()
+        all_names = all_names[~pd.isna(all_names)]
+        if len(all_names) > 1:
+            logging.warning(
+                f"Multiple building names for building {group.name}: {all_names}; only the last name will be used"
+            )
+        all_urls = group["url"].unique()
+        all_urls = all_urls[~pd.isna(all_urls)]
+        if len(all_urls) > 1:
+            logging.warning(
+                f"Multiple URLs for building {group.name}: {all_urls}; only the last URL will be used"
+            )
+
+    buildings.groupby("code").apply(report_different_info)
+    buildings = (
+        buildings.sort_values(by=["building_name", "url"])
+        .groupby("code")
+        .last()
+        .reset_index()
+    )
+
     # For each course, go from { day_of_week: [start, end, location, url] } to
     # an array of { day_of_week, start_time, end_time, location_id }
     def to_course_meetings(times_by_day: str):
         data = ujson.loads(times_by_day)
         meetings = []
         for day, day_details in data.items():
             for [start, end, location, _] in day_details:
-                if location == "" or location == "TBA":
+                if location == "" or location == "TBA" or location == "TBA TBA":
                     meetings.append(
                         {
                             "days_of_week": weekdays[day],
@@ -309,7 +342,7 @@ def to_course_meetings(times_by_day: str):
                         "start_time": start,
                         "end_time": end,
                         "location_id": location_to_id[
-                            location_info["code"], location_info["number"]
+                            location_info["code"], location_info["room"] or np.nan
                         ],
                     }
                 )
@@ -318,9 +351,7 @@ def to_course_meetings(times_by_day: str):
     course_meetings = courses["times_by_day"].apply(to_course_meetings)
     # course_meetings is a series of course_id -> list of dicts.
     # Explode it to get a DataFrame with course_id, days_of_week, start_time, end_time, location_id
-    course_meetings = (
-        course_meetings.explode().dropna().apply(pd.Series).reset_index()
-    )
+    course_meetings = course_meetings.explode().dropna().apply(pd.Series).reset_index()
     # Merge rows with the same start/end/location
     course_meetings = (
         course_meetings.groupby(["course_id", "start_time", "end_time", "location_id"])
@@ -329,7 +360,7 @@ def to_course_meetings(times_by_day: str):
     )
     course_meetings["days_of_week"] = course_meetings["days_of_week"].astype(int)
     course_meetings["location_id"] = course_meetings["location_id"].astype(int)
-    return course_meetings, locations
+    return course_meetings, locations, buildings
 
 
 class CourseTables(TypedDict):
@@ -341,6 +372,7 @@ class CourseTables(TypedDict):
     flags: pd.DataFrame
     course_meetings: pd.DataFrame
     locations: pd.DataFrame
+    buildings: pd.DataFrame
 
 
 def import_courses(data_dir: Path, seasons: list[str]) -> CourseTables:
@@ -398,7 +430,7 @@ def import_courses(data_dir: Path, seasons: list[str]) -> CourseTables:
 
     professors, course_professors = aggregate_professors(courses, data_dir)
     flags, course_flags = aggregate_flags(courses, data_dir)
-    course_meetings, locations = aggregate_locations(courses, data_dir)
+    course_meetings, locations, buildings = aggregate_locations(courses, data_dir)
 
     print("\033[F", end="")
     print("Importing courses... ✔")
@@ -410,8 +442,9 @@ def import_courses(data_dir: Path, seasons: list[str]) -> CourseTables:
     print(f"Total professors: {len(professors)}")
     print(f"Total course-flags: {len(course_flags)}")
     print(f"Total flags: {len(flags)}")
-    print(f"Total locations: {len(locations)}")
     print(f"Total course-meetings: {len(course_meetings)}")
+    print(f"Total locations: {len(locations)}")
+    print(f"Total buildings: {len(buildings)}")
 
     return {
         "courses": courses,
@@ -421,5 +454,6 @@ def import_courses(data_dir: Path, seasons: list[str]) -> CourseTables:
         "course_flags": course_flags,
         "flags": flags,
         "locations": locations,
+        "buildings": buildings,
         "course_meetings": course_meetings,
     }

Original file line number	Diff line number	Diff line change
`@@ -67,9 +67,11 @@ def save_id_cache(tables: dict[str, pd.DataFrame], data_dir: Path):`
`67`	`67`	`listing_to_id = {f"{k[0]}-{k[1]}": v for k, v in listing_to_id.items()}`
`68`	`68`	`flag_to_id = tables["flags"].set_index("flag_text")["flag_id"].to_dict()`
`69`	`69`	`location_to_id = (`
`70`		`- tables["locations"].set_index(["code", "number"])["location_id"].to_dict()`
	`70`	`+ tables["locations"]`
	`71`	`+ .set_index(["building_code", "room"])["location_id"]`
	`72`	`+ .to_dict()`
`71`	`73`	`)`
`72`		`- location_to_id = {f"{k[0]} {k[1]}": v for k, v in location_to_id.items()}`
	`74`	`+ location_to_id = {f"{k[0]} {k[1] or ''}": v for k, v in location_to_id.items()}`
`73`	`75`	`professor_to_id = (`
`74`	`76`	`tables["professors"].set_index(["name", "email"])["professor_id"].to_dict()`
`75`	`77`	`)`