Skip to content

Commit a97cf7b

Browse files
committed
refactor: split locations into more tables
1 parent e5171dd commit a97cf7b

File tree

6 files changed

+111
-45
lines changed

6 files changed

+111
-45
lines changed

docs/db_diagram.pdf

271 Bytes
Binary file not shown.

ferry/database/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
EvaluationStatistics,
1414
Flag,
1515
course_meetings,
16+
Building,
1617
Location,
1718
Listing,
1819
Professor,

ferry/database/models.py

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -476,22 +476,50 @@ class Flag(BaseModel):
476476
)
477477

478478

479+
class Building(BaseModel):
480+
"""
481+
Buildings table.
482+
"""
483+
484+
__tablename__ = "buildings"
485+
486+
code = Column(
487+
String,
488+
comment="Building short code/abbreviation, as in YCS",
489+
index=True,
490+
nullable=False,
491+
primary_key=True,
492+
)
493+
building_name = Column(String, comment="Building full name")
494+
url = Column(String, comment="Yale campus map URL")
495+
496+
479497
class Location(BaseModel):
480498
"""
481499
Locations table.
482500
"""
483501

484502
__tablename__ = "locations"
485503

486-
location_id = Column(Integer, comment="Location ID", primary_key=True)
487-
building_name = Column(String, comment="Building full name")
488-
code = Column(
504+
location_id = Column(Integer, primary_key=True)
505+
building_code = Column(
489506
String,
490-
comment="Building short code/abbreviation, as in YCS",
507+
ForeignKey("buildings.code"),
508+
comment="Building code",
491509
index=True,
492510
nullable=False,
493511
)
494-
number = Column(String, comment="Room number", index=True)
512+
building = relationship("Building", backref="locations", cascade="all")
513+
room = Column(String, comment="Room number")
514+
515+
__table_args__ = (
516+
Index(
517+
"idx_building_code_room_unique",
518+
"building_code",
519+
"room",
520+
unique=True,
521+
),
522+
)
495523

496524

497525
class Professor(BaseModel):

ferry/transform/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ def transform(data_dir: Path) -> dict[str, pd.DataFrame]:
101101
("course_flags", database.course_flags),
102102
("course_meetings", database.course_meetings),
103103
("locations", database.Location.__table__),
104+
("buildings", database.Building.__table__),
104105
("evaluation_questions", database.EvaluationQuestion.__table__),
105106
("evaluation_narratives", database.EvaluationNarrative.__table__),
106107
("evaluation_statistics", database.EvaluationStatistics.__table__),

ferry/transform/cache_id.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,11 @@ def save_id_cache(tables: dict[str, pd.DataFrame], data_dir: Path):
6767
listing_to_id = {f"{k[0]}-{k[1]}": v for k, v in listing_to_id.items()}
6868
flag_to_id = tables["flags"].set_index("flag_text")["flag_id"].to_dict()
6969
location_to_id = (
70-
tables["locations"].set_index(["code", "number"])["location_id"].to_dict()
70+
tables["locations"]
71+
.set_index(["building_code", "room"])["location_id"]
72+
.to_dict()
7173
)
72-
location_to_id = {f"{k[0]} {k[1]}": v for k, v in location_to_id.items()}
74+
location_to_id = {f"{k[0]} {k[1] or ''}": v for k, v in location_to_id.items()}
7375
professor_to_id = (
7476
tables["professors"].set_index(["name", "email"])["professor_id"].to_dict()
7577
)

ferry/transform/import_courses.py

Lines changed: 72 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from typing import TypedDict, cast, Callable
44
from pathlib import Path
55

6+
import numpy as np
67
import pandas as pd
78
import ujson
89
from ferry.crawler.cache import load_cache_json
@@ -217,28 +218,37 @@ def aggregate_flags(
217218
return flags, course_flags
218219

219220

220-
def parse_location(location: str) -> dict[str, str]:
221-
if " - " not in location:
222-
if " " not in location:
223-
# Just building code
224-
return {"building_name": "", "code": location, "number": ""}
225-
# [code] [number]
226-
code, number = location.split(" ", 1)
227-
return {"building_name": "", "code": code, "number": number}
228-
abbrev, rest = location.split(" - ", 1)
229-
if " " not in abbrev:
230-
if rest == abbrev:
231-
rest = ""
232-
# [code] - [building name]
233-
return {"building_name": rest, "code": abbrev, "number": ""}
234-
code, number = abbrev.split(" ", 1)
235-
if not rest.endswith(number):
221+
def parse_location(location: str) -> dict[str, str | None]:
222+
def do_parse():
223+
if " - " not in location:
224+
if " " not in location:
225+
# Just building code
226+
return {"building_name": None, "code": location, "room": None}
227+
# [code] [room]
228+
code, room = location.split(" ", 1)
229+
return {"building_name": None, "code": code, "room": room}
230+
abbrev, rest = location.split(" - ", 1)
231+
if " " not in abbrev:
232+
if rest == abbrev:
233+
rest = None
234+
# [code] - [building name]
235+
return {"building_name": rest, "code": abbrev, "room": None}
236+
code, room = abbrev.split(" ", 1)
237+
if not rest.endswith(room):
238+
raise ValueError(f"Unexpected location format: {location}")
239+
building_full_name = rest.removesuffix(f" {room}")
240+
if building_full_name == code:
241+
building_full_name = None
242+
# [code] [room] - [building name] [room]
243+
return {"building_name": building_full_name, "code": code, "room": room}
244+
245+
res = do_parse()
246+
for key in res:
247+
if res[key] == "" or res[key] == "TBA":
248+
res[key] = None
249+
if res["code"] is None:
236250
raise ValueError(f"Unexpected location format: {location}")
237-
building_full_name = rest.removesuffix(f" {number}")
238-
if building_full_name == code:
239-
building_full_name = ""
240-
# [code] [number] - [building name] [number]
241-
return {"building_name": building_full_name, "code": code, "number": number}
251+
return res
242252

243253

244254
weekdays = {
@@ -254,12 +264,12 @@ def parse_location(location: str) -> dict[str, str]:
254264

255265
def aggregate_locations(
256266
courses: pd.DataFrame, data_dir: Path
257-
) -> tuple[pd.DataFrame, pd.DataFrame]:
267+
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
258268
location_data = []
259269
for times_by_day in courses["times_by_day"]:
260270
for day_details in ujson.loads(times_by_day).values():
261271
for [_, _, location, location_url] in day_details:
262-
if location == "" or location == "TBA":
272+
if location == "" or location == "TBA" or location == "TBA TBA":
263273
continue
264274
location_data.append({**parse_location(location), "url": location_url})
265275
locations = pd.DataFrame(location_data).drop_duplicates().reset_index(drop=True)
@@ -270,29 +280,52 @@ def report_multiple_names(row: pd.DataFrame):
270280
f"Multiple names for {row.name}: {row['building_name'].unique()}"
271281
)
272282

273-
locations_by_name = locations[locations["building_name"] != ""].groupby(
274-
["code", "number"]
283+
locations[~locations["building_name"].isna()].groupby(["code", "room"]).apply(
284+
report_multiple_names
275285
)
276-
locations_by_name.apply(report_multiple_names)
277-
locations["building_name"] = locations["building_name"].replace({"": None})
278-
locations = locations.groupby(["code", "number"], as_index=False).last()
286+
locations = locations.groupby(["code", "room"], as_index=False, dropna=False).last()
279287

280288
locations["location_id"] = generate_id(
281289
locations,
282-
lambda row: f"{row['code']} {row['number']}",
290+
lambda row: f"{row['code']} {row['room'] or ''}",
283291
data_dir / "id_cache" / "location_id.json",
284292
)
285-
location_to_id = locations.set_index(["code", "number"])["location_id"].to_dict()
293+
location_to_id = locations.set_index(["code", "room"])["location_id"].to_dict()
286294
locations = locations.set_index("location_id")
287295

296+
buildings = locations.copy(deep=True)
297+
locations.rename(columns={"code": "building_code"}, inplace=True)
298+
299+
def report_different_info(group: pd.DataFrame):
300+
all_names = group["building_name"].unique()
301+
all_names = all_names[~pd.isna(all_names)]
302+
if len(all_names) > 1:
303+
logging.warning(
304+
f"Multiple building names for building {group.name}: {all_names}; only the last name will be used"
305+
)
306+
all_urls = group["url"].unique()
307+
all_urls = all_urls[~pd.isna(all_urls)]
308+
if len(all_urls) > 1:
309+
logging.warning(
310+
f"Multiple URLs for building {group.name}: {all_urls}; only the last URL will be used"
311+
)
312+
313+
buildings.groupby("code").apply(report_different_info)
314+
buildings = (
315+
buildings.sort_values(by=["building_name", "url"])
316+
.groupby("code")
317+
.last()
318+
.reset_index()
319+
)
320+
288321
# For each course, go from { day_of_week: [start, end, location, url] } to
289322
# an array of { day_of_week, start_time, end_time, location_id }
290323
def to_course_meetings(times_by_day: str):
291324
data = ujson.loads(times_by_day)
292325
meetings = []
293326
for day, day_details in data.items():
294327
for [start, end, location, _] in day_details:
295-
if location == "" or location == "TBA":
328+
if location == "" or location == "TBA" or location == "TBA TBA":
296329
meetings.append(
297330
{
298331
"days_of_week": weekdays[day],
@@ -309,7 +342,7 @@ def to_course_meetings(times_by_day: str):
309342
"start_time": start,
310343
"end_time": end,
311344
"location_id": location_to_id[
312-
location_info["code"], location_info["number"]
345+
location_info["code"], location_info["room"] or np.nan
313346
],
314347
}
315348
)
@@ -318,9 +351,7 @@ def to_course_meetings(times_by_day: str):
318351
course_meetings = courses["times_by_day"].apply(to_course_meetings)
319352
# course_meetings is a series of course_id -> list of dicts.
320353
# Explode it to get a DataFrame with course_id, days_of_week, start_time, end_time, location_id
321-
course_meetings = (
322-
course_meetings.explode().dropna().apply(pd.Series).reset_index()
323-
)
354+
course_meetings = course_meetings.explode().dropna().apply(pd.Series).reset_index()
324355
# Merge rows with the same start/end/location
325356
course_meetings = (
326357
course_meetings.groupby(["course_id", "start_time", "end_time", "location_id"])
@@ -329,7 +360,7 @@ def to_course_meetings(times_by_day: str):
329360
)
330361
course_meetings["days_of_week"] = course_meetings["days_of_week"].astype(int)
331362
course_meetings["location_id"] = course_meetings["location_id"].astype(int)
332-
return course_meetings, locations
363+
return course_meetings, locations, buildings
333364

334365

335366
class CourseTables(TypedDict):
@@ -341,6 +372,7 @@ class CourseTables(TypedDict):
341372
flags: pd.DataFrame
342373
course_meetings: pd.DataFrame
343374
locations: pd.DataFrame
375+
buildings: pd.DataFrame
344376

345377

346378
def import_courses(data_dir: Path, seasons: list[str]) -> CourseTables:
@@ -398,7 +430,7 @@ def import_courses(data_dir: Path, seasons: list[str]) -> CourseTables:
398430

399431
professors, course_professors = aggregate_professors(courses, data_dir)
400432
flags, course_flags = aggregate_flags(courses, data_dir)
401-
course_meetings, locations = aggregate_locations(courses, data_dir)
433+
course_meetings, locations, buildings = aggregate_locations(courses, data_dir)
402434

403435
print("\033[F", end="")
404436
print("Importing courses... ✔")
@@ -410,8 +442,9 @@ def import_courses(data_dir: Path, seasons: list[str]) -> CourseTables:
410442
print(f"Total professors: {len(professors)}")
411443
print(f"Total course-flags: {len(course_flags)}")
412444
print(f"Total flags: {len(flags)}")
413-
print(f"Total locations: {len(locations)}")
414445
print(f"Total course-meetings: {len(course_meetings)}")
446+
print(f"Total locations: {len(locations)}")
447+
print(f"Total buildings: {len(buildings)}")
415448

416449
return {
417450
"courses": courses,
@@ -421,5 +454,6 @@ def import_courses(data_dir: Path, seasons: list[str]) -> CourseTables:
421454
"course_flags": course_flags,
422455
"flags": flags,
423456
"locations": locations,
457+
"buildings": buildings,
424458
"course_meetings": course_meetings,
425459
}

0 commit comments

Comments
 (0)