Skip to content

Commit 018ce41

Browse files
committed
fix: use actual course_id for same_course_id
1 parent 065265c commit 018ce41

File tree

2 files changed

+27
-15
lines changed

2 files changed

+27
-15
lines changed

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ Make sure you have all dependencies installed or are using the Dev Container.
3939

4040
```sh
4141
python main.py -f config/dev_fetch.yml
42+
python main.py -f config/dev_sync_db.yml
4243
```
4344

4445
### Options
@@ -74,6 +75,12 @@ You can selectively run certain stages of Ferry. Options below are in the order
7475
| `--sync-db` | `sync_db` | Sync the transformed data to the database. |
7576
| `--generate-diagram` | `generate_diagram` | Generate a DB visualization diagram to `docs/db_diagram.pdf` |
7677

78+
For example, to test the transformer in isolation, you may find this handy:
79+
80+
```sh
81+
python main.py --transform --snapshot-tables
82+
```
83+
7784
### Release mode
7885

7986
In release mode:

ferry/transform/same_courses.py

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,10 @@ def resolve_historical_courses(
226226
for code_1, code_2 in itertools.pairwise(course_codes):
227227
cross_listed_codes.add_edge(code_1, code_2)
228228

229-
for code_1, code_2 in [*code_changes, *build_four_digit_transition(listings).items()]:
229+
for code_1, code_2 in [
230+
*code_changes,
231+
*build_four_digit_transition(listings).items(),
232+
]:
230233
cross_listed_codes.add_edge(code_1, code_2)
231234

232235
for subject_1, subject_2 in subject_changes:
@@ -254,7 +257,7 @@ def resolve_historical_courses(
254257
courses["description"].apply(len) >= MIN_DESCRIPTION_MATCH_LEN
255258
].to_dict()
256259

257-
same_courses: list[list[int]] = []
260+
same_course_to_courses: dict[int, list[int]] = {}
258261

259262
for codes in tqdm(
260263
cross_listed_codes,
@@ -281,7 +284,8 @@ def resolve_historical_courses(
281284
title_components = [(i, t, c) for i, (t, c) in enumerate(titles.items())]
282285
# There's no title variation, nothing to match
283286
if len(title_components) == 1:
284-
same_courses.append(list(course_set))
287+
ids = list(course_set)
288+
same_course_to_courses[min(ids)] = ids
285289
continue
286290
same_course_graph = nx.Graph()
287291
# fill in the nodes first to keep courses with no same-code edges
@@ -349,15 +353,15 @@ def resolve_historical_courses(
349353
log_file.write(
350354
f"[WARNING] {'/'.join(c1)} and {'/'.join(c2)} have no code in common\n"
351355
)
352-
same_courses.append(list(x))
356+
ids = list(x)
357+
same_course_to_courses[min(ids)] = ids
353358

354359
for course in set(discussion_course_ids):
355-
same_courses.append([course])
360+
same_course_to_courses[course] = [course]
356361

357362
# map courses to unique same-courses ID, and map same-courses ID to courses
358-
connected_courses = pd.Series(same_courses, name="course_id")
363+
connected_courses = pd.Series(same_course_to_courses, name="course_id")
359364
connected_courses.index.rename("same_course_id", inplace=True)
360-
same_course_to_courses = connected_courses.to_dict()
361365

362366
# map course_id to same-course partition ID
363367
same_course_id = (
@@ -409,14 +413,15 @@ def split_same_professors(
409413
.reset_index()
410414
)
411415

412-
professors_grouped.index.rename("same_course_and_profs_id", inplace=True)
416+
professors_grouped["same_course_and_profs_id"] = professors_grouped[
417+
"course_id"
418+
].apply(min)
419+
same_prof_course_to_courses = professors_grouped.set_index(
420+
"same_course_and_profs_id"
421+
)["course_id"].to_dict()
413422

414-
same_prof_course_to_courses = professors_grouped["course_id"].to_dict()
415-
416-
same_course_and_profs_id = (
417-
professors_grouped.explode("course_id")
418-
.reset_index(drop=False)
419-
.set_index("course_id")["same_course_and_profs_id"]
420-
)
423+
same_course_and_profs_id = professors_grouped.explode("course_id").set_index(
424+
"course_id"
425+
)["same_course_and_profs_id"]
421426

422427
return same_course_and_profs_id, same_prof_course_to_courses

0 commit comments

Comments
 (0)