Skip to content

Commit c43a082

Browse files
committed
add time added and updated cols
1 parent 17bb6f2 commit c43a082

File tree

3 files changed

+77
-46
lines changed

3 files changed

+77
-46
lines changed

ferry/database/diff_db.py

Lines changed: 56 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,14 @@
1919
queries_dir = Path(__file__).parent / "queries"
2020

2121
primary_keys = {
22-
"flags" : ["flag_id"],
23-
"course_flags" : ["course_id"],
24-
"professors" : ["professor_id"],
25-
"course_professors" : ["course_id"],
26-
"courses" : ["course_id"],
27-
"listings" : ["listing_id"],
28-
}
22+
"flags": ["flag_id"],
23+
"course_flags": ["course_id"],
24+
"professors": ["professor_id"],
25+
"course_professors": ["course_id"],
26+
"courses": ["course_id"],
27+
"listings": ["listing_id"],
28+
}
29+
2930

3031
def get_dfs(database_connect_string: str):
3132
db = Database(database_connect_string)
@@ -35,7 +36,7 @@ def get_dfs(database_connect_string: str):
3536
db_meta.reflect(bind=db.Engine)
3637

3738
conn = db.Engine.connect()
38-
39+
3940
# get table names
4041
query = "SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'"
4142
result = conn.execute(text(query))
@@ -51,44 +52,49 @@ def get_dfs(database_connect_string: str):
5152

5253
return dataframes
5354

55+
5456
def check_change(row, table_name):
5557
cols_to_exclude = {
56-
"flags" : [],
57-
"course_flags" : [],
58-
"professors" : [],
59-
"course_professors" : [],
60-
'courses' : ['same_course_and_profs_id', 'same_course_id', 'same_prof_id', 'last_offered_course_id'],
61-
"listings" : [],
58+
"all": ["time_added", "last_updated"], # ignore the timestamps
59+
"flags": [],
60+
"course_flags": [],
61+
"professors": [],
62+
"course_professors": [],
63+
'courses': ['same_course_and_profs_id', 'same_course_id', 'same_prof_id', 'last_offered_course_id'],
64+
"listings": [],
6265
}
6366

6467
for col_name in row.index.tolist():
6568
if ("_old" not in col_name):
6669
continue
6770
col_name = col_name.replace("_old", "")
68-
69-
if (col_name in cols_to_exclude[table_name]):
70-
return False
71-
71+
72+
if (col_name in cols_to_exclude[table_name] or col_name in cols_to_exclude["all"]):
73+
continue
74+
7275
old_value = row[col_name + "_old"]
7376
new_value = row[col_name + "_new"]
74-
77+
7578
if isinstance(old_value, list) or isinstance(new_value, list):
76-
old_value = ast.literal_eval(str(old_value).replace('"',"'")) # fix quotes
77-
new_value = ast.literal_eval(str(new_value).replace('"',"'"))
79+
old_value = ast.literal_eval(
80+
str(old_value).replace('"', "'")) # fix quotes
81+
new_value = ast.literal_eval(str(new_value).replace('"', "'"))
7882
if (old_value != new_value):
7983
return True
8084
else:
8185
if (not pd.isna(old_value) and
82-
not pd.isna(new_value)):
86+
not pd.isna(new_value)):
8387
if isinstance(old_value, dict) and isinstance(new_value, str):
8488
new_value = json.loads(new_value)
8589
elif isinstance(old_value, (int, float)) and isinstance(new_value, (int, float)):
8690
new_value = float(new_value)
8791
old_value = float(old_value)
8892
else:
89-
90-
old_value = str(old_value).replace('"',"'").replace('\\', '').strip("'")
91-
new_value = str(new_value).replace('"',"'").replace('\\', '').strip("'")
93+
94+
old_value = str(old_value).replace(
95+
'"', "'").replace('\\', '').strip("'")
96+
new_value = str(new_value).replace(
97+
'"', "'").replace('\\', '').strip("'")
9298
# old_value = normalize_unicode(old_value)
9399
# new_value = normalize_unicode(new_value)
94100
try:
@@ -97,62 +103,67 @@ def check_change(row, table_name):
97103
except:
98104
pass
99105
if old_value != new_value:
100-
print(f"column: {col_name}, old: {old_value}, new: {new_value}")
106+
print(
107+
f"column: {col_name}, old: {old_value}, new: {new_value}")
101108
return True
102-
109+
103110
return False
104111

112+
105113
def generate_diff(tables_old: dict[str, pd.DataFrame],
106-
tables_new: dict[str, pd.DataFrame], output_dir:str):
114+
tables_new: dict[str, pd.DataFrame], output_dir: str):
107115

108116
diff_dict = {}
109117

110118
for table_name in primary_keys.keys():
111119
if table_name not in tables_new.keys() or table_name not in tables_old.keys():
112120
raise ValueError(f"Table {table_name} not found in new tables")
113-
121+
114122
print(f"Computing diff for table {table_name} ...", end=" ")
115-
123+
116124
output_file_path = Path(output_dir) / (table_name + ".md")
117-
118125

119126
with open(output_file_path, "w+") as file:
120-
# check difference between old df and new df and output to above file path
127+
# check difference between old df and new df and output to above file path
121128
old_df = tables_old[table_name]
122129
new_df = tables_new[table_name]
123-
130+
124131
# TODO - better way to do this?
125132
pk = primary_keys[table_name][0]
126133

127134
# Identify rows with differences
128-
135+
129136
# check for rows that are in old df but not in new df
130137
# based on primary key
131138
file.write("## Deleted rows in new table: \n")
132139

133140
deleted_rows = old_df[~old_df[pk].isin(new_df[pk])]
134141
if not deleted_rows.empty:
135142
file.write(f"{deleted_rows.to_csv()}\n")
136-
143+
137144
file.write("## Added rows in new table: \n")
138145
# check for rows that have been added
139146
added_rows = new_df[~new_df[pk].isin(old_df[pk])]
140147
if not added_rows.empty:
141148
file.write(f"{added_rows.to_csv()}\n")
142149

143150
if table_name == "course_flags":
144-
old_df = old_df.groupby("course_id")["flag_id"].apply(frozenset)
145-
new_df = new_df.groupby("course_id")["flag_id"].apply(frozenset)
151+
old_df = old_df.groupby("course_id")[
152+
"flag_id"].apply(frozenset)
153+
new_df = new_df.groupby("course_id")[
154+
"flag_id"].apply(frozenset)
146155
elif table_name == "course_professors":
147-
old_df = old_df.groupby("course_id")["professor_id"].apply(frozenset)
148-
new_df = new_df.groupby("course_id")["professor_id"].apply(frozenset)
156+
old_df = old_df.groupby("course_id")[
157+
"professor_id"].apply(frozenset)
158+
new_df = new_df.groupby("course_id")[
159+
"professor_id"].apply(frozenset)
149160

150-
151161
merged_df = pd.merge(old_df, new_df, on=pk,
152162
how="inner", suffixes=('_old', '_new'))
153-
154-
changed_rows = merged_df[merged_df.apply(check_change, args=(table_name,), axis=1)]
155-
163+
164+
changed_rows = merged_df[merged_df.apply(
165+
check_change, args=(table_name,), axis=1)]
166+
156167
file.write("## Changed rows in new table: \n")
157168

158169
if not changed_rows.empty:
@@ -168,7 +179,8 @@ def generate_diff(tables_old: dict[str, pd.DataFrame],
168179

169180
return diff_dict
170181

182+
171183
if __name__ == "__main__":
172184
tables_old = get_dfs("postgresql://postgres:postgres@db:5432/postgres")
173185
tables_new = transform(data_dir=Path("/workspaces/ferry/data"))
174-
generate_diff(tables_old, tables_new, "/workspaces/ferry/diff")
186+
generate_diff(tables_old, tables_new, "/workspaces/ferry/diff")

ferry/database/sync_db_diff.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,17 @@ def sync_db(tables: dict[str, pd.DataFrame], database_connect_string: str):
5555
if len(to_add) > 0:
5656
print(f"Adding {len(to_add)} new rows to {table_name}")
5757
for _, row in to_add.iterrows():
58-
columns = ', '.join(row.index)
58+
columns_list = list(row.index)
59+
columns_list.append("time_added")
60+
columns_list.append("last_updated")
61+
columns = ', '.join(columns_list)
62+
63+
values_list = list(row.values)
64+
values_list.append("NOW()")
65+
values_list.append("NOW()")
5966
values = ', '.join(
60-
f"'{str(v)}'" if v is not None else 'NULL' for v in row.values)
67+
f"'{str(v)}'" if v is not None else 'NULL' for v in values_list)
68+
6169
insert_query = f'INSERT INTO {table_name} ({columns}) VALUES ({values});'
6270
conn.execute(text(insert_query))
6371

ferry/timestamps.sql

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
ALTER TABLE courses
2+
ADD COLUMN time_added TIMESTAMP DEFAULT NULL,
3+
ADD COLUMN last_updated TIMESTAMP DEFAULT NULL;
4+
5+
ALTER TABLE listings
6+
ADD COLUMN time_added TIMESTAMP DEFAULT NULL,
7+
ADD COLUMN last_updated TIMESTAMP DEFAULT NULL;
8+
9+
ALTER TABLE course_professors
10+
ADD COLUMN time_added TIMESTAMP DEFAULT NULL,
11+
ADD COLUMN last_updated TIMESTAMP DEFAULT NULL;

0 commit comments

Comments
 (0)