1919queries_dir = Path (__file__ ).parent / "queries"
2020
2121primary_keys = {
22- "flags" : ["flag_id" ],
23- "course_flags" : ["course_id" ],
24- "professors" : ["professor_id" ],
25- "course_professors" : ["course_id" ],
26- "courses" : ["course_id" ],
27- "listings" : ["listing_id" ],
28- }
22+ "flags" : ["flag_id" ],
23+ "course_flags" : ["course_id" ],
24+ "professors" : ["professor_id" ],
25+ "course_professors" : ["course_id" ],
26+ "courses" : ["course_id" ],
27+ "listings" : ["listing_id" ],
28+ }
29+
2930
3031def get_dfs (database_connect_string : str ):
3132 db = Database (database_connect_string )
@@ -35,7 +36,7 @@ def get_dfs(database_connect_string: str):
3536 db_meta .reflect (bind = db .Engine )
3637
3738 conn = db .Engine .connect ()
38-
39+
3940 # get table names
4041 query = "SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'"
4142 result = conn .execute (text (query ))
@@ -51,44 +52,49 @@ def get_dfs(database_connect_string: str):
5152
5253 return dataframes
5354
55+
5456def check_change (row , table_name ):
5557 cols_to_exclude = {
56- "flags" : [],
57- "course_flags" : [],
58- "professors" : [],
59- "course_professors" : [],
60- 'courses' : ['same_course_and_profs_id' , 'same_course_id' , 'same_prof_id' , 'last_offered_course_id' ],
61- "listings" : [],
58+ "all" : ["time_added" , "last_updated" ], # ignore the timestamps
59+ "flags" : [],
60+ "course_flags" : [],
61+ "professors" : [],
62+ "course_professors" : [],
63+ 'courses' : ['same_course_and_profs_id' , 'same_course_id' , 'same_prof_id' , 'last_offered_course_id' ],
64+ "listings" : [],
6265 }
6366
6467 for col_name in row .index .tolist ():
6568 if ("_old" not in col_name ):
6669 continue
6770 col_name = col_name .replace ("_old" , "" )
68-
69- if (col_name in cols_to_exclude [table_name ]):
70- return False
71-
71+
72+ if (col_name in cols_to_exclude [table_name ] or col_name in cols_to_exclude [ "all" ] ):
73+ continue
74+
7275 old_value = row [col_name + "_old" ]
7376 new_value = row [col_name + "_new" ]
74-
77+
7578 if isinstance (old_value , list ) or isinstance (new_value , list ):
76- old_value = ast .literal_eval (str (old_value ).replace ('"' ,"'" )) # fix quotes
77- new_value = ast .literal_eval (str (new_value ).replace ('"' ,"'" ))
79+ old_value = ast .literal_eval (
80+ str (old_value ).replace ('"' , "'" )) # fix quotes
81+ new_value = ast .literal_eval (str (new_value ).replace ('"' , "'" ))
7882 if (old_value != new_value ):
7983 return True
8084 else :
8185 if (not pd .isna (old_value ) and
82- not pd .isna (new_value )):
86+ not pd .isna (new_value )):
8387 if isinstance (old_value , dict ) and isinstance (new_value , str ):
8488 new_value = json .loads (new_value )
8589 elif isinstance (old_value , (int , float )) and isinstance (new_value , (int , float )):
8690 new_value = float (new_value )
8791 old_value = float (old_value )
8892 else :
89-
90- old_value = str (old_value ).replace ('"' ,"'" ).replace ('\\ ' , '' ).strip ("'" )
91- new_value = str (new_value ).replace ('"' ,"'" ).replace ('\\ ' , '' ).strip ("'" )
93+
94+ old_value = str (old_value ).replace (
95+ '"' , "'" ).replace ('\\ ' , '' ).strip ("'" )
96+ new_value = str (new_value ).replace (
97+ '"' , "'" ).replace ('\\ ' , '' ).strip ("'" )
9298 # old_value = normalize_unicode(old_value)
9399 # new_value = normalize_unicode(new_value)
94100 try :
@@ -97,62 +103,67 @@ def check_change(row, table_name):
97103 except :
98104 pass
99105 if old_value != new_value :
100- print (f"column: { col_name } , old: { old_value } , new: { new_value } " )
106+ print (
107+ f"column: { col_name } , old: { old_value } , new: { new_value } " )
101108 return True
102-
109+
103110 return False
104111
112+
105113def generate_diff (tables_old : dict [str , pd .DataFrame ],
106- tables_new : dict [str , pd .DataFrame ], output_dir :str ):
114+ tables_new : dict [str , pd .DataFrame ], output_dir : str ):
107115
108116 diff_dict = {}
109117
110118 for table_name in primary_keys .keys ():
111119 if table_name not in tables_new .keys () or table_name not in tables_old .keys ():
112120 raise ValueError (f"Table { table_name } not found in new tables" )
113-
121+
114122 print (f"Computing diff for table { table_name } ..." , end = " " )
115-
123+
116124 output_file_path = Path (output_dir ) / (table_name + ".md" )
117-
118125
119126 with open (output_file_path , "w+" ) as file :
120- # check difference between old df and new df and output to above file path
127+ # check difference between old df and new df and output to above file path
121128 old_df = tables_old [table_name ]
122129 new_df = tables_new [table_name ]
123-
130+
124131 # TODO - better way to do this?
125132 pk = primary_keys [table_name ][0 ]
126133
127134 # Identify rows with differences
128-
135+
129136 # check for rows that are in old df but not in new df
130137 # based on primary key
131138 file .write ("## Deleted rows in new table: \n " )
132139
133140 deleted_rows = old_df [~ old_df [pk ].isin (new_df [pk ])]
134141 if not deleted_rows .empty :
135142 file .write (f"{ deleted_rows .to_csv ()} \n " )
136-
143+
137144 file .write ("## Added rows in new table: \n " )
138145 # check for rows that have been added
139146 added_rows = new_df [~ new_df [pk ].isin (old_df [pk ])]
140147 if not added_rows .empty :
141148 file .write (f"{ added_rows .to_csv ()} \n " )
142149
143150 if table_name == "course_flags" :
144- old_df = old_df .groupby ("course_id" )["flag_id" ].apply (frozenset )
145- new_df = new_df .groupby ("course_id" )["flag_id" ].apply (frozenset )
151+ old_df = old_df .groupby ("course_id" )[
152+ "flag_id" ].apply (frozenset )
153+ new_df = new_df .groupby ("course_id" )[
154+ "flag_id" ].apply (frozenset )
146155 elif table_name == "course_professors" :
147- old_df = old_df .groupby ("course_id" )["professor_id" ].apply (frozenset )
148- new_df = new_df .groupby ("course_id" )["professor_id" ].apply (frozenset )
156+ old_df = old_df .groupby ("course_id" )[
157+ "professor_id" ].apply (frozenset )
158+ new_df = new_df .groupby ("course_id" )[
159+ "professor_id" ].apply (frozenset )
149160
150-
151161 merged_df = pd .merge (old_df , new_df , on = pk ,
152162 how = "inner" , suffixes = ('_old' , '_new' ))
153-
154- changed_rows = merged_df [merged_df .apply (check_change , args = (table_name ,), axis = 1 )]
155-
163+
164+ changed_rows = merged_df [merged_df .apply (
165+ check_change , args = (table_name ,), axis = 1 )]
166+
156167 file .write ("## Changed rows in new table: \n " )
157168
158169 if not changed_rows .empty :
@@ -168,7 +179,8 @@ def generate_diff(tables_old: dict[str, pd.DataFrame],
168179
169180 return diff_dict
170181
182+
171183if __name__ == "__main__" :
172184 tables_old = get_dfs ("postgresql://postgres:postgres@db:5432/postgres" )
173185 tables_new = transform (data_dir = Path ("/workspaces/ferry/data" ))
174- generate_diff (tables_old , tables_new , "/workspaces/ferry/diff" )
186+ generate_diff (tables_old , tables_new , "/workspaces/ferry/diff" )
0 commit comments