add time added and updated cols

reybahl · reybahl · commit c43a08212978 · 2024-11-07T23:07:44.000Z
diff --git a/ferry/database/diff_db.py b/ferry/database/diff_db.py
@@ -19,13 +19,14 @@
 queries_dir = Path(__file__).parent / "queries"
 
 primary_keys = {
-        "flags" : ["flag_id"],
-        "course_flags" : ["course_id"],
-        "professors" : ["professor_id"],
-        "course_professors" : ["course_id"],
-        "courses" : ["course_id"],
-        "listings" : ["listing_id"],
-    }
+    "flags": ["flag_id"],
+    "course_flags": ["course_id"],
+    "professors": ["professor_id"],
+    "course_professors": ["course_id"],
+    "courses": ["course_id"],
+    "listings": ["listing_id"],
+}
+
 
 def get_dfs(database_connect_string: str):
     db = Database(database_connect_string)
@@ -35,7 +36,7 @@ def get_dfs(database_connect_string: str):
     db_meta.reflect(bind=db.Engine)
 
     conn = db.Engine.connect()
-    
+
     # get table names
     query = "SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'"
     result = conn.execute(text(query))
@@ -51,44 +52,49 @@ def get_dfs(database_connect_string: str):
 
     return dataframes
 
+
 def check_change(row, table_name):
     cols_to_exclude = {
-        "flags" : [],
-        "course_flags" : [],
-        "professors" : [],
-        "course_professors" : [],
-        'courses' : ['same_course_and_profs_id', 'same_course_id', 'same_prof_id', 'last_offered_course_id'],
-        "listings" : [],
+        "all": ["time_added", "last_updated"],  # ignore the timestamps
+        "flags": [],
+        "course_flags": [],
+        "professors": [],
+        "course_professors": [],
+        'courses': ['same_course_and_profs_id', 'same_course_id', 'same_prof_id', 'last_offered_course_id'],
+        "listings": [],
     }
 
     for col_name in row.index.tolist():
         if ("_old" not in col_name):
             continue
         col_name = col_name.replace("_old", "")
-        
-        if (col_name in cols_to_exclude[table_name]):
-            return False
-        
+
+        if (col_name in cols_to_exclude[table_name] or col_name in cols_to_exclude["all"]):
+            continue
+
         old_value = row[col_name + "_old"]
         new_value = row[col_name + "_new"]
-        
+
         if isinstance(old_value, list) or isinstance(new_value, list):
-            old_value = ast.literal_eval(str(old_value).replace('"',"'")) # fix quotes
-            new_value = ast.literal_eval(str(new_value).replace('"',"'"))
+            old_value = ast.literal_eval(
+                str(old_value).replace('"', "'"))  # fix quotes
+            new_value = ast.literal_eval(str(new_value).replace('"', "'"))
             if (old_value != new_value):
                 return True
         else:
             if (not pd.isna(old_value) and
-            not pd.isna(new_value)):
+                    not pd.isna(new_value)):
                 if isinstance(old_value, dict) and isinstance(new_value, str):
                     new_value = json.loads(new_value)
                 elif isinstance(old_value, (int, float)) and isinstance(new_value, (int, float)):
                     new_value = float(new_value)
                     old_value = float(old_value)
                 else:
-                
-                    old_value = str(old_value).replace('"',"'").replace('\\', '').strip("'")
-                    new_value = str(new_value).replace('"',"'").replace('\\', '').strip("'")
+
+                    old_value = str(old_value).replace(
+                        '"', "'").replace('\\', '').strip("'")
+                    new_value = str(new_value).replace(
+                        '"', "'").replace('\\', '').strip("'")
                     # old_value = normalize_unicode(old_value)
                     # new_value = normalize_unicode(new_value)
                     try:
@@ -97,62 +103,67 @@ def check_change(row, table_name):
                     except:
                         pass
                 if old_value != new_value:
-                    print(f"column: {col_name}, old: {old_value}, new: {new_value}")
+                    print(
+                        f"column: {col_name}, old: {old_value}, new: {new_value}")
                     return True
-            
+
     return False
 
+
 def generate_diff(tables_old: dict[str, pd.DataFrame],
-                    tables_new: dict[str, pd.DataFrame], output_dir:str):
+                  tables_new: dict[str, pd.DataFrame], output_dir: str):
 
     diff_dict = {}
 
     for table_name in primary_keys.keys():
         if table_name not in tables_new.keys() or table_name not in tables_old.keys():
             raise ValueError(f"Table {table_name} not found in new tables")
-        
+
         print(f"Computing diff for table {table_name} ...", end=" ")
-        
+
         output_file_path = Path(output_dir) / (table_name + ".md")
-        
 
         with open(output_file_path, "w+") as file:
-             # check difference between old df and new df and output to above file path
+            # check difference between old df and new df and output to above file path
             old_df = tables_old[table_name]
             new_df = tables_new[table_name]
-            
+
             # TODO - better way to do this?
             pk = primary_keys[table_name][0]
 
             # Identify rows with differences
-            
+
             # check for rows that are in old df but not in new df
             # based on primary key
             file.write("## Deleted rows in new table: \n")
 
             deleted_rows = old_df[~old_df[pk].isin(new_df[pk])]
             if not deleted_rows.empty:
                 file.write(f"{deleted_rows.to_csv()}\n")
-            
+
             file.write("## Added rows in new table: \n")
             # check for rows that have been added
             added_rows = new_df[~new_df[pk].isin(old_df[pk])]
             if not added_rows.empty:
                 file.write(f"{added_rows.to_csv()}\n")
 
             if table_name == "course_flags":
-                old_df = old_df.groupby("course_id")["flag_id"].apply(frozenset)
-                new_df = new_df.groupby("course_id")["flag_id"].apply(frozenset)
+                old_df = old_df.groupby("course_id")[
+                    "flag_id"].apply(frozenset)
+                new_df = new_df.groupby("course_id")[
+                    "flag_id"].apply(frozenset)
             elif table_name == "course_professors":
-                old_df = old_df.groupby("course_id")["professor_id"].apply(frozenset)
-                new_df = new_df.groupby("course_id")["professor_id"].apply(frozenset)
+                old_df = old_df.groupby("course_id")[
+                    "professor_id"].apply(frozenset)
+                new_df = new_df.groupby("course_id")[
+                    "professor_id"].apply(frozenset)
 
-            
             merged_df = pd.merge(old_df, new_df, on=pk,
                                  how="inner", suffixes=('_old', '_new'))
-            
-            changed_rows = merged_df[merged_df.apply(check_change, args=(table_name,), axis=1)]
-            
+
+            changed_rows = merged_df[merged_df.apply(
+                check_change, args=(table_name,), axis=1)]
+
             file.write("## Changed rows in new table: \n")
 
             if not changed_rows.empty:
@@ -168,7 +179,8 @@ def generate_diff(tables_old: dict[str, pd.DataFrame],
 
     return diff_dict
 
+
 if __name__ == "__main__":
     tables_old = get_dfs("postgresql://postgres:postgres@db:5432/postgres")
     tables_new = transform(data_dir=Path("/workspaces/ferry/data"))
-    generate_diff(tables_old, tables_new, "/workspaces/ferry/diff")
+    generate_diff(tables_old, tables_new, "/workspaces/ferry/diff")
diff --git a/ferry/database/sync_db_diff.py b/ferry/database/sync_db_diff.py
@@ -55,9 +55,17 @@ def sync_db(tables: dict[str, pd.DataFrame], database_connect_string: str):
         if len(to_add) > 0:
             print(f"Adding {len(to_add)} new rows to {table_name}")
             for _, row in to_add.iterrows():
-                columns = ', '.join(row.index)
+                columns_list = list(row.index)
+                columns_list.append("time_added")
+                columns_list.append("last_updated")
+                columns = ', '.join(columns_list)
+
+                values_list = list(row.values)
+                values_list.append("NOW()")
+                values_list.append("NOW()")
                 values = ', '.join(
-                    f"'{str(v)}'" if v is not None else 'NULL' for v in row.values)
+                    f"'{str(v)}'" if v is not None else 'NULL' for v in values_list)
+
                 insert_query = f'INSERT INTO {table_name} ({columns}) VALUES ({values});'
                 conn.execute(text(insert_query))
 
diff --git a/ferry/timestamps.sql b/ferry/timestamps.sql
@@ -0,0 +1,11 @@
+ALTER TABLE courses
+ADD COLUMN time_added TIMESTAMP DEFAULT NULL,
+ADD COLUMN last_updated TIMESTAMP DEFAULT NULL;
+
+ALTER TABLE listings
+ADD COLUMN time_added TIMESTAMP DEFAULT NULL,
+ADD COLUMN last_updated TIMESTAMP DEFAULT NULL;
+
+ALTER TABLE course_professors
+ADD COLUMN time_added TIMESTAMP DEFAULT NULL,
+ADD COLUMN last_updated TIMESTAMP DEFAULT NULL;