11"""
2- A module that summarizes structural and statistical differences between two DataFrame versions.
2+ A module that summarizes structural and statistical
3+ differences between two DataFrame versions.
34"""
45import pandas as pd
56
7+
68def data_version_diff (df_old , df_new ):
79 """
8- This function compares an earlier and a later version of a pandas DataFrame
9- and returns a high-level summary of how the data has changed. It is designed
10- for data auditing, version tracking, and exploratory analysis rather than
11- cell-by-cell comparison.
10+ This function compares an earlier and a later version of a pandas
11+ DataFrame and returns a high-level summary of how the data has changed.
12+ It is designed for data auditing, version tracking, and exploratory
13+ analysis rather than cell-by-cell comparison.
1214
1315 The comparison includes:
1416 - Columns that were added or removed
@@ -33,9 +35,10 @@ def data_version_diff(df_old, df_new):
3335 Notes
3436 -----
3537 - This function assumes both inputs are pandas DataFrames.
36- - Rows are compared by position only; no key-based row matching is performed.
37- - The function is intended for small to medium-sized datasets and exploratory
38- analysis rather than large-scale production pipelines.
38+ - Rows are compared by position only; no key-based row matching is
39+ performed.
40+ - The function is intended for small to medium-sized datasets and
41+ exploratory analysis rather than large-scale production pipelines.
3942
4043 Examples
4144 --------
@@ -74,43 +77,59 @@ def data_version_diff(df_old, df_new):
7477 row_difference = new_row_count - old_row_count
7578
7679 # missing_value_changes
77- shared_columns = df_old .columns .intersection (df_new .columns ) # shared columns
80+ shared_columns = df_old .columns .intersection (df_new .columns ) # shared cols
7881 missing_summary = pd .DataFrame ({
7982 "missing_old" : df_old [shared_columns ].isna ().sum (),
8083 "missing_new" : df_new [shared_columns ].isna ().sum ()
8184 })
8285
8386 # calculate the difference
84- missing_summary ["difference" ] = missing_summary ["missing_new" ] - missing_summary ["missing_old" ]
87+ missing_summary ["difference" ] = (
88+ missing_summary ["missing_new" ] - missing_summary ["missing_old" ]
89+ )
8590
8691 # reset index so 'column' is a regular column
87- missing_summary = missing_summary .reset_index ().rename (columns = {"index" : "column" })
92+ missing_summary = (
93+ missing_summary .reset_index ().rename (columns = {"index" : "column" })
94+ )
8895
8996 # print(f"Missing value summary: {missing_summary}")
9097
91- ## numeric_summary_changes
98+ # numeric_summary_changes
9299 # identify numeric cols in both DFs
93100 numeric_cols_old = df_old .select_dtypes (include = "number" ).columns
94101 numeric_cols_new = df_new .select_dtypes (include = "number" ).columns
95- shared_numeric_columns = numeric_cols_old .intersection (numeric_cols_new )
102+ shared_numeric_columns = numeric_cols_old .intersection (numeric_cols_new )
96103
97104 # compute summary statistics for shared numeric columns
98105 if len (shared_numeric_columns ) == 0 :
99106 numeric_summary_changes = pd .DataFrame (
100107 columns = ["column" , "statistic" , "old" , "new" , "difference" ]
101108 )
102109 else :
103- summary_old = df_old [shared_numeric_columns ].describe ().loc [['mean' , 'std' , 'min' , 'max' ]].T
104- summary_new = df_new [shared_numeric_columns ].describe ().loc [['mean' , 'std' , 'min' , 'max' ]].T
110+ summary_old = (
111+ df_old [shared_numeric_columns ].describe ()
112+ .loc [['mean' , 'std' , 'min' , 'max' ]].T
113+ )
114+ summary_new = (
115+ df_new [shared_numeric_columns ].describe ()
116+ .loc [['mean' , 'std' , 'min' , 'max' ]].T
117+ )
105118
106119 # convert to long format
107- summary_old_long = summary_old .reset_index ().melt (id_vars = "index" , var_name = "statistic" , value_name = "old" ).rename (columns = {"index" : "column" })
120+ summary_old_long = (
121+ summary_old .reset_index ()
122+ .melt (id_vars = "index" , var_name = "statistic" , value_name = "old" )
123+ .rename (columns = {"index" : "column" })
124+ )
108125
109- summary_new_long = summary_new .reset_index ().melt (
110- id_vars = "index" , var_name = "statistic" , value_name = "new"
111- ).rename (columns = {"index" : "column" })
126+ summary_new_long = (
127+ summary_new .reset_index ()
128+ .melt (id_vars = "index" , var_name = "statistic" , value_name = "new" )
129+ .rename (columns = {"index" : "column" })
130+ )
112131
113- # merge old and new
132+ # merge old and new
114133 numeric_summary_changes = summary_old_long .merge (
115134 summary_new_long , on = ["column" , "statistic" ], how = "inner"
116135 )
@@ -128,7 +147,8 @@ def data_version_diff(df_old, df_new):
128147 new_type = df_new [col ].dtype
129148 if old_type != new_type :
130149 dtype_changes_list .append ({
131- "column" : col , "old_dtype" : str (old_type ), "new_type" : str (new_type )
150+ "column" : col , "old_dtype" : str (old_type ),
151+ "new_type" : str (new_type )
132152 })
133153
134154 # convert to DF
@@ -149,19 +169,22 @@ def data_version_diff(df_old, df_new):
149169 "n_columns_added" : len (columns_added ),
150170 "n_columns_removed" : len (columns_removed ),
151171 "n_dtype_changes" : len (dtype_changes ),
152- "n_missing_changes" : int ((missing_summary ["difference" ] != 0 ).sum ()),
172+ "n_missing_changes" : int (
173+ (missing_summary ["difference" ] != 0 ).sum ()
174+ ),
153175 },
154176 }
155177
156178 return result
157179
180+
158181def display_data_version_diff (result ):
159182 """
160183 Print a formatted, human-readable summary of DataFrame version differences.
161184
162- This function takes the output of `data_version_diff` and prints a structured
163- console report highlighting row count changes, schema changes, missing value
164- differences, numeric summary changes, and data type changes.
185+ This function takes the output of `data_version_diff` and prints a
186+ structured console report highlighting row count changes, schema changes,
187+ missing value differences, numeric summary changes, and data type changes.
165188
166189 Parameters
167190 ----------
@@ -170,7 +193,8 @@ def display_data_version_diff(result):
170193
171194 Notes
172195 -----
173- - This function is intended for interactive use (e.g., notebooks or terminals).
196+ - This function is intended for interactive use (e.g., notebooks or
197+ terminals).
174198 - It does not return any value.
175199
176200 Examples
@@ -186,7 +210,7 @@ def display_data_version_diff(result):
186210 rc = result ["row_count_change" ]
187211 diff = rc ["row_difference" ]
188212 sign = "+" if diff > 0 else ""
189- print (f "\n ROWS CHANGE:" )
213+ print ("\n ROWS CHANGE:" )
190214 print ("-" * 60 )
191215 print (f" Old Rows: { rc ['old_row_count' ]} " )
192216 print (f" New Rows: { rc ['new_row_count' ]} " )
@@ -199,7 +223,7 @@ def display_data_version_diff(result):
199223 print (f" Columns added: { ', ' .join (result ['columns_added' ])} " )
200224 else :
201225 print (" Columns added: None" )
202-
226+
203227 if result ["columns_removed" ]:
204228 print (f" Columns removed: { ', ' .join (result ['columns_removed' ])} " )
205229 else :
@@ -208,7 +232,7 @@ def display_data_version_diff(result):
208232 # --- Missing values ---
209233 mv = result ["missing_value_changes" ]
210234 mv_changed = mv [mv ["difference" ] != 0 ]
211-
235+
212236 print ("\n MISSING VALUE CHANGES:" )
213237 print ("-" * 60 )
214238 if mv_changed .empty :
@@ -217,12 +241,12 @@ def display_data_version_diff(result):
217241 print (
218242 mv_changed .assign (
219243 change = lambda d : d ["difference" ].apply (
220- lambda x : f"+{ x } " if x > 0 else str (x )
244+ lambda x : f"+{ x } " if x > 0 else str (x )
221245 )
222246 )[["column" , "missing_old" , "missing_new" , "change" ]]
223247 .to_string (index = False )
224248 )
225-
249+
226250 # --- Numeric summary changes ---
227251 ns = result ["numeric_summary_changes" ]
228252 ns_changed = ns [ns ["difference" ] != 0 ]
0 commit comments