UBC-MDS
diff --git a/‎.flake8‎
Lines changed: 0 additions & 3 deletions b/‎.flake8‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎docs/conf.py‎
Lines changed: 10 additions & 8 deletions b/‎docs/conf.py‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/csvplus/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎src/csvplus/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/csvplus/data_correction.py‎
Lines changed: 18 additions & 9 deletions b/‎src/csvplus/data_correction.py‎
Lines changed: 18 additions & 9 deletions
diff --git a/‎src/csvplus/data_version_diff.py‎
Lines changed: 55 additions & 31 deletions b/‎src/csvplus/data_version_diff.py‎
Lines changed: 55 additions & 31 deletions
diff --git a/‎src/csvplus/example.py‎
Lines changed: 1 addition & 0 deletions b/‎src/csvplus/example.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/csvplus/examples/data_version_diff_demo.py‎
Lines changed: 8 additions & 4 deletions b/‎src/csvplus/examples/data_version_diff_demo.py‎
Lines changed: 8 additions & 4 deletions
@@ -18,26 +18,28 @@
 copyright = "Copyright © 2026 "
 html_show_sphinx = False
 
-# Try to get the version info for the project you're documenting, acts as replacement for
+# Try to get the version info for the project
+# you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 try:
     version = importlib.metadata.version("csvplus")
 except importlib.metadata.PackageNotFoundError:
     version = "0.0.0"
 
-# -- General configuration -----------------------------------------------------
+# -- General configuration --------------------------------
 
-# -- General configuration -----------------------------------------------------
+# -- General configuration -------------------------------
 
-# Add any Sphinx extension module names here, as strings. They can be extensions
+# Add any Sphinx extension module names here, as strings.
+# They can be extensions
 # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 extensions = [
     "myst_parser",
     "sphinx_design",
     "sphinx_copybutton",
     "sphinx.ext.intersphinx",
-    "sphinx.ext.napoleon", # Support numpy style docstrings
+    "sphinx.ext.napoleon",  # Support numpy style docstrings
     # This allows you to create :::{todo} sections that will not be rendered
     # in the live docs if you want to leave notes for future work in the docs
     "sphinx.ext.todo",
@@ -61,7 +63,7 @@
 # Usually you set "language" from the command line for these cases.
 language = "en"
 
-# -- Options for extensions ----------------------------------------------------
+# -- Options for extensions --------------------------------
 # https://myst-parser.readthedocs.io/en/latest/syntax/optional.html
 # -- Options for myst markdown formatting
 myst_enable_extensions = [
@@ -78,7 +80,7 @@
 templates_path = ["_templates"]
 exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 
-#--------- setup autoapi defaults for your api docs ---------------
+# --------- setup autoapi defaults for your api docs ---------------
 
 # AutoAPI configuration
 autoapi_type = "python"
@@ -89,7 +91,7 @@
 autoapi_keep_files = False
 autoapi_options = ["members", "undoc-members", "show-inheritance"]
 
-# -- Options for HTML output ---------------------------------------------------
+# -- Options for HTML output ----------------------------
 
 html_theme = "pydata_sphinx_theme"
 
 
@@ -13,7 +13,7 @@ requires = ["hatchling"]
 [project]
 name = "csvplus"
 # You can chose to use dynamic versioning with hatch or static where you add it manually.
-version = "0.2.2"
+version = "0.2.3"
 
 description = "A better way to read and manipulate data"
 authors = [
 
@@ -1,6 +1,6 @@
 # MIT License
 #
-# Copyright (c) 2026 
+# Copyright (c) 2026
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 
@@ -11,22 +11,27 @@
 def resolve_string_value(df, column_name, resolved_names, threshold):
     """
     For all the values in the column_name of the df, find the one element
-    in the `resolved_names` with highest similarity score computed with `fuzz.WRatio`
-    (case sensitive, meaning that "Google" and "google" will not have a score of 100).
-    And compare the similiarty score with the threshold to decide whether to apply
+    in the `resolved_names` with highest
+    similarity score computed with `fuzz.WRatio`
+    (case sensitive, meaning that "Google" and
+    "google" will not have a score of 100).
+    And compare the similiarty score with
+    the threshold to decide whether to apply
     the string replacement inplace.
 
     Parameters
     ----------
     df : pandas.DataFrame
         The DataFrame of interest.
     column_name : str
-        The column to conduct the consolidation on. The column must exist in `df` and
+        The column to conduct the consolidation on.
+        The column must exist in `df` and
         be of type string.
     resolved_names : list
         A list of standard names for transforming the column's value to.
     threshold: float
-        The minimum similarity score (0 and 100) required to replace a value with a resolved name.
+        The minimum similarity score (0 and 100) required to replace
+        a value with a resolved name.
 
     Returns
     -------
@@ -45,7 +50,8 @@ def resolve_string_value(df, column_name, resolved_names, threshold):
     --------
     >>> import pandas as pd
     >>> data = pd.DataFrame({
-    ...     "company_name": ["Google", "Google Inc.", "Gogle", "Microsoftt", "Micro-soft"],
+    ...     "company_name": ["Google", "Google Inc.",
+    ...     "Gogle", "Microsoftt", "Micro-soft"],
     ...     "num_searches": [1, 2, 3, 4, 5]
     ... })
     >>> resolve_string_value(data, "company_name", ["Google", "Microsoft"], 80)
@@ -67,8 +73,10 @@ def resolve_string_value(df, column_name, resolved_names, threshold):
     elif (threshold < 0) or (threshold > 100):
         raise ValueError("The threshold value is out of range.")
 
-    # Adopted MS Copilot solution to: "How to use `rapidfuzz.process.extractOne()`"?"
-    # Return closest string in `choices` if similarity score based on `fuzz.WRatio` is
+    # Adopted MS Copilot solution to:
+    # "How to use `rapidfuzz.process.extractOne()`"?"
+    # Return closest string in `choices` if similarity score based on
+    # `fuzz.WRatio` is
     # above threshold. Otherwise, return the word itself.
     def find_closest(query, choices, threshold):
         result = process.extractOne(query, choices, scorer=fuzz.WRatio)
@@ -77,4 +85,5 @@ def find_closest(query, choices, threshold):
         else:
             return query
 
-    df[column_name] = df[column_name].apply(lambda x: find_closest(x, resolved_names, threshold))
+    df[column_name] = (df[column_name].apply(
+        lambda x: find_closest(x, resolved_names, threshold)))
@@ -1,14 +1,16 @@
 """
-A module that summarizes structural and statistical differences between two DataFrame versions.
+A module that summarizes structural and statistical
+differences between two DataFrame versions.
 """
 import pandas as pd
 
+
 def data_version_diff(df_old, df_new):
     """
-    This function compares an earlier and a later version of a pandas DataFrame
-    and returns a high-level summary of how the data has changed. It is designed
-    for data auditing, version tracking, and exploratory analysis rather than
-    cell-by-cell comparison.
+    This function compares an earlier and a later version of a pandas
+    DataFrame and returns a high-level summary of how the data has changed.
+    It is designed for data auditing, version tracking, and exploratory
+    analysis rather than cell-by-cell comparison.
 
     The comparison includes:
     - Columns that were added or removed
@@ -33,9 +35,10 @@ def data_version_diff(df_old, df_new):
     Notes
     -----
     - This function assumes both inputs are pandas DataFrames.
-    - Rows are compared by position only; no key-based row matching is performed.
-    - The function is intended for small to medium-sized datasets and exploratory
-      analysis rather than large-scale production pipelines.
+    - Rows are compared by position only; no key-based row matching is
+      performed.
+    - The function is intended for small to medium-sized datasets and
+      exploratory analysis rather than large-scale production pipelines.
 
     Examples
     --------
@@ -74,43 +77,59 @@ def data_version_diff(df_old, df_new):
     row_difference = new_row_count - old_row_count
 
     # missing_value_changes
-    shared_columns = df_old.columns.intersection(df_new.columns) #shared columns
+    shared_columns = df_old.columns.intersection(df_new.columns)  # shared cols
     missing_summary = pd.DataFrame({
         "missing_old": df_old[shared_columns].isna().sum(),
         "missing_new": df_new[shared_columns].isna().sum()
     })
 
     # calculate the difference
-    missing_summary["difference"] = missing_summary["missing_new"] - missing_summary["missing_old"]
+    missing_summary["difference"] = (
+        missing_summary["missing_new"] - missing_summary["missing_old"]
+    )
 
     # reset index so 'column' is a regular column
-    missing_summary = missing_summary.reset_index().rename(columns={"index": "column"})
+    missing_summary = (
+        missing_summary.reset_index().rename(columns={"index": "column"})
+    )
 
     # print(f"Missing value summary: {missing_summary}")
 
-    ## numeric_summary_changes
+    # numeric_summary_changes
     # identify numeric cols in both DFs
     numeric_cols_old = df_old.select_dtypes(include="number").columns
     numeric_cols_new = df_new.select_dtypes(include="number").columns
-    shared_numeric_columns = numeric_cols_old.intersection(numeric_cols_new) 
+    shared_numeric_columns = numeric_cols_old.intersection(numeric_cols_new)
 
     # compute summary statistics for shared numeric columns
     if len(shared_numeric_columns) == 0:
         numeric_summary_changes = pd.DataFrame(
             columns=["column", "statistic", "old", "new", "difference"]
         )
     else:
-        summary_old = df_old[shared_numeric_columns].describe().loc[['mean', 'std', 'min', 'max']].T
-        summary_new = df_new[shared_numeric_columns].describe().loc[['mean', 'std', 'min', 'max']].T
+        summary_old = (
+            df_old[shared_numeric_columns].describe()
+            .loc[['mean', 'std', 'min', 'max']].T
+        )
+        summary_new = (
+            df_new[shared_numeric_columns].describe()
+            .loc[['mean', 'std', 'min', 'max']].T
+        )
 
         # convert to long format
-        summary_old_long = summary_old.reset_index().melt(id_vars="index", var_name="statistic", value_name="old").rename(columns={"index": "column"})
+        summary_old_long = (
+            summary_old.reset_index()
+            .melt(id_vars="index", var_name="statistic", value_name="old")
+            .rename(columns={"index": "column"})
+        )
 
-        summary_new_long = summary_new.reset_index().melt(
-        id_vars="index", var_name="statistic", value_name="new"
-        ).rename(columns={"index": "column"})
+        summary_new_long = (
+            summary_new.reset_index()
+            .melt(id_vars="index", var_name="statistic", value_name="new")
+            .rename(columns={"index": "column"})
+        )
 
-        # merge old and new 
+        # merge old and new
         numeric_summary_changes = summary_old_long.merge(
             summary_new_long, on=["column", "statistic"], how="inner"
         )
@@ -128,7 +147,8 @@ def data_version_diff(df_old, df_new):
         new_type = df_new[col].dtype
         if old_type != new_type:
             dtype_changes_list.append({
-                "column": col, "old_dtype": str(old_type), "new_type": str(new_type)
+                "column": col, "old_dtype": str(old_type),
+                "new_type": str(new_type)
                 })
 
     # convert to DF
@@ -149,19 +169,22 @@ def data_version_diff(df_old, df_new):
             "n_columns_added": len(columns_added),
             "n_columns_removed": len(columns_removed),
             "n_dtype_changes": len(dtype_changes),
-            "n_missing_changes": int((missing_summary["difference"] != 0).sum()),
+            "n_missing_changes": int(
+                (missing_summary["difference"] != 0).sum()
+            ),
         },
     }
 
     return result
 
+
 def display_data_version_diff(result):
     """
     Print a formatted, human-readable summary of DataFrame version differences.
 
-    This function takes the output of `data_version_diff` and prints a structured
-    console report highlighting row count changes, schema changes, missing value
-    differences, numeric summary changes, and data type changes.
+    This function takes the output of `data_version_diff` and prints a
+    structured console report highlighting row count changes, schema changes,
+    missing value differences, numeric summary changes, and data type changes.
 
     Parameters
     ----------
@@ -170,7 +193,8 @@ def display_data_version_diff(result):
 
     Notes
     -----
-    - This function is intended for interactive use (e.g., notebooks or terminals).
+    - This function is intended for interactive use (e.g., notebooks or
+      terminals).
     - It does not return any value.
 
     Examples
@@ -186,7 +210,7 @@ def display_data_version_diff(result):
     rc = result["row_count_change"]
     diff = rc["row_difference"]
     sign = "+" if diff > 0 else ""
-    print(f"\n  ROWS CHANGE:")
+    print("\n  ROWS CHANGE:")
     print("-" * 60)
     print(f"    Old Rows: {rc['old_row_count']}")
     print(f"    New Rows: {rc['new_row_count']}")
@@ -199,7 +223,7 @@ def display_data_version_diff(result):
         print(f"    Columns added: {', '.join(result['columns_added'])}")
     else:
         print(" Columns added: None")
-    
+
     if result["columns_removed"]:
         print(f"    Columns removed: {', '.join(result['columns_removed'])}")
     else:
@@ -208,7 +232,7 @@ def display_data_version_diff(result):
     # --- Missing values ---
     mv = result["missing_value_changes"]
     mv_changed = mv[mv["difference"] != 0]
-    
+
     print("\n   MISSING VALUE CHANGES:")
     print("-" * 60)
     if mv_changed.empty:
@@ -217,12 +241,12 @@ def display_data_version_diff(result):
         print(
             mv_changed.assign(
                 change=lambda d: d["difference"].apply(
-                    lambda x: f"+{x}" if x>0 else str(x)
+                    lambda x: f"+{x}" if x > 0 else str(x)
                 )
             )[["column", "missing_old", "missing_new", "change"]]
             .to_string(index=False)
         )
-    
+
     # --- Numeric summary changes ---
     ns = result["numeric_summary_changes"]
     ns_changed = ns[ns["difference"] != 0]
 
@@ -6,6 +6,7 @@
 that explains the purpose of the module, at the top.
 """
 
+
 def add_numbers(a, b):
     """
     Add two numbers together and return the result.
 
@@ -18,7 +18,8 @@
 #     "age": np.random.randint(18, 70, size=n_old),
 #     "income": np.random.normal(loc=5000, scale=15000, size=n_old),
 #     "email": [fake.email() for _ in range(n_old)],
-#     "signup_date": [fake.date_between(start_date="-2y", end_date="today") for _ in range(n_old)],
+#     "signup_date": [fake.date_between(start_date="-2y", end_date="today")
+#      for _ in range(n_old)],
 #     "country": [fake.country() for _ in range(n_old)],
 # })
 
@@ -36,11 +37,13 @@
 # n_new_rows = 20
 
 # new_rows = pd.DataFrame({
-#     "user_id": range(df_old["user_id"].max() + 1, df_old["user_id"].max() + 1 + n_new_rows),
+#     "user_id": range(df_old["user_id"].max() + 1, df_old["user_id"].max() +
+#       1 + n_new_rows),
 #     "age": np.random.randint(18, 70, size=n_new_rows),
 #     "income": np.random.normal(loc=60000, scale=20000, size=n_new_rows),
 #     "email": [fake.email() for _ in range(n_new_rows)],
-#     "signup_date": [fake.date_between(start_date="-1y", end_date="today") for _ in range(n_new_rows)],
+#     "signup_date": [fake.date_between(start_date="-1y", end_date="today")
+# for _ in range(n_new_rows)],
 #     "country": [fake.country() for _ in range(n_new_rows)],
 # })
 # df_new = pd.concat([df_new, new_rows], ignore_index=True)
@@ -50,7 +53,8 @@
 
 # #column addition
 # df_new["last_login_date"] = [
-#     fake.date_between(start_date="-6m", end_date="today") for _ in range(len(df_new))
+#     fake.date_between(start_date="-6m", end_date="today")
+# for _ in range(len(df_new))
 # ]
 
 # ## missing value changes
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`# MIT License`
`2`	`2`	`#`
`3`		`-# Copyright (c) 2026`
	`3`	`+# Copyright (c) 2026`
`4`	`4`	`#`
`5`	`5`	`# Permission is hereby granted, free of charge, to any person obtaining a copy`
`6`	`6`	`# of this software and associated documentation files (the "Software"), to deal`