Skip to content

Commit a773ebf

Browse files
authored
Fix linter (#98)
* Fix linter * Fix conf py * Fix lint * Version bump
1 parent eb6fd96 commit a773ebf

15 files changed

Lines changed: 225 additions & 144 deletions

.flake8

Lines changed: 0 additions & 3 deletions
This file was deleted.

docs/conf.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,26 +18,28 @@
1818
copyright = "Copyright © 2026 "
1919
html_show_sphinx = False
2020

21-
# Try to get the version info for the project you're documenting, acts as replacement for
21+
# Try to get the version info for the project
22+
# you're documenting, acts as replacement for
2223
# |version| and |release|, also used in various other places throughout the
2324
# built documents.
2425
try:
2526
version = importlib.metadata.version("csvplus")
2627
except importlib.metadata.PackageNotFoundError:
2728
version = "0.0.0"
2829

29-
# -- General configuration -----------------------------------------------------
30+
# -- General configuration --------------------------------
3031

31-
# -- General configuration -----------------------------------------------------
32+
# -- General configuration -------------------------------
3233

33-
# Add any Sphinx extension module names here, as strings. They can be extensions
34+
# Add any Sphinx extension module names here, as strings.
35+
# They can be extensions
3436
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
3537
extensions = [
3638
"myst_parser",
3739
"sphinx_design",
3840
"sphinx_copybutton",
3941
"sphinx.ext.intersphinx",
40-
"sphinx.ext.napoleon", # Support numpy style docstrings
42+
"sphinx.ext.napoleon", # Support numpy style docstrings
4143
# This allows you to create :::{todo} sections that will not be rendered
4244
# in the live docs if you want to leave notes for future work in the docs
4345
"sphinx.ext.todo",
@@ -61,7 +63,7 @@
6163
# Usually you set "language" from the command line for these cases.
6264
language = "en"
6365

64-
# -- Options for extensions ----------------------------------------------------
66+
# -- Options for extensions --------------------------------
6567
# https://myst-parser.readthedocs.io/en/latest/syntax/optional.html
6668
# -- Options for myst markdown formatting
6769
myst_enable_extensions = [
@@ -78,7 +80,7 @@
7880
templates_path = ["_templates"]
7981
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
8082

81-
#--------- setup autoapi defaults for your api docs ---------------
83+
# --------- setup autoapi defaults for your api docs ---------------
8284

8385
# AutoAPI configuration
8486
autoapi_type = "python"
@@ -89,7 +91,7 @@
8991
autoapi_keep_files = False
9092
autoapi_options = ["members", "undoc-members", "show-inheritance"]
9193

92-
# -- Options for HTML output ---------------------------------------------------
94+
# -- Options for HTML output ----------------------------
9395

9496
html_theme = "pydata_sphinx_theme"
9597

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ requires = ["hatchling"]
1313
[project]
1414
name = "csvplus"
1515
# You can chose to use dynamic versioning with hatch or static where you add it manually.
16-
version = "0.2.2"
16+
version = "0.2.3"
1717

1818
description = "A better way to read and manipulate data"
1919
authors = [

src/csvplus/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# MIT License
22
#
3-
# Copyright (c) 2026
3+
# Copyright (c) 2026
44
#
55
# Permission is hereby granted, free of charge, to any person obtaining a copy
66
# of this software and associated documentation files (the "Software"), to deal

src/csvplus/data_correction.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,22 +11,27 @@
1111
def resolve_string_value(df, column_name, resolved_names, threshold):
1212
"""
1313
For all the values in the column_name of the df, find the one element
14-
in the `resolved_names` with highest similarity score computed with `fuzz.WRatio`
15-
(case sensitive, meaning that "Google" and "google" will not have a score of 100).
16-
And compare the similiarty score with the threshold to decide whether to apply
14+
in the `resolved_names` with highest
15+
similarity score computed with `fuzz.WRatio`
16+
(case sensitive, meaning that "Google" and
17+
"google" will not have a score of 100).
18+
And compare the similiarty score with
19+
the threshold to decide whether to apply
1720
the string replacement inplace.
1821
1922
Parameters
2023
----------
2124
df : pandas.DataFrame
2225
The DataFrame of interest.
2326
column_name : str
24-
The column to conduct the consolidation on. The column must exist in `df` and
27+
The column to conduct the consolidation on.
28+
The column must exist in `df` and
2529
be of type string.
2630
resolved_names : list
2731
A list of standard names for transforming the column's value to.
2832
threshold: float
29-
The minimum similarity score (0 and 100) required to replace a value with a resolved name.
33+
The minimum similarity score (0 and 100) required to replace
34+
a value with a resolved name.
3035
3136
Returns
3237
-------
@@ -45,7 +50,8 @@ def resolve_string_value(df, column_name, resolved_names, threshold):
4550
--------
4651
>>> import pandas as pd
4752
>>> data = pd.DataFrame({
48-
... "company_name": ["Google", "Google Inc.", "Gogle", "Microsoftt", "Micro-soft"],
53+
... "company_name": ["Google", "Google Inc.",
54+
... "Gogle", "Microsoftt", "Micro-soft"],
4955
... "num_searches": [1, 2, 3, 4, 5]
5056
... })
5157
>>> resolve_string_value(data, "company_name", ["Google", "Microsoft"], 80)
@@ -67,8 +73,10 @@ def resolve_string_value(df, column_name, resolved_names, threshold):
6773
elif (threshold < 0) or (threshold > 100):
6874
raise ValueError("The threshold value is out of range.")
6975

70-
# Adopted MS Copilot solution to: "How to use `rapidfuzz.process.extractOne()`"?"
71-
# Return closest string in `choices` if similarity score based on `fuzz.WRatio` is
76+
# Adopted MS Copilot solution to:
77+
# "How to use `rapidfuzz.process.extractOne()`"?"
78+
# Return closest string in `choices` if similarity score based on
79+
# `fuzz.WRatio` is
7280
# above threshold. Otherwise, return the word itself.
7381
def find_closest(query, choices, threshold):
7482
result = process.extractOne(query, choices, scorer=fuzz.WRatio)
@@ -77,4 +85,5 @@ def find_closest(query, choices, threshold):
7785
else:
7886
return query
7987

80-
df[column_name] = df[column_name].apply(lambda x: find_closest(x, resolved_names, threshold))
88+
df[column_name] = (df[column_name].apply(
89+
lambda x: find_closest(x, resolved_names, threshold)))

src/csvplus/data_version_diff.py

Lines changed: 55 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
11
"""
2-
A module that summarizes structural and statistical differences between two DataFrame versions.
2+
A module that summarizes structural and statistical
3+
differences between two DataFrame versions.
34
"""
45
import pandas as pd
56

7+
68
def data_version_diff(df_old, df_new):
79
"""
8-
This function compares an earlier and a later version of a pandas DataFrame
9-
and returns a high-level summary of how the data has changed. It is designed
10-
for data auditing, version tracking, and exploratory analysis rather than
11-
cell-by-cell comparison.
10+
This function compares an earlier and a later version of a pandas
11+
DataFrame and returns a high-level summary of how the data has changed.
12+
It is designed for data auditing, version tracking, and exploratory
13+
analysis rather than cell-by-cell comparison.
1214
1315
The comparison includes:
1416
- Columns that were added or removed
@@ -33,9 +35,10 @@ def data_version_diff(df_old, df_new):
3335
Notes
3436
-----
3537
- This function assumes both inputs are pandas DataFrames.
36-
- Rows are compared by position only; no key-based row matching is performed.
37-
- The function is intended for small to medium-sized datasets and exploratory
38-
analysis rather than large-scale production pipelines.
38+
- Rows are compared by position only; no key-based row matching is
39+
performed.
40+
- The function is intended for small to medium-sized datasets and
41+
exploratory analysis rather than large-scale production pipelines.
3942
4043
Examples
4144
--------
@@ -74,43 +77,59 @@ def data_version_diff(df_old, df_new):
7477
row_difference = new_row_count - old_row_count
7578

7679
# missing_value_changes
77-
shared_columns = df_old.columns.intersection(df_new.columns) #shared columns
80+
shared_columns = df_old.columns.intersection(df_new.columns) # shared cols
7881
missing_summary = pd.DataFrame({
7982
"missing_old": df_old[shared_columns].isna().sum(),
8083
"missing_new": df_new[shared_columns].isna().sum()
8184
})
8285

8386
# calculate the difference
84-
missing_summary["difference"] = missing_summary["missing_new"] - missing_summary["missing_old"]
87+
missing_summary["difference"] = (
88+
missing_summary["missing_new"] - missing_summary["missing_old"]
89+
)
8590

8691
# reset index so 'column' is a regular column
87-
missing_summary = missing_summary.reset_index().rename(columns={"index": "column"})
92+
missing_summary = (
93+
missing_summary.reset_index().rename(columns={"index": "column"})
94+
)
8895

8996
# print(f"Missing value summary: {missing_summary}")
9097

91-
## numeric_summary_changes
98+
# numeric_summary_changes
9299
# identify numeric cols in both DFs
93100
numeric_cols_old = df_old.select_dtypes(include="number").columns
94101
numeric_cols_new = df_new.select_dtypes(include="number").columns
95-
shared_numeric_columns = numeric_cols_old.intersection(numeric_cols_new)
102+
shared_numeric_columns = numeric_cols_old.intersection(numeric_cols_new)
96103

97104
# compute summary statistics for shared numeric columns
98105
if len(shared_numeric_columns) == 0:
99106
numeric_summary_changes = pd.DataFrame(
100107
columns=["column", "statistic", "old", "new", "difference"]
101108
)
102109
else:
103-
summary_old = df_old[shared_numeric_columns].describe().loc[['mean', 'std', 'min', 'max']].T
104-
summary_new = df_new[shared_numeric_columns].describe().loc[['mean', 'std', 'min', 'max']].T
110+
summary_old = (
111+
df_old[shared_numeric_columns].describe()
112+
.loc[['mean', 'std', 'min', 'max']].T
113+
)
114+
summary_new = (
115+
df_new[shared_numeric_columns].describe()
116+
.loc[['mean', 'std', 'min', 'max']].T
117+
)
105118

106119
# convert to long format
107-
summary_old_long = summary_old.reset_index().melt(id_vars="index", var_name="statistic", value_name="old").rename(columns={"index": "column"})
120+
summary_old_long = (
121+
summary_old.reset_index()
122+
.melt(id_vars="index", var_name="statistic", value_name="old")
123+
.rename(columns={"index": "column"})
124+
)
108125

109-
summary_new_long = summary_new.reset_index().melt(
110-
id_vars="index", var_name="statistic", value_name="new"
111-
).rename(columns={"index": "column"})
126+
summary_new_long = (
127+
summary_new.reset_index()
128+
.melt(id_vars="index", var_name="statistic", value_name="new")
129+
.rename(columns={"index": "column"})
130+
)
112131

113-
# merge old and new
132+
# merge old and new
114133
numeric_summary_changes = summary_old_long.merge(
115134
summary_new_long, on=["column", "statistic"], how="inner"
116135
)
@@ -128,7 +147,8 @@ def data_version_diff(df_old, df_new):
128147
new_type = df_new[col].dtype
129148
if old_type != new_type:
130149
dtype_changes_list.append({
131-
"column": col, "old_dtype": str(old_type), "new_type": str(new_type)
150+
"column": col, "old_dtype": str(old_type),
151+
"new_type": str(new_type)
132152
})
133153

134154
# convert to DF
@@ -149,19 +169,22 @@ def data_version_diff(df_old, df_new):
149169
"n_columns_added": len(columns_added),
150170
"n_columns_removed": len(columns_removed),
151171
"n_dtype_changes": len(dtype_changes),
152-
"n_missing_changes": int((missing_summary["difference"] != 0).sum()),
172+
"n_missing_changes": int(
173+
(missing_summary["difference"] != 0).sum()
174+
),
153175
},
154176
}
155177

156178
return result
157179

180+
158181
def display_data_version_diff(result):
159182
"""
160183
Print a formatted, human-readable summary of DataFrame version differences.
161184
162-
This function takes the output of `data_version_diff` and prints a structured
163-
console report highlighting row count changes, schema changes, missing value
164-
differences, numeric summary changes, and data type changes.
185+
This function takes the output of `data_version_diff` and prints a
186+
structured console report highlighting row count changes, schema changes,
187+
missing value differences, numeric summary changes, and data type changes.
165188
166189
Parameters
167190
----------
@@ -170,7 +193,8 @@ def display_data_version_diff(result):
170193
171194
Notes
172195
-----
173-
- This function is intended for interactive use (e.g., notebooks or terminals).
196+
- This function is intended for interactive use (e.g., notebooks or
197+
terminals).
174198
- It does not return any value.
175199
176200
Examples
@@ -186,7 +210,7 @@ def display_data_version_diff(result):
186210
rc = result["row_count_change"]
187211
diff = rc["row_difference"]
188212
sign = "+" if diff > 0 else ""
189-
print(f"\n ROWS CHANGE:")
213+
print("\n ROWS CHANGE:")
190214
print("-" * 60)
191215
print(f" Old Rows: {rc['old_row_count']}")
192216
print(f" New Rows: {rc['new_row_count']}")
@@ -199,7 +223,7 @@ def display_data_version_diff(result):
199223
print(f" Columns added: {', '.join(result['columns_added'])}")
200224
else:
201225
print(" Columns added: None")
202-
226+
203227
if result["columns_removed"]:
204228
print(f" Columns removed: {', '.join(result['columns_removed'])}")
205229
else:
@@ -208,7 +232,7 @@ def display_data_version_diff(result):
208232
# --- Missing values ---
209233
mv = result["missing_value_changes"]
210234
mv_changed = mv[mv["difference"] != 0]
211-
235+
212236
print("\n MISSING VALUE CHANGES:")
213237
print("-" * 60)
214238
if mv_changed.empty:
@@ -217,12 +241,12 @@ def display_data_version_diff(result):
217241
print(
218242
mv_changed.assign(
219243
change=lambda d: d["difference"].apply(
220-
lambda x: f"+{x}" if x>0 else str(x)
244+
lambda x: f"+{x}" if x > 0 else str(x)
221245
)
222246
)[["column", "missing_old", "missing_new", "change"]]
223247
.to_string(index=False)
224248
)
225-
249+
226250
# --- Numeric summary changes ---
227251
ns = result["numeric_summary_changes"]
228252
ns_changed = ns[ns["difference"] != 0]

src/csvplus/example.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
that explains the purpose of the module, at the top.
77
"""
88

9+
910
def add_numbers(a, b):
1011
"""
1112
Add two numbers together and return the result.

src/csvplus/examples/data_version_diff_demo.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
# "age": np.random.randint(18, 70, size=n_old),
1919
# "income": np.random.normal(loc=5000, scale=15000, size=n_old),
2020
# "email": [fake.email() for _ in range(n_old)],
21-
# "signup_date": [fake.date_between(start_date="-2y", end_date="today") for _ in range(n_old)],
21+
# "signup_date": [fake.date_between(start_date="-2y", end_date="today")
22+
# for _ in range(n_old)],
2223
# "country": [fake.country() for _ in range(n_old)],
2324
# })
2425

@@ -36,11 +37,13 @@
3637
# n_new_rows = 20
3738

3839
# new_rows = pd.DataFrame({
39-
# "user_id": range(df_old["user_id"].max() + 1, df_old["user_id"].max() + 1 + n_new_rows),
40+
# "user_id": range(df_old["user_id"].max() + 1, df_old["user_id"].max() +
41+
# 1 + n_new_rows),
4042
# "age": np.random.randint(18, 70, size=n_new_rows),
4143
# "income": np.random.normal(loc=60000, scale=20000, size=n_new_rows),
4244
# "email": [fake.email() for _ in range(n_new_rows)],
43-
# "signup_date": [fake.date_between(start_date="-1y", end_date="today") for _ in range(n_new_rows)],
45+
# "signup_date": [fake.date_between(start_date="-1y", end_date="today")
46+
# for _ in range(n_new_rows)],
4447
# "country": [fake.country() for _ in range(n_new_rows)],
4548
# })
4649
# df_new = pd.concat([df_new, new_rows], ignore_index=True)
@@ -50,7 +53,8 @@
5053

5154
# #column addition
5255
# df_new["last_login_date"] = [
53-
# fake.date_between(start_date="-6m", end_date="today") for _ in range(len(df_new))
56+
# fake.date_between(start_date="-6m", end_date="today")
57+
# for _ in range(len(df_new))
5458
# ]
5559

5660
# ## missing value changes

0 commit comments

Comments
 (0)