Merge pull request #21 from vertti/regex-with-dtypes

vertti · web-flow · commit 39db742f370c · 2025-03-11T09:21:58.000+02:00
Regex with dtypes
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 All notable changes to this project will be documented in this file.
 
+## 0.12.0
+
+- Add support for regex patterns used with column dtype validation
+
 ## 0.11.0
 
 - Update function parameter types for better type safety
diff --git a/daffy/decorators.py b/daffy/decorators.py
@@ -22,7 +22,7 @@
 
 # Improved type definitions to support regex patterns
 RegexColumnDef = Tuple[str, Pattern[str]]  # Tuple of (pattern_str, compiled_pattern)
-ColumnsDef = Union[List[Union[str, RegexColumnDef]], Dict[str, Any]]
+ColumnsDef = Union[List[Union[str, RegexColumnDef]], Dict[Union[str, RegexColumnDef], Any]]
 DataFrameType = Union[PandasDataFrame, PolarsDataFrame]
 
 
@@ -78,11 +78,38 @@ def _check_columns(df: DataFrameType, columns: ColumnsDef, strict: bool) -> None
 
     # Handle dictionary of column names/types
     elif isinstance(columns, dict):
+        # First, process dictionary keys for regex patterns
+        processed_dict: Dict[Union[str, RegexColumnDef], Any] = {}
         for column, dtype in columns.items():
-            if column not in df.columns:
-                missing_columns.append(column)
-            elif df[column].dtype != dtype:
-                dtype_mismatches.append((column, df[column].dtype, dtype))
+            if isinstance(column, str) and column.startswith("r/") and column.endswith("/"):
+                # Pattern is in the format "r/pattern/"
+                pattern_str = column[2:-1]  # Remove "r/" prefix and "/" suffix
+                compiled_pattern = re.compile(pattern_str)
+                processed_dict[(column, compiled_pattern)] = dtype
+            else:
+                processed_dict[column] = dtype
+
+        # Check each column against dictionary keys
+        regex_matched_columns = set()
+        for column_key, dtype in processed_dict.items():
+            if isinstance(column_key, str):
+                # Direct column name match
+                if column_key not in df.columns:
+                    missing_columns.append(column_key)
+                elif df[column_key].dtype != dtype:
+                    dtype_mismatches.append((column_key, df[column_key].dtype, dtype))
+            elif _is_regex_pattern(column_key):
+                # Regex pattern match
+                pattern_str, compiled_pattern = column_key
+                matches = _match_column_with_regex(column_key, list(df.columns))
+                if not matches:
+                    missing_columns.append(pattern_str)  # Add the original pattern string
+                else:
+                    for matched_col in matches:
+                        matched_by_regex.add(matched_col)
+                        regex_matched_columns.add(matched_col)
+                        if df[matched_col].dtype != dtype:
+                            dtype_mismatches.append((matched_col, df[matched_col].dtype, dtype))
 
     if missing_columns:
         raise AssertionError(f"Missing columns: {missing_columns}. Got {_describe_pd(df)}")
@@ -100,7 +127,10 @@ def _check_columns(df: DataFrameType, columns: ColumnsDef, strict: bool) -> None
             allowed_columns = explicit_columns.union(matched_by_regex)
             extra_columns = set(df.columns) - allowed_columns
         else:
-            extra_columns = set(df.columns) - set(columns)
+            # For dict with regex patterns, we need to handle both direct and regex matches
+            explicit_columns = {col for col in columns if isinstance(col, str)}
+            allowed_columns = explicit_columns.union(matched_by_regex)
+            extra_columns = set(df.columns) - allowed_columns
 
         if extra_columns:
             raise AssertionError(f"DataFrame contained unexpected column(s): {', '.join(extra_columns)}")
@@ -115,7 +145,9 @@ def df_out(
 
     Args:
         columns (ColumnsDef, optional): List or dict that describes expected columns of the DataFrame.
-            List can contain regex patterns in format "r/pattern/" (e.g., "r/Col[0-9]+/"). Defaults to None.
+            List can contain regex patterns in format "r/pattern/" (e.g., "r/Col[0-9]+/").
+            Dict can use regex patterns as keys in format "r/pattern/" to validate dtypes for matching columns.
+            Defaults to None.
         strict (bool, optional): If True, columns must match exactly with no extra columns.
             If None, uses the value from [tool.daffy] strict setting in pyproject.toml.
 
@@ -165,7 +197,9 @@ def df_in(
     Args:
         name (Optional[str], optional): Name of the parameter that contains a DataFrame. Defaults to None.
         columns (ColumnsDef, optional): List or dict that describes expected columns of the DataFrame.
-            List can contain regex patterns in format "r/pattern/" (e.g., "r/Col[0-9]+/"). Defaults to None.
+            List can contain regex patterns in format "r/pattern/" (e.g., "r/Col[0-9]+/").
+            Dict can use regex patterns as keys in format "r/pattern/" to validate dtypes for matching columns.
+            Defaults to None.
         strict (bool, optional): If True, columns must match exactly with no extra columns.
             If None, uses the value from [tool.daffy] strict setting in pyproject.toml.
 
diff --git a/docs/usage.md b/docs/usage.md
@@ -108,7 +108,26 @@ This will not only check that the specified columns are found from the DataFrame
 AssertionError("Column Price has wrong dtype. Was int64, expected float64")
 ```
 
-> Note: Regex pattern matching is only available for column name lists, not for dictionaries specifying data types.
+### Combining Regex Patterns with Data Type Validation
+
+You can use regex patterns in dictionaries that specify data types as well:
+
+```python
+@df_in(columns={"Brand": "object", "r/Price_\d+/": "int64"})
+def process_data(df):
+    # This will check that all columns matching "Price_\d+" have int64 dtype
+    ...
+```
+
+In this example:
+- The DataFrame must have a column named exactly "Brand" with dtype "object"
+- Any columns matching the pattern "Price_\d+" (e.g., "Price_1", "Price_2") must have dtype "int64"
+
+If a column matches the regex pattern but has the wrong dtype, an error is raised:
+
+```
+AssertionError: Column Price_2 has wrong dtype. Was float64, expected int64
+```
 
 ## Strict Mode
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "daffy"
-version = "0.11.0"
+version = "0.12.0"
 description = "Function decorators for Pandas and Polars Dataframe column name and data type validation"
 authors = [
  { name="Janne Sinivirta", email="janne.sinivirta@gmail.com" },
diff --git a/tests/test_df_in.py b/tests/test_df_in.py
@@ -255,3 +255,51 @@ def test_fn(my_input: Any) -> Any:
         test_fn(df)
 
     assert "DataFrame contained unexpected column(s): Price" in str(excinfo.value)
+
+
+def test_regex_column_with_dtype_pandas(basic_pandas_df: pd.DataFrame) -> None:
+    # Create a DataFrame with numbered price columns
+    df = basic_pandas_df.copy()
+    df["Price_1"] = df["Price"] * 1
+    df["Price_2"] = df["Price"] * 2
+
+    @df_in(columns={"Brand": "object", "r/Price_[0-9]/": "int64"})
+    def test_fn(my_input: Any) -> Any:
+        return my_input
+
+    # This should pass since Price_1 and Price_2 are int64
+    result = test_fn(df)
+    assert "Price_1" in result.columns
+    assert "Price_2" in result.columns
+
+
+def test_regex_column_with_dtype_mismatch_pandas(basic_pandas_df: pd.DataFrame) -> None:
+    # Create a DataFrame with numbered price columns
+    df = basic_pandas_df.copy()
+    df["Price_1"] = df["Price"] * 1
+    df["Price_2"] = df["Price"] * 2.0  # Make this a float
+
+    @df_in(columns={"Brand": "object", "r/Price_[0-9]/": "int64"})
+    def test_fn(my_input: Any) -> Any:
+        return my_input
+
+    # This should fail since Price_2 is float64, not int64
+    with pytest.raises(AssertionError) as excinfo:
+        test_fn(df)
+
+    assert "Column Price_2 has wrong dtype. Was float64, expected int64" in str(excinfo.value)
+
+
+def test_regex_column_with_dtype_polars(basic_polars_df: pl.DataFrame) -> None:
+    # Create a DataFrame with numbered price columns
+    # Polars DataFrames are immutable, so we don't need to copy
+    df = basic_polars_df.with_columns([pl.col("Price").alias("Price_1"), pl.col("Price").alias("Price_2")])
+
+    @df_in(columns={"Brand": pl.datatypes.String, "r/Price_[0-9]/": pl.datatypes.Int64})
+    def test_fn(my_input: Any) -> Any:
+        return my_input
+
+    # This should pass since Price_1 and Price_2 are Int64
+    result = test_fn(df)
+    assert "Price_1" in result.columns
+    assert "Price_2" in result.columns
diff --git a/tests/test_df_out.py b/tests/test_df_out.py
@@ -131,3 +131,63 @@ def test_fn() -> pd.DataFrame:
         test_fn()
 
     assert "DataFrame contained unexpected column(s): Price" in str(excinfo.value)
+
+
+def test_regex_column_with_dtype_in_output_pandas(basic_pandas_df: pd.DataFrame) -> None:
+    # Create a function that adds numbered price columns
+    @df_out(columns={"Brand": "object", "r/Price_[0-9]/": "int64"})
+    def test_fn() -> pd.DataFrame:
+        df = basic_pandas_df.copy()
+        df["Price_1"] = df["Price"] * 1
+        df["Price_2"] = df["Price"] * 2
+        return df
+
+    # This should pass since Price_1 and Price_2 are int64
+    result = test_fn()
+    assert "Price_1" in result.columns
+    assert "Price_2" in result.columns
+
+
+def test_regex_column_with_dtype_mismatch_in_output_pandas(basic_pandas_df: pd.DataFrame) -> None:
+    # Create a function that adds numbered price columns with one wrong dtype
+    @df_out(columns={"Brand": "object", "r/Price_[0-9]/": "int64"})
+    def test_fn() -> pd.DataFrame:
+        df = basic_pandas_df.copy()
+        df["Price_1"] = df["Price"] * 1
+        df["Price_2"] = df["Price"] * 2.0  # Make this a float
+        return df
+
+    # This should fail since Price_2 is float64, not int64
+    with pytest.raises(AssertionError) as excinfo:
+        test_fn()
+
+    assert "Column Price_2 has wrong dtype. Was float64, expected int64" in str(excinfo.value)
+
+
+def test_regex_column_with_dtype_in_output_polars(basic_polars_df: pl.DataFrame) -> None:
+    # Create a function that adds numbered price columns
+    @df_out(columns={"Brand": pl.datatypes.String, "r/Price_[0-9]/": pl.datatypes.Int64})
+    def test_fn() -> pl.DataFrame:
+        # Polars DataFrames are immutable, so we build a new one
+        return basic_polars_df.with_columns([pl.col("Price").alias("Price_1"), pl.col("Price").alias("Price_2")])
+
+    # This should pass since Price_1 and Price_2 are Int64
+    result = test_fn()
+    assert "Price_1" in result.columns
+    assert "Price_2" in result.columns
+
+
+def test_regex_column_with_dtype_strict_in_output_pandas(basic_pandas_df: pd.DataFrame) -> None:
+    # Create a function that adds numbered price columns
+    @df_out(columns={"Brand": "object", "r/Price_[0-9]/": "int64"}, strict=True)
+    def test_fn() -> pd.DataFrame:
+        df = basic_pandas_df.copy()
+        df["Price_1"] = df["Price"] * 1
+        df["Price_2"] = df["Price"] * 2
+        return df
+
+    # This should fail because Price is unexpected
+    with pytest.raises(AssertionError) as excinfo:
+        test_fn()
+
+    assert "DataFrame contained unexpected column(s): Price" in str(excinfo.value)