Merge pull request #18 from vertti/support-regex-patters

vertti · web-flow · commit ba6bc9a722fe · 2025-03-04T17:51:19.000+02:00
Support regex patters
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 All notable changes to this project will be documented in this file.
 
+## 0.10.0
+
+- Add support for regex patterns in column name validation
+
 ## 0.9.4
 
 - Fix to strict flag loading when tool config was missing
diff --git a/README.md b/README.md
@@ -22,6 +22,7 @@ Like type hints for DataFrames, Daffy helps you catch structural mismatches earl
 ## Key Features
 
 - Validate DataFrame columns at function entry and exit points
+- Support regex patterns for matching column names (e.g., `"r/column_\d+/"`)
 - Check data types of columns
 - Control strictness of validation (allow or disallow extra columns)
 - Works with both Pandas and Polars DataFrames
diff --git a/daffy/decorators.py b/daffy/decorators.py
@@ -2,27 +2,73 @@
 
 import inspect
 import logging
+import re
 from functools import wraps
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Pattern, Tuple, Union
 
 import pandas as pd
 import polars as pl
 
 from daffy.config import get_strict
 
-ColumnsDef = Union[List, Dict]
+# New type definition to support regex patterns
+RegexColumnDef = Tuple[str, Pattern]  # Tuple of (pattern_str, compiled_pattern)
+ColumnsDef = Union[List, Dict, List[Union[str, RegexColumnDef]]]
 DataFrameType = Union[pd.DataFrame, pl.DataFrame]
 
 
+def _is_regex_pattern(column: Any) -> bool:
+    """Check if the column definition is a regex pattern tuple."""
+    return (
+        isinstance(column, tuple) and len(column) == 2 and isinstance(column[0], str) and isinstance(column[1], Pattern)
+    )
+
+
+def _match_column_with_regex(column_pattern: RegexColumnDef, df_columns: List[str]) -> List[str]:
+    """Find all column names that match the regex pattern."""
+    _, pattern = column_pattern
+    return [col for col in df_columns if pattern.match(col)]
+
+
+def _compile_regex_patterns(columns: List) -> List:
+    """Compile regex patterns in the column list."""
+    result = []
+    for col in columns:
+        if isinstance(col, str) and col.startswith("r/") and col.endswith("/"):
+            # Pattern is in the format "r/pattern/"
+            pattern_str = col[2:-1]  # Remove "r/" prefix and "/" suffix
+            compiled_pattern = re.compile(pattern_str)
+            result.append((col, compiled_pattern))
+        else:
+            result.append(col)
+    return result
+
+
 def _check_columns(df: DataFrameType, columns: ColumnsDef, strict: bool) -> None:
     missing_columns = []
     dtype_mismatches = []
+    matched_by_regex = set()
 
+    # Handle list of column names/patterns
     if isinstance(columns, list):
-        for column in columns:
-            if column not in df.columns:
-                missing_columns.append(column)
-    if isinstance(columns, dict):
+        # First, compile any regex patterns
+        processed_columns = _compile_regex_patterns(columns)
+
+        for column in processed_columns:
+            if isinstance(column, str):
+                # Direct column name match
+                if column not in df.columns:
+                    missing_columns.append(column)
+            elif _is_regex_pattern(column):
+                # Regex pattern match
+                matches = _match_column_with_regex(column, list(df.columns))
+                if not matches:
+                    missing_columns.append(column[0])  # Add the original pattern string
+                else:
+                    matched_by_regex.update(matches)
+
+    # Handle dictionary of column names/types
+    elif isinstance(columns, dict):
         for column, dtype in columns.items():
             if column not in df.columns:
                 missing_columns.append(column)
@@ -39,18 +85,26 @@ def _check_columns(df: DataFrameType, columns: ColumnsDef, strict: bool) -> None
         raise AssertionError(mismatches)
 
     if strict:
-        extra_columns = set(df.columns) - set(columns)
+        if isinstance(columns, list):
+            # For regex matches, we need to consider all matched columns
+            explicit_columns = {col for col in columns if isinstance(col, str)}
+            allowed_columns = explicit_columns.union(matched_by_regex)
+            extra_columns = set(df.columns) - allowed_columns
+        else:
+            extra_columns = set(df.columns) - set(columns)
+
         if extra_columns:
             raise AssertionError(f"DataFrame contained unexpected column(s): {', '.join(extra_columns)}")
 
 
 def df_out(columns: Optional[ColumnsDef] = None, strict: Optional[bool] = None) -> Callable:
-    """Decorate a function that returns a Pandas DataFrame.
+    """Decorate a function that returns a Pandas or Polars DataFrame.
 
     Document the return value of a function. The return value will be validated in runtime.
 
     Args:
-        columns (ColumnsDef, optional): List or dict that describes expected columns of the DataFrame. Defaults to None.
+        columns (ColumnsDef, optional): List or dict that describes expected columns of the DataFrame.
+            List can contain regex patterns in format "r/pattern/" (e.g., "r/Col[0-9]+/"). Defaults to None.
         strict (bool, optional): If True, columns must match exactly with no extra columns.
             If None, uses the value from [tool.daffy] strict setting in pyproject.toml.
 
@@ -91,13 +145,14 @@ def _get_parameter(func: Callable, name: Optional[str] = None, *args: str, **kwa
 
 
 def df_in(name: Optional[str] = None, columns: Optional[ColumnsDef] = None, strict: Optional[bool] = None) -> Callable:
-    """Decorate a function parameter that is a Pandas DataFrame.
+    """Decorate a function parameter that is a Pandas or Polars DataFrame.
 
-    Document the contents of an inpute parameter. The parameter will be validated in runtime.
+    Document the contents of an input parameter. The parameter will be validated in runtime.
 
     Args:
         name (Optional[str], optional): Name of the parameter that contains a DataFrame. Defaults to None.
-        columns (ColumnsDef, optional): List or dict that describes expected columns of the DataFrame. Defaults to None.
+        columns (ColumnsDef, optional): List or dict that describes expected columns of the DataFrame.
+            List can contain regex patterns in format "r/pattern/" (e.g., "r/Col[0-9]+/"). Defaults to None.
         strict (bool, optional): If True, columns must match exactly with no extra columns.
             If None, uses the value from [tool.daffy] strict setting in pyproject.toml.
 
diff --git a/docs/usage.md b/docs/usage.md
@@ -63,6 +63,31 @@ def filter_cars(car_df):
     return filtered_cars_df
 ```
 
+## Column Pattern Matching with Regex
+
+You can use regex patterns to match column names that follow a specific pattern. This is useful when working with dynamic column names or when dealing with many similar columns.
+
+Define a regex pattern by using the format `"r/pattern/"`:
+
+```python
+@df_in(columns=["Brand", "r/Price_\d+/"])
+def process_data(df):
+    # This will accept DataFrames with columns like "Brand", "Price_1", "Price_2", etc.
+    ...
+```
+
+In this example:
+- The DataFrame must have a column named exactly "Brand"
+- The DataFrame must have at least one column matching the pattern "Price_\d+" (e.g., "Price_1", "Price_2", etc.)
+
+If no columns match a regex pattern, an error is raised:
+
+```
+AssertionError: Missing columns: ['r/Price_\d+/']. Got columns: ['Brand', 'Model']
+```
+
+Regex patterns are also considered in strict mode. Any column matching a regex pattern is considered valid.
+
 ## Data Type Validation
 
 If you want to also check the data types of each column, you can replace the column array:
@@ -83,6 +108,8 @@ This will not only check that the specified columns are found from the DataFrame
 AssertionError("Column Price has wrong dtype. Was int64, expected float64")
 ```
 
+> Note: Regex pattern matching is only available for column name lists, not for dictionaries specifying data types.
+
 ## Strict Mode
 
 You can enable strict-mode for both `@df_in` and `@df_out`. This will raise an error if the DataFrame contains columns not defined in the annotation:
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "daffy"
-version = "0.9.4"
+version = "0.10.0"
 description = "Function decorators for Pandas and Polars Dataframe column name and data type validation"
 authors = [
  { name="Janne Sinivirta", email="janne.sinivirta@gmail.com" },
diff --git a/tests/test_df_in.py b/tests/test_df_in.py
@@ -208,3 +208,50 @@ def test_fn(cars: DataFrameType, ext_cars: DataFrameType) -> int:
         return len(cars) + len(ext_cars)
 
     test_fn(basic_df, ext_cars=extended_df)
+
+
+def test_regex_column_pattern(basic_pandas_df: pd.DataFrame) -> None:
+    # Create a DataFrame with numbered price columns
+    df = basic_pandas_df.copy()
+    df["Price_1"] = df["Price"] * 1
+    df["Price_2"] = df["Price"] * 2
+    df["Price_3"] = df["Price"] * 3
+
+    @df_in(columns=["Brand", "r/Price_[0-9]/"])
+    def test_fn(my_input: Any) -> Any:
+        return my_input
+
+    # This should pass since we have Price_1, Price_2, and Price_3 columns
+    result = test_fn(df)
+    assert "Price_1" in result.columns
+    assert "Price_2" in result.columns
+    assert "Price_3" in result.columns
+
+
+def test_regex_column_pattern_missing(basic_pandas_df: pd.DataFrame) -> None:
+    @df_in(columns=["Brand", "r/NonExistent_[0-9]/"])
+    def test_fn(my_input: Any) -> Any:
+        return my_input
+
+    # This should fail since we don't have any columns matching the pattern
+    with pytest.raises(AssertionError) as excinfo:
+        test_fn(basic_pandas_df)
+
+    assert "Missing columns: ['r/NonExistent_[0-9]/']" in str(excinfo.value)
+
+
+def test_regex_column_pattern_with_strict(basic_pandas_df: pd.DataFrame) -> None:
+    # Create a DataFrame with numbered price columns
+    df = basic_pandas_df.copy()
+    df["Price_1"] = df["Price"] * 1
+    df["Price_2"] = df["Price"] * 2
+
+    @df_in(columns=["Brand", "r/Price_[0-9]/"], strict=True)
+    def test_fn(my_input: Any) -> Any:
+        return my_input
+
+    # This should pass, because "Price" is unexpected but "Price_1" and "Price_2" match the regex
+    with pytest.raises(AssertionError) as excinfo:
+        test_fn(df)
+
+    assert "DataFrame contained unexpected column(s): Price" in str(excinfo.value)
diff --git a/tests/test_df_out.py b/tests/test_df_out.py
@@ -87,3 +87,44 @@ def test_fn(my_input: Any) -> Any:
 
     assert list(basic_pandas_df.columns) == ["Brand", "Price"]  # For sanity
     pd.testing.assert_frame_equal(extended_pandas_df, test_fn(basic_pandas_df.copy()))
+
+
+def test_regex_column_pattern_in_output(basic_pandas_df: pd.DataFrame) -> None:
+    # Create a function that adds numbered price columns
+    @df_out(columns=["Brand", "r/Price_[0-9]/"])
+    def test_fn() -> pd.DataFrame:
+        df = basic_pandas_df.copy()
+        df["Price_1"] = df["Price"] * 1
+        df["Price_2"] = df["Price"] * 2
+        return df
+
+    # This should pass since the output has Brand and Price_1, Price_2 columns
+    result = test_fn()
+    assert "Price_1" in result.columns
+    assert "Price_2" in result.columns
+
+
+def test_regex_column_pattern_missing_in_output(basic_pandas_df: pd.DataFrame) -> None:
+    @df_out(columns=["Brand", "r/NonExistent_[0-9]/"])
+    def test_fn() -> pd.DataFrame:
+        return basic_pandas_df.copy()
+
+    # This should fail since the output doesn't have columns matching the pattern
+    with pytest.raises(AssertionError) as excinfo:
+        test_fn()
+
+    assert "Missing columns: ['r/NonExistent_[0-9]/']" in str(excinfo.value)
+
+
+def test_regex_column_pattern_with_strict_in_output(basic_pandas_df: pd.DataFrame) -> None:
+    @df_out(columns=["Brand", "r/Price_[0-9]/"], strict=True)
+    def test_fn() -> pd.DataFrame:
+        df = basic_pandas_df.copy()
+        df["Price_1"] = df["Price"] * 1
+        return df
+
+    # This should raise an error because Price is unexpected
+    with pytest.raises(AssertionError) as excinfo:
+        test_fn()
+
+    assert "DataFrame contained unexpected column(s): Price" in str(excinfo.value)