Merge pull request #22 from vertti/fix-typings

vertti · web-flow · commit 2a62d0aba3c7 · 2025-03-12T16:05:07.000+02:00
Fix typings
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,12 @@
 
 All notable changes to this project will be documented in this file.
 
+## 0.13.0
+
+- Fix type annotation issues with decorator parameters that could cause type errors in strict type checking
+- Use `Sequence` instead of `List` for better type variance compatibility
+- Add test case that validates type compatibility
+
 ## 0.12.0
 
 - Add support for regex patterns used with column dtype validation
diff --git a/daffy/decorators.py b/daffy/decorators.py
@@ -5,6 +5,7 @@
 import re
 from functools import wraps
 from typing import Any, Callable, Dict, List, Optional, Pattern, Tuple, TypeVar, Union
+from typing import Sequence as Seq  # Renamed to avoid collision
 
 import pandas as pd
 import polars as pl
@@ -16,30 +17,30 @@
 from daffy.config import get_strict
 
 # Type variables for preserving return types
-T = TypeVar("T")
-R = TypeVar("R")
+T = TypeVar("T")  # Generic type var for df_log
+DF = TypeVar("DF", bound=Union[PandasDataFrame, PolarsDataFrame])
+R = TypeVar("R")  # Return type for df_in
 
+RegexColumnDef = Tuple[str, Pattern[str]]
 
-# Improved type definitions to support regex patterns
-RegexColumnDef = Tuple[str, Pattern[str]]  # Tuple of (pattern_str, compiled_pattern)
-ColumnsDef = Union[List[Union[str, RegexColumnDef]], Dict[Union[str, RegexColumnDef], Any]]
+ColumnsList = Seq[Union[str, RegexColumnDef]]
+ColumnsDict = Dict[Union[str, RegexColumnDef], Any]
+ColumnsDef = Union[ColumnsList, ColumnsDict, None]
 DataFrameType = Union[PandasDataFrame, PolarsDataFrame]
 
 
 def _is_regex_pattern(column: Any) -> bool:
-    """Check if the column definition is a regex pattern tuple."""
     return (
         isinstance(column, tuple) and len(column) == 2 and isinstance(column[0], str) and isinstance(column[1], Pattern)
     )
 
 
 def _match_column_with_regex(column_pattern: RegexColumnDef, df_columns: List[str]) -> List[str]:
-    """Find all column names that match the regex pattern."""
     _, pattern = column_pattern
     return [col for col in df_columns if pattern.match(col)]
 
 
-def _compile_regex_patterns(columns: List[Any]) -> List[Union[str, RegexColumnDef]]:
+def _compile_regex_patterns(columns: Seq[Any]) -> List[Union[str, RegexColumnDef]]:
     """Compile regex patterns in the column list."""
     result: List[Union[str, RegexColumnDef]] = []
     for col in columns:
@@ -53,7 +54,7 @@ def _compile_regex_patterns(columns: List[Any]) -> List[Union[str, RegexColumnDe
     return result
 
 
-def _check_columns(df: DataFrameType, columns: ColumnsDef, strict: bool) -> None:
+def _check_columns(df: DataFrameType, columns: Union[ColumnsList, ColumnsDict], strict: bool) -> None:
     missing_columns = []
     dtype_mismatches = []
     matched_by_regex = set()
@@ -137,15 +138,16 @@ def _check_columns(df: DataFrameType, columns: ColumnsDef, strict: bool) -> None
 
 
 def df_out(
-    columns: Optional[ColumnsDef] = None, strict: Optional[bool] = None
-) -> Callable[[Callable[..., DataFrameType]], Callable[..., DataFrameType]]:
+    columns: Union[ColumnsList, ColumnsDict, None] = None, strict: Optional[bool] = None
+) -> Callable[[Callable[..., DF]], Callable[..., DF]]:
     """Decorate a function that returns a Pandas or Polars DataFrame.
 
     Document the return value of a function. The return value will be validated in runtime.
 
     Args:
-        columns (ColumnsDef, optional): List or dict that describes expected columns of the DataFrame.
-            List can contain regex patterns in format "r/pattern/" (e.g., "r/Col[0-9]+/").
+        columns (Union[Sequence[str], Dict[str, Any]], optional): Sequence or dict that describes expected columns
+            of the DataFrame.
+            Sequence can contain regex patterns in format "r/pattern/" (e.g., "r/Col[0-9]+/").
             Dict can use regex patterns as keys in format "r/pattern/" to validate dtypes for matching columns.
             Defaults to None.
         strict (bool, optional): If True, columns must match exactly with no extra columns.
@@ -155,9 +157,9 @@ def df_out(
         Callable: Decorated function with preserved DataFrame return type
     """
 
-    def wrapper_df_out(func: Callable[..., DataFrameType]) -> Callable[..., DataFrameType]:
+    def wrapper_df_out(func: Callable[..., DF]) -> Callable[..., DF]:
         @wraps(func)
-        def wrapper(*args: Any, **kwargs: Any) -> DataFrameType:
+        def wrapper(*args: Any, **kwargs: Any) -> DF:
             result = func(*args, **kwargs)
             assert isinstance(result, pd.DataFrame) or isinstance(result, pl.DataFrame), (
                 f"Wrong return type. Expected DataFrame, got {type(result)}"
@@ -188,16 +190,17 @@ def _get_parameter(func: Callable[..., Any], name: Optional[str] = None, *args:
 
 
 def df_in(
-    name: Optional[str] = None, columns: Optional[ColumnsDef] = None, strict: Optional[bool] = None
+    name: Optional[str] = None, columns: Union[ColumnsList, ColumnsDict, None] = None, strict: Optional[bool] = None
 ) -> Callable[[Callable[..., R]], Callable[..., R]]:
     """Decorate a function parameter that is a Pandas or Polars DataFrame.
 
     Document the contents of an input parameter. The parameter will be validated in runtime.
 
     Args:
         name (Optional[str], optional): Name of the parameter that contains a DataFrame. Defaults to None.
-        columns (ColumnsDef, optional): List or dict that describes expected columns of the DataFrame.
-            List can contain regex patterns in format "r/pattern/" (e.g., "r/Col[0-9]+/").
+        columns (Union[Sequence[str], Dict[str, Any]], optional): Sequence or dict that describes expected columns
+            of the DataFrame.
+            Sequence can contain regex patterns in format "r/pattern/" (e.g., "r/Col[0-9]+/").
             Dict can use regex patterns as keys in format "r/pattern/" to validate dtypes for matching columns.
             Defaults to None.
         strict (bool, optional): If True, columns must match exactly with no extra columns.
diff --git a/mypy.ini b/mypy.ini
@@ -23,8 +23,3 @@ warn_unused_ignores = True
 implicit_reexport = False
 strict_optional = True
 strict_equality = True
-
-# Relax rules for tests
-[mypy-tests.*]
-disallow_any_unimported = False
-disallow_any_decorated = False
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "daffy"
-version = "0.12.0"
+version = "0.13.0"
 description = "Function decorators for Pandas and Polars Dataframe column name and data type validation"
 authors = [
  { name="Janne Sinivirta", email="janne.sinivirta@gmail.com" },
diff --git a/tests/test_df_out.py b/tests/test_df_out.py
@@ -9,7 +9,7 @@
 
 
 def test_wrong_return_type() -> None:
-    @df_out()  # type: ignore[arg-type]
+    @df_out()  # type: ignore[type-var]
     def test_fn() -> int:
         return 1
 
diff --git a/tests/test_type_compatibility.py b/tests/test_type_compatibility.py
@@ -0,0 +1,174 @@
+"""Test type compatibility issues that might occur in client code."""
+
+from typing import Sequence
+
+import pandas as pd
+import polars as pl
+
+from daffy import df_in, df_out
+
+
+# Pass-through function for testing
+@df_in(columns=["Brand", "Price"])
+def simple_list_columns(df: pd.DataFrame) -> pd.DataFrame:
+    return df
+
+
+def test_simple_list_columns() -> None:
+    """Test with a simple list of string columns."""
+    df = pd.DataFrame({"Brand": ["Toyota"], "Price": [25000]})
+    result = simple_list_columns(df)
+    assert isinstance(result, pd.DataFrame)
+
+
+# This would test the Union type DataFrameType compatibility
+@df_out(columns=["Brand", "Price"])
+def return_dataframe() -> pd.DataFrame:
+    return pd.DataFrame({"Brand": ["Toyota"], "Price": [25000]})
+
+
+def function_with_explicit_type_annotations(columns: Sequence[str]) -> None:
+    @df_in(columns=columns)
+    def inner_function(df: pd.DataFrame) -> pd.DataFrame:
+        return df
+
+    df = pd.DataFrame({"Brand": ["Toyota"], "Price": [25000]})
+    inner_function(df)
+
+
+def test_with_polars() -> None:
+    df = pl.DataFrame({"Brand": ["Toyota"], "Price": [25000]})
+
+    @df_in(columns=["Brand", "Price"])
+    def inner_function(df_param: pl.DataFrame) -> pl.DataFrame:
+        return df_param
+
+    inner_function(df)
+
+
+def test_function_with_explicit_type_annotations() -> None:
+    columns = ["Brand", "Price"]
+    function_with_explicit_type_annotations(columns)
+
+
+def test_simple_list_columns_function() -> None:
+    df = pd.DataFrame({"Brand": ["Toyota"], "Price": [25000]})
+    simple_list_columns(df)
+
+
+def test_return_dataframe_function() -> None:
+    result = return_dataframe()
+    assert isinstance(result, pd.DataFrame)
+
+
+def test_dtype_with_regex_pandas() -> None:
+    """Test using both dtype validation and regex patterns with pandas."""
+    # Create a DataFrame with numeric columns following a pattern
+    df = pd.DataFrame(
+        {
+            "measure_2020": [10, 20, 30],
+            "measure_2021": [15, 25, 35],
+            "measure_2022": [18, 28, 38],
+            "category": ["A", "B", "C"],
+        }
+    )
+
+    # Define a function using both regex patterns and dtype validation
+    @df_in(
+        columns={
+            "category": "object",
+            "r/measure_\\d{4}/": "int64",  # All measure_YYYY columns should be int64
+        }
+    )
+    def process_measures(data: pd.DataFrame) -> pd.DataFrame:
+        return data
+
+    # This should pass type checking and runtime validation
+    result = process_measures(df)
+    assert "measure_2020" in result.columns
+    assert "measure_2021" in result.columns
+    assert "measure_2022" in result.columns
+
+
+def test_dtype_with_regex_polars() -> None:
+    """Test using both dtype validation and regex patterns with polars."""
+    # Create a Polars DataFrame with numeric columns following a pattern
+    df = pl.DataFrame(
+        {
+            "measure_2020": [10, 20, 30],
+            "measure_2021": [15, 25, 35],
+            "measure_2022": [18, 28, 38],
+            "category": ["A", "B", "C"],
+        }
+    )
+
+    # Define a function using both regex patterns and dtype validation
+    @df_in(
+        columns={
+            "category": pl.String,
+            "r/measure_\\d{4}/": pl.Int64,  # All measure_YYYY columns should be Int64
+        }
+    )
+    def process_measures(data: pl.DataFrame) -> pl.DataFrame:
+        return data
+
+    # This should pass type checking and runtime validation
+    result = process_measures(df)
+    assert "measure_2020" in result.columns
+    assert "measure_2021" in result.columns
+    assert "measure_2022" in result.columns
+
+
+def test_type_narrowing_with_df_out_pandas() -> None:
+    """Test assigning df_out decorated function result to a specific Pandas DataFrame type."""
+
+    # Define a function that returns a DataFrame with df_out decoration
+    @df_out(columns=["name", "value"])
+    def get_data() -> pd.DataFrame:
+        return pd.DataFrame({"name": ["A", "B", "C"], "value": [1, 2, 3]})
+
+    # The critical test: we should be able to assign the result to a variable
+    # explicitly typed as pd.DataFrame without mypy errors
+    result: pd.DataFrame = get_data()
+    assert "name" in result.columns
+    assert "value" in result.columns
+
+
+def test_type_narrowing_with_df_out_polars() -> None:
+    """Test assigning df_out decorated function result to a specific Polars DataFrame type."""
+
+    # Define a function that returns a DataFrame with df_out decoration
+    @df_out(columns=["name", "value"])
+    def get_data() -> pl.DataFrame:
+        return pl.DataFrame({"name": ["A", "B", "C"], "value": [1, 2, 3]})
+
+    # The critical test: we should be able to assign the result to a variable
+    # explicitly typed as pl.DataFrame without mypy errors
+    result: pl.DataFrame = get_data()
+    assert "name" in result.columns
+    assert "value" in result.columns
+
+
+def test_df_out_preserves_specific_return_type() -> None:
+    """Test that df_out preserves the specific DataFrame return type annotation."""
+
+    # Function that specifically returns pandas DataFrame with df_out
+    @df_out(columns=["col1", "col2"])
+    def function_with_pandas_df() -> pd.DataFrame:
+        return pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
+
+    # We should be able to assign to a variable typed as pandas DataFrame
+    # without having to cast or getting type errors
+    result: pd.DataFrame = function_with_pandas_df()
+
+    # Same with a function returning polars DataFrame
+    @df_out(columns=["col1", "col2"])
+    def function_with_polars_df() -> pl.DataFrame:
+        return pl.DataFrame({"col1": [1, 2], "col2": [3, 4]})
+
+    # Should be assignable to a variable typed as polars DataFrame
+    polars_result: pl.DataFrame = function_with_polars_df()
+
+    # Both should work at runtime too
+    assert isinstance(result, pd.DataFrame)
+    assert isinstance(polars_result, pl.DataFrame)