Merge pull request #26 from vertti/improve-coverage

vertti · web-flow · commit 17128ee205b8 · 2025-06-19T14:22:00.000+03:00
Improve code coverage
diff --git a/daffy/decorators.py b/daffy/decorators.py
@@ -4,7 +4,7 @@
 import logging
 import re
 from functools import wraps
-from typing import Any, Callable, Dict, List, Optional, Pattern, Tuple, TypeVar, Union
+from typing import Any, Callable, Dict, List, Optional, Pattern, Set, Tuple, TypeVar, Union
 from typing import Sequence as Seq  # Renamed to avoid collision
 
 import pandas as pd
@@ -35,113 +35,104 @@ def _is_regex_pattern(column: Any) -> bool:
     )
 
 
+def _assert_is_dataframe(obj: Any, context: str) -> None:
+    if not isinstance(obj, (pd.DataFrame, pl.DataFrame)):
+        raise AssertionError(f"Wrong {context}. Expected DataFrame, got {type(obj).__name__} instead.")
+
+
+def _make_param_info(param_name: Optional[str]) -> str:
+    return f" in parameter '{param_name}'" if param_name else ""
+
+
+def _validate_column(
+    column_spec: Union[str, RegexColumnDef], df: DataFrameType, expected_dtype: Any = None
+) -> Tuple[List[str], List[Tuple[str, Any, Any]], Set[str]]:
+    """Validate a single column specification against a DataFrame."""
+    missing_columns = []
+    dtype_mismatches = []
+    matched_by_regex = set()
+
+    if isinstance(column_spec, str):
+        if column_spec not in df.columns:
+            missing_columns.append(column_spec)
+        elif expected_dtype is not None and df[column_spec].dtype != expected_dtype:
+            dtype_mismatches.append((column_spec, df[column_spec].dtype, expected_dtype))
+    elif _is_regex_pattern(column_spec):
+        pattern_str, _ = column_spec
+        matches = _match_column_with_regex(column_spec, list(df.columns))
+        if not matches:
+            missing_columns.append(pattern_str)
+        else:
+            matched_by_regex.update(matches)
+            if expected_dtype is not None:
+                for matched_col in matches:
+                    if df[matched_col].dtype != expected_dtype:
+                        dtype_mismatches.append((matched_col, df[matched_col].dtype, expected_dtype))
+
+    return missing_columns, dtype_mismatches, matched_by_regex
+
+
 def _match_column_with_regex(column_pattern: RegexColumnDef, df_columns: List[str]) -> List[str]:
     _, pattern = column_pattern
     return [col for col in df_columns if pattern.match(col)]
 
 
+def _compile_regex_pattern(pattern_string: str) -> RegexColumnDef:
+    pattern_str = pattern_string[2:-1]
+    compiled_pattern = re.compile(pattern_str)
+    return (pattern_string, compiled_pattern)
+
+
+def _is_regex_string(column: str) -> bool:
+    return column.startswith("r/") and column.endswith("/")
+
+
 def _compile_regex_patterns(columns: Seq[Any]) -> List[Union[str, RegexColumnDef]]:
-    """Compile regex patterns in the column list."""
-    result: List[Union[str, RegexColumnDef]] = []
-    for col in columns:
-        if isinstance(col, str) and col.startswith("r/") and col.endswith("/"):
-            # Pattern is in the format "r/pattern/"
-            pattern_str = col[2:-1]  # Remove "r/" prefix and "/" suffix
-            compiled_pattern = re.compile(pattern_str)
-            result.append((col, compiled_pattern))
-        else:
-            result.append(col)
-    return result
+    return [_compile_regex_pattern(col) if isinstance(col, str) and _is_regex_string(col) else col for col in columns]
 
 
 def _check_columns(
     df: DataFrameType, columns: Union[ColumnsList, ColumnsDict], strict: bool, param_name: Optional[str] = None
 ) -> None:
-    missing_columns = []
-    dtype_mismatches = []
-    matched_by_regex = set()
+    all_missing_columns = []
+    all_dtype_mismatches = []
+    all_matched_by_regex = set()
 
-    # Handle list of column names/patterns
     if isinstance(columns, list):
-        # First, compile any regex patterns
         processed_columns = _compile_regex_patterns(columns)
+        for column_spec in processed_columns:
+            missing, mismatches, matched = _validate_column(column_spec, df)
+            all_missing_columns.extend(missing)
+            all_dtype_mismatches.extend(mismatches)
+            all_matched_by_regex.update(matched)
+    else:  # isinstance(columns, dict)
+        assert isinstance(columns, dict)
+        for column, expected_dtype in columns.items():
+            column_spec = (
+                _compile_regex_pattern(column) if isinstance(column, str) and _is_regex_string(column) else column
+            )
+            missing, mismatches, matched = _validate_column(column_spec, df, expected_dtype)
+            all_missing_columns.extend(missing)
+            all_dtype_mismatches.extend(mismatches)
+            all_matched_by_regex.update(matched)
+
+    param_info = _make_param_info(param_name)
+
+    if all_missing_columns:
+        raise AssertionError(f"Missing columns: {all_missing_columns}{param_info}. Got {_describe_pd(df)}")
 
-        for column in processed_columns:
-            if isinstance(column, str):
-                # Direct column name match
-                if column not in df.columns:
-                    missing_columns.append(column)
-            elif _is_regex_pattern(column):
-                # Regex pattern match
-                matches = _match_column_with_regex(column, list(df.columns))
-                if not matches:
-                    missing_columns.append(column[0])  # Add the original pattern string
-                else:
-                    matched_by_regex.update(matches)
-
-    # Handle dictionary of column names/types
-    elif isinstance(columns, dict):
-        # First, process dictionary keys for regex patterns
-        processed_dict: Dict[Union[str, RegexColumnDef], Any] = {}
-        for column, dtype in columns.items():
-            if isinstance(column, str) and column.startswith("r/") and column.endswith("/"):
-                # Pattern is in the format "r/pattern/"
-                pattern_str = column[2:-1]  # Remove "r/" prefix and "/" suffix
-                compiled_pattern = re.compile(pattern_str)
-                processed_dict[(column, compiled_pattern)] = dtype
-            else:
-                processed_dict[column] = dtype
-
-        # Check each column against dictionary keys
-        regex_matched_columns = set()
-        for column_key, dtype in processed_dict.items():
-            if isinstance(column_key, str):
-                # Direct column name match
-                if column_key not in df.columns:
-                    missing_columns.append(column_key)
-                elif df[column_key].dtype != dtype:
-                    dtype_mismatches.append((column_key, df[column_key].dtype, dtype))
-            elif _is_regex_pattern(column_key):
-                # Regex pattern match
-                pattern_str, compiled_pattern = column_key
-                matches = _match_column_with_regex(column_key, list(df.columns))
-                if not matches:
-                    missing_columns.append(pattern_str)  # Add the original pattern string
-                else:
-                    for matched_col in matches:
-                        matched_by_regex.add(matched_col)
-                        regex_matched_columns.add(matched_col)
-                        if df[matched_col].dtype != dtype:
-                            dtype_mismatches.append((matched_col, df[matched_col].dtype, dtype))
-
-    if missing_columns:
-        param_info = f" in parameter '{param_name}'" if param_name else ""
-        raise AssertionError(f"Missing columns: {missing_columns}{param_info}. Got {_describe_pd(df)}")
-
-    if dtype_mismatches:
-        param_info = f" in parameter '{param_name}'" if param_name else ""
-        mismatches = ", ".join(
-            [
-                f"Column {col}{param_info} has wrong dtype. Was {was}, expected {expected}"
-                for col, was, expected in dtype_mismatches
-            ]
+    if all_dtype_mismatches:
+        mismatch_descriptions = ", ".join(
+            f"Column {col}{param_info} has wrong dtype. Was {was}, expected {expected}"
+            for col, was, expected in all_dtype_mismatches
         )
-        raise AssertionError(mismatches)
+        raise AssertionError(mismatch_descriptions)
 
     if strict:
-        if isinstance(columns, list):
-            # For regex matches, we need to consider all matched columns
-            explicit_columns = {col for col in columns if isinstance(col, str)}
-            allowed_columns = explicit_columns.union(matched_by_regex)
-            extra_columns = set(df.columns) - allowed_columns
-        else:
-            # For dict with regex patterns, we need to handle both direct and regex matches
-            explicit_columns = {col for col in columns if isinstance(col, str)}
-            allowed_columns = explicit_columns.union(matched_by_regex)
-            extra_columns = set(df.columns) - allowed_columns
-
+        explicit_columns = {col for col in columns if isinstance(col, str)}
+        allowed_columns = explicit_columns.union(all_matched_by_regex)
+        extra_columns = set(df.columns) - allowed_columns
         if extra_columns:
-            param_info = f" in parameter '{param_name}'" if param_name else ""
             raise AssertionError(f"DataFrame{param_info} contained unexpected column(s): {', '.join(extra_columns)}")
 
 
@@ -169,9 +160,7 @@ def wrapper_df_out(func: Callable[..., DF]) -> Callable[..., DF]:
         @wraps(func)
         def wrapper(*args: Any, **kwargs: Any) -> DF:
             result = func(*args, **kwargs)
-            assert isinstance(result, pd.DataFrame) or isinstance(result, pl.DataFrame), (
-                f"Wrong return type. Expected DataFrame, got {type(result)}"
-            )
+            _assert_is_dataframe(result, "return type")
             if columns:
                 _check_columns(result, columns, get_strict(strict))
             return result
@@ -183,38 +172,27 @@ def wrapper(*args: Any, **kwargs: Any) -> DF:
 
 def _get_parameter(func: Callable[..., Any], name: Optional[str] = None, *args: Any, **kwargs: Any) -> Any:
     if not name:
-        if len(args) > 0:
-            return args[0]
-        if kwargs:
-            return next(iter(kwargs.values()))
-        return None
+        return args[0] if args else next(iter(kwargs.values()), None)
 
-    if name and (name not in kwargs):
-        func_params_in_order = list(inspect.signature(func).parameters.keys())
-        parameter_location = func_params_in_order.index(name)
-        return args[parameter_location]
+    if name in kwargs:
+        return kwargs[name]
 
-    return kwargs[name]
+    func_params_in_order = list(inspect.signature(func).parameters.keys())
+    parameter_location = func_params_in_order.index(name)
+    return args[parameter_location]
 
 
 def _get_parameter_name(
     func: Callable[..., Any], name: Optional[str] = None, *args: Any, **kwargs: Any
 ) -> Optional[str]:
-    """Get the actual parameter name being validated."""
     if name:
         return name
 
-    # If no name specified, try to get the first parameter name
-    if len(args) > 0:
-        # Get the first parameter name from the function signature
+    if args:
         func_params_in_order = list(inspect.signature(func).parameters.keys())
-        if func_params_in_order:
-            return func_params_in_order[0]
-    elif kwargs:
-        # Return the first keyword argument name
-        return next(iter(kwargs.keys()))
+        return func_params_in_order[0]
 
-    return None
+    return next(iter(kwargs.keys()), None)
 
 
 def df_in(
@@ -243,9 +221,7 @@ def wrapper_df_in(func: Callable[..., R]) -> Callable[..., R]:
         def wrapper(*args: Any, **kwargs: Any) -> R:
             df = _get_parameter(func, name, *args, **kwargs)
             param_name = _get_parameter_name(func, name, *args, **kwargs)
-            assert isinstance(df, pd.DataFrame) or isinstance(df, pl.DataFrame), (
-                f"Wrong parameter type. Expected DataFrame, got {type(df).__name__} instead."
-            )
+            _assert_is_dataframe(df, "parameter type")
             if columns:
                 _check_columns(df, columns, get_strict(strict), param_name)
             return func(*args, **kwargs)
@@ -261,25 +237,19 @@ def _describe_pd(df: DataFrameType, include_dtypes: bool = False) -> str:
         if isinstance(df, pd.DataFrame):
             readable_dtypes = [dtype.name for dtype in df.dtypes]
             result += f" with dtypes {readable_dtypes}"
-        if isinstance(df, pl.DataFrame):
+        else:
             result += f" with dtypes {df.dtypes}"
     return result
 
 
 def _log_input(level: int, func_name: str, df: Any, include_dtypes: bool) -> None:
-    if isinstance(df, pd.DataFrame) or isinstance(df, pl.DataFrame):
-        logging.log(
-            level,
-            f"Function {func_name} parameters contained a DataFrame: {_describe_pd(df, include_dtypes)}",
-        )
+    if isinstance(df, (pd.DataFrame, pl.DataFrame)):
+        logging.log(level, f"Function {func_name} parameters contained a DataFrame: {_describe_pd(df, include_dtypes)}")
 
 
 def _log_output(level: int, func_name: str, df: Any, include_dtypes: bool) -> None:
-    if isinstance(df, pd.DataFrame) or isinstance(df, pl.DataFrame):
-        logging.log(
-            level,
-            f"Function {func_name} returned a DataFrame: {_describe_pd(df, include_dtypes)}",
-        )
+    if isinstance(df, (pd.DataFrame, pl.DataFrame)):
+        logging.log(level, f"Function {func_name} returned a DataFrame: {_describe_pd(df, include_dtypes)}")
 
 
 def df_log(level: int = logging.DEBUG, include_dtypes: bool = False) -> Callable[[Callable[..., T]], Callable[..., T]]:
diff --git a/tests/test_config.py b/tests/test_config.py
@@ -51,3 +51,66 @@ def test_config_from_pyproject() -> None:
 
             config = load_config()
             assert config["strict"] is True
+
+
+def test_load_config_returns_default_when_file_not_found() -> None:
+    with patch("daffy.config.find_config_file", return_value="/nonexistent/pyproject.toml"):
+        config = get_config()
+        assert config["strict"] is False
+
+
+def test_load_config_returns_default_when_toml_malformed() -> None:
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with open(os.path.join(tmpdir, "pyproject.toml"), "w") as f:
+            f.write("invalid toml [[[")
+
+        with patch("daffy.config.os.getcwd", return_value=tmpdir):
+            import daffy.config
+
+            daffy.config._config_cache = None
+
+            config = get_config()
+            assert config["strict"] is False
+
+
+def test_find_config_file_returns_none_when_no_pyproject_exists() -> None:
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with patch("daffy.config.os.getcwd", return_value=tmpdir):
+            from daffy.config import find_config_file
+
+            result = find_config_file()
+            assert result is None
+
+
+def test_load_config_without_strict_setting() -> None:
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with open(os.path.join(tmpdir, "pyproject.toml"), "w") as f:
+            f.write("""
+[tool.daffy]
+other_setting = "value"
+            """)
+
+        with patch("daffy.config.os.getcwd", return_value=tmpdir):
+            import daffy.config
+
+            daffy.config._config_cache = None
+
+            config = get_config()
+            assert config["strict"] is False
+
+
+def test_load_config_daffy_section_without_strict() -> None:
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with open(os.path.join(tmpdir, "pyproject.toml"), "w") as f:
+            f.write("""
+[tool.daffy]
+some_other_setting = "value"
+            """)
+
+        with patch("daffy.config.os.getcwd", return_value=tmpdir):
+            import daffy.config
+
+            daffy.config._config_cache = None
+
+            config = get_config()
+            assert config["strict"] is False
diff --git a/tests/test_df_in.py b/tests/test_df_in.py