Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/dqx/docs/reference/quality_checks.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ You can also define your own custom checks (see [Creating custom checks](#creati
| `is_not_null` | Checks whether the values in the input column are not null. | `column`: column to check (can be a string column name or a column expression) |
| `is_not_empty` | Checks whether the values in the input column are not empty (but may be null). | `column`: column to check (can be a string column name or a column expression) |
| `is_not_null_and_not_empty` | Checks whether the values in the input column are not null and not empty. | `column`: column to check (can be a string column name or a column expression); `trim_strings`: optional boolean flag to trim spaces from strings |
| `is_in_list` | Checks whether the values in the input column are present in the list of allowed values (null values are allowed). This check is not suited for large lists of allowed values. In such cases, it’s recommended to use the `foreign_key` dataset-level check instead. | `column`: column to check (can be a string column name or a column expression); `allowed`: list of allowed values |
| `is_not_null_and_is_in_list` | Checks whether the values in the input column are not null and present in the list of allowed values. This check is not suited for large lists of allowed values. In such cases, it’s recommended to use the `foreign_key` dataset-level check instead. | `column`: column to check (can be a string column name or a column expression); `allowed`: list of allowed values |
| `is_in_list` | Checks whether the values in the input column are present in the list of allowed values (null values are allowed). This check is not suited for large lists of allowed values. In such cases, it’s recommended to use the `foreign_key` dataset-level check instead. This check is not suited for `MapType` or `StructType` columns. | `column`: column to check (can be a string column name or a column expression); `allowed`: list of allowed values; `case_sensitive`: optional boolean flag for case-sensitive comparison (default: True) |
| `is_not_null_and_is_in_list` | Checks whether the values in the input column are not null and present in the list of allowed values. This check is not suited for large lists of allowed values. In such cases, it’s recommended to use the `foreign_key` dataset-level check instead. This check is not suited for `MapType` or `StructType` columns. | `column`: column to check (can be a string column name or a column expression); `allowed`: list of allowed values; `case_sensitive`: optional boolean flag for case-sensitive comparison (default: True) |
| `is_not_null_and_not_empty_array` | Checks whether the values in the array input column are not null and not empty. | `column`: column to check (can be a string column name or a column expression) |
| `is_in_range` | Checks whether the values in the input column are in the provided range (inclusive of both boundaries). | `column`: column to check (can be a string column name or a column expression); `min_limit`: min limit as number, date, timestamp, column name or sql expression; `max_limit`: max limit as number, date, timestamp, column name or sql expression |
| `is_not_in_range` | Checks whether the values in the input column are outside the provided range (inclusive of both boundaries). | `column`: column to check (can be a string column name or a column expression); `min_limit`: min limit as number, date, timestamp, column name or sql expression; `max_limit`: max limit as number, date, timestamp, column name or sql expression |
Expand Down
46 changes: 35 additions & 11 deletions src/databricks/labs/dqx/check_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
is_sql_query_safe,
normalize_col_str,
get_columns_as_strings,
to_lowercase,
)
from databricks.labs.dqx.errors import MissingParameterError, InvalidParameterError, UnsafeSqlQueryError

Expand Down Expand Up @@ -129,12 +130,14 @@ def is_not_null(column: str | Column) -> Column:


@register_rule("row")
def is_not_null_and_is_in_list(column: str | Column, allowed: list) -> Column:
def is_not_null_and_is_in_list(column: str | Column, allowed: list, case_sensitive: bool = True) -> Column:
"""Checks whether the values in the input column are not null and present in the list of allowed values.
Can optionally perform a case-insensitive comparison.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe note the limitations for MapType and StructType here in the docstring.


Args:
column: column to check; can be a string column name or a column expression
allowed: list of allowed values (actual values or Column objects)
case_sensitive: whether to perform a case-sensitive comparison (default: True)

Returns:
Column object for condition
Expand All @@ -145,38 +148,49 @@ def is_not_null_and_is_in_list(column: str | Column, allowed: list) -> Column:
"""
if allowed is None:
raise MissingParameterError("allowed list is not provided.")

if not isinstance(allowed, list):
raise InvalidParameterError(f"allowed parameter must be a list, got {str(type(allowed))} instead.")

if not allowed:
raise InvalidParameterError("allowed list must not be empty.")

allowed_cols = [item if isinstance(item, Column) else F.lit(item) for item in allowed]
col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column)
condition = col_expr.isNull() | ~col_expr.isin(*allowed_cols)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we simplify this pls?
I think we can keep most of the code intact, just apply lower if needed.

for example:

col_expr_compare = F.lower(col_expr) if not case_sensitive else col_expr
allowed_cols_compare = [F.lower(c) for c in allowed_cols_display] if not case_sensitive else allowed_cols_display

condition = col_expr.isNull() | (~col_expr_compare.isin(*allowed_cols_compare))


# Apply case-insensitive transformation if needed
if not case_sensitive:
has_arrays = any(isinstance(item, (list, tuple)) for item in allowed if not isinstance(item, Column))
col_expr_compare = to_lowercase(col_expr, is_array=has_arrays)
allowed_cols_compare = [
to_lowercase(c, is_array=isinstance(allowed[i], (list, tuple))) for i, c in enumerate(allowed_cols)
]
else:
col_expr_compare, allowed_cols_compare = col_expr, allowed_cols

condition = col_expr.isNull() | ~col_expr_compare.isin(*allowed_cols_compare)

return make_condition(
condition,
F.concat_ws(
"",
F.lit("Value '"),
F.when(col_expr.isNull(), F.lit("null")).otherwise(col_expr.cast("string")),
F.lit(f"' in Column '{col_expr_str}' is null or not in the allowed list: ["),
F.concat_ws(", ", *allowed_cols),
F.concat_ws(", ", *[c.cast("string") for c in allowed_cols]),
F.lit("]"),
),
f"{col_str_norm}_is_null_or_is_not_in_the_list",
)


@register_rule("row")
def is_in_list(column: str | Column, allowed: list) -> Column:
def is_in_list(column: str | Column, allowed: list, case_sensitive: bool = True) -> Column:
"""Checks whether the values in the input column are present in the list of allowed values
(null values are allowed).
(null values are allowed). Can optionally perform a case-insensitive comparison.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe note the limitations for MapType and StructType here in the docstring.


Args:
column: column to check; can be a string column name or a column expression
allowed: list of allowed values (actual values or Column objects)
case_sensitive: whether to perform a case-sensitive comparison (default: True)

Returns:
Column object for condition
Expand All @@ -187,24 +201,34 @@ def is_in_list(column: str | Column, allowed: list) -> Column:
"""
if allowed is None:
raise MissingParameterError("allowed list is not provided.")

if not isinstance(allowed, list):
raise InvalidParameterError(f"allowed parameter must be a list, got {str(type(allowed))} instead.")

if not allowed:
raise InvalidParameterError("allowed list must not be empty.")

allowed_cols = [item if isinstance(item, Column) else F.lit(item) for item in allowed]
col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column)
condition = ~col_expr.isin(*allowed_cols)

# Apply case-insensitive transformation if needed
if not case_sensitive:
has_arrays = any(isinstance(item, (list, tuple)) for item in allowed if not isinstance(item, Column))
col_expr_compare = to_lowercase(col_expr, is_array=has_arrays)
allowed_cols_compare = [
to_lowercase(c, is_array=isinstance(allowed[i], (list, tuple))) for i, c in enumerate(allowed_cols)
]
else:
col_expr_compare, allowed_cols_compare = col_expr, allowed_cols

condition = ~col_expr_compare.isin(*allowed_cols_compare)

return make_condition(
condition,
F.concat_ws(
"",
F.lit("Value '"),
F.when(col_expr.isNull(), F.lit("null")).otherwise(col_expr.cast("string")),
F.lit(f"' in Column '{col_expr_str}' is not in the allowed list: ["),
F.concat_ws(", ", *allowed_cols),
F.concat_ws(", ", *[c.cast("string") for c in allowed_cols]),
F.lit("]"),
),
f"{col_str_norm}_is_not_in_the_list",
Expand Down
18 changes: 17 additions & 1 deletion src/databricks/labs/dqx/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from databricks.labs.blueprint.limiter import rate_limited
from databricks.labs.dqx.errors import InvalidParameterError
from databricks.sdk.errors import NotFound

import pyspark.sql.functions as F

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -433,3 +433,19 @@ def _match_table_patterns(table: str, patterns: list[str]) -> bool:
bool: True if the table name matches any of the patterns, False otherwise.
"""
return any(fnmatch(table, pattern) for pattern in patterns)


def to_lowercase(col_expr: Column, is_array: bool = False) -> Column:
"""Converts a column expression to lowercase, handling both scalar and array types.

Args:
col_expr: Column expression to convert
is_array: Whether the column contains array values

Returns:
Column expression with lowercase transformation applied
"""
if is_array:
return F.transform(col_expr, lambda x: F.lower(x))
else:
return F.lower(col_expr)
Loading
Loading