Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/dqx/docs/reference/quality_checks.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ You can also define your own custom checks (see [Creating custom checks](#creati
| `is_not_null` | Checks whether the values in the input column are not null. | `column`: column to check (can be a string column name or a column expression) |
| `is_not_empty` | Checks whether the values in the input column are not empty (but may be null). | `column`: column to check (can be a string column name or a column expression) |
| `is_not_null_and_not_empty` | Checks whether the values in the input column are not null and not empty. | `column`: column to check (can be a string column name or a column expression); `trim_strings`: optional boolean flag to trim spaces from strings |
| `is_in_list` | Checks whether the values in the input column are present in the list of allowed values (null values are allowed). This check is not suited for large lists of allowed values. In such cases, it’s recommended to use the `foreign_key` dataset-level check instead. | `column`: column to check (can be a string column name or a column expression); `allowed`: list of allowed values |
| `is_not_null_and_is_in_list` | Checks whether the values in the input column are not null and present in the list of allowed values. This check is not suited for large lists of allowed values. In such cases, it’s recommended to use the `foreign_key` dataset-level check instead. | `column`: column to check (can be a string column name or a column expression); `allowed`: list of allowed values |
| `is_in_list` | Checks whether the values in the input column are present in the list of allowed values (null values are allowed). This check is not suited for large lists of allowed values. In such cases, it’s recommended to use the `foreign_key` dataset-level check instead. | `column`: column to check (can be a string column name or a column expression); `allowed`: list of allowed values; `case_sensitive`: optional boolean flag for case-sensitive comparison (default: True) |
| `is_not_null_and_is_in_list` | Checks whether the values in the input column are not null and present in the list of allowed values. This check is not suited for large lists of allowed values. In such cases, it’s recommended to use the `foreign_key` dataset-level check instead. | `column`: column to check (can be a string column name or a column expression); `allowed`: list of allowed values; `case_sensitive`: optional boolean flag for case-sensitive comparison (default: True) |
| `is_not_null_and_not_empty_array` | Checks whether the values in the array input column are not null and not empty. | `column`: column to check (can be a string column name or a column expression) |
| `is_in_range` | Checks whether the values in the input column are in the provided range (inclusive of both boundaries). | `column`: column to check (can be a string column name or a column expression); `min_limit`: min limit as number, date, timestamp, column name or sql expression; `max_limit`: max limit as number, date, timestamp, column name or sql expression |
| `is_not_in_range` | Checks whether the values in the input column are outside the provided range (inclusive of both boundaries). | `column`: column to check (can be a string column name or a column expression); `min_limit`: min limit as number, date, timestamp, column name or sql expression; `max_limit`: max limit as number, date, timestamp, column name or sql expression |
Expand Down
33 changes: 22 additions & 11 deletions src/databricks/labs/dqx/check_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,14 @@ def is_not_null(column: str | Column) -> Column:


@register_rule("row")
def is_not_null_and_is_in_list(column: str | Column, allowed: list) -> Column:
def is_not_null_and_is_in_list(column: str | Column, allowed: list, case_sensitive: bool = True) -> Column:
"""Checks whether the values in the input column are not null and present in the list of allowed values.
Can optionally perform a case-insensitive comparison.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe note the limitations for MapType and StructType here in the docstring.


Args:
column: column to check; can be a string column name or a column expression
allowed: list of allowed values (actual values or Column objects)
case_sensitive: whether to perform a case-sensitive comparison (default: True)

Returns:
Column object for condition
Expand All @@ -145,38 +147,43 @@ def is_not_null_and_is_in_list(column: str | Column, allowed: list) -> Column:
"""
if allowed is None:
raise MissingParameterError("allowed list is not provided.")

if not isinstance(allowed, list):
raise InvalidParameterError(f"allowed parameter must be a list, got {str(type(allowed))} instead.")

if not allowed:
raise InvalidParameterError("allowed list must not be empty.")

allowed_cols = [item if isinstance(item, Column) else F.lit(item) for item in allowed]
col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column)
condition = col_expr.isNull() | ~col_expr.isin(*allowed_cols)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we simplify this pls?
I think we can keep most of the code intact, just apply lower if needed.

for example:

col_expr_compare = F.lower(col_expr) if not case_sensitive else col_expr
allowed_cols_compare = [F.lower(c) for c in allowed_cols_display] if not case_sensitive else allowed_cols_display

condition = col_expr.isNull() | (~col_expr_compare.isin(*allowed_cols_compare))


# Apply case-insensitive transformation if needed
col_expr_compare = F.lower(col_expr) if not case_sensitive else col_expr
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to define and handle what case insensitivity means for complex objects (e.g. arrays, maps, structs). We probably want case insensitivity for all keys and values in MapType and StructType columns and case insensitivity for all items in ArrayType columns.

lower(col) will throw an error for complex data types. We might want to create a private helper function to handle casing based on the column type.

allowed_cols_compare = [F.lower(c) if not case_sensitive else c for c in allowed_cols]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should move the ternary outside of the list so that we are not doing the comparison for every element:

allowed_cols_compare = [F.lower(c) for c in allowed_cols] if not case_sensitive else allowed_cols


condition = col_expr.isNull() | ~col_expr_compare.isin(*allowed_cols_compare)

return make_condition(
condition,
F.concat_ws(
"",
F.lit("Value '"),
F.when(col_expr.isNull(), F.lit("null")).otherwise(col_expr.cast("string")),
F.lit(f"' in Column '{col_expr_str}' is null or not in the allowed list: ["),
F.concat_ws(", ", *allowed_cols),
F.concat_ws(", ", *allowed_cols), # Use original allowed_cols for display
F.lit("]"),
),
f"{col_str_norm}_is_null_or_is_not_in_the_list",
)


@register_rule("row")
def is_in_list(column: str | Column, allowed: list) -> Column:
def is_in_list(column: str | Column, allowed: list, case_sensitive: bool = True) -> Column:
"""Checks whether the values in the input column are present in the list of allowed values
(null values are allowed).
(null values are allowed). Can optionally perform a case-insensitive comparison.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe note the limitations for MapType and StructType here in the docstring.


Args:
column: column to check; can be a string column name or a column expression
allowed: list of allowed values (actual values or Column objects)
case_sensitive: whether to perform a case-sensitive comparison (default: True)

Returns:
Column object for condition
Expand All @@ -187,24 +194,28 @@ def is_in_list(column: str | Column, allowed: list) -> Column:
"""
if allowed is None:
raise MissingParameterError("allowed list is not provided.")

if not isinstance(allowed, list):
raise InvalidParameterError(f"allowed parameter must be a list, got {str(type(allowed))} instead.")

if not allowed:
raise InvalidParameterError("allowed list must not be empty.")

allowed_cols = [item if isinstance(item, Column) else F.lit(item) for item in allowed]
col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column)
condition = ~col_expr.isin(*allowed_cols)

# Apply case-insensitive transformation if needed
col_expr_compare = F.lower(col_expr) if not case_sensitive else col_expr
allowed_cols_compare = [F.lower(c) if not case_sensitive else c for c in allowed_cols]

condition = ~col_expr_compare.isin(*allowed_cols_compare)

return make_condition(
condition,
F.concat_ws(
"",
F.lit("Value '"),
F.when(col_expr.isNull(), F.lit("null")).otherwise(col_expr.cast("string")),
F.lit(f"' in Column '{col_expr_str}' is not in the allowed list: ["),
F.concat_ws(", ", *allowed_cols),
F.concat_ws(", ", *allowed_cols), # Use original allowed_cols for display
F.lit("]"),
),
f"{col_str_norm}_is_not_in_the_list",
Expand Down
Loading
Loading