Skip to content

Load and save checks from a Delta table #339

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
May 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 31 additions & 3 deletions demos/dqx_demo_library.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,15 @@
user_name = spark.sql("select current_user() as user").collect()[0]["user"]
checks_file = f"/Workspace/Users/{user_name}/dqx_demo_checks.yml"
dq_engine = DQEngine(ws)
dq_engine.save_checks_in_workspace_file(checks, workspace_path=checks_file)
dq_engine.save_checks_in_workspace_file(checks=checks, workspace_path=checks_file)

# save generated checks in a Delta table
dq_engine.save_checks_in_table(checks=checks, table_name="main.default.dqx_checks_table", mode="overwrite")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Loading and applying quality checks
# MAGIC ## Loading and applying quality checks from a file

# COMMAND ----------

Expand All @@ -76,7 +79,7 @@

input_df = spark.createDataFrame([[1, 3, 3, 2], [3, 3, None, 1]], schema)

# load checks
# load checks from a file
dq_engine = DQEngine(WorkspaceClient())
checks = dq_engine.load_checks_from_workspace_file(workspace_path=checks_file)

Expand All @@ -91,6 +94,31 @@

# COMMAND ----------

# MAGIC %md
# MAGIC ## Loading and applying quality checks from a Delta table

# COMMAND ----------

from databricks.labs.dqx.engine import DQEngine
from databricks.sdk import WorkspaceClient

input_df = spark.createDataFrame([[1, 3, 3, 2], [3, 3, None, 1]], schema)

# load checks from a Delta table
dq_engine = DQEngine(WorkspaceClient())
checks = dq_engine.load_checks_from_table(table_name="main.default.dqx_checks_table")

# Option 1: apply quality rules and quarantine invalid records
valid_df, quarantined_df = dq_engine.apply_checks_by_metadata_and_split(input_df, checks)
display(valid_df)
display(quarantined_df)

# Option 2: apply quality rules and flag invalid records as additional columns (`_warning` and `_error`)
valid_and_quarantined_df = dq_engine.apply_checks_by_metadata(input_df, checks)
display(valid_and_quarantined_df)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Validating syntax of quality checks defined in yaml

Expand Down
62 changes: 57 additions & 5 deletions docs/dqx/docs/guide.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -160,17 +160,41 @@ Fields:
- `criticality`: either "error" (data going only into "bad/quarantine" dataframe) or "warn" (data going into both "good" and "bad" dataframes). If not provided, the default is "error".
- `check`: column expression containing "function" (check function to apply), "arguments" (check function arguments), and "col_name" (column name as `str` or sql expression the check will be applied for) or "col_names" (column names as `array` the check will be applied for).
- (optional) `name` for the check: autogenerated if not provided.
- (optional) `filter` to filter the rows for which the check is applied (e.g. `"business_unit = 'Finance'"`)

### Quality rules configured in a Delta table

Quality rules can also be stored in a Delta table in Unity Catalog. Each row represents a check with column values for the `name`, `check`, `criticality`, `filter`, and `run_config_name`.

```python
# The checks table will contain the following columns:
# +------------------+-----------------------+---------------------------------------+----------------------+--------------------+
# | name | criticality | check | filter | run_config_name |
# +------------------+-----------------------+---------------------------------------+----------------------+--------------------+
# | "city_is_null" | "warn" | {function: 'is_not_null', | "country = 'Poland'" | "default" |
# | | | arguments: {'col_name': 'city'}} | | |
# | ... | ... | ... | ... | ... |
# +------------------+-----------------------+---------------------------------------+----------------------+--------------------+
```
Fields:
- `criticality`: either "error" (data going only into "bad/quarantine" dataframe) or "warn" (data going into both "good" and "bad" dataframes). If not provided, the default is "error".
- `check`: a `StructType` value with the following fields:
- `function`: Name of the DQX check function to apply
- `arguments`: A `MapType` value with the function's keyword arguments as key-value pairs
- (optional) `name`: Name to use for the check
- (optional) `filter`: Spark expression to filter the rows for which the check is applied (e.g. `"business_unit = 'Finance'"`)
- `run_config_name`: A run or workflow name. Can be used to load and apply a subset of checks to specific workflows. Default value is `"default"`.

### Loading and execution methods

Checks can be loaded from a file in the installation folder, workspace, or local file system. The engine will raise an error if the checks file contains invalid JSON or YAML definition.
Checks can be loaded from a Delta table in Unity Catalog or from a file in the installation folder, workspace, or local file system. The engine will raise an error if the stored checks (either from a file or table) contain an invalid definition.

Checks loaded from a file can be applied using one of the following methods:
Checks loaded from a file or table can be applied using one of the following methods:
* `apply_checks_by_metadata_and_split`: splits the input data into valid and invalid (quarantined) dataframes.
* `apply_checks_by_metadata`: report issues as additional columns.

Syntax of the loaded checks is validated automatically as part of these methods.
In addition, you can also perform a standalone syntax validation of the checks as described [here](#validating-syntax-of-quality-checks-defined-in-yamljson).
In addition, you can also perform a standalone syntax validation of the checks as described [here](#validating-syntax-of-quality-checks-defined-in-configuration).

#### Method 1: Loading checks from a workspace file in the installation folder

Expand Down Expand Up @@ -239,6 +263,34 @@ In addition, you can also perform a standalone syntax validation of the checks a
</TabItem>
</Tabs>

#### Method 4: Loading checks from a Delta table

<Tabs>
<TabItem value="Python" label="Python" default>
```python
from databricks.labs.dqx.engine import DQEngine
from databricks.sdk import WorkspaceClient

dq_engine = DQEngine(WorkspaceClient())

# Load all checks with the "default" `run_config_name`:
default_checks = dq_engine.load_checks_from_table("dq.config.checks_table")

# Load checks with the "workflow_001" `run_config_name`:
workflow_checks = dq_engine.load_checks_from_table("dq.config.checks_table", "workflow_001")

checks = default_checks + workflow_checks
input_df = spark.read.table("catalog1.schema1.table1")

# Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes
valid_df, quarantined_df = dq_engine.apply_checks_by_metadata_and_split(input_df, checks)

# Option 2: apply quality rules on the dataframe and report issues as additional columns (`_warning` and `_error`)
valid_and_quarantined_df = dq_engine.apply_checks_by_metadata(input_df, checks)
```
</TabItem>
</Tabs>

### Quality rules defined in code

#### Method 1: Using DQX classes
Expand Down Expand Up @@ -427,9 +479,9 @@ The DQX integration with DLT does not use DLT Expectations but DQX's own methods
</TabItem>
</Tabs>

## Validating syntax of quality checks defined in yaml/json
## Validating syntax of quality checks defined in configuration

You can validate the syntax of checks defined as metadata in `yaml` or `json` format before applying them. This validation ensures that the checks are correctly defined and can be interpreted by the DQX engine.
You can validate the syntax of checks defined as metadata in a Delta table or file (either `yaml` or `json`) before applying them. This validation ensures that the checks are correctly defined and can be interpreted by the DQX engine.
The validation cannot be used for checks defined using [DQX classes](#method-1-using-dqx-classes). When checks are defined with DQX classes, syntax validation is unnecessary because the application will fail to interpret them if the DQX objects are constructed incorrectly.

<Tabs>
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ max-args = 10
max-positional-arguments=10

# Maximum number of attributes for a class (see R0902).
max-attributes = 15
max-attributes = 16

# Maximum number of boolean expressions in an if statement (see R0916).
max-bool-expr = 5
Expand Down
1 change: 1 addition & 0 deletions src/databricks/labs/dqx/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class RunConfig:
output_table: str | None = None # output data table
quarantine_table: str | None = None # quarantined data table
checks_file: str | None = "checks.yml" # file containing quality rules / checks
checks_table: str | None = None # table containing quality rules / checks
profile_summary_stats_file: str | None = "profile_summary_stats.yml" # file containing profile summary statistics
override_clusters: dict[str, str] | None = None # cluster configuration for jobs
spark_conf: dict[str, str] | None = None # extra spark configs
Expand Down
113 changes: 112 additions & 1 deletion src/databricks/labs/dqx/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
import functools as ft
import inspect
import itertools
import warnings
from pathlib import Path
from collections.abc import Callable
from typing import Any
import yaml
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from pyspark.sql import DataFrame, SparkSession

from databricks.labs.blueprint.installation import Installation
from databricks.labs.dqx import row_checks
Expand All @@ -30,6 +31,7 @@
from databricks.sdk import WorkspaceClient

logger = logging.getLogger(__name__)
COLLECT_LIMIT_WARNING = 500


class DQEngineCore(DQEngineCoreBase):
Expand Down Expand Up @@ -142,6 +144,72 @@ def save_checks_in_local_file(checks: list[dict], filepath: str):
msg = f"Checks file {filepath} missing"
raise FileNotFoundError(msg) from None

@staticmethod
def build_quality_rules_from_dataframe(df: DataFrame, run_config_name: str = "default") -> list[dict]:
"""Build checks from a Spark DataFrame based on check specifications, i.e. function name plus arguments.

:param df: Spark DataFrame with data quality check rules. Each row should define a check. Rows should
have the following columns:
* `name` - Name that will be given to a resulting column. Autogenerated if not provided
* `criticality` (optional) - Possible values are `error` (data going only into "bad" dataframe) and `warn` (data is going into both dataframes)
* `check` - DQX check function used in the check; A `StructType` column defining the data quality check
* `filter` - Expression for filtering data quality checks
* `run_config_name` (optional) - Run configuration name for storing checks across runs
:param run_config_name: Run configuration name for filtering quality rules
:return: List of data quality check specifications as a Python dictionary
"""
check_rows = df.where(f"run_config_name = '{run_config_name}'").collect()
if len(check_rows) > COLLECT_LIMIT_WARNING:
warnings.warn(
f"Collecting large number of rows from Spark DataFrame: {len(check_rows)}",
category=UserWarning,
stacklevel=2,
)
checks = []
for row in check_rows:
check = {"name": row.name, "criticality": row.criticality, "check": row.check.asDict()}
if row.filter is not None:
check["filter"] = row.filter
checks.append(check)
return checks

@staticmethod
def build_dataframe_from_quality_rules(
checks: list[dict], run_config_name: str = "default", spark: SparkSession | None = None
) -> DataFrame:
"""Build a Spark DataFrame from a set of check specifications, i.e. function name plus arguments.

:param checks: list of check specifications as Python dictionaries. Each check consists of the following fields:
* `check` - Column expression to evaluate. This expression should return string value if it's evaluated to true (it will be used as an error/warning message) or `null` if it's evaluated to `false`
* `name` - Name that will be given to a resulting column. Autogenerated if not provided
* `criticality` (optional) - Possible values are `error` (data going only into "bad" dataframe) and `warn` (data is going into both dataframes)
* `filter` (optional) - Expression for filtering data quality checks
:param run_config_name: Run configuration name for storing quality checks across runs
:param spark: SparkSession to use for DataFrame operations
:return: Spark DataFrame with data quality check rules
"""
if spark is None:
spark = SparkSession.builder.getOrCreate()
schema = "name STRING, criticality STRING, check STRUCT<function STRING, arguments MAP<STRING, STRING>>, filter STRING, run_config_name STRING"
dq_rule_checks = DQEngineCore.build_checks_by_metadata(checks)
dq_rule_rows = []
for dq_rule_check in dq_rule_checks:
arguments = dq_rule_check.check_func_kwargs
if isinstance(dq_rule_check, DQColSetRule):
arguments["col_names"] = dq_rule_check.columns
if isinstance(dq_rule_check, DQColRule):
arguments["col_name"] = dq_rule_check.col_name
dq_rule_rows.append(
[
dq_rule_check.name,
dq_rule_check.criticality,
{"function": dq_rule_check.check_func.__name__, "arguments": arguments},
dq_rule_check.filter,
run_config_name,
]
)
return spark.createDataFrame(dq_rule_rows, schema)

@staticmethod
def build_checks_by_metadata(checks: list[dict], custom_checks: dict[str, Any] | None = None) -> list[DQColRule]:
"""Build checks based on check specification, i.e. function name plus arguments.
Expand Down Expand Up @@ -590,6 +658,23 @@ def load_checks_from_installation(
raise ValueError(f"Invalid or no checks in workspace file: {installation.install_folder()}/{filename}")
return parsed_checks

def load_checks_from_table(
self, table_name: str, run_config_name: str = "default", spark: SparkSession | None = None
) -> list[dict]:
"""
Load checks (dq rules) from a Delta table in the workspace.
:param table_name: Unity catalog or Hive metastore table name
:param run_config_name: Run configuration name for filtering checks
:param spark: Optional SparkSession
:return: List of dq rules or raise an error if checks file is missing or is invalid.
"""
logger.info(f"Loading quality rules (checks) from table {table_name}")
if not self.ws.tables.exists(table_name).table_exists:
raise NotFound(f"Table {table_name} does not exist in the workspace")
if spark is None:
spark = SparkSession.builder.getOrCreate()
return DQEngine._load_checks_from_table(table_name, run_config_name, spark)

@staticmethod
def save_checks_in_local_file(checks: list[dict], path: str):
return DQEngineCore.save_checks_in_local_file(checks, path)
Expand Down Expand Up @@ -633,6 +718,20 @@ def save_checks_in_workspace_file(self, checks: list[dict], workspace_path: str)
workspace_path, yaml.safe_dump(checks).encode('utf-8'), format=ImportFormat.AUTO, overwrite=True
)

@staticmethod
def save_checks_in_table(
checks: list[dict], table_name: str, run_config_name: str = "default", mode: str = "append"
):
"""
Save checks to a Delta table in the workspace.
:param checks: list of dq rules to save
:param table_name: Unity catalog or Hive metastore table name
:param run_config_name: Run configuration name for identifying groups of checks
:param mode: Output mode for writing checks to Delta (e.g. `append` or `overwrite`)
"""
logger.info(f"Saving quality rules (checks) to table {table_name}")
DQEngine._save_checks_in_table(checks, table_name, run_config_name, mode)

def load_run_config(
self, run_config_name: str | None = "default", assume_user: bool = True, product_name: str = "dqx"
) -> RunConfig:
Expand Down Expand Up @@ -670,3 +769,15 @@ def _load_checks_from_file(installation: Installation, filename: str) -> list[di
except NotFound:
msg = f"Checks file {filename} missing"
raise NotFound(msg) from None

@staticmethod
def _load_checks_from_table(table_name: str, run_config_name: str, spark: SparkSession | None = None) -> list[dict]:
if spark is None:
spark = SparkSession.builder.getOrCreate()
rules_df = spark.read.table(table_name)
return DQEngineCore.build_quality_rules_from_dataframe(rules_df, run_config_name=run_config_name)

@staticmethod
def _save_checks_in_table(checks: list[dict], table_name: str, run_config_name: str, mode: str):
rules_df = DQEngineCore.build_dataframe_from_quality_rules(checks, run_config_name=run_config_name)
rules_df.write.saveAsTable(table_name, mode=mode)
19 changes: 3 additions & 16 deletions tests/integration/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,7 @@ def config(self) -> WorkspaceConfig:
class MockInstallationContext(MockRuntimeContext):
__test__ = False

def __init__(
self,
env_or_skip_fixture,
ws,
check_file,
):
def __init__(self, env_or_skip_fixture, ws, check_file):
super().__init__(env_or_skip_fixture, ws)
self.check_file = check_file

Expand Down Expand Up @@ -170,16 +165,8 @@ def workspace_installation(self) -> WorkspaceInstallation:


@pytest.fixture
def installation_ctx(
ws,
env_or_skip,
check_file="checks.yml",
) -> Generator[MockInstallationContext, None, None]:
ctx = MockInstallationContext(
env_or_skip,
ws,
check_file,
)
def installation_ctx(ws, env_or_skip, check_file="checks.yml") -> Generator[MockInstallationContext, None, None]:
ctx = MockInstallationContext(env_or_skip, ws, check_file)
yield ctx.replace(workspace_client=ws)
ctx.workspace_installation.uninstall()

Expand Down
Loading